From aacb43e5fdd1b0f69f7d99bf5e91529c1f45f0c1 Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Thu, 12 Dec 2024 13:59:08 -0800
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20change?=
 =?UTF-8?q?s=20to=20main=20this=20commit=20is=20based=20on?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.6-beta.1

[skip ci]
---
 bolt/docs/BinaryAnalysis.md                   |   20 +
 bolt/include/bolt/Rewrite/RewriteInstance.h   |    3 +
 bolt/include/bolt/Utils/CommandLineOpts.h     |    2 +
 bolt/lib/Rewrite/RewriteInstance.cpp          |    7 +
 bolt/lib/Utils/CommandLineOpts.cpp            |    2 +
 bolt/test/CMakeLists.txt                      |    1 +
 .../binary-analysis/AArch64/Inputs/dummy.txt  |    1 +
 .../binary-analysis/AArch64/cmdline-args.test |   33 +
 .../binary-analysis/AArch64/lit.local.cfg     |    7 +
 bolt/test/lit.cfg.py                          |    1 +
 bolt/tools/CMakeLists.txt                     |    1 +
 bolt/tools/binary-analysis/CMakeLists.txt     |   19 +
 .../tools/binary-analysis/binary-analysis.cpp |  122 +
 clang/docs/ReleaseNotes.rst                   |   13 +
 clang/include/clang-c/CXString.h              |    4 +
 clang/include/clang-c/Index.h                 |   20 +-
 clang/include/clang/AST/APValue.h             |    6 +-
 clang/include/clang/AST/Decl.h                |   11 +-
 .../include/clang/AST/DeclContextInternals.h  |   10 +-
 clang/include/clang/AST/DeclTemplate.h        |   17 +-
 clang/include/clang/AST/ExprConcepts.h        |    8 +-
 clang/include/clang/AST/ExternalASTSource.h   |    8 +-
 clang/include/clang/AST/RecursiveASTVisitor.h |   11 +-
 clang/include/clang/AST/Redeclarable.h        |    2 +-
 clang/include/clang/AST/StmtOpenACC.h         |  170 ++
 clang/include/clang/AST/TextNodeDumper.h      |    4 +
 clang/include/clang/AST/Type.h                |    6 +-
 .../clang/Basic/DiagnosticSemaKinds.td        |    4 +-
 clang/include/clang/Basic/OpenACCKinds.h      |    8 +
 clang/include/clang/Basic/StmtNodes.td        |    4 +
 clang/include/clang/Lex/PreprocessingRecord.h |    4 +-
 clang/include/clang/Lex/Preprocessor.h        |    6 +-
 .../include/clang/Serialization/ASTBitCodes.h |    4 +
 clang/lib/AST/ByteCode/BitcastBuffer.h        |    9 +
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      |   90 +
 .../lib/AST/ByteCode/InterpBuiltinBitCast.cpp |    6 +-
 clang/lib/AST/ByteCode/InterpBuiltinBitCast.h |    8 +-
 clang/lib/AST/StmtOpenACC.cpp                 |   86 +
 clang/lib/AST/StmtPrinter.cpp                 |   45 +
 clang/lib/AST/StmtProfile.cpp                 |   31 +
 clang/lib/AST/TextNodeDumper.cpp              |   19 +
 clang/lib/Analysis/PathDiagnostic.cpp         |    6 +-
 clang/lib/Analysis/UnsafeBufferUsage.cpp      |   35 +-
 clang/lib/Basic/FileManager.cpp               |    2 +-
 clang/lib/Basic/SourceManager.cpp             |    3 +-
 clang/lib/CodeGen/CGStmt.cpp                  |   12 +
 clang/lib/CodeGen/CodeGenFunction.h           |   24 +
 clang/lib/Driver/Driver.cpp                   |   53 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |    3 +-
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |    9 +-
 clang/lib/Driver/ToolChains/CommonArgs.h      |    2 +
 clang/lib/Driver/ToolChains/Darwin.cpp        |   10 +-
 clang/lib/Driver/ToolChains/FreeBSD.cpp       |    6 +-
 clang/lib/Driver/ToolChains/Fuchsia.cpp       |   10 +-
 clang/lib/Driver/ToolChains/Gnu.cpp           |    7 +-
 clang/lib/Driver/ToolChains/Hexagon.cpp       |    9 +-
 clang/lib/Driver/ToolChains/NetBSD.cpp        |    7 +-
 clang/lib/Driver/ToolChains/OHOS.cpp          |   60 +-
 clang/lib/Driver/ToolChains/OpenBSD.cpp       |    6 +-
 clang/lib/Driver/ToolChains/Solaris.cpp       |    6 +-
 clang/lib/Index/FileIndexRecord.cpp           |    2 +-
 clang/lib/Index/IndexDecl.cpp                 |    4 +-
 clang/lib/Parse/ParseOpenACC.cpp              |   15 +-
 clang/lib/Sema/CheckExprLifetime.cpp          |  114 +-
 clang/lib/Sema/DeclSpec.cpp                   |    3 +-
 clang/lib/Sema/SemaExceptionSpec.cpp          |    4 +
 clang/lib/Sema/SemaOpenACC.cpp                |  148 +-
 clang/lib/Sema/TreeTransform.h                |  118 +
 clang/lib/Serialization/ASTReaderStmt.cpp     |   41 +
 clang/lib/Serialization/ASTWriterStmt.cpp     |   25 +
 .../Checkers/WebKit/ASTUtils.cpp              |    2 +-
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp  |    4 +
 clang/test/AST/ByteCode/builtin-functions.cpp |   48 +
 clang/test/AST/ByteCode/complex.cpp           |    8 -
 .../AST/ast-print-openacc-data-construct.cpp  |   82 +
 .../Analysis/Checkers/WebKit/call-args.cpp    |   11 +
 clang/test/CodeGenCXX/ext-int.cpp             |   18 -
 clang/test/Driver/config-file3.c              |   23 +
 .../Driver/print-supported-extensions-riscv.c |    1 +
 clang/test/Driver/stack-clash-protection.c    |    5 +
 clang/test/Driver/sysroot.c                   |    3 +-
 clang/test/ParserOpenACC/parse-clauses.c      |   46 +-
 clang/test/ParserOpenACC/parse-clauses.cpp    |    1 -
 clang/test/ParserOpenACC/parse-constructs.c   |   12 +-
 clang/test/Sema/Inputs/lifetime-analysis.h    |    5 +
 .../Sema/warn-lifetime-analysis-nocfg.cpp     |   90 +-
 clang/test/SemaCXX/ext-int.cpp                |    4 +-
 .../warn-unsafe-buffer-usage-array.cpp        |   40 +
 .../warn-unsafe-buffer-usage-field-attr.cpp   |    1 -
 ...e-buffer-usage-fixits-parm-unsupported.cpp |    2 +-
 .../test/SemaCXX/warn-unsafe-buffer-usage.cpp |   39 +-
 .../combined-construct-collapse-clause.cpp    |    5 +-
 .../combined-construct-default-ast.cpp        |    1 -
 .../combined-construct-default-clause.c       |    2 -
 .../combined-construct-if-clause.c            |    2 -
 .../compute-construct-default-clause.c        |    2 -
 .../compute-construct-device_type-clause.c    |    6 +-
 .../SemaOpenACC/compute-construct-if-clause.c |    2 -
 clang/test/SemaOpenACC/data-construct-ast.cpp |   91 +
 .../SemaOpenACC/data-construct-async-ast.cpp  |   61 +
 .../SemaOpenACC/data-construct-async-clause.c |   44 +
 .../data-construct-default-ast.cpp            |   68 +
 .../data-construct-default-clause.c           |   24 +
 .../data-construct-device_type-ast.cpp        |   39 +
 .../data-construct-device_type-clause.c       |   54 +
 .../SemaOpenACC/data-construct-if-ast.cpp     |  131 +
 .../SemaOpenACC/data-construct-if-clause.c    |   35 +
 .../SemaOpenACC/data-construct-wait-ast.cpp   |  230 ++
 .../SemaOpenACC/data-construct-wait-clause.c  |   50 +
 clang/test/SemaOpenACC/data-construct.cpp     |  219 ++
 .../loop-construct-collapse-clause.cpp        |    5 +-
 clang/tools/libclang/CIndex.cpp               |   41 +-
 clang/tools/libclang/CIndexCXX.cpp            |    8 +-
 clang/tools/libclang/CXCursor.cpp             |   12 +
 .../unittests/AST/ASTContextParentMapTest.cpp |   49 +
 .../Analysis/FlowSensitive/TransferTest.cpp   |   18 +-
 .../Serialization/LoadSpecLazilyTest.cpp      |    2 +
 .../TableGen/ClangDiagnosticsEmitter.cpp      |    4 +-
 compiler-rt/lib/builtins/CMakeLists.txt       |    4 +-
 .../lib/interception/interception_win.cpp     |    5 +-
 .../tests/interception_win_test.cpp           |    1 +
 .../lib/rtsan/rtsan_interceptors_posix.cpp    |    1 +
 flang/examples/FeatureList/FeatureList.cpp    |    3 +-
 .../include/flang/Optimizer/HLFIR/HLFIROps.td |    1 +
 flang/include/flang/Parser/dump-parse-tree.h  |    8 +-
 .../include/flang/Parser/parse-tree-visitor.h |   34 -
 flang/include/flang/Parser/parse-tree.h       |   75 +-
 .../flang/Semantics/openmp-modifiers.h        |    2 +
 flang/lib/Lower/OpenMP/Clauses.cpp            |   65 +-
 flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp     |    9 +
 .../OpenMP/MapsForPrivatizedSymbols.cpp       |    9 +-
 flang/lib/Parser/openmp-parsers.cpp           |   82 +-
 flang/lib/Parser/unparse.cpp                  |   73 +-
 flang/lib/Semantics/check-omp-structure.cpp   |  488 ++--
 flang/lib/Semantics/check-omp-structure.h     |   11 +-
 flang/lib/Semantics/openmp-modifiers.cpp      |   33 +
 flang/lib/Semantics/resolve-directives.cpp    |   15 +-
 flang/lib/Semantics/resolve-names.cpp         |   20 +
 flang/test/HLFIR/shapeof.fir                  |   18 +
 .../Parser/OpenMP/in-reduction-clause.f90     |   12 +-
 flang/test/Parser/OpenMP/linear-clause.f90    |  117 +
 .../test/Parser/OpenMP/reduction-modifier.f90 |    2 +-
 .../Parser/OpenMP/task-reduction-clause.f90   |   23 +
 .../directive-contin-with-pp.F90              |    6 +-
 .../Semantics/OpenMP/clause-validity01.f90    |    8 +
 flang/test/Semantics/OpenMP/in-reduction.f90  |   70 +
 .../test/Semantics/OpenMP/linear-clause01.f90 |   12 +-
 .../test/Semantics/OpenMP/linear-clause02.f90 |   13 +
 flang/test/Semantics/OpenMP/linear-iter.f90   |   14 +-
 flang/test/Semantics/OpenMP/symbol08.f90      |    5 +-
 .../test/Semantics/OpenMP/task-reduction.f90  |   70 +
 flang/test/Semantics/OpenMP/taskgroup01.f90   |    2 +
 flang/test/Semantics/modfile55.cuf            |    1 +
 flang/unittests/Runtime/AccessTest.cpp        |   70 +
 libc/CMakeLists.txt                           |    6 +
 libc/docs/headers/arpa/inet.rst               |   50 +
 libc/docs/headers/assert.rst                  |    4 +-
 libc/docs/headers/ctype.rst                   |   58 +-
 libc/docs/headers/errno.rst                   |   10 +-
 libc/docs/headers/fenv.rst                    |   52 +-
 libc/docs/headers/float.rst                   |   76 +-
 libc/docs/headers/index.rst                   |    2 +
 libc/docs/headers/inttypes.rst                |   14 +-
 libc/docs/headers/locale.rst                  |   36 +-
 libc/docs/headers/signal.rst                  |   92 +-
 libc/docs/headers/stdlib.rst                  |   90 +-
 libc/docs/headers/string.rst                  |   64 +-
 libc/docs/headers/strings.rst                 |   26 +-
 libc/docs/headers/sys/mman.rst                |  179 ++
 libc/docs/headers/threads.rst                 |   62 +-
 libc/docs/headers/uchar.rst                   |   12 +-
 libc/docs/headers/wchar.rst                   |    4 +-
 libc/docs/headers/wctype.rst                  |    2 +-
 libc/utils/docgen/arpa/inet.yaml              |   18 +
 libc/utils/docgen/assert.json                 |   10 -
 libc/utils/docgen/assert.yaml                 |    7 +
 libc/utils/docgen/ctype.json                  |   47 -
 libc/utils/docgen/ctype.yaml                  |   72 +
 libc/utils/docgen/docgen.py                   |   70 +-
 libc/utils/docgen/errno.json                  |   16 -
 libc/utils/docgen/errno.yaml                  |   14 +
 libc/utils/docgen/fenv.json                   |  114 -
 libc/utils/docgen/fenv.yaml                   |   97 +
 libc/utils/docgen/float.json                  |  163 --
 libc/utils/docgen/float.yaml                  |  143 ++
 libc/utils/docgen/header.py                   |   11 +-
 libc/utils/docgen/inttypes.json               |   22 -
 libc/utils/docgen/inttypes.yaml               |   20 +
 libc/utils/docgen/locale.json                 |   30 -
 libc/utils/docgen/locale.yaml                 |   37 +
 libc/utils/docgen/setjmp.json                 |   15 -
 libc/utils/docgen/setjmp.yaml                 |   15 +
 libc/utils/docgen/signal.json                 |  152 --
 libc/utils/docgen/signal.yaml                 |  102 +
 libc/utils/docgen/stdbit.json                 |  270 --
 libc/utils/docgen/stdbit.yaml                 |  179 ++
 libc/utils/docgen/stdlib.json                 |  174 --
 libc/utils/docgen/stdlib.yaml                 |  158 ++
 libc/utils/docgen/string.json                 |   99 -
 libc/utils/docgen/string.yaml                 |   94 +
 libc/utils/docgen/strings.json                |   40 -
 libc/utils/docgen/sys/mman.yaml               |   77 +
 libc/utils/docgen/threads.json                |   90 -
 libc/utils/docgen/threads.yaml                |   88 +
 libc/utils/docgen/uchar.json                  |   27 -
 libc/utils/docgen/uchar.yaml                  |   21 +
 libc/utils/docgen/wchar.json                  |  198 --
 libc/utils/docgen/wchar.yaml                  |  131 +
 libc/utils/docgen/wctype.json                 |   58 -
 libc/utils/docgen/wctype.yaml                 |   36 +
 .../algorithms.partition_point.bench.cpp      |    4 +-
 ...exicographical_compare_three_way.bench.cpp |    0
 .../{ => containers}/ContainerBenchmarks.h    |    2 +-
 .../{ => containers}/deque.bench.cpp          |    2 +-
 .../{ => containers}/deque_iterator.bench.cpp |    0
 .../benchmarks/{ => containers}/map.bench.cpp |    2 +-
 .../{ => containers}/ordered_set.bench.cpp    |    2 +-
 .../{ => containers}/string.bench.cpp         |    4 +-
 .../unordered_set_operations.bench.cpp        |    2 +-
 .../vector_operations.bench.cpp               |    2 +-
 .../benchmarks/{ => format}/format.bench.cpp  |    0
 .../{ => format}/format_to.bench.cpp          |    0
 .../{ => format}/format_to_n.bench.cpp        |    0
 .../{ => format}/formatted_size.bench.cpp     |    0
 .../{ => format}/formatter_float.bench.cpp    |    2 +-
 .../{ => format}/formatter_int.bench.cpp      |    2 +-
 .../std_format_spec_string_unicode.bench.cpp  |    0
 ...ormat_spec_string_unicode_escape.bench.cpp |    0
 lldb/include/lldb/Target/StackFrameList.h     |   66 +-
 lldb/source/Target/StackFrameList.cpp         |  237 +-
 lldb/source/Target/Thread.cpp                 |    2 +-
 .../api/multithreaded/TestMultithreaded.py    |   17 +-
 .../test/API/api/multithreaded/deep_stack.cpp |   17 +
 .../test_concurrent_unwind.cpp.template       |   91 +
 lldb/unittests/Host/PipeTest.cpp              |    3 +
 llvm/Maintainers.md                           |   10 +-
 llvm/docs/RISCVUsage.rst                      |    3 +
 llvm/docs/ReleaseNotes.md                     |    2 +
 .../MyFirstLanguageFrontend/LangImpl07.rst    |    8 +-
 llvm/examples/Kaleidoscope/Chapter7/toy.cpp   |    4 +-
 llvm/include/llvm/ADT/StringTable.h           |   91 +
 llvm/include/llvm/Analysis/IVDescriptors.h    |   38 +-
 llvm/include/llvm/Analysis/PtrUseVisitor.h    |   14 +
 .../include/llvm/Analysis/TargetLibraryInfo.h |   12 +
 llvm/include/llvm/CodeGen/MachineScheduler.h  |   12 +-
 llvm/include/llvm/Frontend/OpenMP/ClauseT.h   |    6 +-
 llvm/include/llvm/Frontend/OpenMP/OMP.td      |    2 +-
 .../include/llvm/Transforms/Utils/Evaluator.h |    3 -
 .../include/llvm/Transforms/Utils/LoopUtils.h |    6 +
 .../llvm/Transforms/Utils/SSAUpdater.h        |    7 +
 llvm/lib/Analysis/IVDescriptors.cpp           |  113 +-
 llvm/lib/Analysis/TargetLibraryInfo.cpp       |   23 +-
 llvm/lib/Analysis/ValueTracking.cpp           |    4 +-
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |   77 +-
 llvm/lib/CodeGen/GlobalISel/Utils.cpp         |    3 +-
 llvm/lib/CodeGen/MachineScheduler.cpp         |   85 +-
 llvm/lib/CodeGen/SelectOptimize.cpp           |   90 +-
 llvm/lib/CodeGen/VLIWMachineScheduler.cpp     |    7 +-
 .../Parallel/DWARFLinkerCompileUnit.cpp       |    8 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |    7 +-
 llvm/lib/MC/MCParser/AsmParser.cpp            |   26 +-
 llvm/lib/Support/Windows/Path.inc             |   36 +-
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |   34 +
 .../Target/AArch64/AArch64ISelLowering.cpp    |   71 +
 .../Target/AArch64/AArch64RegisterInfo.cpp    |   52 +
 llvm/lib/Target/AArch64/AArch64RegisterInfo.h |    5 +
 llvm/lib/Target/AArch64/SMEInstrFormats.td    |   28 +-
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |   39 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |    4 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |    2 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |    2 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |    3 +-
 .../M68k/Disassembler/M68kDisassembler.cpp    |    8 +-
 llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp     |   18 +-
 llvm/lib/Target/M68k/M68kInstrAtomics.td      |   89 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |   22 +-
 .../RISCV/Disassembler/RISCVDisassembler.cpp  |    2 +
 llvm/lib/Target/RISCV/RISCVFeatures.td        |    8 +
 llvm/lib/Target/RISCV/RISCVGISel.td           |    9 -
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   |   23 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  112 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |   33 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td   |   75 +-
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |    2 +
 llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp    |   46 +
 llvm/lib/TargetParser/RISCVISAInfo.cpp        |    2 +-
 llvm/lib/Transforms/IPO/AlwaysInliner.cpp     |   42 +-
 .../Scalar/InductiveRangeCheckElimination.cpp |  100 +-
 llvm/lib/Transforms/Scalar/SROA.cpp           |  108 +-
 llvm/lib/Transforms/Utils/Evaluator.cpp       |   40 +-
 llvm/lib/Transforms/Utils/LoopUtils.cpp       |   19 +
 llvm/lib/Transforms/Utils/SSAUpdater.cpp      |   14 +-
 .../lib/Transforms/Utils/SimplifyLibCalls.cpp |   82 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |   80 +-
 .../Transforms/Vectorize/SLPVectorizer.cpp    |   36 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   87 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   88 +-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |    2 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   39 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |    7 +-
 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp |    2 -
 llvm/lib/Transforms/Vectorize/VPlanUtils.cpp  |    2 +-
 .../Transforms/Vectorize/VPlanVerifier.cpp    |    2 +-
 .../Transforms/Vectorize/VectorCombine.cpp    |  114 +-
 .../AArch64/Atomics/aarch64-atomicrmw-lse2.ll |   80 +-
 .../Atomics/aarch64-atomicrmw-lse2_lse128.ll  |   80 +-
 .../aarch64-atomicrmw-outline_atomics.ll      |   80 +-
 .../AArch64/Atomics/aarch64-atomicrmw-rcpc.ll |   80 +-
 .../Atomics/aarch64-atomicrmw-rcpc3.ll        |   80 +-
 .../Atomics/aarch64-atomicrmw-v8_1a.ll        |   80 +-
 .../AArch64/Atomics/aarch64-atomicrmw-v8a.ll  |   80 +-
 .../AArch64/GlobalISel/legalize-cmp.mir       |    8 +-
 .../GlobalISel/legalize-threeway-cmp.mir      |   16 +-
 llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll  |   28 +-
 .../CodeGen/AArch64/dump-schedule-trace.mir   |    4 +-
 .../AArch64/force-enable-intervals.mir        |    4 +-
 llvm/test/CodeGen/AArch64/icmp.ll             |   32 +-
 .../misched-detail-resource-booking-01.mir    |    2 +-
 .../misched-detail-resource-booking-02.mir    |    2 +-
 .../misched-sort-resource-in-trace.mir        |    4 +-
 llvm/test/CodeGen/AArch64/scmp.ll             |   14 +-
 llvm/test/CodeGen/AArch64/selectopt-cast.ll   |  124 +
 .../AArch64/sme2-intrinsics-int-dots.ll       |  492 +++-
 .../CodeGen/AArch64/sme2-intrinsics-vdot.ll   |  284 ++-
 llvm/test/CodeGen/AArch64/ucmp.ll             |   28 +-
 .../AArch64/vecreduce-umax-legalization.ll    |    7 +-
 .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll |  192 +-
 .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll |  210 +-
 .../AMDGPU/amdgpu-codegenprepare-idiv.ll      |   97 +
 .../av-spill-expansion-with-machine-cp.mir    |   28 +-
 llvm/test/CodeGen/AMDGPU/bypass-div.ll        |  119 +-
 llvm/test/CodeGen/AMDGPU/div_i128.ll          |   90 +-
 llvm/test/CodeGen/AMDGPU/udiv64.ll            |  237 +-
 llvm/test/CodeGen/AMDGPU/urem64.ll            |  126 +-
 llvm/test/CodeGen/ARM/single-issue-r52.mir    |    8 +-
 llvm/test/CodeGen/M68k/Atomics/load-store.ll  |   48 +
 llvm/test/CodeGen/M68k/Atomics/rmw.ll         |  141 ++
 .../M68k/CodeModel/Large/Atomics/cmpxchg.ll   |  316 +++
 .../M68k/CodeModel/Large/Atomics/fence.ll     |   41 +
 .../CodeModel/Large/Atomics/load-store.ll     | 1161 +++++++++
 .../M68k/CodeModel/Large/Atomics/rmw.ll       | 1390 +++++++++++
 .../M68k/CodeModel/{ => Large}/large-pic.ll   |    0
 .../{ => Large}/large-pie-global-access.ll    |    0
 .../M68k/CodeModel/{ => Large}/large-pie.ll   |    0
 .../CodeModel/{ => Large}/large-static.ll     |    0
 .../M68k/CodeModel/{ => Medium}/medium-pic.ll |    0
 .../{ => Medium}/medium-pie-global-access.ll  |    0
 .../M68k/CodeModel/{ => Medium}/medium-pie.ll |    0
 .../CodeModel/{ => Medium}/medium-static.ll   |    0
 .../M68k/CodeModel/{ => Small}/small-pic.ll   |    0
 .../{ => Small}/small-pie-global-access.ll    |    0
 .../M68k/CodeModel/{ => Small}/small-pie.ll   |    0
 .../CodeModel/{ => Small}/small-static.ll     |    0
 llvm/test/CodeGen/M68k/TLS/tls-arid.ll        |   19 +
 .../Mips/GlobalISel/legalizer/icmp.mir        |   64 +-
 .../CodeGen/Mips/GlobalISel/llvm-ir/icmp.ll   |   24 +-
 llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll   |   28 +-
 llvm/test/CodeGen/PowerPC/aix-cc-abi.ll       |  109 +-
 .../legalizer/legalize-addo-subo-rv32.mir     |   48 +-
 .../legalizer/legalize-icmp-rv32.mir          |  178 +-
 .../legalizer/legalize-icmp-rv64.mir          |  192 +-
 .../legalizer/legalize-sat-rv32.mir           |   48 +-
 .../legalizer/legalize-smax-rv32.mir          |    8 +-
 .../legalizer/legalize-smin-rv32.mir          |    8 +-
 .../legalizer/legalize-umax-rv32.mir          |    8 +-
 .../legalizer/legalize-umin-rv32.mir          |    8 +-
 llvm/test/CodeGen/RISCV/and-shl.ll            |   79 +
 llvm/test/CodeGen/RISCV/attributes.ll         |    2 +
 .../RISCV/rvv/fixed-vectors-fp-shuffles.ll    |   73 +
 .../CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll   |  176 +-
 .../RISCV/rvv/fixed-vectors-int-buildvec.ll   |    8 +-
 .../RISCV/rvv/fixed-vectors-int-shuffles.ll   |  100 +-
 .../rvv/fixed-vectors-interleaved-access.ll   | 1330 +++++-----
 .../RISCV/rvv/fixed-vectors-masked-gather.ll  |  516 ++--
 .../RISCV/rvv/fixed-vectors-masked-scatter.ll |  472 ++--
 .../rvv/fixed-vectors-shuffle-deinterleave.ll |   61 +-
 .../CodeGen/RISCV/rvv/sink-splat-operands.ll  |  160 ++
 llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll  |  293 +++
 .../test/CodeGen/RISCV/rvv/vl-opt-op-info.mir |   60 +
 .../RISCV/sifive7-enable-intervals.mir        |    2 +-
 llvm/test/CodeGen/X86/handle-move.ll          |    4 +-
 llvm/test/CodeGen/X86/isel-select-cmov.ll     |   24 +-
 llvm/test/CodeGen/X86/misched-aa-colored.ll   |    2 +-
 llvm/test/CodeGen/X86/misched-matrix.ll       |    2 +-
 llvm/test/CodeGen/X86/misched-new.ll          |    4 +-
 llvm/test/MC/Disassembler/M68k/control.txt    |    2 +-
 .../test/MC/ELF/debug-hash-file-empty-dwarf.s |   26 +
 llvm/test/MC/ELF/debug-hash-file.s            |    6 +
 llvm/test/MC/M68k/Atomics/cas.s               |   42 +-
 llvm/test/MC/RISCV/xqcics-invalid.s           |  121 +
 llvm/test/MC/RISCV/xqcics-valid.s             |  147 ++
 .../GlobalOpt/evaluate-call-errors.ll         |    3 +-
 .../GlobalOpt/evaluate-constfold-call.ll      |   11 +-
 .../GlobalOpt/evaluate-ret-void-mismatch.ll   |   26 +
 llvm/test/Transforms/IRCE/low-iterations.ll   |    4 +-
 llvm/test/Transforms/IRCE/profitability.ll    |   38 +
 .../Transforms/Inline/always-inline-bfi.ll    |  101 +
 .../InstCombine/fpclass-from-dom-cond.ll      |   62 +
 .../Transforms/InstCombine/stdio-custom-dl.ll |    5 +-
 .../InstCombine/strcpy-nonzero-as.ll          |    4 +-
 .../AArch64/sve-tail-folding-forced.ll        |    2 +-
 .../LoopVectorize/AArch64/sve-widen-gep.ll    |    3 +
 .../AArch64/sve2-histcnt-vplan.ll             |    6 +
 .../AArch64/synthesize-mask-for-call.ll       |   18 +
 .../widen-call-with-intrinsic-or-libfunc.ll   |    6 +
 .../PowerPC/vplan-force-tail-with-evl.ll      |    6 +
 .../RISCV/riscv-vector-reverse.ll             |   98 +-
 .../RISCV/vplan-vp-intrinsics-reduction.ll    |    5 +-
 .../RISCV/vplan-vp-select-intrinsics.ll       |   13 +-
 .../first-order-recurrence-chains-vplan.ll    |   12 +
 ...-order-recurrence-sink-replicate-region.ll |   17 +-
 .../Transforms/LoopVectorize/icmp-uniforms.ll |    3 +
 .../interleave-and-scalarize-only.ll          |    7 +-
 .../LoopVectorize/iv-select-cmp-blend.ll      |   87 +
 .../LoopVectorize/iv-select-cmp-no-wrap.ll    |   88 +-
 .../LoopVectorize/iv-select-cmp-trunc.ll      | 1216 +++++++--
 .../Transforms/LoopVectorize/iv-select-cmp.ll | 2195 +++++++++++++++--
 .../LoopVectorize/select-min-index.ll         | 1035 ++++++--
 .../uncountable-early-exit-vplan.ll           |    6 +-
 .../LoopVectorize/vplan-dot-printing.ll       |    3 +-
 .../LoopVectorize/vplan-iv-transforms.ll      |    3 +
 .../LoopVectorize/vplan-predicate-switch.ll   |   13 +-
 .../vplan-printing-before-execute.ll          |   16 +-
 .../vplan-printing-outer-loop.ll              |    3 +
 .../LoopVectorize/vplan-printing.ll           |   46 +-
 .../vplan-sink-scalars-and-merge-vf1.ll       |    3 +
 .../vplan-sink-scalars-and-merge.ll           |   32 +-
 .../vplan-unused-interleave-group.ll          |    3 +
 .../X86/distinct-index-width-crash.ll         |    4 +-
 .../AArch64/hoist-runtime-checks.ll           |   28 +-
 .../PhaseOrdering/X86/concat-boolmasks.ll     |  252 --
 .../X86/hoist-load-of-baseptr.ll              |    8 +-
 .../X86/preserve-access-group.ll              |   10 +-
 .../Transforms/SLPVectorizer/RISCV/revec.ll   |   99 +-
 .../SROA/non-capturing-call-readonly.ll       |   99 +-
 .../test/Transforms/SROA/readonlynocapture.ll |   31 +-
 .../VectorCombine/X86/concat-boolmasks.ll     |  293 +++
 .../test/tools/llc/new-pm/regalloc-amdgpu.mir |    4 +-
 llvm/unittests/ADT/CMakeLists.txt             |    1 +
 llvm/unittests/ADT/StringTableTest.cpp        |   41 +
 .../Frontend/OpenMPDecompositionTest.cpp      |   15 +-
 .../Frontend/OpenMPIRBuilderTest.cpp          |   17 +-
 llvm/unittests/IR/IRBuilderTest.cpp           |    8 +-
 llvm/unittests/ProfileData/MemProfTest.cpp    |  112 +-
 .../TargetParser/RISCVISAInfoTest.cpp         |    1 +
 .../Transforms/Vectorize/VPlanHCFGTest.cpp    |    3 +-
 .../Transforms/Vectorize/VPlanTest.cpp        |    9 +-
 .../Vectorize/VPlanVerifierTest.cpp           |    2 +-
 .../gn/secondary/llvm/unittests/ADT/BUILD.gn  |    1 +
 mlir/CMakeLists.txt                           |    3 +
 mlir/cmake/modules/AddMLIR.cmake              |   20 +
 mlir/include/mlir/Conversion/Passes.td        |    6 +-
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |    8 +
 .../ComplexToStandard/ComplexToStandard.cpp   |  155 --
 .../Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp  |    4 +-
 .../Tensor/IR/TensorTilingInterfaceImpl.cpp   |   29 +-
 .../Tosa/Transforms/TosaValidation.cpp        |   19 +-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  188 +-
 .../convert-to-standard.mlir                  |  539 +---
 .../GPUToLLVMSPV/gpu-to-llvm-spv.mlir         |    6 +-
 mlir/test/Dialect/Tosa/level_check.mlir       |    8 +
 ...target-byref-bycopy-generation-device.mlir |    5 +-
 .../omptarget-declare-target-llvm-device.mlir |    2 +-
 .../openmp-target-multiple-private.mlir       |   80 +
 .../openmp-target-private-allocatable.mlir    |   64 +
 .../Target/LLVMIR/openmp-target-private.mlir  |   89 +
 .../openmp-target-use-device-nested.mlir      |    6 +-
 mlir/test/Target/LLVMIR/openmp-todo.mlir      |   18 -
 mlir/tools/mlir-cpu-runner/CMakeLists.txt     |    8 +-
 mlir/tools/mlir-lsp-server/CMakeLists.txt     |    7 +-
 mlir/tools/mlir-opt/CMakeLists.txt            |    8 +-
 .../bytecode/CMakeLists.txt                   |    2 +-
 .../mlir-parser-fuzzer/text/CMakeLists.txt    |    2 +-
 mlir/tools/mlir-query/CMakeLists.txt          |    4 +-
 mlir/tools/mlir-reduce/CMakeLists.txt         |    7 +-
 mlir/tools/mlir-rewrite/CMakeLists.txt        |    5 +-
 mlir/tools/mlir-translate/CMakeLists.txt      |    9 +-
 offload/DeviceRTL/include/Synchronization.h   |    8 +
 offload/DeviceRTL/src/Synchronization.cpp     |   44 +-
 479 files changed, 20921 insertions(+), 8006 deletions(-)
 create mode 100644 bolt/docs/BinaryAnalysis.md
 create mode 100644 bolt/test/binary-analysis/AArch64/Inputs/dummy.txt
 create mode 100644 bolt/test/binary-analysis/AArch64/cmdline-args.test
 create mode 100644 bolt/test/binary-analysis/AArch64/lit.local.cfg
 create mode 100644 bolt/tools/binary-analysis/CMakeLists.txt
 create mode 100644 bolt/tools/binary-analysis/binary-analysis.cpp
 create mode 100644 clang/test/AST/ast-print-openacc-data-construct.cpp
 create mode 100644 clang/test/SemaOpenACC/data-construct-ast.cpp
 create mode 100644 clang/test/SemaOpenACC/data-construct-async-ast.cpp
 create mode 100644 clang/test/SemaOpenACC/data-construct-async-clause.c
 create mode 100644 clang/test/SemaOpenACC/data-construct-default-ast.cpp
 create mode 100644 clang/test/SemaOpenACC/data-construct-default-clause.c
 create mode 100644 clang/test/SemaOpenACC/data-construct-device_type-ast.cpp
 create mode 100644 clang/test/SemaOpenACC/data-construct-device_type-clause.c
 create mode 100644 clang/test/SemaOpenACC/data-construct-if-ast.cpp
 create mode 100644 clang/test/SemaOpenACC/data-construct-if-clause.c
 create mode 100644 clang/test/SemaOpenACC/data-construct-wait-ast.cpp
 create mode 100644 clang/test/SemaOpenACC/data-construct-wait-clause.c
 create mode 100644 clang/test/SemaOpenACC/data-construct.cpp
 create mode 100644 flang/test/Parser/OpenMP/linear-clause.f90
 create mode 100644 flang/test/Parser/OpenMP/task-reduction-clause.f90
 create mode 100644 flang/test/Semantics/OpenMP/in-reduction.f90
 create mode 100644 flang/test/Semantics/OpenMP/linear-clause02.f90
 create mode 100644 flang/test/Semantics/OpenMP/task-reduction.f90
 create mode 100644 libc/docs/headers/arpa/inet.rst
 create mode 100644 libc/docs/headers/sys/mman.rst
 create mode 100644 libc/utils/docgen/arpa/inet.yaml
 delete mode 100644 libc/utils/docgen/assert.json
 create mode 100644 libc/utils/docgen/assert.yaml
 delete mode 100644 libc/utils/docgen/ctype.json
 create mode 100644 libc/utils/docgen/ctype.yaml
 delete mode 100644 libc/utils/docgen/errno.json
 create mode 100644 libc/utils/docgen/errno.yaml
 delete mode 100644 libc/utils/docgen/fenv.json
 create mode 100644 libc/utils/docgen/fenv.yaml
 delete mode 100644 libc/utils/docgen/float.json
 create mode 100644 libc/utils/docgen/float.yaml
 delete mode 100644 libc/utils/docgen/inttypes.json
 create mode 100644 libc/utils/docgen/inttypes.yaml
 delete mode 100644 libc/utils/docgen/locale.json
 create mode 100644 libc/utils/docgen/locale.yaml
 delete mode 100644 libc/utils/docgen/setjmp.json
 create mode 100644 libc/utils/docgen/setjmp.yaml
 delete mode 100644 libc/utils/docgen/signal.json
 create mode 100644 libc/utils/docgen/signal.yaml
 delete mode 100644 libc/utils/docgen/stdbit.json
 create mode 100644 libc/utils/docgen/stdbit.yaml
 delete mode 100644 libc/utils/docgen/stdlib.json
 create mode 100644 libc/utils/docgen/stdlib.yaml
 delete mode 100644 libc/utils/docgen/string.json
 create mode 100644 libc/utils/docgen/string.yaml
 delete mode 100644 libc/utils/docgen/strings.json
 create mode 100644 libc/utils/docgen/sys/mman.yaml
 delete mode 100644 libc/utils/docgen/threads.json
 create mode 100644 libc/utils/docgen/threads.yaml
 delete mode 100644 libc/utils/docgen/uchar.json
 create mode 100644 libc/utils/docgen/uchar.yaml
 delete mode 100644 libc/utils/docgen/wchar.json
 create mode 100644 libc/utils/docgen/wchar.yaml
 delete mode 100644 libc/utils/docgen/wctype.json
 create mode 100644 libc/utils/docgen/wctype.yaml
 rename libcxx/test/benchmarks/{ => algorithms}/algorithms.partition_point.bench.cpp (98%)
 rename libcxx/test/benchmarks/{ => algorithms}/lexicographical_compare_three_way.bench.cpp (100%)
 rename libcxx/test/benchmarks/{ => containers}/ContainerBenchmarks.h (99%)
 rename libcxx/test/benchmarks/{ => containers}/deque.bench.cpp (98%)
 rename libcxx/test/benchmarks/{ => containers}/deque_iterator.bench.cpp (100%)
 rename libcxx/test/benchmarks/{ => containers}/map.bench.cpp (99%)
 rename libcxx/test/benchmarks/{ => containers}/ordered_set.bench.cpp (99%)
 rename libcxx/test/benchmarks/{ => containers}/string.bench.cpp (99%)
 rename libcxx/test/benchmarks/{ => containers}/unordered_set_operations.bench.cpp (99%)
 rename libcxx/test/benchmarks/{ => containers}/vector_operations.bench.cpp (99%)
 rename libcxx/test/benchmarks/{ => format}/format.bench.cpp (100%)
 rename libcxx/test/benchmarks/{ => format}/format_to.bench.cpp (100%)
 rename libcxx/test/benchmarks/{ => format}/format_to_n.bench.cpp (100%)
 rename libcxx/test/benchmarks/{ => format}/formatted_size.bench.cpp (100%)
 rename libcxx/test/benchmarks/{ => format}/formatter_float.bench.cpp (99%)
 rename libcxx/test/benchmarks/{ => format}/formatter_int.bench.cpp (99%)
 rename libcxx/test/benchmarks/{ => format}/std_format_spec_string_unicode.bench.cpp (100%)
 rename libcxx/test/benchmarks/{ => format}/std_format_spec_string_unicode_escape.bench.cpp (100%)
 create mode 100644 lldb/test/API/api/multithreaded/deep_stack.cpp
 create mode 100644 lldb/test/API/api/multithreaded/test_concurrent_unwind.cpp.template
 create mode 100644 llvm/include/llvm/ADT/StringTable.h
 create mode 100644 llvm/test/CodeGen/M68k/CodeModel/Large/Atomics/cmpxchg.ll
 create mode 100644 llvm/test/CodeGen/M68k/CodeModel/Large/Atomics/fence.ll
 create mode 100644 llvm/test/CodeGen/M68k/CodeModel/Large/Atomics/load-store.ll
 create mode 100644 llvm/test/CodeGen/M68k/CodeModel/Large/Atomics/rmw.ll
 rename llvm/test/CodeGen/M68k/CodeModel/{ => Large}/large-pic.ll (100%)
 rename llvm/test/CodeGen/M68k/CodeModel/{ => Large}/large-pie-global-access.ll (100%)
 rename llvm/test/CodeGen/M68k/CodeModel/{ => Large}/large-pie.ll (100%)
 rename llvm/test/CodeGen/M68k/CodeModel/{ => Large}/large-static.ll (100%)
 rename llvm/test/CodeGen/M68k/CodeModel/{ => Medium}/medium-pic.ll (100%)
 rename llvm/test/CodeGen/M68k/CodeModel/{ => Medium}/medium-pie-global-access.ll (100%)
 rename llvm/test/CodeGen/M68k/CodeModel/{ => Medium}/medium-pie.ll (100%)
 rename llvm/test/CodeGen/M68k/CodeModel/{ => Medium}/medium-static.ll (100%)
 rename llvm/test/CodeGen/M68k/CodeModel/{ => Small}/small-pic.ll (100%)
 rename llvm/test/CodeGen/M68k/CodeModel/{ => Small}/small-pie-global-access.ll (100%)
 rename llvm/test/CodeGen/M68k/CodeModel/{ => Small}/small-pie.ll (100%)
 rename llvm/test/CodeGen/M68k/CodeModel/{ => Small}/small-static.ll (100%)
 create mode 100644 llvm/test/CodeGen/M68k/TLS/tls-arid.ll
 create mode 100644 llvm/test/CodeGen/RISCV/and-shl.ll
 create mode 100644 llvm/test/MC/ELF/debug-hash-file-empty-dwarf.s
 create mode 100644 llvm/test/MC/RISCV/xqcics-invalid.s
 create mode 100644 llvm/test/MC/RISCV/xqcics-valid.s
 create mode 100644 llvm/test/Transforms/GlobalOpt/evaluate-ret-void-mismatch.ll
 create mode 100644 llvm/test/Transforms/IRCE/profitability.ll
 create mode 100644 llvm/test/Transforms/Inline/always-inline-bfi.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/iv-select-cmp-blend.ll
 delete mode 100644 llvm/test/Transforms/PhaseOrdering/X86/concat-boolmasks.ll
 create mode 100644 llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll
 create mode 100644 llvm/unittests/ADT/StringTableTest.cpp
 create mode 100644 mlir/test/Target/LLVMIR/openmp-target-multiple-private.mlir
 create mode 100644 mlir/test/Target/LLVMIR/openmp-target-private-allocatable.mlir

diff --git a/bolt/docs/BinaryAnalysis.md b/bolt/docs/BinaryAnalysis.md
new file mode 100644
index 0000000000000..f91b77d046de8
--- /dev/null
+++ b/bolt/docs/BinaryAnalysis.md
@@ -0,0 +1,20 @@
+# BOLT-based binary analysis
+
+As part of post-link-time optimizing, BOLT needs to perform a range of analyses
+on binaries such as recontructing control flow graphs, and more.
+
+The `llvm-bolt-binary-analysis` tool enables running requested binary analyses
+on binaries, and generating reports. It does this by building on top of the
+analyses implemented in the BOLT libraries.
+
+## Which binary analyses are implemented?
+
+At the moment, no binary analyses are implemented.
+
+The goal is to make it easy using a plug-in framework to add your own analyses.
+
+## How to add your own binary analysis
+
+_TODO: this section needs to be written. Ideally, we should have a simple
+"example" or "template" analysis that can be the starting point for implementing
+custom analyses_
diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h
index 73d2857f946cc..42094cb732107 100644
--- a/bolt/include/bolt/Rewrite/RewriteInstance.h
+++ b/bolt/include/bolt/Rewrite/RewriteInstance.h
@@ -164,6 +164,9 @@ class RewriteInstance {
 
   void preregisterSections();
 
+  /// run analyses requested in binary analysis mode.
+  void runBinaryAnalyses();
+
   /// Run optimizations that operate at the binary, or post-linker, level.
   void runOptimizationPasses();
 
diff --git a/bolt/include/bolt/Utils/CommandLineOpts.h b/bolt/include/bolt/Utils/CommandLineOpts.h
index 04bf7db5de952..111eb650c3746 100644
--- a/bolt/include/bolt/Utils/CommandLineOpts.h
+++ b/bolt/include/bolt/Utils/CommandLineOpts.h
@@ -18,6 +18,7 @@
 namespace opts {
 
 extern bool HeatmapMode;
+extern bool BinaryAnalysisMode;
 
 extern llvm::cl::OptionCategory BoltCategory;
 extern llvm::cl::OptionCategory BoltDiffCategory;
@@ -27,6 +28,7 @@ extern llvm::cl::OptionCategory BoltOutputCategory;
 extern llvm::cl::OptionCategory AggregatorCategory;
 extern llvm::cl::OptionCategory BoltInstrCategory;
 extern llvm::cl::OptionCategory HeatmapCategory;
+extern llvm::cl::OptionCategory BinaryAnalysisCategory;
 
 extern llvm::cl::opt<unsigned> AlignText;
 extern llvm::cl::opt<unsigned> AlignFunctions;
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 76e1f0156f828..dfac662aebb6f 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -699,6 +699,11 @@ Error RewriteInstance::run() {
   if (opts::DiffOnly)
     return Error::success();
 
+  if (opts::BinaryAnalysisMode) {
+    runBinaryAnalyses();
+    return Error::success();
+  }
+
   preregisterSections();
 
   runOptimizationPasses();
@@ -3475,6 +3480,8 @@ void RewriteInstance::runOptimizationPasses() {
   BC->logBOLTErrorsAndQuitOnFatal(BinaryFunctionPassManager::runAllPasses(*BC));
 }
 
+void RewriteInstance::runBinaryAnalyses() {}
+
 void RewriteInstance::preregisterSections() {
   // Preregister sections before emission to set their order in the output.
   const unsigned ROFlags = BinarySection::getFlags(/*IsReadOnly*/ true,
diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp
index de82420a16713..17f090aa61ee9 100644
--- a/bolt/lib/Utils/CommandLineOpts.cpp
+++ b/bolt/lib/Utils/CommandLineOpts.cpp
@@ -29,6 +29,7 @@ const char *BoltRevision =
 namespace opts {
 
 bool HeatmapMode = false;
+bool BinaryAnalysisMode = false;
 
 cl::OptionCategory BoltCategory("BOLT generic options");
 cl::OptionCategory BoltDiffCategory("BOLTDIFF generic options");
@@ -38,6 +39,7 @@ cl::OptionCategory BoltOutputCategory("Output options");
 cl::OptionCategory AggregatorCategory("Data aggregation options");
 cl::OptionCategory BoltInstrCategory("BOLT instrumentation options");
 cl::OptionCategory HeatmapCategory("Heatmap options");
+cl::OptionCategory BinaryAnalysisCategory("BinaryAnalysis options");
 
 cl::opt<unsigned> AlignText("align-text",
                             cl::desc("alignment of .text section"), cl::Hidden,
diff --git a/bolt/test/CMakeLists.txt b/bolt/test/CMakeLists.txt
index d468ff984840f..6e18b028bddfc 100644
--- a/bolt/test/CMakeLists.txt
+++ b/bolt/test/CMakeLists.txt
@@ -37,6 +37,7 @@ list(APPEND BOLT_TEST_DEPS
   lld
   llvm-config
   llvm-bolt
+  llvm-bolt-binary-analysis
   llvm-bolt-heatmap
   llvm-bat-dump
   llvm-dwarfdump
diff --git a/bolt/test/binary-analysis/AArch64/Inputs/dummy.txt b/bolt/test/binary-analysis/AArch64/Inputs/dummy.txt
new file mode 100644
index 0000000000000..2995a4d0e7491
--- /dev/null
+++ b/bolt/test/binary-analysis/AArch64/Inputs/dummy.txt
@@ -0,0 +1 @@
+dummy
\ No newline at end of file
diff --git a/bolt/test/binary-analysis/AArch64/cmdline-args.test b/bolt/test/binary-analysis/AArch64/cmdline-args.test
new file mode 100644
index 0000000000000..e414818644a3b
--- /dev/null
+++ b/bolt/test/binary-analysis/AArch64/cmdline-args.test
@@ -0,0 +1,33 @@
+# This file tests error messages produced on invalid command line arguments.
+# It also checks that help messages are generated as expected.
+
+# Verify that an error message is provided if an input file is missing or incorrect
+
+RUN: not llvm-bolt-binary-analysis 2>&1 | FileCheck -check-prefix=NOFILEARG %s
+NOFILEARG:       llvm-bolt-binary-analysis: Not enough positional command line arguments specified!
+NOFILEARG-NEXT:  Must specify at least 1 positional argument: See: {{.*}}llvm-bolt-binary-analysis --help
+
+RUN: not llvm-bolt-binary-analysis non-existing-file 2>&1 | FileCheck -check-prefix=NONEXISTINGFILEARG %s
+NONEXISTINGFILEARG:       llvm-bolt-binary-analysis: 'non-existing-file': No such file or directory.
+
+RUN: not llvm-bolt-binary-analysis %p/Inputs/dummy.txt 2>&1 | FileCheck -check-prefix=NOELFFILEARG %s
+NOELFFILEARG:       llvm-bolt-binary-analysis: '{{.*}}/Inputs/dummy.txt': The file was not recognized as a valid object file.
+
+RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe
+RUN: llvm-bolt-binary-analysis %t.exe 2>&1 | FileCheck -check-prefix=VALIDELFFILEARG --allow-empty %s
+# Check that there are no BOLT-WARNING or BOLT-ERROR output lines
+VALIDELFFILEARG:     BOLT-INFO:
+VALIDELFFILEARG-NOT: BOLT-WARNING:
+VALIDELFFILEARG-NOT: BOLT-ERROR:
+
+# Check --help output
+
+RUN: llvm-bolt-binary-analysis --help 2>&1 | FileCheck -check-prefix=HELP %s
+
+HELP:       OVERVIEW: BinaryAnalysis
+HELP-EMPTY:
+HELP-NEXT:  USAGE: llvm-bolt-binary-analysis [options] <executable>
+HELP-EMPTY:
+HELP-NEXT:  OPTIONS:
+HELP-EMPTY:
+HELP-NEXT:  Generic Options:
diff --git a/bolt/test/binary-analysis/AArch64/lit.local.cfg b/bolt/test/binary-analysis/AArch64/lit.local.cfg
new file mode 100644
index 0000000000000..6f247dd52e82f
--- /dev/null
+++ b/bolt/test/binary-analysis/AArch64/lit.local.cfg
@@ -0,0 +1,7 @@
+if "AArch64" not in config.root.targets:
+    config.unsupported = True
+
+flags = "--target=aarch64-linux-gnu -nostartfiles -nostdlib -ffreestanding -Wl,--emit-relocs"
+
+config.substitutions.insert(0, ("%cflags", f"%cflags {flags}"))
+config.substitutions.insert(0, ("%cxxflags", f"%cxxflags {flags}"))
diff --git a/bolt/test/lit.cfg.py b/bolt/test/lit.cfg.py
index da3ae34ba3bdd..0d05229be2bf3 100644
--- a/bolt/test/lit.cfg.py
+++ b/bolt/test/lit.cfg.py
@@ -110,6 +110,7 @@
     ),
     ToolSubst("llvm-boltdiff", unresolved="fatal"),
     ToolSubst("llvm-bolt-heatmap", unresolved="fatal"),
+    ToolSubst("llvm-bolt-binary-analysis", unresolved="fatal"),
     ToolSubst("llvm-bat-dump", unresolved="fatal"),
     ToolSubst("perf2bolt", unresolved="fatal"),
     ToolSubst("yaml2obj", unresolved="fatal"),
diff --git a/bolt/tools/CMakeLists.txt b/bolt/tools/CMakeLists.txt
index 22ea3b9bd805f..3383902cffc40 100644
--- a/bolt/tools/CMakeLists.txt
+++ b/bolt/tools/CMakeLists.txt
@@ -7,3 +7,4 @@ add_subdirectory(llvm-bolt-fuzzer)
 add_subdirectory(bat-dump)
 add_subdirectory(merge-fdata)
 add_subdirectory(heatmap)
+add_subdirectory(binary-analysis)
diff --git a/bolt/tools/binary-analysis/CMakeLists.txt b/bolt/tools/binary-analysis/CMakeLists.txt
new file mode 100644
index 0000000000000..841fc5b371185
--- /dev/null
+++ b/bolt/tools/binary-analysis/CMakeLists.txt
@@ -0,0 +1,19 @@
+set(LLVM_LINK_COMPONENTS
+  ${LLVM_TARGETS_TO_BUILD}
+  MC
+  Object
+  Support
+  )
+
+add_bolt_tool(llvm-bolt-binary-analysis
+  binary-analysis.cpp
+  DISABLE_LLVM_LINK_LLVM_DYLIB
+  )
+
+target_link_libraries(llvm-bolt-binary-analysis
+  PRIVATE
+  LLVMBOLTRewrite
+  LLVMBOLTUtils
+  )
+
+add_dependencies(bolt llvm-bolt-binary-analysis)
diff --git a/bolt/tools/binary-analysis/binary-analysis.cpp b/bolt/tools/binary-analysis/binary-analysis.cpp
new file mode 100644
index 0000000000000..b03fee3e025ae
--- /dev/null
+++ b/bolt/tools/binary-analysis/binary-analysis.cpp
@@ -0,0 +1,122 @@
+//===- bolt/tools/binary-analysis/binary-analysis.cpp ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a generic binary analysis tool, where multiple different specific
+// binary analyses can be plugged in to. The binary analyses are mostly built
+// on top of BOLT components.
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Rewrite/RewriteInstance.h"
+#include "bolt/Utils/CommandLineOpts.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/VirtualFileSystem.h"
+
+#define DEBUG_TYPE "bolt"
+
+using namespace llvm;
+using namespace object;
+using namespace bolt;
+
+namespace opts {
+
+static cl::OptionCategory *BinaryAnalysisCategories[] = {
+    &BinaryAnalysisCategory};
+
+static cl::opt<std::string> InputFilename(cl::Positional,
+                                          cl::desc("<executable>"),
+                                          cl::Required,
+                                          cl::cat(BinaryAnalysisCategory),
+                                          cl::sub(cl::SubCommand::getAll()));
+
+} // namespace opts
+
+static StringRef ToolName = "llvm-bolt-binary-analysis";
+
+static void report_error(StringRef Message, std::error_code EC) {
+  assert(EC);
+  errs() << ToolName << ": '" << Message << "': " << EC.message() << ".\n";
+  exit(1);
+}
+
+static void report_error(StringRef Message, Error E) {
+  assert(E);
+  errs() << ToolName << ": '" << Message << "': " << toString(std::move(E))
+         << ".\n";
+  exit(1);
+}
+
+void ParseCommandLine(int argc, char **argv) {
+  cl::HideUnrelatedOptions(ArrayRef(opts::BinaryAnalysisCategories));
+  // Register the target printer for --version.
+  cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
+
+  cl::ParseCommandLineOptions(argc, argv, "BinaryAnalysis\n");
+}
+
+static std::string GetExecutablePath(const char *Argv0) {
+  SmallString<256> ExecutablePath(Argv0);
+  // Do a PATH lookup if Argv0 isn't a valid path.
+  if (!llvm::sys::fs::exists(ExecutablePath))
+    if (llvm::ErrorOr<std::string> P =
+            llvm::sys::findProgramByName(ExecutablePath))
+      ExecutablePath = *P;
+  return std::string(ExecutablePath.str());
+}
+
+int main(int argc, char **argv) {
+  // Print a stack trace if we signal out.
+  sys::PrintStackTraceOnErrorSignal(argv[0]);
+  PrettyStackTraceProgram X(argc, argv);
+
+  std::string ToolPath = GetExecutablePath(argv[0]);
+
+  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+
+  // Initialize targets and assembly printers/parsers.
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllAsmParsers();
+  llvm::InitializeAllDisassemblers();
+
+  llvm::InitializeAllTargets();
+  llvm::InitializeAllAsmPrinters();
+
+  ParseCommandLine(argc, argv);
+
+  opts::BinaryAnalysisMode = true;
+
+  if (!sys::fs::exists(opts::InputFilename))
+    report_error(opts::InputFilename, errc::no_such_file_or_directory);
+
+  Expected<OwningBinary<Binary>> BinaryOrErr =
+      createBinary(opts::InputFilename);
+  if (Error E = BinaryOrErr.takeError())
+    report_error(opts::InputFilename, std::move(E));
+  Binary &Binary = *BinaryOrErr.get().getBinary();
+
+  if (auto *e = dyn_cast<ELFObjectFileBase>(&Binary)) {
+    auto RIOrErr = RewriteInstance::create(e, argc, argv, ToolPath);
+    if (Error E = RIOrErr.takeError())
+      report_error(opts::InputFilename, std::move(E));
+    RewriteInstance &RI = *RIOrErr.get();
+    if (Error E = RI.run())
+      report_error(opts::InputFilename, std::move(E));
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 6802c0c50b8f0..befa411e882b4 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -61,6 +61,8 @@ code bases.
 C/C++ Language Potentially Breaking Changes
 -------------------------------------------
 
+- Clang now rejects ``_Complex _BitInt`` types.
+
 C++ Specific Potentially Breaking Changes
 -----------------------------------------
 
@@ -612,6 +614,8 @@ Improvements to Clang's diagnostics
 
 - Clang now diagnoses ``[[deprecated]]`` attribute usage on local variables (#GH90073).
 
+- Fix false positives when `[[gsl::Owner/Pointer]]` and `[[clang::lifetimebound]]` are used together.
+
 - Improved diagnostic message for ``__builtin_bit_cast`` size mismatch (#GH115870).
 
 - Clang now omits shadow warnings for enum constants in separate class scopes (#GH62588).
@@ -664,6 +668,15 @@ Improvements to Clang's diagnostics
       bool operator==(const C&) = default;
     };
 
+- Clang now emits `-Wdangling-capture` diangostic when a STL container captures a dangling reference.
+
+  .. code-block:: c++
+
+    void test() {
+      std::vector<std::string_view> views;
+      views.push_back(std::string("123")); // warning
+    }
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/include/clang-c/CXString.h b/clang/include/clang-c/CXString.h
index f117010c71a46..63dce4d140ce2 100644
--- a/clang/include/clang-c/CXString.h
+++ b/clang/include/clang-c/CXString.h
@@ -46,6 +46,10 @@ typedef struct {
 
 /**
  * Retrieve the character data associated with the given string.
+ *
+ * The returned data is a reference and not owned by the user. This data
+ * is only valid while the `CXString` is valid. This function is similar
+ * to `std::string::c_str()`.
  */
 CINDEX_LINKAGE const char *clang_getCString(CXString string);
 
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index 8fc06328f0bce..29858f00fad74 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -2166,9 +2166,27 @@ enum CXCursorKind {
    */
   CXCursor_OpenACCLoopConstruct = 321,
 
+  /** OpenACC Combined Constructs.
+   */
   CXCursor_OpenACCCombinedConstruct = 322,
 
-  CXCursor_LastStmt = CXCursor_OpenACCCombinedConstruct,
+  /** OpenACC data Construct.
+   */
+  CXCursor_OpenACCDataConstruct = 323,
+
+  /** OpenACC enter data Construct.
+   */
+  CXCursor_OpenACCEnterDataConstruct = 324,
+
+  /** OpenACC exit data Construct.
+   */
+  CXCursor_OpenACCExitDataConstruct = 325,
+
+  /** OpenACC host_data Construct.
+   */
+  CXCursor_OpenACCHostDataConstruct = 326,
+
+  CXCursor_LastStmt = CXCursor_OpenACCHostDataConstruct,
 
   /**
    * Cursor that represents the translation unit itself.
diff --git a/clang/include/clang/AST/APValue.h b/clang/include/clang/AST/APValue.h
index 7869ee386689d..4401f3a8ff482 100644
--- a/clang/include/clang/AST/APValue.h
+++ b/clang/include/clang/AST/APValue.h
@@ -157,11 +157,9 @@ class APValue {
 
     void Profile(llvm::FoldingSetNodeID &ID) const;
 
-    template <class T>
-    bool is() const { return Ptr.is<T>(); }
+    template <class T> bool is() const { return isa<T>(Ptr); }
 
-    template <class T>
-    T get() const { return Ptr.get<T>(); }
+    template <class T> T get() const { return cast<T>(Ptr); }
 
     template <class T>
     T dyn_cast() const { return Ptr.dyn_cast<T>(); }
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 88d93a79d00f8..67ee0bb412692 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -3457,18 +3457,17 @@ class TypedefNameDecl : public TypeDecl, public Redeclarable<TypedefNameDecl> {
   using redeclarable_base::isFirstDecl;
 
   bool isModed() const {
-    return MaybeModedTInfo.getPointer().is<ModedTInfo *>();
+    return isa<ModedTInfo *>(MaybeModedTInfo.getPointer());
   }
 
   TypeSourceInfo *getTypeSourceInfo() const {
-    return isModed() ? MaybeModedTInfo.getPointer().get<ModedTInfo *>()->first
-                     : MaybeModedTInfo.getPointer().get<TypeSourceInfo *>();
+    return isModed() ? cast<ModedTInfo *>(MaybeModedTInfo.getPointer())->first
+                     : cast<TypeSourceInfo *>(MaybeModedTInfo.getPointer());
   }
 
   QualType getUnderlyingType() const {
-    return isModed() ? MaybeModedTInfo.getPointer().get<ModedTInfo *>()->second
-                     : MaybeModedTInfo.getPointer()
-                           .get<TypeSourceInfo *>()
+    return isModed() ? cast<ModedTInfo *>(MaybeModedTInfo.getPointer())->second
+                     : cast<TypeSourceInfo *>(MaybeModedTInfo.getPointer())
                            ->getType();
   }
 
diff --git a/clang/include/clang/AST/DeclContextInternals.h b/clang/include/clang/AST/DeclContextInternals.h
index e169c48592192..b17b7627ac90c 100644
--- a/clang/include/clang/AST/DeclContextInternals.h
+++ b/clang/include/clang/AST/DeclContextInternals.h
@@ -70,7 +70,7 @@ class StoredDeclsList {
         // want to keep (if any) will be of the form DeclListNode(D, <rest>);
         // replace it with just D.
         if (NewLast) {
-          DeclListNode *Node = NewLast->get<DeclListNode*>();
+          DeclListNode *Node = cast<DeclListNode *>(*NewLast);
           *NewLast = Node->D;
           C.DeallocateDeclListNode(Node);
         }
@@ -84,11 +84,11 @@ class StoredDeclsList {
     if (!Data.getPointer())
       // All declarations are erased.
       return nullptr;
-    else if (NewHead.is<NamedDecl *>())
+    else if (isa<NamedDecl *>(NewHead))
       // The list only contains a declaration, the header itself.
       return (DeclListNode::Decls *)&Data;
     else {
-      assert(NewLast && NewLast->is<NamedDecl *>() && "Not the tail?");
+      assert(NewLast && isa<NamedDecl *>(*NewLast) && "Not the tail?");
       return NewLast;
     }
   }
@@ -207,7 +207,7 @@ class StoredDeclsList {
     }
 
     // Append the Decls.
-    DeclListNode *Node = C.AllocateDeclListNode(Tail->get<NamedDecl *>());
+    DeclListNode *Node = C.AllocateDeclListNode(cast<NamedDecl *>(*Tail));
     Node->Rest = DeclsAsList;
     *Tail = Node;
   }
@@ -293,7 +293,7 @@ class StoredDeclsList {
         llvm::errs() << '[' << Node->D << "] -> ";
         D = Node->Rest;
       } else {
-        llvm::errs() << '[' << D.get<NamedDecl*>() << "]\n";
+        llvm::errs() << '[' << cast<NamedDecl *>(D) << "]\n";
         return;
       }
     }
diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h
index 44ccf8932a183..d3a466a8617bb 100644
--- a/clang/include/clang/AST/DeclTemplate.h
+++ b/clang/include/clang/AST/DeclTemplate.h
@@ -319,8 +319,7 @@ class DefaultArgStorage {
     const DefaultArgStorage &Storage = Parm->getDefaultArgStorage();
     if (auto *Prev = Storage.ValueOrInherited.template dyn_cast<ParmDecl *>())
       Parm = Prev;
-    assert(!Parm->getDefaultArgStorage()
-                .ValueOrInherited.template is<ParmDecl *>() &&
+    assert(!isa<ParmDecl *>(Parm->getDefaultArgStorage().ValueOrInherited) &&
            "should only be one level of indirection");
     return Parm;
   }
@@ -333,7 +332,7 @@ class DefaultArgStorage {
 
   /// Determine whether the default argument for this parameter was inherited
   /// from a previous declaration of the same entity.
-  bool isInherited() const { return ValueOrInherited.template is<ParmDecl*>(); }
+  bool isInherited() const { return isa<ParmDecl *>(ValueOrInherited); }
 
   /// Get the default argument's value. This does not consider whether the
   /// default argument is visible.
@@ -343,7 +342,7 @@ class DefaultArgStorage {
       Storage = &Prev->getDefaultArgStorage();
     if (const auto *C = Storage->ValueOrInherited.template dyn_cast<Chain *>())
       return C->Value;
-    return Storage->ValueOrInherited.template get<ArgType>();
+    return cast<ArgType>(Storage->ValueOrInherited);
   }
 
   /// Get the parameter from which we inherit the default argument, if any.
@@ -379,7 +378,7 @@ class DefaultArgStorage {
       Inherited->PrevDeclWithDefaultArg = InheritedFrom;
     } else
       ValueOrInherited = new (allocateDefaultArgStorageChain(C))
-          Chain{InheritedFrom, ValueOrInherited.template get<ArgType>()};
+          Chain{InheritedFrom, cast<ArgType>(ValueOrInherited)};
   }
 
   /// Remove the default argument, even if it was inherited.
@@ -1992,7 +1991,7 @@ class ClassTemplateSpecializationDecl : public CXXRecordDecl,
   /// template arguments have been deduced.
   void setInstantiationOf(ClassTemplatePartialSpecializationDecl *PartialSpec,
                           const TemplateArgumentList *TemplateArgs) {
-    assert(!SpecializedTemplate.is<SpecializedPartialSpecialization*>() &&
+    assert(!isa<SpecializedPartialSpecialization *>(SpecializedTemplate) &&
            "Already set to a class template partial specialization!");
     auto *PS = new (getASTContext()) SpecializedPartialSpecialization();
     PS->PartialSpecialization = PartialSpec;
@@ -2003,7 +2002,7 @@ class ClassTemplateSpecializationDecl : public CXXRecordDecl,
   /// Note that this class template specialization is an instantiation
   /// of the given class template.
   void setInstantiationOf(ClassTemplateDecl *TemplDecl) {
-    assert(!SpecializedTemplate.is<SpecializedPartialSpecialization*>() &&
+    assert(!isa<SpecializedPartialSpecialization *>(SpecializedTemplate) &&
            "Previously set to a class template partial specialization!");
     SpecializedTemplate = TemplDecl;
   }
@@ -2761,7 +2760,7 @@ class VarTemplateSpecializationDecl : public VarDecl,
   /// template arguments have been deduced.
   void setInstantiationOf(VarTemplatePartialSpecializationDecl *PartialSpec,
                           const TemplateArgumentList *TemplateArgs) {
-    assert(!SpecializedTemplate.is<SpecializedPartialSpecialization *>() &&
+    assert(!isa<SpecializedPartialSpecialization *>(SpecializedTemplate) &&
            "Already set to a variable template partial specialization!");
     auto *PS = new (getASTContext()) SpecializedPartialSpecialization();
     PS->PartialSpecialization = PartialSpec;
@@ -2772,7 +2771,7 @@ class VarTemplateSpecializationDecl : public VarDecl,
   /// Note that this variable template specialization is an instantiation
   /// of the given variable template.
   void setInstantiationOf(VarTemplateDecl *TemplDecl) {
-    assert(!SpecializedTemplate.is<SpecializedPartialSpecialization *>() &&
+    assert(!isa<SpecializedPartialSpecialization *>(SpecializedTemplate) &&
            "Previously set to a variable template partial specialization!");
     SpecializedTemplate = TemplDecl;
   }
diff --git a/clang/include/clang/AST/ExprConcepts.h b/clang/include/clang/AST/ExprConcepts.h
index 86c4155b6a853..f988d40cf73c3 100644
--- a/clang/include/clang/AST/ExprConcepts.h
+++ b/clang/include/clang/AST/ExprConcepts.h
@@ -329,24 +329,24 @@ class ExprRequirement : public Requirement {
 
       bool isSubstitutionFailure() const {
         return !isEmpty() &&
-            TypeConstraintInfo.getPointer().is<SubstitutionDiagnostic *>();
+               isa<SubstitutionDiagnostic *>(TypeConstraintInfo.getPointer());
       }
 
       bool isTypeConstraint() const {
         return !isEmpty() &&
-            TypeConstraintInfo.getPointer().is<TemplateParameterList *>();
+               isa<TemplateParameterList *>(TypeConstraintInfo.getPointer());
       }
 
       SubstitutionDiagnostic *getSubstitutionDiagnostic() const {
         assert(isSubstitutionFailure());
-        return TypeConstraintInfo.getPointer().get<SubstitutionDiagnostic *>();
+        return cast<SubstitutionDiagnostic *>(TypeConstraintInfo.getPointer());
       }
 
       const TypeConstraint *getTypeConstraint() const;
 
       TemplateParameterList *getTypeConstraintTemplateParameterList() const {
         assert(isTypeConstraint());
-        return TypeConstraintInfo.getPointer().get<TemplateParameterList *>();
+        return cast<TemplateParameterList *>(TypeConstraintInfo.getPointer());
       }
   };
 private:
diff --git a/clang/include/clang/AST/ExternalASTSource.h b/clang/include/clang/AST/ExternalASTSource.h
index 9f968ba05b446..4d7ff822fceb7 100644
--- a/clang/include/clang/AST/ExternalASTSource.h
+++ b/clang/include/clang/AST/ExternalASTSource.h
@@ -462,9 +462,7 @@ struct LazyGenerationalUpdatePtr {
       : Value(Value) {}
 
   /// Forcibly set this pointer (which must be lazy) as needing updates.
-  void markIncomplete() {
-    Value.template get<LazyData *>()->LastGeneration = 0;
-  }
+  void markIncomplete() { cast<LazyData *>(Value)->LastGeneration = 0; }
 
   /// Set the value of this pointer, in the current generation.
   void set(T NewValue) {
@@ -487,14 +485,14 @@ struct LazyGenerationalUpdatePtr {
       }
       return LazyVal->LastValue;
     }
-    return Value.template get<T>();
+    return cast<T>(Value);
   }
 
   /// Get the most recently computed value of this pointer without updating it.
   T getNotUpdated() const {
     if (auto *LazyVal = Value.template dyn_cast<LazyData *>())
       return LazyVal->LastValue;
-    return Value.template get<T>();
+    return cast<T>(Value);
   }
 
   void *getOpaqueValue() { return Value.getOpaqueValue(); }
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 76b598a5db238..5d5c91ff91d55 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -2151,8 +2151,11 @@ DEF_TRAVERSE_DECL(DecompositionDecl, {
 })
 
 DEF_TRAVERSE_DECL(BindingDecl, {
-  if (getDerived().shouldVisitImplicitCode())
+  if (getDerived().shouldVisitImplicitCode()) {
     TRY_TO(TraverseStmt(D->getBinding()));
+    if (const auto HoldingVar = D->getHoldingVar())
+      TRY_TO(TraverseDecl(HoldingVar));
+  }
 })
 
 DEF_TRAVERSE_DECL(MSPropertyDecl, { TRY_TO(TraverseDeclaratorHelper(D)); })
@@ -4058,6 +4061,12 @@ DEF_TRAVERSE_STMT(OpenACCLoopConstruct,
                   { TRY_TO(TraverseOpenACCAssociatedStmtConstruct(S)); })
 DEF_TRAVERSE_STMT(OpenACCCombinedConstruct,
                   { TRY_TO(TraverseOpenACCAssociatedStmtConstruct(S)); })
+DEF_TRAVERSE_STMT(OpenACCDataConstruct,
+                  { TRY_TO(TraverseOpenACCAssociatedStmtConstruct(S)); })
+DEF_TRAVERSE_STMT(OpenACCEnterDataConstruct, {})
+DEF_TRAVERSE_STMT(OpenACCExitDataConstruct, {})
+DEF_TRAVERSE_STMT(OpenACCHostDataConstruct,
+                  { TRY_TO(TraverseOpenACCAssociatedStmtConstruct(S)); })
 
 // Traverse HLSL: Out argument expression
 DEF_TRAVERSE_STMT(HLSLOutArgExpr, {})
diff --git a/clang/include/clang/AST/Redeclarable.h b/clang/include/clang/AST/Redeclarable.h
index bba789375cb6e..ee21f11e5f707 100644
--- a/clang/include/clang/AST/Redeclarable.h
+++ b/clang/include/clang/AST/Redeclarable.h
@@ -116,7 +116,7 @@ class Redeclarable {
       return isa<KnownLatest>(Link) ||
              // FIXME: 'template' is required on the next line due to an
              // apparent clang bug.
-             cast<NotKnownLatest>(Link).template is<UninitializedLatest>();
+             isa<UninitializedLatest>(cast<NotKnownLatest>(Link));
     }
 
     decl_type *getPrevious(const decl_type *D) const {
diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h
index fa8793e740822..df73980822c7b 100644
--- a/clang/include/clang/AST/StmtOpenACC.h
+++ b/clang/include/clang/AST/StmtOpenACC.h
@@ -292,5 +292,175 @@ class OpenACCCombinedConstruct final
     return const_cast<OpenACCCombinedConstruct *>(this)->getLoop();
   }
 };
+
+// This class represents a 'data' construct, which has an associated statement
+// and clauses, but is otherwise pretty simple.
+class OpenACCDataConstruct final
+    : public OpenACCAssociatedStmtConstruct,
+      public llvm::TrailingObjects<OpenACCCombinedConstruct,
+                                   const OpenACCClause *> {
+  OpenACCDataConstruct(unsigned NumClauses)
+      : OpenACCAssociatedStmtConstruct(
+            OpenACCDataConstructClass, OpenACCDirectiveKind::Data,
+            SourceLocation{}, SourceLocation{}, SourceLocation{},
+            /*AssociatedStmt=*/nullptr) {
+    std::uninitialized_value_construct(
+        getTrailingObjects<const OpenACCClause *>(),
+        getTrailingObjects<const OpenACCClause *>() + NumClauses);
+    setClauseList(MutableArrayRef(getTrailingObjects<const OpenACCClause *>(),
+                                  NumClauses));
+  }
+
+  OpenACCDataConstruct(SourceLocation Start, SourceLocation DirectiveLoc,
+                       SourceLocation End,
+                       ArrayRef<const OpenACCClause *> Clauses,
+                       Stmt *StructuredBlock)
+      : OpenACCAssociatedStmtConstruct(OpenACCDataConstructClass,
+                                       OpenACCDirectiveKind::Data, Start,
+                                       DirectiveLoc, End, StructuredBlock) {
+    std::uninitialized_copy(Clauses.begin(), Clauses.end(),
+                            getTrailingObjects<const OpenACCClause *>());
+    setClauseList(MutableArrayRef(getTrailingObjects<const OpenACCClause *>(),
+                                  Clauses.size()));
+  }
+  void setStructuredBlock(Stmt *S) { setAssociatedStmt(S); }
+
+public:
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OpenACCDataConstructClass;
+  }
+
+  static OpenACCDataConstruct *CreateEmpty(const ASTContext &C,
+                                           unsigned NumClauses);
+  static OpenACCDataConstruct *Create(const ASTContext &C, SourceLocation Start,
+                                      SourceLocation DirectiveLoc,
+                                      SourceLocation End,
+                                      ArrayRef<const OpenACCClause *> Clauses,
+                                      Stmt *StructuredBlock);
+  Stmt *getStructuredBlock() { return getAssociatedStmt(); }
+  const Stmt *getStructuredBlock() const {
+    return const_cast<OpenACCDataConstruct *>(this)->getStructuredBlock();
+  }
+};
+// This class represents a 'enter data' construct, which JUST has clauses.
+class OpenACCEnterDataConstruct final
+    : public OpenACCConstructStmt,
+      public llvm::TrailingObjects<OpenACCCombinedConstruct,
+                                   const OpenACCClause *> {
+  OpenACCEnterDataConstruct(unsigned NumClauses)
+      : OpenACCConstructStmt(OpenACCEnterDataConstructClass,
+                             OpenACCDirectiveKind::EnterData, SourceLocation{},
+                             SourceLocation{}, SourceLocation{}) {
+    std::uninitialized_value_construct(
+        getTrailingObjects<const OpenACCClause *>(),
+        getTrailingObjects<const OpenACCClause *>() + NumClauses);
+    setClauseList(MutableArrayRef(getTrailingObjects<const OpenACCClause *>(),
+                                  NumClauses));
+  }
+  OpenACCEnterDataConstruct(SourceLocation Start, SourceLocation DirectiveLoc,
+                            SourceLocation End,
+                            ArrayRef<const OpenACCClause *> Clauses)
+      : OpenACCConstructStmt(OpenACCEnterDataConstructClass,
+                             OpenACCDirectiveKind::EnterData, Start,
+                             DirectiveLoc, End) {
+    std::uninitialized_copy(Clauses.begin(), Clauses.end(),
+                            getTrailingObjects<const OpenACCClause *>());
+    setClauseList(MutableArrayRef(getTrailingObjects<const OpenACCClause *>(),
+                                  Clauses.size()));
+  }
+
+public:
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OpenACCEnterDataConstructClass;
+  }
+  static OpenACCEnterDataConstruct *CreateEmpty(const ASTContext &C,
+                                                unsigned NumClauses);
+  static OpenACCEnterDataConstruct *
+  Create(const ASTContext &C, SourceLocation Start, SourceLocation DirectiveLoc,
+         SourceLocation End, ArrayRef<const OpenACCClause *> Clauses);
+};
+// This class represents a 'exit data' construct, which JUST has clauses.
+class OpenACCExitDataConstruct final
+    : public OpenACCConstructStmt,
+      public llvm::TrailingObjects<OpenACCCombinedConstruct,
+                                   const OpenACCClause *> {
+  OpenACCExitDataConstruct(unsigned NumClauses)
+      : OpenACCConstructStmt(OpenACCExitDataConstructClass,
+                             OpenACCDirectiveKind::ExitData, SourceLocation{},
+                             SourceLocation{}, SourceLocation{}) {
+    std::uninitialized_value_construct(
+        getTrailingObjects<const OpenACCClause *>(),
+        getTrailingObjects<const OpenACCClause *>() + NumClauses);
+    setClauseList(MutableArrayRef(getTrailingObjects<const OpenACCClause *>(),
+                                  NumClauses));
+  }
+  OpenACCExitDataConstruct(SourceLocation Start, SourceLocation DirectiveLoc,
+                           SourceLocation End,
+                           ArrayRef<const OpenACCClause *> Clauses)
+      : OpenACCConstructStmt(OpenACCExitDataConstructClass,
+                             OpenACCDirectiveKind::ExitData, Start,
+                             DirectiveLoc, End) {
+    std::uninitialized_copy(Clauses.begin(), Clauses.end(),
+                            getTrailingObjects<const OpenACCClause *>());
+    setClauseList(MutableArrayRef(getTrailingObjects<const OpenACCClause *>(),
+                                  Clauses.size()));
+  }
+
+public:
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OpenACCExitDataConstructClass;
+  }
+  static OpenACCExitDataConstruct *CreateEmpty(const ASTContext &C,
+                                               unsigned NumClauses);
+  static OpenACCExitDataConstruct *
+  Create(const ASTContext &C, SourceLocation Start, SourceLocation DirectiveLoc,
+         SourceLocation End, ArrayRef<const OpenACCClause *> Clauses);
+};
+// This class represents a 'host_data' construct, which has an associated
+// statement and clauses, but is otherwise pretty simple.
+class OpenACCHostDataConstruct final
+    : public OpenACCAssociatedStmtConstruct,
+      public llvm::TrailingObjects<OpenACCCombinedConstruct,
+                                   const OpenACCClause *> {
+  OpenACCHostDataConstruct(unsigned NumClauses)
+      : OpenACCAssociatedStmtConstruct(
+            OpenACCHostDataConstructClass, OpenACCDirectiveKind::HostData,
+            SourceLocation{}, SourceLocation{}, SourceLocation{},
+            /*AssociatedStmt=*/nullptr) {
+    std::uninitialized_value_construct(
+        getTrailingObjects<const OpenACCClause *>(),
+        getTrailingObjects<const OpenACCClause *>() + NumClauses);
+    setClauseList(MutableArrayRef(getTrailingObjects<const OpenACCClause *>(),
+                                  NumClauses));
+  }
+  OpenACCHostDataConstruct(SourceLocation Start, SourceLocation DirectiveLoc,
+                           SourceLocation End,
+                           ArrayRef<const OpenACCClause *> Clauses,
+                           Stmt *StructuredBlock)
+      : OpenACCAssociatedStmtConstruct(OpenACCHostDataConstructClass,
+                                       OpenACCDirectiveKind::HostData, Start,
+                                       DirectiveLoc, End, StructuredBlock) {
+    std::uninitialized_copy(Clauses.begin(), Clauses.end(),
+                            getTrailingObjects<const OpenACCClause *>());
+    setClauseList(MutableArrayRef(getTrailingObjects<const OpenACCClause *>(),
+                                  Clauses.size()));
+  }
+  void setStructuredBlock(Stmt *S) { setAssociatedStmt(S); }
+
+public:
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OpenACCHostDataConstructClass;
+  }
+  static OpenACCHostDataConstruct *CreateEmpty(const ASTContext &C,
+                                               unsigned NumClauses);
+  static OpenACCHostDataConstruct *
+  Create(const ASTContext &C, SourceLocation Start, SourceLocation DirectiveLoc,
+         SourceLocation End, ArrayRef<const OpenACCClause *> Clauses,
+         Stmt *StructuredBlock);
+  Stmt *getStructuredBlock() { return getAssociatedStmt(); }
+  const Stmt *getStructuredBlock() const {
+    return const_cast<OpenACCHostDataConstruct *>(this)->getStructuredBlock();
+  }
+};
 } // namespace clang
 #endif // LLVM_CLANG_AST_STMTOPENACC_H
diff --git a/clang/include/clang/AST/TextNodeDumper.h b/clang/include/clang/AST/TextNodeDumper.h
index 988b142a7672a..e54e7e527b8a3 100644
--- a/clang/include/clang/AST/TextNodeDumper.h
+++ b/clang/include/clang/AST/TextNodeDumper.h
@@ -411,6 +411,10 @@ class TextNodeDumper
   void VisitOpenACCConstructStmt(const OpenACCConstructStmt *S);
   void VisitOpenACCLoopConstruct(const OpenACCLoopConstruct *S);
   void VisitOpenACCCombinedConstruct(const OpenACCCombinedConstruct *S);
+  void VisitOpenACCDataConstruct(const OpenACCDataConstruct *S);
+  void VisitOpenACCEnterDataConstruct(const OpenACCEnterDataConstruct *S);
+  void VisitOpenACCExitDataConstruct(const OpenACCExitDataConstruct *S);
+  void VisitOpenACCHostDataConstruct(const OpenACCHostDataConstruct *S);
   void VisitOpenACCAsteriskSizeExpr(const OpenACCAsteriskSizeExpr *S);
   void VisitEmbedExpr(const EmbedExpr *S);
   void VisitAtomicExpr(const AtomicExpr *AE);
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index f0eee77c73ef0..09c98f642852f 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -934,11 +934,11 @@ class QualType {
                        Qualifiers::FastWidth> Value;
 
   const ExtQuals *getExtQualsUnsafe() const {
-    return Value.getPointer().get<const ExtQuals*>();
+    return cast<const ExtQuals *>(Value.getPointer());
   }
 
   const Type *getTypePtrUnsafe() const {
-    return Value.getPointer().get<const Type*>();
+    return cast<const Type *>(Value.getPointer());
   }
 
   const ExtQualsTypeCommonBase *getCommonPtr() const {
@@ -1064,7 +1064,7 @@ class QualType {
   /// "non-fast" qualifiers, e.g., those that are stored in an ExtQualType
   /// instance.
   bool hasLocalNonFastQualifiers() const {
-    return Value.getPointer().is<const ExtQuals*>();
+    return isa<const ExtQuals *>(Value.getPointer());
   }
 
   /// Retrieve the set of qualifiers local to this particular QualType
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 0a245e2077f68..811265151fa0d 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10237,10 +10237,10 @@ def warn_dangling_pointer_assignment : Warning<
    InGroup<DanglingAssignment>;
 def warn_dangling_reference_captured : Warning<
    "object whose reference is captured by '%0' will be destroyed at the end of "
-   "the full-expression">, InGroup<DanglingCapture>, DefaultIgnore;
+   "the full-expression">, InGroup<DanglingCapture>;
 def warn_dangling_reference_captured_by_unknown : Warning<
    "object whose reference is captured will be destroyed at the end of "
-   "the full-expression">, InGroup<DanglingCapture>, DefaultIgnore;
+   "the full-expression">, InGroup<DanglingCapture>;
 
 // For non-floating point, expressions of the form x == x or x != x
 // should result in a warning, since these always evaluate to a constant.
diff --git a/clang/include/clang/Basic/OpenACCKinds.h b/clang/include/clang/Basic/OpenACCKinds.h
index ea0bf23468cb8..7fb76271826a6 100644
--- a/clang/include/clang/Basic/OpenACCKinds.h
+++ b/clang/include/clang/Basic/OpenACCKinds.h
@@ -158,6 +158,14 @@ inline bool isOpenACCCombinedDirectiveKind(OpenACCDirectiveKind K) {
          K == OpenACCDirectiveKind::KernelsLoop;
 }
 
+// Tests 'K' to see if it is 'data', 'host_data', 'enter data', or 'exit data'.
+inline bool isOpenACCDataDirectiveKind(OpenACCDirectiveKind K) {
+  return K == OpenACCDirectiveKind::Data ||
+         K == OpenACCDirectiveKind::EnterData ||
+         K == OpenACCDirectiveKind::ExitData ||
+         K == OpenACCDirectiveKind::HostData;
+}
+
 enum class OpenACCAtomicKind : uint8_t {
   Read,
   Write,
diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td
index 89f5a76eb1131..0c3c580c218fd 100644
--- a/clang/include/clang/Basic/StmtNodes.td
+++ b/clang/include/clang/Basic/StmtNodes.td
@@ -308,6 +308,10 @@ def OpenACCAssociatedStmtConstruct
 def OpenACCComputeConstruct : StmtNode<OpenACCAssociatedStmtConstruct>;
 def OpenACCLoopConstruct : StmtNode<OpenACCAssociatedStmtConstruct>;
 def OpenACCCombinedConstruct : StmtNode<OpenACCAssociatedStmtConstruct>;
+def OpenACCDataConstruct : StmtNode<OpenACCAssociatedStmtConstruct>;
+def OpenACCEnterDataConstruct : StmtNode<OpenACCConstructStmt>;
+def OpenACCExitDataConstruct : StmtNode<OpenACCConstructStmt>;
+def OpenACCHostDataConstruct : StmtNode<OpenACCAssociatedStmtConstruct>;
 
 // OpenACC Additional Expressions.
 def OpenACCAsteriskSizeExpr : StmtNode<Expr>;
diff --git a/clang/include/clang/Lex/PreprocessingRecord.h b/clang/include/clang/Lex/PreprocessingRecord.h
index 437d8e4cc174e..7886aef7f0c7f 100644
--- a/clang/include/clang/Lex/PreprocessingRecord.h
+++ b/clang/include/clang/Lex/PreprocessingRecord.h
@@ -180,13 +180,13 @@ class Token;
     }
 
     /// True if it is a builtin macro.
-    bool isBuiltinMacro() const { return NameOrDef.is<IdentifierInfo *>(); }
+    bool isBuiltinMacro() const { return isa<IdentifierInfo *>(NameOrDef); }
 
     /// The name of the macro being expanded.
     const IdentifierInfo *getName() const {
       if (MacroDefinitionRecord *Def = getDefinition())
         return Def->getName();
-      return NameOrDef.get<IdentifierInfo *>();
+      return cast<IdentifierInfo *>(NameOrDef);
     }
 
     /// The definition of the macro being expanded. May return null if
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 3312d4ed1d798..3d223c345ea15 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -859,7 +859,7 @@ class Preprocessor {
       auto *Info = State.dyn_cast<ModuleMacroInfo*>();
       if (!Info) {
         Info = new (PP.getPreprocessorAllocator())
-            ModuleMacroInfo(State.get<MacroDirective *>());
+            ModuleMacroInfo(cast<MacroDirective *>(State));
         State = Info;
       }
 
@@ -892,7 +892,7 @@ class Preprocessor {
     MacroDirective *getLatest() const {
       if (auto *Info = State.dyn_cast<ModuleMacroInfo*>())
         return Info->MD;
-      return State.get<MacroDirective*>();
+      return cast<MacroDirective *>(State);
     }
 
     void setLatest(MacroDirective *MD) {
@@ -945,7 +945,7 @@ class Preprocessor {
         if (Overrides.empty())
           return;
         Info = new (PP.getPreprocessorAllocator())
-            ModuleMacroInfo(State.get<MacroDirective *>());
+            ModuleMacroInfo(cast<MacroDirective *>(State));
         State = Info;
       }
       Info->OverriddenMacros.clear();
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index af0e08d800bf2..2be9ade08cac3 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -2019,6 +2019,10 @@ enum StmtCode {
   STMT_OPENACC_LOOP_CONSTRUCT,
   STMT_OPENACC_COMBINED_CONSTRUCT,
   EXPR_OPENACC_ASTERISK_SIZE,
+  STMT_OPENACC_DATA_CONSTRUCT,
+  STMT_OPENACC_ENTER_DATA_CONSTRUCT,
+  STMT_OPENACC_EXIT_DATA_CONSTRUCT,
+  STMT_OPENACC_HOST_DATA_CONSTRUCT,
 
   // HLSL Constructs
   EXPR_HLSL_OUT_ARG,
diff --git a/clang/lib/AST/ByteCode/BitcastBuffer.h b/clang/lib/AST/ByteCode/BitcastBuffer.h
index b1b6b9e5173a7..d1d6ee39ad17b 100644
--- a/clang/lib/AST/ByteCode/BitcastBuffer.h
+++ b/clang/lib/AST/ByteCode/BitcastBuffer.h
@@ -18,6 +18,8 @@ namespace interp {
 
 enum class Endian { Little, Big };
 
+struct Bytes;
+
 /// A quantity in bits.
 struct Bits {
   size_t N = 0;
@@ -30,6 +32,7 @@ struct Bits {
   bool isFullByte() const { return N % 8 == 0; }
   bool nonZero() const { return N != 0; }
   bool isZero() const { return N == 0; }
+  Bytes toBytes() const;
 
   Bits operator-(Bits Other) const { return Bits(N - Other.N); }
   Bits operator+(Bits Other) const { return Bits(N + Other.N); }
@@ -56,6 +59,11 @@ struct Bytes {
   Bits toBits() const { return Bits(N * 8); }
 };
 
+inline Bytes Bits::toBytes() const {
+  assert(isFullByte());
+  return Bytes(N / 8);
+}
+
 /// A bit range. Both Start and End are inclusive.
 struct BitRange {
   Bits Start;
@@ -83,6 +91,7 @@ struct BitcastBuffer {
 
   /// Returns the buffer size in bits.
   Bits size() const { return FinalBitSize; }
+  Bytes byteSize() const { return FinalBitSize.toBytes(); }
 
   /// Returns \c true if all bits in the buffer have been initialized.
   bool allInitialized() const;
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 57175da32b31c..21baedf832eea 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -1830,6 +1830,7 @@ static bool interp__builtin_elementwise_popcount(InterpState &S, CodePtr OpPC,
 
   return true;
 }
+
 static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC,
                                    const InterpFrame *Frame,
                                    const Function *Func, const CallExpr *Call) {
@@ -1900,6 +1901,87 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+/// Determine if T is a character type for which we guarantee that
+/// sizeof(T) == 1.
+static bool isOneByteCharacterType(QualType T) {
+  return T->isCharType() || T->isChar8Type();
+}
+
+static bool interp__builtin_memcmp(InterpState &S, CodePtr OpPC,
+                                   const InterpFrame *Frame,
+                                   const Function *Func, const CallExpr *Call) {
+  assert(Call->getNumArgs() == 3);
+  unsigned ID = Func->getBuiltinID();
+  const Pointer &PtrA = getParam<Pointer>(Frame, 0);
+  const Pointer &PtrB = getParam<Pointer>(Frame, 1);
+  const APSInt &Size =
+      peekToAPSInt(S.Stk, *S.getContext().classify(Call->getArg(2)));
+
+  if (ID == Builtin::BImemcmp || ID == Builtin::BIbcmp)
+    diagnoseNonConstexprBuiltin(S, OpPC, ID);
+
+  if (Size.isZero()) {
+    pushInteger(S, 0, Call->getType());
+    return true;
+  }
+
+  // FIXME: This is an arbitrary limitation the current constant interpreter
+  // had. We could remove this.
+  if (!isOneByteCharacterType(PtrA.getType()) ||
+      !isOneByteCharacterType(PtrB.getType())) {
+    S.FFDiag(S.Current->getSource(OpPC),
+             diag::note_constexpr_memcmp_unsupported)
+        << ("'" + S.getASTContext().BuiltinInfo.getName(ID) + "'").str()
+        << PtrA.getType() << PtrB.getType();
+    return false;
+  }
+
+  if (PtrA.isDummy() || PtrB.isDummy())
+    return false;
+
+  // Now, read both pointers to a buffer and compare those.
+  BitcastBuffer BufferA(
+      Bits(S.getASTContext().getTypeSize(PtrA.getFieldDesc()->getType())));
+  readPointerToBuffer(S.getContext(), PtrA, BufferA, false);
+
+  BitcastBuffer BufferB(
+      Bits(S.getASTContext().getTypeSize(PtrB.getFieldDesc()->getType())));
+  readPointerToBuffer(S.getContext(), PtrB, BufferB, false);
+
+  size_t MinBufferSize = std::min(BufferA.byteSize().getQuantity(),
+                                  BufferB.byteSize().getQuantity());
+  size_t CmpSize =
+      std::min(MinBufferSize, static_cast<size_t>(Size.getZExtValue()));
+
+  for (size_t I = 0; I != CmpSize; ++I) {
+    std::byte A = BufferA.Data[I];
+    std::byte B = BufferB.Data[I];
+
+    if (A < B) {
+      pushInteger(S, -1, Call->getType());
+      return true;
+    } else if (A > B) {
+      pushInteger(S, 1, Call->getType());
+      return true;
+    }
+  }
+
+  // We compared CmpSize bytes above. If the limiting factor was the Size
+  // passed, we're done and the result is equality (0).
+  if (Size.getZExtValue() <= CmpSize) {
+    pushInteger(S, 0, Call->getType());
+    return true;
+  }
+
+  // However, if we read all the available bytes but were instructed to read
+  // even more, diagnose this as a "read of dereferenced one-past-the-end
+  // pointer". This is what would happen if we called CheckRead() on every array
+  // element.
+  S.FFDiag(S.Current->getSource(OpPC), diag::note_constexpr_access_past_end)
+      << AK_Read << S.Current->getRange(OpPC);
+  return false;
+}
+
 bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
                       const CallExpr *Call, uint32_t BuiltinID) {
   const InterpFrame *Frame = S.Current;
@@ -2373,6 +2455,14 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
       return false;
     break;
 
+  case Builtin::BI__builtin_memcmp:
+  case Builtin::BImemcmp:
+  case Builtin::BI__builtin_bcmp:
+  case Builtin::BIbcmp:
+    if (!interp__builtin_memcmp(S, OpPC, Frame, F, Call))
+      return false;
+    break;
+
   default:
     S.FFDiag(S.Current->getLocation(OpPC),
              diag::note_invalid_subexpr_in_const_expr)
diff --git a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
index c9141c0fad2f5..c87993b8739a7 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
@@ -259,8 +259,10 @@ static bool CheckBitcastType(InterpState &S, CodePtr OpPC, QualType T,
   return true;
 }
 
-static bool readPointerToBuffer(const Context &Ctx, const Pointer &FromPtr,
-                                BitcastBuffer &Buffer, bool ReturnOnUninit) {
+bool clang::interp::readPointerToBuffer(const Context &Ctx,
+                                        const Pointer &FromPtr,
+                                        BitcastBuffer &Buffer,
+                                        bool ReturnOnUninit) {
   const ASTContext &ASTCtx = Ctx.getASTContext();
   Endian TargetEndianness =
       ASTCtx.getTargetInfo().isLittleEndian() ? Endian::Little : Endian::Big;
diff --git a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.h b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.h
index 92e6ffc79fc4f..08c207c7415df 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.h
+++ b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_AST_INTERP_BUILITN_BIT_CAST_H
-#define LLVM_CLANG_AST_INTERP_BUILITN_BIT_CAST_H
+#ifndef LLVM_CLANG_AST_INTERP_BUILTIN_BIT_CAST_H
+#define LLVM_CLANG_AST_INTERP_BUILTIN_BIT_CAST_H
 
 #include "BitcastBuffer.h"
 #include <cstddef>
@@ -17,6 +17,7 @@ namespace interp {
 class Pointer;
 class InterpState;
 class CodePtr;
+class Context;
 
 bool DoBitCast(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                std::byte *Buff, Bits BitWidth, Bits FullBitWidth,
@@ -25,7 +26,8 @@ bool DoBitCastPtr(InterpState &S, CodePtr OpPC, const Pointer &FromPtr,
                   Pointer &ToPtr);
 bool DoBitCastPtr(InterpState &S, CodePtr OpPC, const Pointer &FromPtr,
                   Pointer &ToPtr, size_t Size);
-
+bool readPointerToBuffer(const Context &Ctx, const Pointer &FromPtr,
+                         BitcastBuffer &Buffer, bool ReturnOnUninit);
 } // namespace interp
 } // namespace clang
 
diff --git a/clang/lib/AST/StmtOpenACC.cpp b/clang/lib/AST/StmtOpenACC.cpp
index 23dd57d235813..fb73dfb3fa9de 100644
--- a/clang/lib/AST/StmtOpenACC.cpp
+++ b/clang/lib/AST/StmtOpenACC.cpp
@@ -110,3 +110,89 @@ OpenACCCombinedConstruct *OpenACCCombinedConstruct::Create(
       OpenACCCombinedConstruct(DK, BeginLoc, DirLoc, EndLoc, Clauses, Loop);
   return Inst;
 }
+
+OpenACCDataConstruct *OpenACCDataConstruct::CreateEmpty(const ASTContext &C,
+                                                        unsigned NumClauses) {
+  void *Mem =
+      C.Allocate(OpenACCDataConstruct::totalSizeToAlloc<const OpenACCClause *>(
+          NumClauses));
+  auto *Inst = new (Mem) OpenACCDataConstruct(NumClauses);
+  return Inst;
+}
+
+OpenACCDataConstruct *
+OpenACCDataConstruct::Create(const ASTContext &C, SourceLocation Start,
+                             SourceLocation DirectiveLoc, SourceLocation End,
+                             ArrayRef<const OpenACCClause *> Clauses,
+                             Stmt *StructuredBlock) {
+  void *Mem =
+      C.Allocate(OpenACCDataConstruct::totalSizeToAlloc<const OpenACCClause *>(
+          Clauses.size()));
+  auto *Inst = new (Mem)
+      OpenACCDataConstruct(Start, DirectiveLoc, End, Clauses, StructuredBlock);
+  return Inst;
+}
+
+OpenACCEnterDataConstruct *
+OpenACCEnterDataConstruct::CreateEmpty(const ASTContext &C,
+                                       unsigned NumClauses) {
+  void *Mem = C.Allocate(
+      OpenACCEnterDataConstruct::totalSizeToAlloc<const OpenACCClause *>(
+          NumClauses));
+  auto *Inst = new (Mem) OpenACCEnterDataConstruct(NumClauses);
+  return Inst;
+}
+
+OpenACCEnterDataConstruct *OpenACCEnterDataConstruct::Create(
+    const ASTContext &C, SourceLocation Start, SourceLocation DirectiveLoc,
+    SourceLocation End, ArrayRef<const OpenACCClause *> Clauses) {
+  void *Mem = C.Allocate(
+      OpenACCEnterDataConstruct::totalSizeToAlloc<const OpenACCClause *>(
+          Clauses.size()));
+  auto *Inst =
+      new (Mem) OpenACCEnterDataConstruct(Start, DirectiveLoc, End, Clauses);
+  return Inst;
+}
+
+OpenACCExitDataConstruct *
+OpenACCExitDataConstruct::CreateEmpty(const ASTContext &C,
+                                      unsigned NumClauses) {
+  void *Mem = C.Allocate(
+      OpenACCExitDataConstruct::totalSizeToAlloc<const OpenACCClause *>(
+          NumClauses));
+  auto *Inst = new (Mem) OpenACCExitDataConstruct(NumClauses);
+  return Inst;
+}
+
+OpenACCExitDataConstruct *OpenACCExitDataConstruct::Create(
+    const ASTContext &C, SourceLocation Start, SourceLocation DirectiveLoc,
+    SourceLocation End, ArrayRef<const OpenACCClause *> Clauses) {
+  void *Mem = C.Allocate(
+      OpenACCExitDataConstruct::totalSizeToAlloc<const OpenACCClause *>(
+          Clauses.size()));
+  auto *Inst =
+      new (Mem) OpenACCExitDataConstruct(Start, DirectiveLoc, End, Clauses);
+  return Inst;
+}
+
+OpenACCHostDataConstruct *
+OpenACCHostDataConstruct::CreateEmpty(const ASTContext &C,
+                                      unsigned NumClauses) {
+  void *Mem = C.Allocate(
+      OpenACCHostDataConstruct::totalSizeToAlloc<const OpenACCClause *>(
+          NumClauses));
+  auto *Inst = new (Mem) OpenACCHostDataConstruct(NumClauses);
+  return Inst;
+}
+
+OpenACCHostDataConstruct *OpenACCHostDataConstruct::Create(
+    const ASTContext &C, SourceLocation Start, SourceLocation DirectiveLoc,
+    SourceLocation End, ArrayRef<const OpenACCClause *> Clauses,
+    Stmt *StructuredBlock) {
+  void *Mem = C.Allocate(
+      OpenACCHostDataConstruct::totalSizeToAlloc<const OpenACCClause *>(
+          Clauses.size()));
+  auto *Inst = new (Mem) OpenACCHostDataConstruct(Start, DirectiveLoc, End,
+                                                  Clauses, StructuredBlock);
+  return Inst;
+}
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 7507c9d14327a..488419add5e79 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -1193,6 +1193,51 @@ void StmtPrinter::VisitOpenACCCombinedConstruct(OpenACCCombinedConstruct *S) {
   PrintStmt(S->getLoop());
 }
 
+void StmtPrinter::VisitOpenACCDataConstruct(OpenACCDataConstruct *S) {
+  Indent() << "#pragma acc data";
+
+  if (!S->clauses().empty()) {
+    OS << ' ';
+    OpenACCClausePrinter Printer(OS, Policy);
+    Printer.VisitClauseList(S->clauses());
+  }
+  OS << '\n';
+
+  PrintStmt(S->getStructuredBlock());
+}
+void StmtPrinter::VisitOpenACCEnterDataConstruct(OpenACCEnterDataConstruct *S) {
+  Indent() << "#pragma acc enter data";
+
+  if (!S->clauses().empty()) {
+    OS << ' ';
+    OpenACCClausePrinter Printer(OS, Policy);
+    Printer.VisitClauseList(S->clauses());
+  }
+  OS << '\n';
+}
+void StmtPrinter::VisitOpenACCExitDataConstruct(OpenACCExitDataConstruct *S) {
+  Indent() << "#pragma acc exit data";
+
+  if (!S->clauses().empty()) {
+    OS << ' ';
+    OpenACCClausePrinter Printer(OS, Policy);
+    Printer.VisitClauseList(S->clauses());
+  }
+  OS << '\n';
+}
+void StmtPrinter::VisitOpenACCHostDataConstruct(OpenACCHostDataConstruct *S) {
+  Indent() << "#pragma acc host_data";
+
+  if (!S->clauses().empty()) {
+    OS << ' ';
+    OpenACCClausePrinter Printer(OS, Policy);
+    Printer.VisitClauseList(S->clauses());
+  }
+  OS << '\n';
+
+  PrintStmt(S->getStructuredBlock());
+}
+
 //===----------------------------------------------------------------------===//
 //  Expr printing methods.
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 3dfbef1cdb712..e9ff674097c8f 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2697,6 +2697,37 @@ void StmtProfiler::VisitOpenACCCombinedConstruct(
   P.VisitOpenACCClauseList(S->clauses());
 }
 
+void StmtProfiler::VisitOpenACCDataConstruct(const OpenACCDataConstruct *S) {
+  VisitStmt(S);
+
+  OpenACCClauseProfiler P{*this};
+  P.VisitOpenACCClauseList(S->clauses());
+}
+
+void StmtProfiler::VisitOpenACCEnterDataConstruct(
+    const OpenACCEnterDataConstruct *S) {
+  VisitStmt(S);
+
+  OpenACCClauseProfiler P{*this};
+  P.VisitOpenACCClauseList(S->clauses());
+}
+
+void StmtProfiler::VisitOpenACCExitDataConstruct(
+    const OpenACCExitDataConstruct *S) {
+  VisitStmt(S);
+
+  OpenACCClauseProfiler P{*this};
+  P.VisitOpenACCClauseList(S->clauses());
+}
+
+void StmtProfiler::VisitOpenACCHostDataConstruct(
+    const OpenACCHostDataConstruct *S) {
+  VisitStmt(S);
+
+  OpenACCClauseProfiler P{*this};
+  P.VisitOpenACCClauseList(S->clauses());
+}
+
 void StmtProfiler::VisitHLSLOutArgExpr(const HLSLOutArgExpr *S) {
   VisitStmt(S);
 }
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 2552c11a39532..209ad3a5f10ac 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -2936,6 +2936,25 @@ void TextNodeDumper::VisitOpenACCCombinedConstruct(
   OS << " " << S->getDirectiveKind();
 }
 
+void TextNodeDumper::VisitOpenACCDataConstruct(const OpenACCDataConstruct *S) {
+  OS << " " << S->getDirectiveKind();
+}
+
+void TextNodeDumper::VisitOpenACCEnterDataConstruct(
+    const OpenACCEnterDataConstruct *S) {
+  OS << " " << S->getDirectiveKind();
+}
+
+void TextNodeDumper::VisitOpenACCExitDataConstruct(
+    const OpenACCExitDataConstruct *S) {
+  OS << " " << S->getDirectiveKind();
+}
+
+void TextNodeDumper::VisitOpenACCHostDataConstruct(
+    const OpenACCHostDataConstruct *S) {
+  OS << " " << S->getDirectiveKind();
+}
+
 void TextNodeDumper::VisitEmbedExpr(const EmbedExpr *S) {
   AddChild("begin", [=] { OS << S->getStartingElementPos(); });
   AddChild("number of elements", [=] { OS << S->getDataElementCount(); });
diff --git a/clang/lib/Analysis/PathDiagnostic.cpp b/clang/lib/Analysis/PathDiagnostic.cpp
index 35472e705cfd8..5b14d138b6e28 100644
--- a/clang/lib/Analysis/PathDiagnostic.cpp
+++ b/clang/lib/Analysis/PathDiagnostic.cpp
@@ -484,10 +484,10 @@ SourceLocation PathDiagnosticLocation::getValidSourceLocation(
   // source code, so find an enclosing statement and use its location.
   if (!L.isValid()) {
     AnalysisDeclContext *ADC;
-    if (LAC.is<const LocationContext*>())
-      ADC = LAC.get<const LocationContext*>()->getAnalysisDeclContext();
+    if (auto *LC = dyn_cast<const LocationContext *>(LAC))
+      ADC = LC->getAnalysisDeclContext();
     else
-      ADC = LAC.get<AnalysisDeclContext*>();
+      ADC = cast<AnalysisDeclContext *>(LAC);
 
     ParentMap &PM = ADC->getParentMap();
 
diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index 40f529e52b44a..a9aff39df6474 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -439,36 +439,25 @@ AST_MATCHER(ArraySubscriptExpr, isSafeArraySubscript) {
   //    already duplicated
   //  - call both from Sema and from here
 
-  const auto *BaseDRE =
-      dyn_cast<DeclRefExpr>(Node.getBase()->IgnoreParenImpCasts());
-  const auto *SLiteral =
-      dyn_cast<StringLiteral>(Node.getBase()->IgnoreParenImpCasts());
-  uint64_t size;
-
-  if (!BaseDRE && !SLiteral)
+  uint64_t limit;
+  if (const auto *CATy =
+          dyn_cast<ConstantArrayType>(Node.getBase()
+                                          ->IgnoreParenImpCasts()
+                                          ->getType()
+                                          ->getUnqualifiedDesugaredType())) {
+    limit = CATy->getLimitedSize();
+  } else if (const auto *SLiteral = dyn_cast<StringLiteral>(
+                 Node.getBase()->IgnoreParenImpCasts())) {
+    limit = SLiteral->getLength() + 1;
+  } else {
     return false;
-
-  if (BaseDRE) {
-    if (!BaseDRE->getDecl())
-      return false;
-    const auto *CATy = Finder->getASTContext().getAsConstantArrayType(
-        BaseDRE->getDecl()->getType());
-    if (!CATy) {
-      return false;
-    }
-    size = CATy->getLimitedSize();
-  } else if (SLiteral) {
-    size = SLiteral->getLength() + 1;
   }
 
   if (const auto *IdxLit = dyn_cast<IntegerLiteral>(Node.getIdx())) {
     const APInt ArrIdx = IdxLit->getValue();
-    // FIXME: ArrIdx.isNegative() we could immediately emit an error as that's a
-    // bug
-    if (ArrIdx.isNonNegative() && ArrIdx.getLimitedValue() < size)
+    if (ArrIdx.isNonNegative() && ArrIdx.getLimitedValue() < limit)
       return true;
   }
-
   return false;
 }
 
diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp
index 2876c290a26b1..f44b5e4c4b638 100644
--- a/clang/lib/Basic/FileManager.cpp
+++ b/clang/lib/Basic/FileManager.cpp
@@ -398,7 +398,7 @@ FileEntryRef FileManager::getVirtualFileRef(StringRef Filename, off_t Size,
       {Filename, std::errc::no_such_file_or_directory}).first;
   if (NamedFileEnt.second) {
     FileEntryRef::MapValue Value = *NamedFileEnt.second;
-    if (LLVM_LIKELY(Value.V.is<FileEntry *>()))
+    if (LLVM_LIKELY(isa<FileEntry *>(Value.V)))
       return FileEntryRef(NamedFileEnt);
     return FileEntryRef(*Value.V.get<const FileEntryRef::MapEntry *>());
   }
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index 849c18f171f6e..44e982d3ee67f 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -592,8 +592,7 @@ bool needConversion(StringRef Filename) {
 #ifdef __MVS__
   llvm::ErrorOr<bool> NeedConversion =
       llvm::needzOSConversion(Filename.str().c_str());
-  assert(NeedConversion && "Filename was not found");
-  return *NeedConversion;
+  return NeedConversion && *NeedConversion;
 #else
   return false;
 #endif
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 698baf853507f..6c7a594fb10c4 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -458,6 +458,18 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
   case Stmt::OpenACCCombinedConstructClass:
     EmitOpenACCCombinedConstruct(cast<OpenACCCombinedConstruct>(*S));
     break;
+  case Stmt::OpenACCDataConstructClass:
+    EmitOpenACCDataConstruct(cast<OpenACCDataConstruct>(*S));
+    break;
+  case Stmt::OpenACCEnterDataConstructClass:
+    EmitOpenACCEnterDataConstruct(cast<OpenACCEnterDataConstruct>(*S));
+    break;
+  case Stmt::OpenACCExitDataConstructClass:
+    EmitOpenACCExitDataConstruct(cast<OpenACCExitDataConstruct>(*S));
+    break;
+  case Stmt::OpenACCHostDataConstructClass:
+    EmitOpenACCHostDataConstruct(cast<OpenACCHostDataConstruct>(*S));
+    break;
   }
 }
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index eaea0d8a08ac0..092d55355a0a1 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4094,6 +4094,30 @@ class CodeGenFunction : public CodeGenTypeCache {
     EmitStmt(S.getLoop());
   }
 
+  void EmitOpenACCDataConstruct(const OpenACCDataConstruct &S) {
+    // TODO OpenACC: Implement this.  It is currently implemented as a 'no-op',
+    // simply emitting its structured block, but in the future we will implement
+    // some sort of IR.
+    EmitStmt(S.getStructuredBlock());
+  }
+
+  void EmitOpenACCEnterDataConstruct(const OpenACCEnterDataConstruct &S) {
+    // TODO OpenACC: Implement this.  It is currently implemented as a 'no-op',
+    // but in the future we will implement some sort of IR.
+  }
+
+  void EmitOpenACCExitDataConstruct(const OpenACCExitDataConstruct &S) {
+    // TODO OpenACC: Implement this.  It is currently implemented as a 'no-op',
+    // but in the future we will implement some sort of IR.
+  }
+
+  void EmitOpenACCHostDataConstruct(const OpenACCHostDataConstruct &S) {
+    // TODO OpenACC: Implement this.  It is currently implemented as a 'no-op',
+    // simply emitting its structured block, but in the future we will implement
+    // some sort of IR.
+    EmitStmt(S.getStructuredBlock());
+  }
+
   //===--------------------------------------------------------------------===//
   //                         LValue Expression Emission
   //===--------------------------------------------------------------------===//
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index fb73b62cf2dae..dc84c1b9d1cc4 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -1165,6 +1165,34 @@ bool Driver::loadConfigFiles() {
   return false;
 }
 
+static bool findTripleConfigFile(llvm::cl::ExpansionContext &ExpCtx,
+                                 SmallString<128> &ConfigFilePath,
+                                 llvm::Triple Triple, std::string Suffix) {
+  // First, try the full unmodified triple.
+  if (ExpCtx.findConfigFile(Triple.str() + Suffix, ConfigFilePath))
+    return true;
+
+  // Don't continue if we didn't find a parsable version in the triple.
+  VersionTuple OSVersion = Triple.getOSVersion();
+  if (!OSVersion.getMinor().has_value())
+    return false;
+
+  std::string BaseOSName = Triple.getOSTypeName(Triple.getOS()).str();
+
+  // Next try strip the version to only include the major component.
+  // e.g. arm64-apple-darwin23.6.0 -> arm64-apple-darwin23
+  if (OSVersion.getMajor() != 0) {
+    Triple.setOSName(BaseOSName + llvm::utostr(OSVersion.getMajor()));
+    if (ExpCtx.findConfigFile(Triple.str() + Suffix, ConfigFilePath))
+      return true;
+  }
+
+  // Finally, try without any version suffix at all.
+  // e.g. arm64-apple-darwin23.6.0 -> arm64-apple-darwin
+  Triple.setOSName(BaseOSName);
+  return ExpCtx.findConfigFile(Triple.str() + Suffix, ConfigFilePath);
+}
+
 bool Driver::loadDefaultConfigFiles(llvm::cl::ExpansionContext &ExpCtx) {
   // Disable default config if CLANG_NO_DEFAULT_CONFIG is set to a non-empty
   // value.
@@ -1176,7 +1204,7 @@ bool Driver::loadDefaultConfigFiles(llvm::cl::ExpansionContext &ExpCtx) {
     return false;
 
   std::string RealMode = getExecutableForDriverMode(Mode);
-  std::string Triple;
+  llvm::Triple Triple;
 
   // If name prefix is present, no --target= override was passed via CLOptions
   // and the name prefix is not a valid triple, force it for backwards
@@ -1187,15 +1215,13 @@ bool Driver::loadDefaultConfigFiles(llvm::cl::ExpansionContext &ExpCtx) {
     llvm::Triple PrefixTriple{ClangNameParts.TargetPrefix};
     if (PrefixTriple.getArch() == llvm::Triple::UnknownArch ||
         PrefixTriple.isOSUnknown())
-      Triple = PrefixTriple.str();
+      Triple = PrefixTriple;
   }
 
   // Otherwise, use the real triple as used by the driver.
-  if (Triple.empty()) {
-    llvm::Triple RealTriple =
-        computeTargetTriple(*this, TargetTriple, *CLOptions);
-    Triple = RealTriple.str();
-    assert(!Triple.empty());
+  if (Triple.str().empty()) {
+    Triple = computeTargetTriple(*this, TargetTriple, *CLOptions);
+    assert(!Triple.str().empty());
   }
 
   // Search for config files in the following order:
@@ -1210,21 +1236,21 @@ bool Driver::loadDefaultConfigFiles(llvm::cl::ExpansionContext &ExpCtx) {
 
   // Try loading <triple>-<mode>.cfg, and return if we find a match.
   SmallString<128> CfgFilePath;
-  std::string CfgFileName = Triple + '-' + RealMode + ".cfg";
-  if (ExpCtx.findConfigFile(CfgFileName, CfgFilePath))
+  if (findTripleConfigFile(ExpCtx, CfgFilePath, Triple,
+                           "-" + RealMode + ".cfg"))
     return readConfigFile(CfgFilePath, ExpCtx);
 
   bool TryModeSuffix = !ClangNameParts.ModeSuffix.empty() &&
                        ClangNameParts.ModeSuffix != RealMode;
   if (TryModeSuffix) {
-    CfgFileName = Triple + '-' + ClangNameParts.ModeSuffix + ".cfg";
-    if (ExpCtx.findConfigFile(CfgFileName, CfgFilePath))
+    if (findTripleConfigFile(ExpCtx, CfgFilePath, Triple,
+                             "-" + ClangNameParts.ModeSuffix + ".cfg"))
       return readConfigFile(CfgFilePath, ExpCtx);
   }
 
   // Try loading <mode>.cfg, and return if loading failed.  If a matching file
   // was not found, still proceed on to try <triple>.cfg.
-  CfgFileName = RealMode + ".cfg";
+  std::string CfgFileName = RealMode + ".cfg";
   if (ExpCtx.findConfigFile(CfgFileName, CfgFilePath)) {
     if (readConfigFile(CfgFilePath, ExpCtx))
       return true;
@@ -1236,8 +1262,7 @@ bool Driver::loadDefaultConfigFiles(llvm::cl::ExpansionContext &ExpCtx) {
   }
 
   // Try loading <triple>.cfg and return if we find a match.
-  CfgFileName = Triple + ".cfg";
-  if (ExpCtx.findConfigFile(CfgFileName, CfgFilePath))
+  if (findTripleConfigFile(ExpCtx, CfgFilePath, Triple, ".cfg"))
     return readConfigFile(CfgFilePath, ExpCtx);
 
   // If we were unable to find a config file deduced from executable name,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index d3206c3e8e25e..a020e00cd1739 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -3773,7 +3773,8 @@ static void RenderSCPOptions(const ToolChain &TC, const ArgList &Args,
                              ArgStringList &CmdArgs) {
   const llvm::Triple &EffectiveTriple = TC.getEffectiveTriple();
 
-  if (!EffectiveTriple.isOSFreeBSD() && !EffectiveTriple.isOSLinux())
+  if (!EffectiveTriple.isOSFreeBSD() && !EffectiveTriple.isOSLinux() &&
+      !EffectiveTriple.isOSFuchsia())
     return;
 
   if (!EffectiveTriple.isX86() && !EffectiveTriple.isSystemZ() &&
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 3c78b12b0741e..0d851314a8953 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1417,6 +1417,7 @@ void tools::addAsNeededOption(const ToolChain &TC,
 
 void tools::linkSanitizerRuntimeDeps(const ToolChain &TC,
                                      const llvm::opt::ArgList &Args,
+                                     const SanitizerArgs &SanArgs,
                                      ArgStringList &CmdArgs) {
   // Force linking against the system libraries sanitizers depends on
   // (see PR15823 why this is necessary).
@@ -1443,18 +1444,18 @@ void tools::linkSanitizerRuntimeDeps(const ToolChain &TC,
   // libresolv.a, even if exists, is an empty archive to satisfy POSIX -lresolv
   // requirement.
   if (TC.getTriple().isOSLinux() && !TC.getTriple().isAndroid() &&
-      !TC.getTriple().isMusl() && TC.getSanitizerArgs(Args).needsMsanRt())
+      !TC.getTriple().isMusl() && SanArgs.needsMsanRt())
     CmdArgs.push_back("-lresolv");
 }
 
 static void
 collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args,
+                         const SanitizerArgs &SanArgs,
                          SmallVectorImpl<StringRef> &SharedRuntimes,
                          SmallVectorImpl<StringRef> &StaticRuntimes,
                          SmallVectorImpl<StringRef> &NonWholeStaticRuntimes,
                          SmallVectorImpl<StringRef> &HelperStaticRuntimes,
                          SmallVectorImpl<StringRef> &RequiredSymbols) {
-  const SanitizerArgs &SanArgs = TC.getSanitizerArgs(Args);
   // Collect shared runtimes.
   if (SanArgs.needsSharedRt()) {
     if (SanArgs.needsAsanRt()) {
@@ -1588,12 +1589,12 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args,
 // Should be called before we add system libraries (C++ ABI, libstdc++/libc++,
 // C runtime, etc). Returns true if sanitizer system deps need to be linked in.
 bool tools::addSanitizerRuntimes(const ToolChain &TC, const ArgList &Args,
+                                 const SanitizerArgs &SanArgs,
                                  ArgStringList &CmdArgs) {
-  const SanitizerArgs &SanArgs = TC.getSanitizerArgs(Args);
   SmallVector<StringRef, 4> SharedRuntimes, StaticRuntimes,
       NonWholeStaticRuntimes, HelperStaticRuntimes, RequiredSymbols;
   if (SanArgs.linkRuntimes()) {
-    collectSanitizerRuntimes(TC, Args, SharedRuntimes, StaticRuntimes,
+    collectSanitizerRuntimes(TC, Args, SanArgs, SharedRuntimes, StaticRuntimes,
                              NonWholeStaticRuntimes, HelperStaticRuntimes,
                              RequiredSymbols);
   }
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h
index b6ddd99b87279..de2d143b90479 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.h
+++ b/clang/lib/Driver/ToolChains/CommonArgs.h
@@ -38,10 +38,12 @@ void addLinkerCompressDebugSectionsOption(const ToolChain &TC,
 void claimNoWarnArgs(const llvm::opt::ArgList &Args);
 
 bool addSanitizerRuntimes(const ToolChain &TC, const llvm::opt::ArgList &Args,
+                          const SanitizerArgs &SanArgs,
                           llvm::opt::ArgStringList &CmdArgs);
 
 void linkSanitizerRuntimeDeps(const ToolChain &TC,
                               const llvm::opt::ArgList &Args,
+                              const SanitizerArgs &SanArgs,
                               llvm::opt::ArgStringList &CmdArgs);
 
 bool addXRayRuntime(const ToolChain &TC, const llvm::opt::ArgList &Args,
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index 87380869f6fda..cdb6d21a0148b 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -430,13 +430,17 @@ void darwin::Linker::AddLinkArgs(Compilation &C, const ArgList &Args,
 
   // Give --sysroot= preference, over the Apple specific behavior to also use
   // --isysroot as the syslibroot.
-  StringRef sysroot = C.getSysRoot();
-  if (sysroot != "") {
+  // We check `OPT__sysroot_EQ` directly instead of `getSysRoot` to make sure we
+  // prioritise command line arguments over configuration of `DEFAULT_SYSROOT`.
+  if (const Arg *A = Args.getLastArg(options::OPT__sysroot_EQ)) {
     CmdArgs.push_back("-syslibroot");
-    CmdArgs.push_back(C.getArgs().MakeArgString(sysroot));
+    CmdArgs.push_back(A->getValue());
   } else if (const Arg *A = Args.getLastArg(options::OPT_isysroot)) {
     CmdArgs.push_back("-syslibroot");
     CmdArgs.push_back(A->getValue());
+  } else if (StringRef sysroot = C.getSysRoot(); sysroot != "") {
+    CmdArgs.push_back("-syslibroot");
+    CmdArgs.push_back(C.getArgs().MakeArgString(sysroot));
   }
 
   Args.AddLastArg(CmdArgs, options::OPT_twolevel__namespace);
diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp
index 3d744bc087f46..c78f4e26a2f10 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp
@@ -283,7 +283,9 @@ void freebsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                   D.getLTOMode() == LTOK_Thin);
   }
 
-  bool NeedsSanitizerDeps = addSanitizerRuntimes(ToolChain, Args, CmdArgs);
+  const SanitizerArgs &SanArgs = ToolChain.getSanitizerArgs(Args);
+  bool NeedsSanitizerDeps =
+      addSanitizerRuntimes(ToolChain, Args, SanArgs, CmdArgs);
   bool NeedsXRayDeps = addXRayRuntime(ToolChain, Args, CmdArgs);
   addLinkerCompressDebugSectionsOption(ToolChain, Args, CmdArgs);
   AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs, JA);
@@ -324,7 +326,7 @@ void freebsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     }
 
     if (NeedsSanitizerDeps)
-      linkSanitizerRuntimeDeps(ToolChain, Args, CmdArgs);
+      linkSanitizerRuntimeDeps(ToolChain, Args, SanArgs, CmdArgs);
     if (NeedsXRayDeps)
       linkXRayRuntimeDeps(ToolChain, Args, CmdArgs);
     // FIXME: For some reason GCC passes -lgcc and -lgcc_s before adding
diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
index c2badc80a7b45..52d58431d4ae1 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
@@ -184,7 +184,8 @@ void fuchsia::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     // Note that Fuchsia never needs to link in sanitizer runtime deps.  Any
     // sanitizer runtimes with system dependencies use the `.deplibs` feature
     // instead.
-    addSanitizerRuntimes(ToolChain, Args, CmdArgs);
+    const SanitizerArgs &SanArgs = ToolChain.getSanitizerArgs(Args);
+    addSanitizerRuntimes(ToolChain, Args, SanArgs, CmdArgs);
 
     addXRayRuntime(ToolChain, Args, CmdArgs);
 
@@ -317,10 +318,9 @@ Fuchsia::Fuchsia(const Driver &D, const llvm::Triple &Triple,
       Args.hasFlag(options::OPT_fexceptions, options::OPT_fno_exceptions, true);
   addMultilibFlag(Exceptions, "-fexceptions", Flags);
   addMultilibFlag(!Exceptions, "-fno-exceptions", Flags);
-  addMultilibFlag(getSanitizerArgs(Args).needsAsanRt(), "-fsanitize=address",
-                  Flags);
-  addMultilibFlag(getSanitizerArgs(Args).needsHwasanRt(),
-                  "-fsanitize=hwaddress", Flags);
+  const SanitizerArgs &SanArgs = getSanitizerArgs(Args);
+  addMultilibFlag(SanArgs.needsAsanRt(), "-fsanitize=address", Flags);
+  addMultilibFlag(SanArgs.needsHwasanRt(), "-fsanitize=hwaddress", Flags);
 
   addMultilibFlag(Args.getLastArgValue(options::OPT_fcxx_abi_EQ) == "itanium",
                   "-fc++-abi=itanium", Flags);
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 8397f1121ec2c..9eb3d3125772c 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -23,6 +23,7 @@
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/MultilibBuilder.h"
 #include "clang/Driver/Options.h"
+#include "clang/Driver/SanitizerArgs.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
 #include "llvm/ADT/StringSet.h"
@@ -538,7 +539,9 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (Args.hasArg(options::OPT_Z_Xlinker__no_demangle))
     CmdArgs.push_back("--no-demangle");
 
-  bool NeedsSanitizerDeps = addSanitizerRuntimes(ToolChain, Args, CmdArgs);
+  const SanitizerArgs &SanArgs = ToolChain.getSanitizerArgs(Args);
+  bool NeedsSanitizerDeps =
+      addSanitizerRuntimes(ToolChain, Args, SanArgs, CmdArgs);
   bool NeedsXRayDeps = addXRayRuntime(ToolChain, Args, CmdArgs);
   addLinkerCompressDebugSectionsOption(ToolChain, Args, CmdArgs);
   AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs, JA);
@@ -583,7 +586,7 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
         CmdArgs.push_back("--start-group");
 
       if (NeedsSanitizerDeps)
-        linkSanitizerRuntimeDeps(ToolChain, Args, CmdArgs);
+        linkSanitizerRuntimeDeps(ToolChain, Args, SanArgs, CmdArgs);
 
       if (NeedsXRayDeps)
         linkXRayRuntimeDeps(ToolChain, Args, CmdArgs);
diff --git a/clang/lib/Driver/ToolChains/Hexagon.cpp b/clang/lib/Driver/ToolChains/Hexagon.cpp
index 76cedf312d68a..18900430313b5 100644
--- a/clang/lib/Driver/ToolChains/Hexagon.cpp
+++ b/clang/lib/Driver/ToolChains/Hexagon.cpp
@@ -13,6 +13,7 @@
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/InputInfo.h"
 #include "clang/Driver/Options.h"
+#include "clang/Driver/SanitizerArgs.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
@@ -215,7 +216,8 @@ void hexagon::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
       "-mcpu=hexagon" +
       toolchains::HexagonToolChain::GetTargetCPUVersion(Args)));
 
-  addSanitizerRuntimes(HTC, Args, CmdArgs);
+  SanitizerArgs SanArgs = HTC.getSanitizerArgs(Args);
+  addSanitizerRuntimes(HTC, Args, SanArgs, CmdArgs);
 
   assert((Output.isFilename() || Output.isNothing()) && "Invalid output.");
   if (Output.isFilename()) {
@@ -301,7 +303,8 @@ constructHexagonLinkArgs(Compilation &C, const JobAction &JA,
   bool UseShared = IsShared && !IsStatic;
   StringRef CpuVer = toolchains::HexagonToolChain::GetTargetCPUVersion(Args);
 
-  bool NeedsSanitizerDeps = addSanitizerRuntimes(HTC, Args, CmdArgs);
+  const SanitizerArgs &SanArgs = HTC.getSanitizerArgs(Args);
+  bool NeedsSanitizerDeps = addSanitizerRuntimes(HTC, Args, SanArgs, CmdArgs);
   bool NeedsXRayDeps = addXRayRuntime(HTC, Args, CmdArgs);
 
   //----------------------------------------------------------------------------
@@ -371,7 +374,7 @@ constructHexagonLinkArgs(Compilation &C, const JobAction &JA,
 
     if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) {
       if (NeedsSanitizerDeps) {
-        linkSanitizerRuntimeDeps(HTC, Args, CmdArgs);
+        linkSanitizerRuntimeDeps(HTC, Args, SanArgs, CmdArgs);
 
         if (UNW != ToolChain::UNW_None)
           CmdArgs.push_back("-lunwind");
diff --git a/clang/lib/Driver/ToolChains/NetBSD.cpp b/clang/lib/Driver/ToolChains/NetBSD.cpp
index abd5e1aa003b3..0ead1c610ba0d 100644
--- a/clang/lib/Driver/ToolChains/NetBSD.cpp
+++ b/clang/lib/Driver/ToolChains/NetBSD.cpp
@@ -274,11 +274,12 @@ void netbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                             options::OPT_s, options::OPT_t});
   ToolChain.AddFilePathLibArgs(Args, CmdArgs);
 
-  bool NeedsSanitizerDeps = addSanitizerRuntimes(ToolChain, Args, CmdArgs);
+  const SanitizerArgs &SanArgs = ToolChain.getSanitizerArgs(Args);
+  bool NeedsSanitizerDeps =
+      addSanitizerRuntimes(ToolChain, Args, SanArgs, CmdArgs);
   bool NeedsXRayDeps = addXRayRuntime(ToolChain, Args, CmdArgs);
   AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs, JA);
 
-  const SanitizerArgs &SanArgs = ToolChain.getSanitizerArgs(Args);
   if (SanArgs.needsSharedRt()) {
     CmdArgs.push_back("-rpath");
     CmdArgs.push_back(Args.MakeArgString(ToolChain.getCompilerRTPath()));
@@ -334,7 +335,7 @@ void netbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     }
 
     if (NeedsSanitizerDeps)
-      linkSanitizerRuntimeDeps(ToolChain, Args, CmdArgs);
+      linkSanitizerRuntimeDeps(ToolChain, Args, SanArgs, CmdArgs);
     if (NeedsXRayDeps)
       linkXRayRuntimeDeps(ToolChain, Args, CmdArgs);
     if (Args.hasArg(options::OPT_pthread))
diff --git a/clang/lib/Driver/ToolChains/OHOS.cpp b/clang/lib/Driver/ToolChains/OHOS.cpp
index 6e1a09ae908b2..c9a532771b99e 100644
--- a/clang/lib/Driver/ToolChains/OHOS.cpp
+++ b/clang/lib/Driver/ToolChains/OHOS.cpp
@@ -19,8 +19,8 @@
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/VirtualFileSystem.h"
 
 using namespace clang::driver;
 using namespace clang::driver::toolchains;
@@ -58,11 +58,9 @@ static bool findOHOSMuslMultilibs(const Driver &D,
   return false;
 }
 
-static bool findOHOSMultilibs(const Driver &D,
-                                      const ToolChain &TC,
-                                      const llvm::Triple &TargetTriple,
-                                      StringRef Path, const ArgList &Args,
-                                      DetectedMultilibs &Result) {
+static bool findOHOSMultilibs(const Driver &D, const ToolChain &TC,
+                              const llvm::Triple &TargetTriple, StringRef Path,
+                              const ArgList &Args, DetectedMultilibs &Result) {
   Multilib::flags_list Flags;
   bool IsA7 = false;
   if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ))
@@ -172,8 +170,7 @@ OHOS::OHOS(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
       Paths);
 }
 
-ToolChain::RuntimeLibType OHOS::GetRuntimeLibType(
-    const ArgList &Args) const {
+ToolChain::RuntimeLibType OHOS::GetRuntimeLibType(const ArgList &Args) const {
   if (Arg *A = Args.getLastArg(clang::driver::options::OPT_rtlib_EQ)) {
     StringRef Value = A->getValue();
     if (Value != "compiler-rt")
@@ -184,20 +181,19 @@ ToolChain::RuntimeLibType OHOS::GetRuntimeLibType(
   return ToolChain::RLT_CompilerRT;
 }
 
-ToolChain::CXXStdlibType
-OHOS::GetCXXStdlibType(const ArgList &Args) const {
+ToolChain::CXXStdlibType OHOS::GetCXXStdlibType(const ArgList &Args) const {
   if (Arg *A = Args.getLastArg(options::OPT_stdlib_EQ)) {
     StringRef Value = A->getValue();
     if (Value != "libc++")
       getDriver().Diag(diag::err_drv_invalid_stdlib_name)
-        << A->getAsString(Args);
+          << A->getAsString(Args);
   }
 
   return ToolChain::CST_Libcxx;
 }
 
 void OHOS::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
-                                        ArgStringList &CC1Args) const {
+                                     ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
   const llvm::Triple &Triple = getTriple();
   std::string SysRoot = computeSysRoot();
@@ -258,7 +254,7 @@ void OHOS::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
 }
 
 void OHOS::AddCXXStdlibLibArgs(const ArgList &Args,
-                                  ArgStringList &CmdArgs) const {
+                               ArgStringList &CmdArgs) const {
   switch (GetCXXStdlibType(Args)) {
   case ToolChain::CST_Libcxx:
     CmdArgs.push_back("-lc++");
@@ -291,7 +287,8 @@ ToolChain::path_list OHOS::getRuntimePaths() const {
 
   // First try the triple passed to driver as --target=<triple>.
   P.assign(D.ResourceDir);
-  llvm::sys::path::append(P, "lib", D.getTargetTriple(), SelectedMultilib.gccSuffix());
+  llvm::sys::path::append(P, "lib", D.getTargetTriple(),
+                          SelectedMultilib.gccSuffix());
   Paths.push_back(P.c_str());
 
   // Second try the normalized triple.
@@ -340,26 +337,20 @@ std::string OHOS::getDynamicLinker(const ArgList &Args) const {
 
 std::string OHOS::getCompilerRT(const ArgList &Args, StringRef Component,
                                 FileType Type) const {
+  std::string CRTBasename =
+      buildCompilerRTBasename(Args, Component, Type, /*AddArch=*/false);
+
   SmallString<128> Path(getDriver().ResourceDir);
   llvm::sys::path::append(Path, "lib", getMultiarchTriple(getTriple()),
-                          SelectedMultilib.gccSuffix());
-  const char *Prefix =
-      Type == ToolChain::FT_Object ? "" : "lib";
-  const char *Suffix;
-  switch (Type) {
-  case ToolChain::FT_Object:
-    Suffix = ".o";
-    break;
-  case ToolChain::FT_Static:
-    Suffix = ".a";
-    break;
-  case ToolChain::FT_Shared:
-    Suffix = ".so";
-    break;
-  }
-  llvm::sys::path::append(
-      Path, Prefix + Twine("clang_rt.") + Component + Suffix);
-  return static_cast<std::string>(Path.str());
+                          SelectedMultilib.gccSuffix(), CRTBasename);
+  if (getVFS().exists(Path))
+    return std::string(Path);
+
+  std::string NewPath = ToolChain::getCompilerRT(Args, Component, Type);
+  if (getVFS().exists(NewPath))
+    return NewPath;
+
+  return std::string(Path);
 }
 
 void OHOS::addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const {
@@ -396,7 +387,7 @@ SanitizerMask OHOS::getSupportedSanitizers() const {
 
 // TODO: Make a base class for Linux and OHOS and move this there.
 void OHOS::addProfileRTLibs(const llvm::opt::ArgList &Args,
-                             llvm::opt::ArgStringList &CmdArgs) const {
+                            llvm::opt::ArgStringList &CmdArgs) const {
   // Add linker option -u__llvm_profile_runtime to cause runtime
   // initialization module to be linked in.
   if (needsProfileRT(Args))
@@ -413,7 +404,8 @@ ToolChain::path_list OHOS::getArchSpecificLibPaths() const {
   return Paths;
 }
 
-ToolChain::UnwindLibType OHOS::GetUnwindLibType(const llvm::opt::ArgList &Args) const {
+ToolChain::UnwindLibType
+OHOS::GetUnwindLibType(const llvm::opt::ArgList &Args) const {
   if (Args.getLastArg(options::OPT_unwindlib_EQ))
     return Generic_ELF::GetUnwindLibType(Args);
   return GetDefaultUnwindLibType();
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index f668a11e78f81..b7dcffb3f5ee0 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -213,7 +213,9 @@ void openbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                   D.getLTOMode() == LTOK_Thin);
   }
 
-  bool NeedsSanitizerDeps = addSanitizerRuntimes(ToolChain, Args, CmdArgs);
+  const SanitizerArgs &SanArgs = ToolChain.getSanitizerArgs(Args);
+  bool NeedsSanitizerDeps =
+      addSanitizerRuntimes(ToolChain, Args, SanArgs, CmdArgs);
   bool NeedsXRayDeps = addXRayRuntime(ToolChain, Args, CmdArgs);
   AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs, JA);
 
@@ -251,7 +253,7 @@ void openbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
     if (NeedsSanitizerDeps) {
       CmdArgs.push_back(ToolChain.getCompilerRTArgString(Args, "builtins"));
-      linkSanitizerRuntimeDeps(ToolChain, Args, CmdArgs);
+      linkSanitizerRuntimeDeps(ToolChain, Args, SanArgs, CmdArgs);
     }
     if (NeedsXRayDeps) {
       CmdArgs.push_back(ToolChain.getCompilerRTArgString(Args, "builtins"));
diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp
index fd3232b7c1b06..83e83835eb915 100644
--- a/clang/lib/Driver/ToolChains/Solaris.cpp
+++ b/clang/lib/Driver/ToolChains/Solaris.cpp
@@ -203,7 +203,8 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_T_Group});
 
-  bool NeedsSanitizerDeps = addSanitizerRuntimes(ToolChain, Args, CmdArgs);
+  const SanitizerArgs &SA = ToolChain.getSanitizerArgs(Args);
+  bool NeedsSanitizerDeps = addSanitizerRuntimes(ToolChain, Args, SA, CmdArgs);
   AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs, JA);
 
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs,
@@ -250,9 +251,8 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     if (!Args.hasArg(options::OPT_shared)) {
       CmdArgs.push_back("-lgcc");
     }
-    const SanitizerArgs &SA = ToolChain.getSanitizerArgs(Args);
     if (NeedsSanitizerDeps) {
-      linkSanitizerRuntimeDeps(ToolChain, Args, CmdArgs);
+      linkSanitizerRuntimeDeps(ToolChain, Args, SA, CmdArgs);
 
       // Work around Solaris/amd64 ld bug when calling __tls_get_addr directly.
       // However, ld -z relax=transtls is available since Solaris 11.2, but not
diff --git a/clang/lib/Index/FileIndexRecord.cpp b/clang/lib/Index/FileIndexRecord.cpp
index f3a5e6b63bbc2..449c33637eb7e 100644
--- a/clang/lib/Index/FileIndexRecord.cpp
+++ b/clang/lib/Index/FileIndexRecord.cpp
@@ -65,7 +65,7 @@ void FileIndexRecord::print(llvm::raw_ostream &OS, SourceManager &SM) const {
         OS << ' ' << ND->getDeclName();
       }
     } else {
-      const auto *MI = DclInfo.DeclOrMacro.get<const MacroInfo *>();
+      const auto *MI = cast<const MacroInfo *>(DclInfo.DeclOrMacro);
       SourceLocation Loc = SM.getFileLoc(MI->getDefinitionLoc());
       PresumedLoc PLoc = SM.getPresumedLoc(Loc);
       OS << llvm::sys::path::filename(PLoc.getFilename()) << ':'
diff --git a/clang/lib/Index/IndexDecl.cpp b/clang/lib/Index/IndexDecl.cpp
index a7fa6c5e6898e..19cff0398e21e 100644
--- a/clang/lib/Index/IndexDecl.cpp
+++ b/clang/lib/Index/IndexDecl.cpp
@@ -665,9 +665,9 @@ class IndexingDeclVisitor : public ConstDeclVisitor<IndexingDeclVisitor, bool> {
                        ClassTemplatePartialSpecializationDecl *>
         Template = D->getSpecializedTemplateOrPartial();
     const Decl *SpecializationOf =
-        Template.is<ClassTemplateDecl *>()
+        isa<ClassTemplateDecl *>(Template)
             ? (Decl *)Template.get<ClassTemplateDecl *>()
-            : Template.get<ClassTemplatePartialSpecializationDecl *>();
+            : cast<ClassTemplatePartialSpecializationDecl *>(Template);
     if (!D->isThisDeclarationADefinition())
       IndexCtx.indexNestedNameSpecifierLoc(D->getQualifierLoc(), D);
     IndexCtx.indexTagDecl(
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index bc59de3c1a0ad..8c81936b35296 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -571,6 +571,8 @@ void SkipUntilEndOfDirective(Parser &P) {
 bool doesDirectiveHaveAssociatedStmt(OpenACCDirectiveKind DirKind) {
   switch (DirKind) {
   default:
+  case OpenACCDirectiveKind::EnterData:
+  case OpenACCDirectiveKind::ExitData:
     return false;
   case OpenACCDirectiveKind::Parallel:
   case OpenACCDirectiveKind::Serial:
@@ -579,6 +581,8 @@ bool doesDirectiveHaveAssociatedStmt(OpenACCDirectiveKind DirKind) {
   case OpenACCDirectiveKind::SerialLoop:
   case OpenACCDirectiveKind::KernelsLoop:
   case OpenACCDirectiveKind::Loop:
+  case OpenACCDirectiveKind::Data:
+  case OpenACCDirectiveKind::HostData:
     return true;
   }
   llvm_unreachable("Unhandled directive->assoc stmt");
@@ -596,6 +600,11 @@ unsigned getOpenACCScopeFlags(OpenACCDirectiveKind DirKind) {
     // so that we can diagnose trying to 'break'/'continue' inside of one.
     return Scope::BreakScope | Scope::ContinueScope |
            Scope::OpenACCComputeConstructScope;
+  case OpenACCDirectiveKind::Data:
+  case OpenACCDirectiveKind::EnterData:
+  case OpenACCDirectiveKind::ExitData:
+  case OpenACCDirectiveKind::HostData:
+    return 0;
   case OpenACCDirectiveKind::Invalid:
     llvm_unreachable("Shouldn't be creating a scope for an invalid construct");
   default:
@@ -1508,10 +1517,10 @@ StmtResult Parser::ParseOpenACCDirectiveStmt() {
     return StmtError();
 
   StmtResult AssocStmt;
-  SemaOpenACC::AssociatedStmtRAII AssocStmtRAII(getActions().OpenACC(),
-                                                DirInfo.DirKind, DirInfo.DirLoc,
-                                                {}, DirInfo.Clauses);
   if (doesDirectiveHaveAssociatedStmt(DirInfo.DirKind)) {
+    SemaOpenACC::AssociatedStmtRAII AssocStmtRAII(
+        getActions().OpenACC(), DirInfo.DirKind, DirInfo.DirLoc, {},
+        DirInfo.Clauses);
     ParsingOpenACCDirectiveRAII DirScope(*this, /*Value=*/false);
     ParseScope ACCScope(this, getOpenACCScopeFlags(DirInfo.DirKind));
 
diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index 868081292bc32..843fdb4a65cd7 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -367,6 +367,8 @@ static bool shouldTrackImplicitObjectArg(const CXXMethodDecl *Callee) {
   if (Callee->getReturnType()->isReferenceType()) {
     if (!Callee->getIdentifier()) {
       auto OO = Callee->getOverloadedOperator();
+      if (!Callee->getParent()->hasAttr<OwnerAttr>())
+        return false;
       return OO == OverloadedOperatorKind::OO_Subscript ||
              OO == OverloadedOperatorKind::OO_Star;
     }
@@ -1152,6 +1154,86 @@ static bool pathOnlyHandlesGslPointer(const IndirectLocalPath &Path) {
   }
   return false;
 }
+// Result of analyzing the Path for GSLPointer.
+enum AnalysisResult {
+  // Path does not correspond to a GSLPointer.
+  NotGSLPointer,
+
+  // A relevant case was identified.
+  Report,
+  // Stop the entire traversal.
+  Abandon,
+  // Skip this step and continue traversing inner AST nodes.
+  Skip,
+};
+// Analyze cases where a GSLPointer is initialized or assigned from a
+// temporary owner object.
+static AnalysisResult analyzePathForGSLPointer(const IndirectLocalPath &Path,
+                                               Local L) {
+  if (!pathOnlyHandlesGslPointer(Path))
+    return NotGSLPointer;
+
+  // At this point, Path represents a series of operations involving a
+  // GSLPointer, either in the process of initialization or assignment.
+
+  // Note: A LifetimeBoundCall can appear interleaved in this sequence.
+  // For example:
+  //    const std::string& Ref(const std::string& a [[clang::lifetimebound]]);
+  //    string_view abc = Ref(std::string());
+  // The "Path" is [GSLPointerInit, LifetimeboundCall], where "L" is the
+  // temporary "std::string()" object. We need to check the return type of the
+  // function with the lifetimebound attribute.
+  if (Path.back().Kind == IndirectLocalPathEntry::LifetimeBoundCall) {
+    // The lifetimebound applies to the implicit object parameter of a method.
+    const FunctionDecl *FD =
+        llvm::dyn_cast_or_null<FunctionDecl>(Path.back().D);
+    // The lifetimebound applies to a function parameter.
+    if (const auto *PD = llvm::dyn_cast<ParmVarDecl>(Path.back().D))
+      FD = llvm::dyn_cast<FunctionDecl>(PD->getDeclContext());
+
+    if (isa_and_present<CXXConstructorDecl>(FD)) {
+      // Constructor case: the parameter is annotated with lifetimebound
+      //   e.g., GSLPointer(const S& s [[clang::lifetimebound]])
+      // We still respect this case even the type S is not an owner.
+      return Report;
+    }
+    // Check the return type, e.g.
+    //   const GSLOwner& func(const Foo& foo [[clang::lifetimebound]])
+    //   GSLPointer func(const Foo& foo [[clang::lifetimebound]])
+    if (FD &&
+        ((FD->getReturnType()->isReferenceType() &&
+          isRecordWithAttr<OwnerAttr>(FD->getReturnType()->getPointeeType())) ||
+         isPointerLikeType(FD->getReturnType())))
+      return Report;
+
+    return Abandon;
+  }
+
+  if (isa<DeclRefExpr>(L)) {
+    // We do not want to follow the references when returning a pointer
+    // originating from a local owner to avoid the following false positive:
+    //   int &p = *localUniquePtr;
+    //   someContainer.add(std::move(localUniquePtr));
+    //   return p;
+    if (!pathContainsInit(Path) && isRecordWithAttr<OwnerAttr>(L->getType()))
+      return Report;
+    return Abandon;
+  }
+
+  // The GSLPointer is from a temporary object.
+  auto *MTE = dyn_cast<MaterializeTemporaryExpr>(L);
+
+  bool IsGslPtrValueFromGslTempOwner =
+      MTE && !MTE->getExtendingDecl() &&
+      isRecordWithAttr<OwnerAttr>(MTE->getType());
+  // Skipping a chain of initializing gsl::Pointer annotated objects.
+  // We are looking only for the final source to find out if it was
+  // a local or temporary owner or the address of a local
+  // variable/param.
+  if (!IsGslPtrValueFromGslTempOwner)
+    return Skip;
+  return Report;
+}
 
 static bool isAssignmentOperatorLifetimeBound(CXXMethodDecl *CMD) {
   return CMD && isNormalAssignmentOperator(CMD) && CMD->param_size() == 1 &&
@@ -1189,27 +1271,17 @@ checkExprLifetimeImpl(Sema &SemaRef, const InitializedEntity *InitEntity,
 
     auto *MTE = dyn_cast<MaterializeTemporaryExpr>(L);
 
-    bool IsGslPtrValueFromGslTempOwner = false;
-    if (pathOnlyHandlesGslPointer(Path)) {
-      if (isa<DeclRefExpr>(L)) {
-        // We do not want to follow the references when returning a pointer
-        // originating from a local owner to avoid the following false positive:
-        //   int &p = *localUniquePtr;
-        //   someContainer.add(std::move(localUniquePtr));
-        //   return p;
-        if (pathContainsInit(Path) ||
-            !isRecordWithAttr<OwnerAttr>(L->getType()))
-          return false;
-      } else {
-        IsGslPtrValueFromGslTempOwner =
-            MTE && !MTE->getExtendingDecl() &&
-            isRecordWithAttr<OwnerAttr>(MTE->getType());
-        // Skipping a chain of initializing gsl::Pointer annotated objects.
-        // We are looking only for the final source to find out if it was
-        // a local or temporary owner or the address of a local variable/param.
-        if (!IsGslPtrValueFromGslTempOwner)
-          return true;
-      }
+    bool IsGslPtrValueFromGslTempOwner = true;
+    switch (analyzePathForGSLPointer(Path, L)) {
+    case Abandon:
+      return false;
+    case Skip:
+      return true;
+    case NotGSLPointer:
+      IsGslPtrValueFromGslTempOwner = false;
+      LLVM_FALLTHROUGH;
+    case Report:
+      break;
     }
 
     switch (LK) {
diff --git a/clang/lib/Sema/DeclSpec.cpp b/clang/lib/Sema/DeclSpec.cpp
index ee237ffc4d2b9..47644680b720b 100644
--- a/clang/lib/Sema/DeclSpec.cpp
+++ b/clang/lib/Sema/DeclSpec.cpp
@@ -1343,8 +1343,7 @@ void DeclSpec::Finish(Sema &S, const PrintingPolicy &Policy) {
                               S.getLocForEndOfToken(getTypeSpecComplexLoc()),
                                                  " double");
       TypeSpecType = TST_double;   // _Complex -> _Complex double.
-    } else if (TypeSpecType == TST_int || TypeSpecType == TST_char ||
-               TypeSpecType == TST_bitint) {
+    } else if (TypeSpecType == TST_int || TypeSpecType == TST_char) {
       // Note that this intentionally doesn't include _Complex _Bool.
       if (!S.getLangOpts().CPlusPlus)
         S.Diag(TSTLoc, diag::ext_integer_complex);
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index 6a9f43d6f5215..2be6af293ed54 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -1396,6 +1396,8 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
   case Expr::ConceptSpecializationExprClass:
   case Expr::RequiresExprClass:
   case Expr::HLSLOutArgExprClass:
+  case Stmt::OpenACCEnterDataConstructClass:
+  case Stmt::OpenACCExitDataConstructClass:
     // These expressions can never throw.
     return CT_Cannot;
 
@@ -1407,6 +1409,8 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
   case Stmt::OpenACCComputeConstructClass:
   case Stmt::OpenACCLoopConstructClass:
   case Stmt::OpenACCCombinedConstructClass:
+  case Stmt::OpenACCDataConstructClass:
+  case Stmt::OpenACCHostDataConstructClass:
   case Stmt::AttributedStmtClass:
   case Stmt::BreakStmtClass:
   case Stmt::CapturedStmtClass:
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 62c3e778ab178..99daa3eba6287 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -37,6 +37,10 @@ bool diagnoseConstructAppertainment(SemaOpenACC &S, OpenACCDirectiveKind K,
   case OpenACCDirectiveKind::Serial:
   case OpenACCDirectiveKind::Kernels:
   case OpenACCDirectiveKind::Loop:
+  case OpenACCDirectiveKind::Data:
+  case OpenACCDirectiveKind::EnterData:
+  case OpenACCDirectiveKind::ExitData:
+  case OpenACCDirectiveKind::HostData:
     if (!IsStmt)
       return S.Diag(StartLoc, diag::err_acc_construct_appertainment) << K;
     break;
@@ -431,11 +435,12 @@ bool checkAlreadyHasClauseOfKind(
 bool checkValidAfterDeviceType(
     SemaOpenACC &S, const OpenACCDeviceTypeClause &DeviceTypeClause,
     const SemaOpenACC::OpenACCParsedClause &NewClause) {
-  // This is only a requirement on compute and loop constructs so far, so this
-  // is fine otherwise.
+  // This is only a requirement on compute, combined, data and loop constructs
+  // so far, so this is fine otherwise.
   if (!isOpenACCComputeDirectiveKind(NewClause.getDirectiveKind()) &&
       !isOpenACCCombinedDirectiveKind(NewClause.getDirectiveKind()) &&
-      NewClause.getDirectiveKind() != OpenACCDirectiveKind::Loop)
+      NewClause.getDirectiveKind() != OpenACCDirectiveKind::Loop &&
+      NewClause.getDirectiveKind() != OpenACCDirectiveKind::Data)
     return false;
 
   // OpenACC3.3: Section 2.4: Clauses that precede any device_type clause are
@@ -500,6 +505,16 @@ bool checkValidAfterDeviceType(
     default:
       break;
     }
+  } else if (NewClause.getDirectiveKind() == OpenACCDirectiveKind::Data) {
+    // OpenACC3.3 section 2.6.5: Only the async and wait clauses may follow a
+    // device_type clause.
+    switch (NewClause.getClauseKind()) {
+    case OpenACCClauseKind::Async:
+    case OpenACCClauseKind::Wait:
+      return false;
+    default:
+      break;
+    }
   }
   S.Diag(NewClause.getBeginLoc(), diag::err_acc_clause_after_device_type)
       << NewClause.getClauseKind() << DeviceTypeClause.getClauseKind()
@@ -572,14 +587,6 @@ class SemaOpenACCClauseVisitor {
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitDefaultClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute'/'combined' constructs,
-  // and 'compute'/'combined' constructs are the only construct that can do
-  // anything with this yet, so skip/treat as unimplemented in this case.
-  // Only 'data' is left.
-  if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()) &&
-      !isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind()))
-    return isNotImplemented();
-
   // Don't add an invalid clause to the AST.
   if (Clause.getDefaultClauseKind() == OpenACCDefaultClauseKind::Invalid)
     return nullptr;
@@ -626,16 +633,20 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitTileClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitIfClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute'/'combined' constructs,
-  // and 'compute'/'combined' constructs are the only construct that can do
-  // anything with this yet, so skip/treat as unimplemented in this case.
+  // Restrictions only properly implemented on 'compute'/'combined'/'data'
+  // constructs, and 'compute'/'combined'/'data' constructs are the only
+  // constructs that can do anything with this yet, so skip/treat as
+  // unimplemented in this case.
   if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()) &&
-      !isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind()))
+      !isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind()) &&
+      !isOpenACCDataDirectiveKind(Clause.getDirectiveKind()))
     return isNotImplemented();
 
   // There is no prose in the standard that says duplicates aren't allowed,
   // but this diagnostic is present in other compilers, as well as makes
-  // sense.
+  // sense. Prose DOES exist for 'data' and 'host_data', 'enter data' and 'exit
+  // data' both don't, but other implmementations do this.  OpenACC issue 519
+  // filed for the latter two.
   if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
     return nullptr;
 
@@ -857,11 +868,13 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitVectorLengthClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitAsyncClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute'/'combined' constructs,
-  // and 'compute'/'combined' constructs are the only construct that can do
-  // anything with this yet, so skip/treat as unimplemented in this case.
+  // Restrictions only properly implemented on 'compute'/'combined'/'data'
+  // constructs, and 'compute'/'combined'/'data' constructs are the only
+  // construct that can do anything with this yet, so skip/treat as
+  // unimplemented in this case.
   if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()) &&
-      !isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind()))
+      !isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind()) &&
+      !isOpenACCDataDirectiveKind(Clause.getDirectiveKind()))
     return isNotImplemented();
 
   // There is no prose in the standard that says duplicates aren't allowed,
@@ -1049,11 +1062,13 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitDevicePtrClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitWaitClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute'/'combined' constructs,
-  // and 'compute'/'combined' constructs are the only construct that can do
-  // anything with this yet, so skip/treat as unimplemented in this case.
+  // Restrictions only properly implemented on 'compute'/'combined'/'data'
+  // constructs, and 'compute'/'combined'/'data' constructs are the only
+  // construct that can do anything with this yet, so skip/treat as
+  // unimplemented in this case.
   if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()) &&
-      !isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind()))
+      !isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind()) &&
+      !isOpenACCDataDirectiveKind(Clause.getDirectiveKind()))
     return isNotImplemented();
 
   return OpenACCWaitClause::Create(
@@ -1063,12 +1078,13 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitWaitClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceTypeClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute', 'combined',  and
-  // 'loop' constructs, and 'compute'/'combined'/'loop' constructs are the only
-  // construct that can do anything with this yet, so skip/treat as
+  // Restrictions only properly implemented on 'compute', 'combined', 'data' and
+  // 'loop' constructs, and 'compute'/'combined'/'data'/'loop' constructs are
+  // the only construct that can do anything with this yet, so skip/treat as
   // unimplemented in this case.
   if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()) &&
       Clause.getDirectiveKind() != OpenACCDirectiveKind::Loop &&
+      Clause.getDirectiveKind() != OpenACCDirectiveKind::Data &&
       !isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind()))
     return isNotImplemented();
 
@@ -1760,6 +1776,31 @@ void CollectActiveReductionClauses(
   }
 }
 
+// Depth needs to be preserved for all associated statements that aren't
+// supposed to modify the compute/combined/loop construct information.
+bool PreserveLoopRAIIDepthInAssociatedStmtRAII(OpenACCDirectiveKind DK) {
+  switch (DK) {
+  case OpenACCDirectiveKind::Parallel:
+  case OpenACCDirectiveKind::ParallelLoop:
+  case OpenACCDirectiveKind::Serial:
+  case OpenACCDirectiveKind::SerialLoop:
+  case OpenACCDirectiveKind::Kernels:
+  case OpenACCDirectiveKind::KernelsLoop:
+  case OpenACCDirectiveKind::Loop:
+    return false;
+  case OpenACCDirectiveKind::Data:
+  case OpenACCDirectiveKind::HostData:
+    return true;
+  case OpenACCDirectiveKind::EnterData:
+  case OpenACCDirectiveKind::ExitData:
+    llvm_unreachable("Doesn't have an associated stmt");
+  default:
+  case OpenACCDirectiveKind::Invalid:
+    llvm_unreachable("Unhandled directive kind?");
+  }
+  llvm_unreachable("Unhandled directive kind?");
+}
+
 } // namespace
 
 SemaOpenACC::SemaOpenACC(Sema &S) : SemaBase(S) {}
@@ -1774,7 +1815,7 @@ SemaOpenACC::AssociatedStmtRAII::AssociatedStmtRAII(
       OldLoopVectorClauseLoc(S.LoopVectorClauseLoc),
       OldLoopWithoutSeqInfo(S.LoopWithoutSeqInfo),
       ActiveReductionClauses(S.ActiveReductionClauses),
-      LoopRAII(SemaRef, /*PreserveDepth=*/false) {
+      LoopRAII(SemaRef, PreserveLoopRAIIDepthInAssociatedStmtRAII(DirKind)) {
 
   // Compute constructs end up taking their 'loop'.
   if (DirKind == OpenACCDirectiveKind::Parallel ||
@@ -1950,24 +1991,23 @@ void SemaOpenACC::AssociatedStmtRAII::SetTileInfoBeforeAssociatedStmt(
 }
 
 SemaOpenACC::AssociatedStmtRAII::~AssociatedStmtRAII() {
-  SemaRef.ActiveComputeConstructInfo = OldActiveComputeConstructInfo;
-  SemaRef.LoopGangClauseOnKernel = OldLoopGangClauseOnKernel;
-  SemaRef.LoopWorkerClauseLoc = OldLoopWorkerClauseLoc;
-  SemaRef.LoopVectorClauseLoc = OldLoopVectorClauseLoc;
-  SemaRef.LoopWithoutSeqInfo = OldLoopWithoutSeqInfo;
-  SemaRef.ActiveReductionClauses.swap(ActiveReductionClauses);
-
   if (DirKind == OpenACCDirectiveKind::Parallel ||
       DirKind == OpenACCDirectiveKind::Serial ||
       DirKind == OpenACCDirectiveKind::Kernels ||
+      DirKind == OpenACCDirectiveKind::Loop ||
       DirKind == OpenACCDirectiveKind::ParallelLoop ||
       DirKind == OpenACCDirectiveKind::SerialLoop ||
       DirKind == OpenACCDirectiveKind::KernelsLoop) {
-    // Nothing really to do here, the restorations above should be enough for
-    // now.
-  } else if (DirKind == OpenACCDirectiveKind::Loop) {
-    // Nothing really to do here, the LoopInConstruct should handle restorations
-    // correctly.
+    SemaRef.ActiveComputeConstructInfo = OldActiveComputeConstructInfo;
+    SemaRef.LoopGangClauseOnKernel = OldLoopGangClauseOnKernel;
+    SemaRef.LoopWorkerClauseLoc = OldLoopWorkerClauseLoc;
+    SemaRef.LoopVectorClauseLoc = OldLoopVectorClauseLoc;
+    SemaRef.LoopWithoutSeqInfo = OldLoopWithoutSeqInfo;
+    SemaRef.ActiveReductionClauses.swap(ActiveReductionClauses);
+  } else if (DirKind == OpenACCDirectiveKind::Data ||
+             DirKind == OpenACCDirectiveKind::HostData) {
+    // Intentionally doesn't reset the Loop, Compute Construct, or reduction
+    // effects.
   }
 }
 
@@ -2175,6 +2215,10 @@ void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K,
   case OpenACCDirectiveKind::SerialLoop:
   case OpenACCDirectiveKind::KernelsLoop:
   case OpenACCDirectiveKind::Loop:
+  case OpenACCDirectiveKind::Data:
+  case OpenACCDirectiveKind::EnterData:
+  case OpenACCDirectiveKind::ExitData:
+  case OpenACCDirectiveKind::HostData:
     // Nothing to do here, there is no real legalization that needs to happen
     // here as these constructs do not take any arguments.
     break;
@@ -3441,6 +3485,24 @@ StmtResult SemaOpenACC::ActOnEndStmtDirective(OpenACCDirectiveKind K,
         getASTContext(), ActiveComputeConstructInfo.Kind, StartLoc, DirLoc,
         EndLoc, Clauses, AssocStmt.isUsable() ? AssocStmt.get() : nullptr);
   }
+  case OpenACCDirectiveKind::Data: {
+    return OpenACCDataConstruct::Create(
+        getASTContext(), StartLoc, DirLoc, EndLoc, Clauses,
+        AssocStmt.isUsable() ? AssocStmt.get() : nullptr);
+  }
+  case OpenACCDirectiveKind::EnterData: {
+    return OpenACCEnterDataConstruct::Create(getASTContext(), StartLoc, DirLoc,
+                                             EndLoc, Clauses);
+  }
+  case OpenACCDirectiveKind::ExitData: {
+    return OpenACCExitDataConstruct::Create(getASTContext(), StartLoc, DirLoc,
+                                            EndLoc, Clauses);
+  }
+  case OpenACCDirectiveKind::HostData: {
+    return OpenACCHostDataConstruct::Create(
+        getASTContext(), StartLoc, DirLoc, EndLoc, Clauses,
+        AssocStmt.isUsable() ? AssocStmt.get() : nullptr);
+  }
   }
   llvm_unreachable("Unhandled case in directive handling?");
 }
@@ -3451,9 +3513,15 @@ StmtResult SemaOpenACC::ActOnAssociatedStmt(
   switch (K) {
   default:
     llvm_unreachable("Unimplemented associated statement application");
+  case OpenACCDirectiveKind::EnterData:
+  case OpenACCDirectiveKind::ExitData:
+    llvm_unreachable(
+        "these don't have associated statements, so shouldn't get here");
   case OpenACCDirectiveKind::Parallel:
   case OpenACCDirectiveKind::Serial:
   case OpenACCDirectiveKind::Kernels:
+  case OpenACCDirectiveKind::Data:
+  case OpenACCDirectiveKind::HostData:
     // There really isn't any checking here that could happen. As long as we
     // have a statement to associate, this should be fine.
     // OpenACC 3.3 Section 6:
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 02d2fc018e3c3..f2dbf4086a13d 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -4110,6 +4110,42 @@ class TreeTransform {
                                                      EndLoc, Clauses, Loop);
   }
 
+  StmtResult RebuildOpenACCDataConstruct(SourceLocation BeginLoc,
+                                         SourceLocation DirLoc,
+                                         SourceLocation EndLoc,
+                                         ArrayRef<OpenACCClause *> Clauses,
+                                         StmtResult StrBlock) {
+    return getSema().OpenACC().ActOnEndStmtDirective(OpenACCDirectiveKind::Data,
+                                                     BeginLoc, DirLoc, EndLoc,
+                                                     Clauses, StrBlock);
+  }
+
+  StmtResult
+  RebuildOpenACCEnterDataConstruct(SourceLocation BeginLoc,
+                                   SourceLocation DirLoc, SourceLocation EndLoc,
+                                   ArrayRef<OpenACCClause *> Clauses) {
+    return getSema().OpenACC().ActOnEndStmtDirective(
+        OpenACCDirectiveKind::EnterData, BeginLoc, DirLoc, EndLoc, Clauses, {});
+  }
+
+  StmtResult
+  RebuildOpenACCExitDataConstruct(SourceLocation BeginLoc,
+                                  SourceLocation DirLoc, SourceLocation EndLoc,
+                                  ArrayRef<OpenACCClause *> Clauses) {
+    return getSema().OpenACC().ActOnEndStmtDirective(
+        OpenACCDirectiveKind::ExitData, BeginLoc, DirLoc, EndLoc, Clauses, {});
+  }
+
+  StmtResult RebuildOpenACCHostDataConstruct(SourceLocation BeginLoc,
+                                             SourceLocation DirLoc,
+                                             SourceLocation EndLoc,
+                                             ArrayRef<OpenACCClause *> Clauses,
+                                             StmtResult StrBlock) {
+    return getSema().OpenACC().ActOnEndStmtDirective(
+        OpenACCDirectiveKind::HostData, BeginLoc, DirLoc, EndLoc, Clauses,
+        StrBlock);
+  }
+
   ExprResult RebuildOpenACCAsteriskSizeExpr(SourceLocation AsteriskLoc) {
     return getSema().OpenACC().ActOnOpenACCAsteriskSizeExpr(AsteriskLoc);
   }
@@ -12153,6 +12189,88 @@ StmtResult TreeTransform<Derived>::TransformOpenACCCombinedConstruct(
       C->getEndLoc(), TransformedClauses, Loop);
 }
 
+template <typename Derived>
+StmtResult
+TreeTransform<Derived>::TransformOpenACCDataConstruct(OpenACCDataConstruct *C) {
+  getSema().OpenACC().ActOnConstruct(C->getDirectiveKind(), C->getBeginLoc());
+
+  llvm::SmallVector<OpenACCClause *> TransformedClauses =
+      getDerived().TransformOpenACCClauseList(C->getDirectiveKind(),
+                                              C->clauses());
+  if (getSema().OpenACC().ActOnStartStmtDirective(C->getDirectiveKind(),
+                                                  C->getBeginLoc()))
+    return StmtError();
+
+  SemaOpenACC::AssociatedStmtRAII AssocStmtRAII(
+      getSema().OpenACC(), C->getDirectiveKind(), C->getDirectiveLoc(),
+      C->clauses(), TransformedClauses);
+  StmtResult StrBlock = getDerived().TransformStmt(C->getStructuredBlock());
+  StrBlock = getSema().OpenACC().ActOnAssociatedStmt(
+      C->getBeginLoc(), C->getDirectiveKind(), TransformedClauses, StrBlock);
+
+  return getDerived().RebuildOpenACCDataConstruct(
+      C->getBeginLoc(), C->getDirectiveLoc(), C->getEndLoc(),
+      TransformedClauses, StrBlock);
+}
+
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOpenACCEnterDataConstruct(
+    OpenACCEnterDataConstruct *C) {
+  getSema().OpenACC().ActOnConstruct(C->getDirectiveKind(), C->getBeginLoc());
+
+  llvm::SmallVector<OpenACCClause *> TransformedClauses =
+      getDerived().TransformOpenACCClauseList(C->getDirectiveKind(),
+                                              C->clauses());
+  if (getSema().OpenACC().ActOnStartStmtDirective(C->getDirectiveKind(),
+                                                  C->getBeginLoc()))
+    return StmtError();
+
+  return getDerived().RebuildOpenACCEnterDataConstruct(
+      C->getBeginLoc(), C->getDirectiveLoc(), C->getEndLoc(),
+      TransformedClauses);
+}
+
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOpenACCExitDataConstruct(
+    OpenACCExitDataConstruct *C) {
+  getSema().OpenACC().ActOnConstruct(C->getDirectiveKind(), C->getBeginLoc());
+
+  llvm::SmallVector<OpenACCClause *> TransformedClauses =
+      getDerived().TransformOpenACCClauseList(C->getDirectiveKind(),
+                                              C->clauses());
+  if (getSema().OpenACC().ActOnStartStmtDirective(C->getDirectiveKind(),
+                                                  C->getBeginLoc()))
+    return StmtError();
+
+  return getDerived().RebuildOpenACCExitDataConstruct(
+      C->getBeginLoc(), C->getDirectiveLoc(), C->getEndLoc(),
+      TransformedClauses);
+}
+
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOpenACCHostDataConstruct(
+    OpenACCHostDataConstruct *C) {
+  getSema().OpenACC().ActOnConstruct(C->getDirectiveKind(), C->getBeginLoc());
+
+  llvm::SmallVector<OpenACCClause *> TransformedClauses =
+      getDerived().TransformOpenACCClauseList(C->getDirectiveKind(),
+                                              C->clauses());
+  if (getSema().OpenACC().ActOnStartStmtDirective(C->getDirectiveKind(),
+                                                  C->getBeginLoc()))
+    return StmtError();
+
+  SemaOpenACC::AssociatedStmtRAII AssocStmtRAII(
+      getSema().OpenACC(), C->getDirectiveKind(), C->getDirectiveLoc(),
+      C->clauses(), TransformedClauses);
+  StmtResult StrBlock = getDerived().TransformStmt(C->getStructuredBlock());
+  StrBlock = getSema().OpenACC().ActOnAssociatedStmt(
+      C->getBeginLoc(), C->getDirectiveKind(), TransformedClauses, StrBlock);
+
+  return getDerived().RebuildOpenACCHostDataConstruct(
+      C->getBeginLoc(), C->getDirectiveLoc(), C->getEndLoc(),
+      TransformedClauses, StrBlock);
+}
+
 template <typename Derived>
 ExprResult TreeTransform<Derived>::TransformOpenACCAsteriskSizeExpr(
     OpenACCAsteriskSizeExpr *E) {
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 9f4877b19d870..21ad6c5a9faa3 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2849,6 +2849,27 @@ void ASTStmtReader::VisitOpenACCCombinedConstruct(OpenACCCombinedConstruct *S) {
   VisitOpenACCAssociatedStmtConstruct(S);
 }
 
+void ASTStmtReader::VisitOpenACCDataConstruct(OpenACCDataConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCAssociatedStmtConstruct(S);
+}
+
+void ASTStmtReader::VisitOpenACCEnterDataConstruct(
+    OpenACCEnterDataConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCConstructStmt(S);
+}
+
+void ASTStmtReader::VisitOpenACCExitDataConstruct(OpenACCExitDataConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCConstructStmt(S);
+}
+
+void ASTStmtReader::VisitOpenACCHostDataConstruct(OpenACCHostDataConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCAssociatedStmtConstruct(S);
+}
+
 //===----------------------------------------------------------------------===//
 // HLSL Constructs/Directives.
 //===----------------------------------------------------------------------===//
@@ -4324,6 +4345,26 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       S = OpenACCCombinedConstruct::CreateEmpty(Context, NumClauses);
       break;
     }
+    case STMT_OPENACC_DATA_CONSTRUCT: {
+      unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
+      S = OpenACCDataConstruct::CreateEmpty(Context, NumClauses);
+      break;
+    }
+    case STMT_OPENACC_ENTER_DATA_CONSTRUCT: {
+      unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
+      S = OpenACCEnterDataConstruct::CreateEmpty(Context, NumClauses);
+      break;
+    }
+    case STMT_OPENACC_EXIT_DATA_CONSTRUCT: {
+      unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
+      S = OpenACCExitDataConstruct::CreateEmpty(Context, NumClauses);
+      break;
+    }
+    case STMT_OPENACC_HOST_DATA_CONSTRUCT: {
+      unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
+      S = OpenACCHostDataConstruct::CreateEmpty(Context, NumClauses);
+      break;
+    }
     case EXPR_REQUIRES: {
       unsigned numLocalParameters = Record[ASTStmtReader::NumExprFields];
       unsigned numRequirement = Record[ASTStmtReader::NumExprFields + 1];
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 603aa5707ce9b..e55cbe1f6ecce 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2926,6 +2926,31 @@ void ASTStmtWriter::VisitOpenACCCombinedConstruct(OpenACCCombinedConstruct *S) {
   Code = serialization::STMT_OPENACC_COMBINED_CONSTRUCT;
 }
 
+void ASTStmtWriter::VisitOpenACCDataConstruct(OpenACCDataConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCAssociatedStmtConstruct(S);
+  Code = serialization::STMT_OPENACC_DATA_CONSTRUCT;
+}
+
+void ASTStmtWriter::VisitOpenACCEnterDataConstruct(
+    OpenACCEnterDataConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCConstructStmt(S);
+  Code = serialization::STMT_OPENACC_ENTER_DATA_CONSTRUCT;
+}
+
+void ASTStmtWriter::VisitOpenACCExitDataConstruct(OpenACCExitDataConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCConstructStmt(S);
+  Code = serialization::STMT_OPENACC_EXIT_DATA_CONSTRUCT;
+}
+
+void ASTStmtWriter::VisitOpenACCHostDataConstruct(OpenACCHostDataConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCAssociatedStmtConstruct(S);
+  Code = serialization::STMT_OPENACC_HOST_DATA_CONSTRUCT;
+}
+
 //===----------------------------------------------------------------------===//
 // HLSL Constructs/Directives.
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
index b3cd594a0f352..69b63240d2075 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
@@ -33,7 +33,7 @@ bool tryToFindPtrOrigin(
       E = tempExpr->getSubExpr();
       continue;
     }
-    if (auto *tempExpr = dyn_cast<CXXTemporaryObjectExpr>(E)) {
+    if (auto *tempExpr = dyn_cast<CXXConstructExpr>(E)) {
       if (auto *C = tempExpr->getConstructor()) {
         if (auto *Class = C->getParent(); Class && isSafePtr(Class))
           return callback(E, true);
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index b46cd9fe86fc1..ae43c59511bfa 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1825,6 +1825,10 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Stmt::OpenACCComputeConstructClass:
     case Stmt::OpenACCLoopConstructClass:
     case Stmt::OpenACCCombinedConstructClass:
+    case Stmt::OpenACCDataConstructClass:
+    case Stmt::OpenACCEnterDataConstructClass:
+    case Stmt::OpenACCExitDataConstructClass:
+    case Stmt::OpenACCHostDataConstructClass:
     case Stmt::OMPUnrollDirectiveClass:
     case Stmt::OMPMetaDirectiveClass:
     case Stmt::HLSLOutArgExprClass: {
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index ef6faae030a8f..4ee24646286fa 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -1223,3 +1223,51 @@ namespace BuiltinMemcpy {
   static_assert(test_memcpy(0, 1, sizeof(int) * 2) == 2334); // both-error {{not an integral constant expression}} \
                                                              // both-note {{in call}}
 }
+
+namespace Memcmp {
+  constexpr unsigned char ku00fe00[] = {0x00, 0xfe, 0x00};
+  constexpr unsigned char ku00feff[] = {0x00, 0xfe, 0xff};
+  constexpr signed char ks00fe00[] = {0, -2, 0};
+  constexpr signed char ks00feff[] = {0, -2, -1};
+  static_assert(__builtin_memcmp(ku00feff, ks00fe00, 2) == 0);
+  static_assert(__builtin_memcmp(ku00feff, ks00fe00, 99) == 1);
+  static_assert(__builtin_memcmp(ku00fe00, ks00feff, 99) == -1);
+  static_assert(__builtin_memcmp(ks00feff, ku00fe00, 2) == 0);
+  static_assert(__builtin_memcmp(ks00feff, ku00fe00, 99) == 1);
+  static_assert(__builtin_memcmp(ks00fe00, ku00feff, 99) == -1);
+  static_assert(__builtin_memcmp(ks00fe00, ks00feff, 2) == 0);
+  static_assert(__builtin_memcmp(ks00feff, ks00fe00, 99) == 1);
+  static_assert(__builtin_memcmp(ks00fe00, ks00feff, 99) == -1);
+
+  struct Bool3Tuple { bool bb[3]; };
+  constexpr Bool3Tuple kb000100 = {{false, true, false}};
+  static_assert(sizeof(bool) != 1u || __builtin_memcmp(ks00fe00, kb000100.bb, 1) == 0); // both-error {{constant}} \
+                                                                                        // both-note {{not supported}}
+
+  constexpr char a = 'a';
+  constexpr char b = 'a';
+  static_assert(__builtin_memcmp(&a, &b, 1) == 0);
+
+  extern struct Incomplete incomplete;
+  static_assert(__builtin_memcmp(&incomplete, "", 0u) == 0);
+  static_assert(__builtin_memcmp("", &incomplete, 0u) == 0);
+  static_assert(__builtin_memcmp(&incomplete, "", 1u) == 42); // both-error {{not an integral constant}} \
+                                                              // both-note {{not supported}}
+  static_assert(__builtin_memcmp("", &incomplete, 1u) == 42); // both-error {{not an integral constant}} \
+                                                              // both-note {{not supported}}
+
+  static_assert(__builtin_memcmp(u8"abab\0banana", u8"abab\0banana", 100) == 0); // both-error {{not an integral constant}} \
+                                                                                 // both-note {{dereferenced one-past-the-end}}
+
+  static_assert(__builtin_bcmp("abaa", "abba", 3) != 0);
+  static_assert(__builtin_bcmp("abaa", "abba", 2) == 0);
+  static_assert(__builtin_bcmp("a\203", "a", 2) != 0);
+  static_assert(__builtin_bcmp("a\203", "a\003", 2) != 0);
+  static_assert(__builtin_bcmp(0, 0, 0) == 0);
+  static_assert(__builtin_bcmp("abab\0banana", "abab\0banana", 100) == 0); // both-error {{not an integral constant}}\
+                                                                           // both-note {{dereferenced one-past-the-end}}
+  static_assert(__builtin_bcmp("abab\0banana", "abab\0canada", 100) != 0); // FIXME: Should we reject this?
+  static_assert(__builtin_bcmp("abab\0banana", "abab\0canada", 7) != 0);
+  static_assert(__builtin_bcmp("abab\0banana", "abab\0canada", 6) != 0);
+  static_assert(__builtin_bcmp("abab\0banana", "abab\0canada", 5) == 0);
+}
diff --git a/clang/test/AST/ByteCode/complex.cpp b/clang/test/AST/ByteCode/complex.cpp
index ee11c6214b70c..2c0111c53d3bf 100644
--- a/clang/test/AST/ByteCode/complex.cpp
+++ b/clang/test/AST/ByteCode/complex.cpp
@@ -146,11 +146,6 @@ constexpr _Complex int I3 = {15};
 static_assert(__real(I3) == 15, "");
 static_assert(__imag(I3) == 0, "");
 
-constexpr _Complex _BitInt(8) A = {4};
-static_assert(__real(A) == 4, "");
-static_assert(__imag(A) == 0, "");
-
-
 constexpr _Complex double Doubles[4] = {{1.0, 2.0}};
 static_assert(__real(Doubles[0]) == 1.0, "");
 static_assert(__imag(Doubles[0]) == 2.0, "");
@@ -163,9 +158,6 @@ static_assert(__imag(Doubles[3]) == 0.0, "");
 
 static_assert(~(0.5 + 1.5j) == (0.5 + -1.5j), "");
 
-static_assert(__extension__ __imag(A) == 0, "");
-static_assert(__imag(__extension__ A) == 0, "");
-
 void func(void) {
   __complex__ int arr;
   _Complex int result;
diff --git a/clang/test/AST/ast-print-openacc-data-construct.cpp b/clang/test/AST/ast-print-openacc-data-construct.cpp
new file mode 100644
index 0000000000000..3e828b594bc9c
--- /dev/null
+++ b/clang/test/AST/ast-print-openacc-data-construct.cpp
@@ -0,0 +1,82 @@
+// RUN: %clang_cc1 -fopenacc -Wno-openacc-deprecated-clause-alias -Wno-source-uses-openacc -ast-print %s -o - | FileCheck %s
+
+void foo() {
+  int Var;
+  // TODO OpenACC: These are only legal if they have one of a list of clauses on
+  // them, so the 'check' lines should start to include those once we implement
+  // them.  For now, they don't emit those because they are 'not implemented'.
+
+// CHECK: #pragma acc data default(none)
+#pragma acc data default(none)
+  ;
+
+// CHECK: #pragma acc data device_type(int)
+#pragma acc data device_type(int)
+  ;
+
+// CHECK: #pragma acc enter data
+// CHECK-NOT: copyin(Var)
+#pragma acc enter data copyin(Var)
+  ;
+// CHECK: #pragma acc exit data
+// CHECK-NOT: copyout(Var)
+#pragma acc exit data copyout(Var)
+  ;
+// CHECK: #pragma acc host_data
+// CHECK-NOT: use_device(Var)
+#pragma acc host_data use_device(Var)
+  ;
+
+  int i;
+  int *iPtr;
+  int array[5];
+
+// CHECK: #pragma acc data default(none) if(i == array[1])
+#pragma acc data default(none) if(i == array[1])
+  ;
+// CHECK: #pragma acc enter data if(i == array[1])
+#pragma acc enter data copyin(Var) if(i == array[1])
+  ;
+// CHECK: #pragma acc exit data if(i == array[1])
+#pragma acc exit data copyout(Var) if(i == array[1])
+  ;
+// CHECK: #pragma acc host_data if(i == array[1])
+#pragma acc host_data use_device(Var) if(i == array[1])
+  ;
+
+// CHECK: #pragma acc data default(none) async(i)
+#pragma acc data default(none) async(i)
+  ;
+// CHECK: #pragma acc enter data async(i)
+#pragma acc enter data copyin(i) async(i)
+// CHECK: #pragma acc exit data async
+#pragma acc exit data copyout(i) async
+
+// CHECK: #pragma acc data default(none) wait
+#pragma acc data default(none) wait()
+  ;
+
+// CHECK: #pragma acc enter data wait()
+#pragma acc enter data copyin(Var) wait()
+
+// CHECK: #pragma acc exit data wait(*iPtr, i)
+#pragma acc exit data copyout(Var) wait(*iPtr, i)
+
+// CHECK: #pragma acc data default(none) wait(queues: *iPtr, i)
+#pragma acc data default(none) wait(queues:*iPtr, i)
+  ;
+
+// CHECK: #pragma acc enter data wait(devnum: i : *iPtr, i)
+#pragma acc enter data copyin(Var) wait(devnum:i:*iPtr, i)
+
+// CHECK: #pragma acc exit data wait(devnum: i : queues: *iPtr, i)
+#pragma acc exit data copyout(Var) wait(devnum:i:queues:*iPtr, i)
+
+// CHECK: #pragma acc data default(none)
+#pragma acc data default(none)
+  ;
+
+// CHECK: #pragma acc data default(present)
+#pragma acc data default(present)
+  ;
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/call-args.cpp b/clang/test/Analysis/Checkers/WebKit/call-args.cpp
index 94efddeaf66cd..9920690746daf 100644
--- a/clang/test/Analysis/Checkers/WebKit/call-args.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/call-args.cpp
@@ -364,4 +364,15 @@ namespace call_with_explicit_temporary_obj {
     Ref { *provide() }->method();
     RefPtr { provide() }->method();
   }
+  template <typename T>
+  void bar() {
+    Ref(*provide())->method();
+    RefPtr(provide())->method();
+  }
+  void baz() {
+    bar<int>();
+  }
+}
+
+namespace call_with_explicit_construct {
 }
diff --git a/clang/test/CodeGenCXX/ext-int.cpp b/clang/test/CodeGenCXX/ext-int.cpp
index 97b5d6ce16b88..f470398ec2095 100644
--- a/clang/test/CodeGenCXX/ext-int.cpp
+++ b/clang/test/CodeGenCXX/ext-int.cpp
@@ -549,24 +549,6 @@ void Shift(_BitInt(28) Ext, _BitInt(65) LargeExt, int i) {
   // CHECK: ashr i65 {{.+}}, %[[PROMO]]
 }
 
-void ComplexTest(_Complex _BitInt(12) first, _Complex _BitInt(33) second) {
-  // LIN: define{{.*}} void @_Z11ComplexTestCDB12_CDB33_
-  // WIN: define dso_local void  @"?ComplexTest@@YAXU?$_Complex@U?$_BitInt@$0M@@__clang@@@__clang@@U?$_Complex@U?$_BitInt@$0CB@@__clang@@@2@@Z"
-  first + second;
-  // CHECK: %[[FIRST_REALP:.+]] = getelementptr inbounds nuw { i12, i12 }, ptr %{{.+}}, i32 0, i32 0
-  // CHECK: %[[FIRST_REAL:.+]] = load i12, ptr %[[FIRST_REALP]]
-  // CHECK: %[[FIRST_IMAGP:.+]] = getelementptr inbounds nuw { i12, i12 }, ptr %{{.+}}, i32 0, i32 1
-  // CHECK: %[[FIRST_IMAG:.+]] = load i12, ptr %[[FIRST_IMAGP]]
-  // CHECK: %[[FIRST_REAL_CONV:.+]] = sext i12 %[[FIRST_REAL]]
-  // CHECK: %[[FIRST_IMAG_CONV:.+]] = sext i12 %[[FIRST_IMAG]]
-  // CHECK: %[[SECOND_REALP:.+]] = getelementptr inbounds nuw { i33, i33 }, ptr %{{.+}}, i32 0, i32 0
-  // CHECK: %[[SECOND_REAL:.+]] = load i33, ptr %[[SECOND_REALP]]
-  // CHECK: %[[SECOND_IMAGP:.+]] = getelementptr inbounds nuw { i33, i33 }, ptr %{{.+}}, i32 0, i32 1
-  // CHECK: %[[SECOND_IMAG:.+]] = load i33, ptr %[[SECOND_IMAGP]]
-  // CHECK: %[[REAL:.+]] = add i33 %[[FIRST_REAL_CONV]], %[[SECOND_REAL]]
-  // CHECK: %[[IMAG:.+]] = add i33 %[[FIRST_IMAG_CONV]], %[[SECOND_IMAG]]
-}
-
 typedef  _BitInt(64) vint64_t16 __attribute__((vector_size(16)));
 void VectorTest(vint64_t16 first, vint64_t16 second) {
   // LIN: define{{.*}} void @_Z10VectorTestDv2_DB64_S0_(<2 x i64> %{{.+}}, <2 x i64> %{{.+}})
diff --git a/clang/test/Driver/config-file3.c b/clang/test/Driver/config-file3.c
index a0b8062c60ce5..395c31ce04b6b 100644
--- a/clang/test/Driver/config-file3.c
+++ b/clang/test/Driver/config-file3.c
@@ -226,3 +226,26 @@
 //
 // RUN: HOME=%S/Inputs/config %clang -### --config-user-dir=~ -v 2>&1 | FileCheck %s --check-prefix=CHECK-TILDE
 // CHECK-TILDE: User configuration file directory: {{.*}}/Inputs/config
+
+//--- Fallback to stripping OS versions
+//
+// RUN: touch %t/testdmode/x86_64-apple-darwin23.6.0-clang.cfg
+// RUN: touch %t/testdmode/x86_64-apple-darwin23-clang.cfg
+// RUN: touch %t/testdmode/x86_64-apple-darwin-clang.cfg
+// RUN: %clang -target x86_64-apple-darwin23.6.0 --config-system-dir=%t/testdmode --config-user-dir= -no-canonical-prefixes --version 2>&1 | FileCheck %s -check-prefix DARWIN --implicit-check-not 'Configuration file:'
+//
+// DARWIN: Configuration file: {{.*}}/testdmode/x86_64-apple-darwin23.6.0-clang.cfg
+
+//--- DARWIN + no full version
+//
+// RUN: rm %t/testdmode/x86_64-apple-darwin23.6.0-clang.cfg
+// RUN: %clang -target x86_64-apple-darwin23.6.0 --config-system-dir=%t/testdmode --config-user-dir= -no-canonical-prefixes --version 2>&1 | FileCheck %s -check-prefix DARWIN-MAJOR --implicit-check-not 'Configuration file:'
+//
+// DARWIN-MAJOR: Configuration file: {{.*}}/testdmode/x86_64-apple-darwin23-clang.cfg
+
+//--- DARWIN + no version
+//
+// RUN: rm %t/testdmode/x86_64-apple-darwin23-clang.cfg
+// RUN: %clang -target x86_64-apple-darwin23.6.0 --config-system-dir=%t/testdmode --config-user-dir= -no-canonical-prefixes --version 2>&1 | FileCheck %s -check-prefix DARWIN-VERSIONLESS --implicit-check-not 'Configuration file:'
+//
+// DARWIN-VERSIONLESS: Configuration file: {{.*}}/testdmode/x86_64-apple-darwin-clang.cfg
diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index 9df903115b57c..c268f75a0fd8d 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -189,6 +189,7 @@
 // CHECK-NEXT:     ssctr                1.0       'Ssctr' (Control Transfer Records Supervisor Level)
 // CHECK-NEXT:     svukte               0.3       'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses)
 // CHECK-NEXT:     xqcia                0.2       'Xqcia' (Qualcomm uC Arithmetic Extension)
+// CHECK-NEXT:     xqcics               0.2       'Xqcics' (Qualcomm uC Conditional Select Extension)
 // CHECK-NEXT:     xqcicsr              0.2       'Xqcicsr' (Qualcomm uC CSR Extension)
 // CHECK-NEXT:     xqcisls              0.2       'Xqcisls' (Qualcomm uC Scaled Load Store Extension)
 // CHECK-EMPTY:
diff --git a/clang/test/Driver/stack-clash-protection.c b/clang/test/Driver/stack-clash-protection.c
index 222452f7897a6..3b0476db9d3cb 100644
--- a/clang/test/Driver/stack-clash-protection.c
+++ b/clang/test/Driver/stack-clash-protection.c
@@ -22,6 +22,11 @@
 // SCP-ll-win64-NOT: attributes {{.*}} "probe-stack"="inline-asm"
 // SCP-ll-win64: argument unused during compilation: '-fstack-clash-protection'
 
+// RUN: %clang -target x86_64-unknown-fuchsia -fstack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-FUCHSIA
+// RUN: %clang -target aarch64-unknown-fuchsia -fstack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-FUCHSIA
+// RUN: %clang -target riscv64-unknown-fuchsia -fstack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-FUCHSIA
+// SCP-FUCHSIA: "-fstack-clash-protection"
+
 int foo(int c) {
   int r;
   __asm__("sub %0, %%rsp"
diff --git a/clang/test/Driver/sysroot.c b/clang/test/Driver/sysroot.c
index 85da2499090af..3080f76e03168 100644
--- a/clang/test/Driver/sysroot.c
+++ b/clang/test/Driver/sysroot.c
@@ -4,10 +4,9 @@
 // CHECK-SYSROOTEQ: "-cc1"{{.*}} "-isysroot" "{{[^"]*}}/FOO"
 
 // Apple Darwin uses -isysroot as the syslib root, too.
-// We pass --sysroot="" to defeat any -DDEFAULT_SYSROOT parameter.
 // RUN: touch %t2.o
 // RUN: %clang -target i386-apple-darwin10 \
-// RUN:   -isysroot /FOO --sysroot="" -### %t2.o 2> %t2
+// RUN:   -isysroot /FOO -### %t2.o 2> %t2
 // RUN: FileCheck --check-prefix=CHECK-APPLE-ISYSROOT < %t2 %s
 // CHECK-APPLE-ISYSROOT: "-arch" "i386"{{.*}} "-syslibroot" "{{[^"]*}}/FOO"
 
diff --git a/clang/test/ParserOpenACC/parse-clauses.c b/clang/test/ParserOpenACC/parse-clauses.c
index 3741ed099cf5c..558e61d666fd2 100644
--- a/clang/test/ParserOpenACC/parse-clauses.c
+++ b/clang/test/ParserOpenACC/parse-clauses.c
@@ -4,37 +4,29 @@
 
 void func() {
 
-  // expected-warning@+2{{OpenACC clause 'finalize' not yet implemented, clause ignored}}
-  // expected-warning@+1{{OpenACC construct 'enter data' not yet implemented, pragma ignored}}
+  // expected-warning@+1{{OpenACC clause 'finalize' not yet implemented, clause ignored}}
 #pragma acc enter data finalize
 
-  // expected-warning@+3{{OpenACC clause 'finalize' not yet implemented, clause ignored}}
   // expected-warning@+2{{OpenACC clause 'finalize' not yet implemented, clause ignored}}
-  // expected-warning@+1{{OpenACC construct 'enter data' not yet implemented, pragma ignored}}
+  // expected-warning@+1{{OpenACC clause 'finalize' not yet implemented, clause ignored}}
 #pragma acc enter data finalize finalize
 
-  // expected-warning@+3{{OpenACC clause 'finalize' not yet implemented, clause ignored}}
-  // expected-error@+2{{invalid OpenACC clause 'invalid'}}
-  // expected-warning@+1{{OpenACC construct 'enter data' not yet implemented, pragma ignored}}
+  // expected-warning@+2{{OpenACC clause 'finalize' not yet implemented, clause ignored}}
+  // expected-error@+1{{invalid OpenACC clause 'invalid'}}
 #pragma acc enter data finalize invalid
 
-  // expected-warning@+3{{OpenACC clause 'finalize' not yet implemented, clause ignored}}
-  // expected-error@+2{{invalid OpenACC clause 'invalid'}}
-  // expected-warning@+1{{OpenACC construct 'enter data' not yet implemented, pragma ignored}}
+  // expected-warning@+2{{OpenACC clause 'finalize' not yet implemented, clause ignored}}
+  // expected-error@+1{{invalid OpenACC clause 'invalid'}}
 #pragma acc enter data finalize invalid invalid finalize
 
-  // expected-warning@+3{{OpenACC clause 'wait' not yet implemented, clause ignored}}
-  // expected-warning@+2{{OpenACC clause 'finalize' not yet implemented, clause ignored}}
-  // expected-warning@+1{{OpenACC construct 'enter data' not yet implemented, pragma ignored}}
+  // expected-warning@+1{{OpenACC clause 'finalize' not yet implemented, clause ignored}}
 #pragma acc enter data wait finalize
 
-  // expected-warning@+2{{OpenACC clause 'if_present' not yet implemented, clause ignored}}
-  // expected-warning@+1{{OpenACC construct 'host_data' not yet implemented, pragma ignored}}
+  // expected-warning@+1{{OpenACC clause 'if_present' not yet implemented, clause ignored}}
 #pragma acc host_data if_present
 
-  // expected-warning@+3{{OpenACC clause 'if_present' not yet implemented, clause ignored}}
   // expected-warning@+2{{OpenACC clause 'if_present' not yet implemented, clause ignored}}
-  // expected-warning@+1{{OpenACC construct 'host_data' not yet implemented, pragma ignored}}
+  // expected-warning@+1{{OpenACC clause 'if_present' not yet implemented, clause ignored}}
 #pragma acc host_data if_present, if_present
 
   // expected-error@+4{{OpenACC clause 'independent' on 'loop' construct conflicts with previous data dependence clause}}
@@ -528,29 +520,21 @@ void VarListClauses() {
 #pragma acc serial firstprivate(s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
-  // expected-warning@+4{{OpenACC construct 'exit data' not yet implemented}}
-  // expected-error@+3{{expected ','}}
-  // expected-warning@+2{{OpenACC clause 'delete' not yet implemented, clause ignored}}
-  // expected-warning@+1{{OpenACC clause 'async' not yet implemented, clause ignored}}
+  // expected-error@+2{{expected ','}}
+  // expected-warning@+1{{OpenACC clause 'delete' not yet implemented, clause ignored}}
 #pragma acc exit data delete(s.array[s.value] s.array[s.value :5] ) async
   for(int i = 0; i < 5;++i) {}
 
-  // expected-warning@+3{{OpenACC construct 'exit data' not yet implemented}}
-  // expected-warning@+2{{OpenACC clause 'delete' not yet implemented, clause ignored}}
-  // expected-warning@+1{{OpenACC clause 'async' not yet implemented, clause ignored}}
+  // expected-warning@+1{{OpenACC clause 'delete' not yet implemented, clause ignored}}
 #pragma acc exit data delete(s.array[s.value : 5], s.value),async
   for(int i = 0; i < 5;++i) {}
 
-  // expected-warning@+4{{OpenACC construct 'exit data' not yet implemented}}
-  // expected-error@+3{{expected ','}}
-  // expected-warning@+2{{OpenACC clause 'use_device' not yet implemented, clause ignored}}
-  // expected-warning@+1{{OpenACC clause 'async' not yet implemented, clause ignored}}
+  // expected-error@+2{{expected ','}}
+  // expected-warning@+1{{OpenACC clause 'use_device' not yet implemented, clause ignored}}
 #pragma acc exit data use_device(s.array[s.value] s.array[s.value :5] ),async
   for(int i = 0; i < 5;++i) {}
 
-  // expected-warning@+3{{OpenACC construct 'exit data' not yet implemented}}
-  // expected-warning@+2{{OpenACC clause 'use_device' not yet implemented, clause ignored}}
-  // expected-warning@+1{{OpenACC clause 'async' not yet implemented, clause ignored}}
+  // expected-warning@+1{{OpenACC clause 'use_device' not yet implemented, clause ignored}}
 #pragma acc exit data use_device(s.array[s.value : 5], s.value), async
   for(int i = 0; i < 5;++i) {}
 
diff --git a/clang/test/ParserOpenACC/parse-clauses.cpp b/clang/test/ParserOpenACC/parse-clauses.cpp
index 4dc966ea9879f..1781a27940754 100644
--- a/clang/test/ParserOpenACC/parse-clauses.cpp
+++ b/clang/test/ParserOpenACC/parse-clauses.cpp
@@ -35,7 +35,6 @@ void templ() {
 #pragma acc parallel async
   for(;;){}
 
-  // expected-warning@+2{{OpenACC construct 'exit data' not yet implemented}}
   // expected-warning@+1{{OpenACC clause 'delete' not yet implemented, clause ignored}}
 #pragma acc exit data delete(I)
   ;
diff --git a/clang/test/ParserOpenACC/parse-constructs.c b/clang/test/ParserOpenACC/parse-constructs.c
index 27b9a6993fd3e..d3b1ccb48c034 100644
--- a/clang/test/ParserOpenACC/parse-constructs.c
+++ b/clang/test/ParserOpenACC/parse-constructs.c
@@ -54,16 +54,13 @@ void func() {
   // expected-error@+1{{invalid OpenACC clause 'clause'}}
 #pragma acc kernels clause list
   for(;;){}
-  // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC construct 'data' not yet implemented, pragma ignored}}
+  // expected-error@+1{{invalid OpenACC clause 'clause'}}
 #pragma acc data clause list
   for(;;){}
-  // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC construct 'enter data' not yet implemented, pragma ignored}}
+  // expected-error@+1{{invalid OpenACC clause 'clause'}}
 #pragma acc enter data clause list
   for(;;){}
-  // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC construct 'exit data' not yet implemented, pragma ignored}}
+  // expected-error@+1{{invalid OpenACC clause 'clause'}}
 #pragma acc exit data clause list
   for(;;){}
   // expected-error@+1{{invalid OpenACC directive 'enter invalid'}}
@@ -78,8 +75,7 @@ void func() {
   // expected-error@+1{{expected identifier}}
 #pragma acc exit }
   for(;;){}
-  // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC construct 'host_data' not yet implemented, pragma ignored}}
+  // expected-error@+1{{invalid OpenACC clause 'clause'}}
 #pragma acc host_data clause list
   for(;;){}
   // expected-error@+1{{invalid OpenACC clause 'clause'}}
diff --git a/clang/test/Sema/Inputs/lifetime-analysis.h b/clang/test/Sema/Inputs/lifetime-analysis.h
index 5c151385b1fe5..f888e6ab94bb6 100644
--- a/clang/test/Sema/Inputs/lifetime-analysis.h
+++ b/clang/test/Sema/Inputs/lifetime-analysis.h
@@ -128,6 +128,11 @@ struct reference_wrapper {
 template<typename T>
 reference_wrapper<T> ref(T& t) noexcept;
 
+template <typename T>
+struct [[gsl::Pointer]] iterator {
+  T& operator*() const;
+};
+
 struct false_type {
     static constexpr bool value = false;
     constexpr operator bool() const noexcept { return value; }
diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
index fc876926ba2e6..45b4dc838f44e 100644
--- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
+++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
@@ -604,8 +604,9 @@ struct [[gsl::Pointer]] Span {
 
 // Pointer from Owner<Pointer>
 std::string_view test5() {
-  std::string_view a = StatusOr<std::string_view>().valueLB(); // expected-warning {{object backing the pointer will be dest}}
-  return StatusOr<std::string_view>().valueLB(); // expected-warning {{returning address of local temporary}}
+  // The Owner<Pointer> doesn't own the object which its inner pointer points to.
+  std::string_view a = StatusOr<std::string_view>().valueLB(); // OK
+  return StatusOr<std::string_view>().valueLB(); // OK
 
   // No dangling diagnostics on non-lifetimebound methods.
   std::string_view b = StatusOr<std::string_view>().valueNoLB();
@@ -652,7 +653,7 @@ Span<std::string> test10(StatusOr<std::vector<std::string>> aa) {
 
 // Pointer<Owner>> from Owner<Pointer<Owner>>
 Span<std::string> test11(StatusOr<Span<std::string>> aa) {
-  return aa.valueLB(); // expected-warning {{address of stack memory}}
+  return aa.valueLB(); // OK
   return aa.valueNoLB(); // OK.
 }
 
@@ -693,3 +694,86 @@ void test() {
   auto y = std::set<int>{}.begin(); // expected-warning {{object backing the pointer}}
 }
 } // namespace GH118064
+
+namespace LifetimeboundInterleave {
+
+const std::string& Ref(const std::string& abc [[clang::lifetimebound]]);
+
+std::string_view TakeSv(std::string_view abc [[clang::lifetimebound]]);
+std::string_view TakeStrRef(const std::string& abc [[clang::lifetimebound]]);
+std::string_view TakeStr(std::string abc [[clang::lifetimebound]]);
+
+std::string_view test1() {
+  std::string_view t1 = Ref(std::string()); // expected-warning {{object backing}}
+  t1 = Ref(std::string()); // expected-warning {{object backing}}
+  return Ref(std::string()); // expected-warning {{returning address}}
+  
+  std::string_view t2 = TakeSv(std::string()); // expected-warning {{object backing}}
+  t2 = TakeSv(std::string()); // expected-warning {{object backing}}
+  return TakeSv(std::string()); // expected-warning {{returning address}}
+
+  std::string_view t3 = TakeStrRef(std::string()); // expected-warning {{temporary}}
+  t3 = TakeStrRef(std::string()); // expected-warning {{object backing}}
+  return TakeStrRef(std::string()); // expected-warning {{returning address}}
+
+
+  std::string_view t4 = TakeStr(std::string());
+  t4 = TakeStr(std::string());
+  return TakeStr(std::string());
+}
+
+template <typename T>
+struct Foo {
+  const T& get() const [[clang::lifetimebound]];
+  const T& getNoLB() const;
+};
+std::string_view test2(Foo<std::string> r1, Foo<std::string_view> r2) {
+  std::string_view t1 = Foo<std::string>().get(); // expected-warning {{object backing}}
+  t1 = Foo<std::string>().get(); // expected-warning {{object backing}}
+  return r1.get(); // expected-warning {{address of stack}}
+  
+  std::string_view t2 = Foo<std::string_view>().get();
+  t2 = Foo<std::string_view>().get();
+  return r2.get();
+
+  // no warning on no-LB-annotated method.
+  std::string_view t3 = Foo<std::string>().getNoLB(); 
+  t3 = Foo<std::string>().getNoLB(); 
+  return r1.getNoLB(); 
+}
+
+struct Bar {};
+struct [[gsl::Pointer]] Pointer {
+  Pointer(const Bar & bar [[clang::lifetimebound]]);
+};
+Pointer test3(Bar bar) {
+  Pointer p = Pointer(Bar()); // expected-warning {{temporary}}
+  p = Pointer(Bar()); // expected-warning {{object backing}}
+  return bar; // expected-warning {{address of stack}}
+}
+
+template<typename T>
+struct MySpan {
+  MySpan(const std::vector<T>& v);
+  using iterator = std::iterator<T>;
+  iterator begin() const [[clang::lifetimebound]];
+};
+template <typename T>
+typename MySpan<T>::iterator ReturnFirstIt(const MySpan<T>& v [[clang::lifetimebound]]);
+
+void test4() {
+  std::vector<int> v{1};
+  // MySpan<T> doesn't own any underlying T objects, the pointee object of
+  // the MySpan iterator is still alive when the whole span is destroyed, thus
+  // no diagnostic.
+  const int& t1 = *MySpan<int>(v).begin();
+  const int& t2 = *ReturnFirstIt(MySpan<int>(v));
+  // Ideally, we would diagnose the following case, but due to implementation
+  // constraints, we do not.
+  const int& t4 = *MySpan<int>(std::vector<int>{}).begin();
+  
+  auto it1 = MySpan<int>(v).begin(); // expected-warning {{temporary whose address is use}}
+  auto it2 = ReturnFirstIt(MySpan<int>(v)); // expected-warning {{temporary whose address is used}}
+}
+
+} // namespace LifetimeboundInterleave
diff --git a/clang/test/SemaCXX/ext-int.cpp b/clang/test/SemaCXX/ext-int.cpp
index 000b871ccd343..d974221e774a7 100644
--- a/clang/test/SemaCXX/ext-int.cpp
+++ b/clang/test/SemaCXX/ext-int.cpp
@@ -101,8 +101,10 @@ typedef _BitInt(37) __attribute__((vector_size(16))) VecTy4;
 // expected-error@+1{{'_BitInt' vector element width must be a power of 2}}
 typedef _BitInt(37) __attribute__((ext_vector_type(32))) OtherVecTy4;
 
-// Allow _Complex:
+// expected-error@+1{{'_Complex _BitInt' is invalid}}
 _Complex _BitInt(3) Cmplx;
+// expected-error@+1{{'_Complex _BitInt' is invalid}}
+typedef _Complex _BitInt(3) Cmp;
 
 // Reject cases of _Atomic:
 // expected-error@+1{{_Atomic cannot be applied to integer type '_BitInt(4)'}}
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp
index c6c93a27e4b96..7dd6c83dbba2a 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp
@@ -52,3 +52,43 @@ void constant_id_string(unsigned idx) {
   unsafe_char = ""[1]; //expected-warning{{unsafe buffer access}} 
   unsafe_char = ""[idx]; //expected-warning{{unsafe buffer access}}
 }
+
+typedef float Float4x4[4][4];
+
+// expected-warning@+1 {{'matrix' is an unsafe buffer that does not perform bounds checks}}
+float two_dimension_array(Float4x4& matrix, unsigned idx) {
+  // expected-warning@+1{{unsafe buffer access}}
+  float a = matrix[0][4];
+
+  a = matrix[0][3];
+
+  // expected-note@+1{{used in buffer access here}}
+  a = matrix[4][0];
+
+  a = matrix[idx][0]; // expected-note{{used in buffer access here}}
+
+  a = matrix[0][idx]; //expected-warning{{unsafe buffer access}}
+
+  a = matrix[idx][idx]; //expected-warning{{unsafe buffer access}} // expected-note{{used in buffer access here}}
+
+  return matrix[1][1];
+}
+
+typedef float Float2x3x4[2][3][4];
+float multi_dimension_array(Float2x3x4& matrix) {
+  float *f = matrix[0][2];
+  return matrix[1][2][3];
+}
+
+char array_strings[][11] = {
+  "Apple", "Banana", "Cherry", "Date", "Elderberry"
+};
+
+char array_string[] = "123456";
+
+char access_strings() {
+  char c = array_strings[0][4];
+  c = array_strings[3][10];
+  c = array_string[5];
+  return c;
+}
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-field-attr.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-field-attr.cpp
index 0ba605475925b..1636c948da075 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-field-attr.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-field-attr.cpp
@@ -96,7 +96,6 @@ void test_attribute_multiple_fields (D d) {
 
    int v = d.buf[0]; //expected-warning{{field 'buf' prone to unsafe buffer manipulation}}
 
-   //expected-warning@+1{{unsafe buffer access}}
    v = d.buf[5]; //expected-warning{{field 'buf' prone to unsafe buffer manipulation}}
 }
 
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-parm-unsupported.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-parm-unsupported.cpp
index 71350098613d1..0c80da63f8291 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-parm-unsupported.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-parm-unsupported.cpp
@@ -118,7 +118,7 @@ void isArrayDecayToPointerUPC(int a[][10], int (*b)[10]) {
 // expected-warning@-2{{'b' is an unsafe pointer used for buffer access}}
   int tmp;
 
-  tmp = a[5][5] + b[5][5];  // expected-warning2{{unsafe buffer access}}  expected-note2{{used in buffer access here}}
+  tmp = a[5][5] + b[5][5];  // expected-note2{{used in buffer access here}}
 }
 
 // parameter having default values:
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage.cpp
index 642db0e9d3c63..41d38ada48788 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage.cpp
@@ -109,7 +109,6 @@ void testQualifiedParameters(const int * p, const int * const q, const int a[10]
       q[1], 1[q], q[-1],    // expected-note3{{used in buffer access here}}
       a[1],                 // expected-note{{used in buffer access here}}     `a` is of pointer type
       b[1][2]               // expected-note{{used in buffer access here}}     `b[1]` is of array type
-                            // expected-warning@-1{{unsafe buffer access}}
       );
 }
 
@@ -128,29 +127,41 @@ T_t funRetT();
 T_t * funRetTStar();
 
 void testStructMembers(struct T * sp, struct T s, T_t * sp2, T_t s2) {
-  foo(sp->a[1],     // expected-warning{{unsafe buffer access}}
+  foo(sp->a[1],
       sp->b[1],     // expected-warning{{unsafe buffer access}}
-      sp->c.a[1],   // expected-warning{{unsafe buffer access}}
+      sp->c.a[1],
       sp->c.b[1],   // expected-warning{{unsafe buffer access}}
-      s.a[1],       // expected-warning{{unsafe buffer access}}
+      s.a[1],
       s.b[1],       // expected-warning{{unsafe buffer access}}
-      s.c.a[1],     // expected-warning{{unsafe buffer access}}
+      s.c.a[1],
       s.c.b[1],     // expected-warning{{unsafe buffer access}}
-      sp2->a[1],    // expected-warning{{unsafe buffer access}}
+      sp2->a[1],
       sp2->b[1],    // expected-warning{{unsafe buffer access}}
-      sp2->c.a[1],  // expected-warning{{unsafe buffer access}}
+      sp2->c.a[1],
       sp2->c.b[1],  // expected-warning{{unsafe buffer access}}
-      s2.a[1],      // expected-warning{{unsafe buffer access}}
+      s2.a[1],
       s2.b[1],      // expected-warning{{unsafe buffer access}}
-      s2.c.a[1],           // expected-warning{{unsafe buffer access}}
+      s2.c.a[1],
       s2.c.b[1],           // expected-warning{{unsafe buffer access}}
-      funRetT().a[1],      // expected-warning{{unsafe buffer access}}
+      funRetT().a[1],
       funRetT().b[1],      // expected-warning{{unsafe buffer access}}
-      funRetTStar()->a[1], // expected-warning{{unsafe buffer access}}
+      funRetTStar()->a[1],
       funRetTStar()->b[1]  // expected-warning{{unsafe buffer access}}
       );
 }
 
+union Foo {
+   bool b;
+   int arr[10];
+};
+
+int testUnionMembers(Foo f) {
+  int a = f.arr[0];
+  a = f.arr[5];
+  a = f.arr[10]; // expected-warning{{unsafe buffer access}}
+  return a;
+}
+
 int garray[10];     // expected-warning{{'garray' is an unsafe buffer that does not perform bounds checks}}
 int * gp = garray;  // expected-warning{{'gp' is an unsafe pointer used for buffer access}}
 int gvar = gp[1];   // FIXME: file scope unsafe buffer access is not warned
@@ -213,7 +224,6 @@ void testTypedefs(T_ptr_t p) {
   // expected-warning@-1{{'p' is an unsafe pointer used for buffer access}}
   foo(p[1],       // expected-note{{used in buffer access here}}
       p[1].a[1],  // expected-note{{used in buffer access here}}
-                  // expected-warning@-1{{unsafe buffer access}}
       p[1].b[1]   // expected-note{{used in buffer access here}}
                   // expected-warning@-1{{unsafe buffer access}}
       );
@@ -223,10 +233,9 @@ template<typename T, int N> T f(T t, T * pt, T a[N], T (&b)[N]) {
   // expected-warning@-1{{'t' is an unsafe pointer used for buffer access}}
   // expected-warning@-2{{'pt' is an unsafe pointer used for buffer access}}
   // expected-warning@-3{{'a' is an unsafe pointer used for buffer access}}
-  // expected-warning@-4{{'b' is an unsafe buffer that does not perform bounds checks}}
   foo(pt[1],    // expected-note{{used in buffer access here}}
       a[1],     // expected-note{{used in buffer access here}}
-      b[1]);    // expected-note{{used in buffer access here}}
+      b[1]);
   return &t[1]; // expected-note{{used in buffer access here}}
 }
 
@@ -376,7 +385,7 @@ int testArrayAccesses(int n, int idx) {
     typedef int A[3];
     const A tArr = {4, 5, 6};
     foo(tArr[0], tArr[1]);
-    return cArr[0][1];      // expected-warning{{unsafe buffer access}}
+    return cArr[0][1];
 }
 
 void testArrayPtrArithmetic(int x[]) { // expected-warning{{'x' is an unsafe pointer used for buffer access}}
diff --git a/clang/test/SemaOpenACC/combined-construct-collapse-clause.cpp b/clang/test/SemaOpenACC/combined-construct-collapse-clause.cpp
index c7db9669a9879..31078ea7a0de9 100644
--- a/clang/test/SemaOpenACC/combined-construct-collapse-clause.cpp
+++ b/clang/test/SemaOpenACC/combined-construct-collapse-clause.cpp
@@ -214,14 +214,15 @@ void no_other_directives() {
 #pragma acc serial loop collapse(2)
   for(unsigned i = 0; i < 5; ++i) {
     for(unsigned j = 0; j < 5; ++j) {
-#pragma acc data // expected-warning{{OpenACC construct 'data' not yet implemented}}
+#pragma acc data
+      ;
     }
   }
   // expected-note@+1{{active 'collapse' clause defined here}}
 #pragma acc kernels loop collapse(2)
   for(unsigned i = 0; i < 5; ++i) {
     // expected-error@+1{{OpenACC 'data' construct cannot appear in intervening code of a 'kernels loop' with a 'collapse' clause}}
-#pragma acc data // expected-warning{{OpenACC construct 'data' not yet implemented}}
+#pragma acc data
     for(unsigned j = 0; j < 5; ++j) {
     }
   }
diff --git a/clang/test/SemaOpenACC/combined-construct-default-ast.cpp b/clang/test/SemaOpenACC/combined-construct-default-ast.cpp
index 2ff24b32afe7b..8f09e74907318 100644
--- a/clang/test/SemaOpenACC/combined-construct-default-ast.cpp
+++ b/clang/test/SemaOpenACC/combined-construct-default-ast.cpp
@@ -1,4 +1,3 @@
-
 // RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
 
 // Test this with PCH.
diff --git a/clang/test/SemaOpenACC/combined-construct-default-clause.c b/clang/test/SemaOpenACC/combined-construct-default-clause.c
index a9c90240cb122..43c2883f13184 100644
--- a/clang/test/SemaOpenACC/combined-construct-default-clause.c
+++ b/clang/test/SemaOpenACC/combined-construct-default-clause.c
@@ -28,8 +28,6 @@ void SingleOnly() {
   #pragma acc kernels loop default(none)
   for(int i = 0; i < 5; ++i);
 
-  // expected-warning@+2{{OpenACC construct 'data' not yet implemented}}
-  // expected-warning@+1{{OpenACC clause 'default' not yet implemented}}
   #pragma acc data default(none)
   while(0);
 
diff --git a/clang/test/SemaOpenACC/combined-construct-if-clause.c b/clang/test/SemaOpenACC/combined-construct-if-clause.c
index 563f1cd25377b..09bd4dd190b6b 100644
--- a/clang/test/SemaOpenACC/combined-construct-if-clause.c
+++ b/clang/test/SemaOpenACC/combined-construct-if-clause.c
@@ -43,8 +43,6 @@ void BoolExpr(int *I, float *F) {
 #pragma acc kernels loop if (*I < *F)
   for (unsigned i = 0; i < 5; ++i);
 
-  // expected-warning@+2{{OpenACC construct 'data' not yet implemented}}
-  // expected-warning@+1{{OpenACC clause 'if' not yet implemented}}
 #pragma acc data if (*I < *F)
   for (unsigned i = 0; i < 5; ++i);
 #pragma acc parallel loop if (*I < *F)
diff --git a/clang/test/SemaOpenACC/compute-construct-default-clause.c b/clang/test/SemaOpenACC/compute-construct-default-clause.c
index 70e29f3e8ac05..dfa5cd3f1c0d3 100644
--- a/clang/test/SemaOpenACC/compute-construct-default-clause.c
+++ b/clang/test/SemaOpenACC/compute-construct-default-clause.c
@@ -28,8 +28,6 @@ void SingleOnly() {
   #pragma acc kernels default(none)
   for(int i = 0; i < 5; ++i);
 
-  // expected-warning@+2{{OpenACC construct 'data' not yet implemented}}
-  // expected-warning@+1{{OpenACC clause 'default' not yet implemented}}
   #pragma acc data default(none)
   while(0);
 
diff --git a/clang/test/SemaOpenACC/compute-construct-device_type-clause.c b/clang/test/SemaOpenACC/compute-construct-device_type-clause.c
index 0ae972d2a99ff..2f4a037529b50 100644
--- a/clang/test/SemaOpenACC/compute-construct-device_type-clause.c
+++ b/clang/test/SemaOpenACC/compute-construct-device_type-clause.c
@@ -34,11 +34,9 @@ void uses() {
 #pragma acc kernels dtype(MACRO)
   while(1);
 
-  // expected-error@+2{{OpenACC 'device_type' clause is not valid on 'enter data' directive}}
-  // expected-warning@+1{{OpenACC construct 'enter data' not yet implemented}}
+  // expected-error@+1{{OpenACC 'device_type' clause is not valid on 'enter data' directive}}
 #pragma acc enter data device_type(I)
-  // expected-error@+2{{OpenACC 'dtype' clause is not valid on 'enter data' directive}}
-  // expected-warning@+1{{OpenACC construct 'enter data' not yet implemented}}
+  // expected-error@+1{{OpenACC 'dtype' clause is not valid on 'enter data' directive}}
 #pragma acc enter data dtype(I)
 
 
diff --git a/clang/test/SemaOpenACC/compute-construct-if-clause.c b/clang/test/SemaOpenACC/compute-construct-if-clause.c
index 7cdc35275acce..20d42a17cba14 100644
--- a/clang/test/SemaOpenACC/compute-construct-if-clause.c
+++ b/clang/test/SemaOpenACC/compute-construct-if-clause.c
@@ -43,8 +43,6 @@ void BoolExpr(int *I, float *F) {
 #pragma acc kernels if (*I < *F)
   while(0);
 
-  // expected-warning@+2{{OpenACC construct 'data' not yet implemented}}
-  // expected-warning@+1{{OpenACC clause 'if' not yet implemented}}
 #pragma acc data if (*I < *F)
   while(0);
 #pragma acc parallel loop if (*I < *F)
diff --git a/clang/test/SemaOpenACC/data-construct-ast.cpp b/clang/test/SemaOpenACC/data-construct-ast.cpp
new file mode 100644
index 0000000000000..60ed295fcd42e
--- /dev/null
+++ b/clang/test/SemaOpenACC/data-construct-ast.cpp
@@ -0,0 +1,91 @@
+// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+
+// Test this with PCH.
+// RUN: %clang_cc1 %s -fopenacc -emit-pch -o %t %s
+// RUN: %clang_cc1 %s -fopenacc -include-pch %t -ast-dump-all | FileCheck %s
+
+#ifndef PCH_HELPER
+#define PCH_HELPER
+
+void NormalFunc() {
+  // CHECK-LABEL: NormalFunc
+  // CHECK-NEXT: CompoundStmt
+
+  int Var;
+  // CHECK-NEXT: DeclStmt
+  // CHECK-NEXT: VarDecl
+
+  // TODO OpenACC: these constructs require the clauses to be legal, but we
+  // don't have the clauses implemented yet.  As we implement them, they needed
+  // to be added to the 'check' lines.
+
+#pragma acc data default(none)
+  while (Var);
+  // CHECK-NEXT: OpenACCDataConstruct{{.*}}data
+  // CHECK-NEXT: default(none)
+  // CHECK-NEXT: WhileStmt
+  // CHECK: NullStmt
+#pragma acc enter data copyin(Var)
+  // CHECK-NEXT: OpenACCEnterDataConstruct{{.*}} enter data
+#pragma acc exit data copyout(Var)
+  // CHECK-NEXT: OpenACCExitDataConstruct{{.*}} exit data
+#pragma acc host_data use_device(Var)
+  while (Var);
+  // CHECK-NEXT: OpenACCHostDataConstruct{{.*}} host_data
+  // CHECK-NEXT: WhileStmt
+  // CHECK: NullStmt
+}
+
+template<typename T>
+void TemplFunc() {
+  // CHECK-LABEL: FunctionTemplateDecl {{.*}}TemplFunc
+  // CHECK-NEXT: TemplateTypeParmDecl
+  // CHECK-NEXT: FunctionDecl{{.*}}TemplFunc
+  // CHECK-NEXT: CompoundStmt
+
+  T Var;
+  // CHECK-NEXT: DeclStmt
+  // CHECK-NEXT: VarDecl
+
+#pragma acc data default(none)
+  while (Var);
+  // CHECK-NEXT: OpenACCDataConstruct{{.*}}data
+  // CHECK-NEXT: default(none)
+  // CHECK-NEXT: WhileStmt
+  // CHECK: NullStmt
+#pragma acc enter data copyin(Var)
+  // CHECK-NEXT: OpenACCEnterDataConstruct{{.*}} enter data
+#pragma acc exit data copyout(Var)
+  // CHECK-NEXT: OpenACCExitDataConstruct{{.*}} exit data
+#pragma acc host_data use_device(Var)
+  while (Var);
+  // CHECK-NEXT: OpenACCHostDataConstruct{{.*}} host_data
+  // CHECK-NEXT: WhileStmt
+  // CHECK: NullStmt
+
+  // Instantiation:
+  // CHECK-NEXT: FunctionDecl{{.*}} TemplFunc 'void ()' implicit_instantiation
+  // CHECK-NEXT: TemplateArgument type 'int'
+  // CHECK-NEXT: BuiltinType{{.*}} 'int'
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: DeclStmt
+  // CHECK-NEXT: VarDecl
+
+  // CHECK-NEXT: OpenACCDataConstruct{{.*}}data
+  // CHECK-NEXT: default(none)
+  // CHECK-NEXT: WhileStmt
+  // CHECK: NullStmt
+
+  // CHECK-NEXT: OpenACCEnterDataConstruct{{.*}} enter data
+
+  // CHECK-NEXT: OpenACCExitDataConstruct{{.*}} exit data
+
+  // CHECK-NEXT: OpenACCHostDataConstruct{{.*}} host_data
+  // CHECK-NEXT: WhileStmt
+  // CHECK: NullStmt
+}
+void use() {
+  TemplFunc<int>();
+}
+#endif
diff --git a/clang/test/SemaOpenACC/data-construct-async-ast.cpp b/clang/test/SemaOpenACC/data-construct-async-ast.cpp
new file mode 100644
index 0000000000000..d16cc6f480797
--- /dev/null
+++ b/clang/test/SemaOpenACC/data-construct-async-ast.cpp
@@ -0,0 +1,61 @@
+// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+
+// Test this with PCH.
+// RUN: %clang_cc1 %s -fopenacc -emit-pch -o %t %s
+// RUN: %clang_cc1 %s -fopenacc -include-pch %t -ast-dump-all | FileCheck %s
+#ifndef PCH_HELPER
+#define PCH_HELPER
+
+int some_int();
+
+template<typename T>
+void TemplUses() {
+  // CHECK: FunctionTemplateDecl{{.*}}TemplUses
+  // CHECK-NEXT: TemplateTypeParmDecl{{.*}}T
+  // CHECK-NEXT: FunctionDecl{{.*}}TemplUses
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc data async(some_int())
+  ;
+  // CHECK-NEXT: OpenACCDataConstruct{{.*}}data
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+  // CHECK-NEXT: NullStmt
+#pragma acc enter data async(T{})
+  // CHECK-NEXT: OpenACCEnterDataConstruct{{.*}}enter data
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: CXXUnresolvedConstructExpr{{.*}} 'T' 'T' list
+  // CHECK-NEXT: InitListExpr{{.*}}'void'
+#pragma acc exit data async
+  // CHECK-NEXT: OpenACCExitDataConstruct{{.*}}exit data
+  // CHECK-NEXT: async clause
+
+  // Instantiations
+  // CHECK-NEXT: FunctionDecl{{.*}} TemplUses 'void ()' implicit_instantiation
+  // CHECK-NEXT: TemplateArgument type 'int'
+  // CHECK-NEXT: BuiltinType{{.*}} 'int'
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: OpenACCDataConstruct{{.*}}data
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+  // CHECK-NEXT: NullStmt
+
+  // CHECK-NEXT: OpenACCEnterDataConstruct{{.*}}enter data
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: CXXFunctionalCastExpr
+  // CHECK-NEXT: InitListExpr{{.*}}'int'
+
+  // CHECK-NEXT: OpenACCExitDataConstruct{{.*}}exit data
+  // CHECK-NEXT: async clause
+}
+void Inst() {
+  TemplUses<int>();
+}
+
+
+#endif // PCH_HELPER
diff --git a/clang/test/SemaOpenACC/data-construct-async-clause.c b/clang/test/SemaOpenACC/data-construct-async-clause.c
new file mode 100644
index 0000000000000..053cc976939b5
--- /dev/null
+++ b/clang/test/SemaOpenACC/data-construct-async-clause.c
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+void Test() {
+  int I;
+  struct NotConvertible{} NC;
+  // No special rules for this clause on the data constructs, so not much to
+  // test that isn't covered by combined/compute.
+  // expected-warning@+1{{OpenACC clause 'copyin' not yet implemented}}
+#pragma acc data copyin(I) async(I)
+  ;
+  // expected-warning@+1{{OpenACC clause 'copyin' not yet implemented}}
+#pragma acc enter data copyin(I) async(I)
+  // expected-warning@+1{{OpenACC clause 'copyout' not yet implemented}}
+#pragma acc exit data copyout(I) async(I)
+  // expected-warning@+2{{OpenACC clause 'use_device' not yet implemented}}
+  // expected-error@+1{{OpenACC 'async' clause is not valid on 'host_data' directive}}
+#pragma acc host_data use_device(I) async(I)
+  ;
+
+  // expected-warning@+2{{OpenACC clause 'copyin' not yet implemented}}
+  // expected-error@+1{{OpenACC clause 'async' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc data copyin(NC) async(NC)
+  ;
+  // expected-warning@+2{{OpenACC clause 'copyin' not yet implemented}}
+  // expected-error@+1{{OpenACC clause 'async' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc enter data copyin(NC) async(NC)
+  // expected-warning@+2{{OpenACC clause 'copyout' not yet implemented}}
+  // expected-error@+1{{OpenACC clause 'async' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc exit data copyout(NC) async(NC)
+  // expected-warning@+2{{OpenACC clause 'use_device' not yet implemented}}
+  // expected-error@+1{{OpenACC clause 'async' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc host_data use_device(NC) async(NC)
+  ;
+
+  // expected-warning@+3{{OpenACC clause 'copyin' not yet implemented}}
+  // expected-error@+2{{OpenACC 'async' clause cannot appear more than once on a 'data' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data copyin(I) async(I) async(I)
+  ;
+  // expected-warning@+3{{OpenACC clause 'copyin' not yet implemented}}
+  // expected-error@+2{{expected ')'}}
+  // expected-note@+1{{to match this '('}}
+#pragma acc enter data copyin(I) async(I, I)
+}
diff --git a/clang/test/SemaOpenACC/data-construct-default-ast.cpp b/clang/test/SemaOpenACC/data-construct-default-ast.cpp
new file mode 100644
index 0000000000000..ef9b1348c6709
--- /dev/null
+++ b/clang/test/SemaOpenACC/data-construct-default-ast.cpp
@@ -0,0 +1,68 @@
+// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+
+// Test this with PCH.
+// RUN: %clang_cc1 %s -fopenacc -emit-pch -o %t %s
+// RUN: %clang_cc1 %s -fopenacc -include-pch %t -ast-dump-all | FileCheck %s
+
+#ifndef PCH_HELPER
+#define PCH_HELPER
+void NormalFunc() {
+  // CHECK-LABEL: NormalFunc
+  // CHECK-NEXT: CompoundStmt
+  // CHECK-NEXT: OpenACCDataConstruct {{.*}}data
+  // CHECK-NEXT: default(none)
+#pragma acc data  default(none)
+  // CHECK: OpenACCDataConstruct {{.*}}data
+  // CHECK-NEXT: default(present)
+#pragma acc data default(present)
+    ;
+}
+template<typename T>
+void TemplFunc() {
+#pragma acc data default(none)
+  for (unsigned i = 0; i < 5; ++i) {
+    typename T::type I;
+  }
+
+#pragma acc data default(present)
+  for (unsigned i = 0; i < 5; ++i) {
+    typename T::type I;
+  }
+
+  // CHECK-LABEL: FunctionTemplateDecl {{.*}}TemplFunc
+  // CHECK-NEXT: TemplateTypeParmDecl
+
+  // Template Pattern:
+  // CHECK-NEXT: FunctionDecl
+  // CHECK-NEXT: CompoundStmt
+  // CHECK-NEXT: OpenACCDataConstruct {{.*}}data
+  // CHECK-NEXT: default(none)
+  // CHECK: VarDecl{{.*}} I 'typename T::type'
+
+  // CHECK-NEXT: OpenACCDataConstruct {{.*}}data
+  // CHECK-NEXT: default(present)
+  // CHECK: VarDecl{{.*}} I 'typename T::type'
+
+  // Check instantiation.
+  // CHECK-LABEL: FunctionDecl{{.*}} used TemplFunc 'void ()' implicit_instantiation
+  // CHECK-NEXT: TemplateArgument type 'S'
+  // CHECK-NEXT: RecordType
+  // CHECK-NEXT: CXXRecord
+  // CHECK-NEXT: CompoundStmt
+  // CHECK-NEXT: OpenACCDataConstruct {{.*}}data
+  // CHECK-NEXT: default(none)
+  // CHECK: VarDecl{{.*}} I 'typename S::type':'int'
+  // CHECK-NEXT: OpenACCDataConstruct {{.*}}data
+  // CHECK-NEXT: default(present)
+  // CHECK: VarDecl{{.*}} I 'typename S::type':'int'
+
+}
+struct S {
+  using type = int;
+};
+
+void use() {
+  TemplFunc<S>();
+}
+
+#endif
diff --git a/clang/test/SemaOpenACC/data-construct-default-clause.c b/clang/test/SemaOpenACC/data-construct-default-clause.c
new file mode 100644
index 0000000000000..e09004d7404c0
--- /dev/null
+++ b/clang/test/SemaOpenACC/data-construct-default-clause.c
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+void use() {
+  // expected-error@+1{{invalid value for 'default' clause; expected 'present' or 'none'}}
+#pragma acc data default(garbage)
+  ;
+#pragma acc data default(present)
+  ;
+#pragma acc data default(none)
+  ;
+  // expected-error@+2{{OpenACC 'default' clause cannot appear more than once on a 'data' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data default(none) default(present)
+  ;
+  // expected-error@+1{{OpenACC 'default' clause is not valid on 'enter data' directive}}
+#pragma acc enter data default(present)
+  ;
+  // expected-error@+1{{OpenACC 'default' clause is not valid on 'exit data' directive}}
+#pragma acc exit data default(none)
+  ;
+  // expected-error@+1{{OpenACC 'default' clause is not valid on 'host_data' directive}}
+#pragma acc host_data default(present)
+  ;
+}
diff --git a/clang/test/SemaOpenACC/data-construct-device_type-ast.cpp b/clang/test/SemaOpenACC/data-construct-device_type-ast.cpp
new file mode 100644
index 0000000000000..23f00490f0674
--- /dev/null
+++ b/clang/test/SemaOpenACC/data-construct-device_type-ast.cpp
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+
+// Test this with PCH.
+// RUN: %clang_cc1 %s -fopenacc -emit-pch -o %t %s
+// RUN: %clang_cc1 %s -fopenacc -include-pch %t -ast-dump-all | FileCheck %s
+#ifndef PCH_HELPER
+#define PCH_HELPER
+
+template<typename T>
+void TemplUses() {
+  // CHECK: FunctionTemplateDecl{{.*}}TemplUses
+  // CHECK-NEXT: TemplateTypeParmDecl{{.*}}T
+  // CHECK-NEXT: FunctionDecl{{.*}}TemplUses
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc data device_type(T) dtype(T)
+  ;
+  // CHECK-NEXT: OpenACCDataConstruct{{.*}} data
+  // CHECK-NEXT: device_type(T)
+  // CHECK-NEXT: dtype(T)
+  // CHECK-NEXT: NullStmt
+
+  // Instantiations
+  // CHECK-NEXT: FunctionDecl{{.*}} TemplUses 'void ()' implicit_instantiation
+  // CHECK-NEXT: TemplateArgument type 'int'
+  // CHECK-NEXT: BuiltinType{{.*}} 'int'
+  // CHECK-NEXT: CompoundStmt
+
+  // Argument to 'device-type' is just an identifier, so we don't transform it.
+  // CHECK-NEXT: OpenACCDataConstruct{{.*}} data
+  // CHECK-NEXT: device_type(T)
+  // CHECK-NEXT: dtype(T)
+  // CHECK-NEXT: NullStmt
+}
+void Inst() {
+  TemplUses<int>();
+}
+
+#endif // PCH_HELPER
diff --git a/clang/test/SemaOpenACC/data-construct-device_type-clause.c b/clang/test/SemaOpenACC/data-construct-device_type-clause.c
new file mode 100644
index 0000000000000..f675c2fa6a880
--- /dev/null
+++ b/clang/test/SemaOpenACC/data-construct-device_type-clause.c
@@ -0,0 +1,54 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+void uses() {
+  int Var;
+#pragma acc data device_type(foo) async
+  ;
+#pragma acc data device_type(foo) wait
+  ;
+#pragma acc data device_type(foo) dtype(false)
+  ;
+#pragma acc data dtype(foo) device_type(false)
+  ;
+
+  // expected-error@+2{{OpenACC clause 'if' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(foo) if(1)
+  ;
+  // expected-error@+2{{OpenACC clause 'copy' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(foo) copy(Var)
+  ;
+  // expected-error@+2{{OpenACC clause 'copyin' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(foo) copyin(Var)
+  ;
+  // expected-error@+2{{OpenACC clause 'copyout' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(foo) copyout(Var)
+  ;
+  // expected-error@+2{{OpenACC clause 'create' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(foo) create(Var)
+  ;
+  // expected-error@+2{{OpenACC clause 'no_create' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(foo) no_create(Var)
+  ;
+  // expected-error@+2{{OpenACC clause 'present' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(foo) present(Var)
+  ;
+  // expected-error@+2{{OpenACC clause 'deviceptr' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(foo) deviceptr(Var)
+  ;
+  // expected-error@+2{{OpenACC clause 'attach' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(foo) attach(Var)
+  ;
+  // expected-error@+2{{OpenACC clause 'default' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(foo) default(none)
+  ;
+}
diff --git a/clang/test/SemaOpenACC/data-construct-if-ast.cpp b/clang/test/SemaOpenACC/data-construct-if-ast.cpp
new file mode 100644
index 0000000000000..a522b30357df8
--- /dev/null
+++ b/clang/test/SemaOpenACC/data-construct-if-ast.cpp
@@ -0,0 +1,131 @@
+// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+
+// Test this with PCH.
+// RUN: %clang_cc1 %s -fopenacc -emit-pch -o %t %s
+// RUN: %clang_cc1 %s -fopenacc -include-pch %t -ast-dump-all | FileCheck %s
+
+#ifndef PCH_HELPER
+#define PCH_HELPER
+void NormalFunc(int j, float f) {
+  // CHECK: FunctionDecl{{.*}}NormalFunc
+  // CHECK-NEXT: ParmVarDecl
+  // CHECK-NEXT: ParmVarDecl
+  // CHECK-NEXT: CompoundStmt
+#pragma acc data if( j < f) default(none)
+  ;
+  // CHECK-NEXT: OpenACCDataConstruct{{.*}}data
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}} 'bool' '<'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float' <IntegralToFloating>
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'j' 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'float' lvalue ParmVar{{.*}} 'f' 'float'
+  // CHECK-NEXT: default(none)
+  // CHECK-NEXT: NullStmt
+
+}
+
+int Global;
+
+template<typename T>
+void TemplFunc() {
+  // CHECK: FunctionTemplateDecl{{.*}}TemplFunc
+  // CHECK-NEXT: TemplateTypeParmDecl
+
+  // Match the prototype:
+  // CHECK-NEXT: FunctionDecl{{.*}}TemplFunc
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc data default(none) if(T::SomeFloat < typename T::IntTy{})
+  ;
+  // CHECK-NEXT: OpenACCDataConstruct{{.*}}data
+  // CHECK-NEXT: default(none)
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}} '<dependent type>' '<'
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+  // CHECK-NEXT: CXXUnresolvedConstructExpr{{.*}} 'typename T::IntTy' 'typename T::IntTy'
+  // CHECK-NEXT: InitListExpr{{.*}} 'void'
+  // CHECK-NEXT: NullStmt
+
+#pragma acc enter data copyin(Global) if(typename T::IntTy{})
+  ;
+  // CHECK-NEXT: OpenACCEnterDataConstruct{{.*}}enter data
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: CXXUnresolvedConstructExpr{{.*}} 'typename T::IntTy' 'typename T::IntTy'
+  // CHECK-NEXT: InitListExpr{{.*}} 'void'
+  // CHECK-NEXT: NullStmt
+
+#pragma acc exit data copyout(Global) if(T::SomeFloat)
+  ;
+  // CHECK-NEXT: OpenACCExitDataConstruct{{.*}}exit data
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+  // CHECK-NEXT: NullStmt
+
+#pragma acc host_data use_device(Global) if(T::BC)
+  ;
+  // CHECK-NEXT: OpenACCHostDataConstruct{{.*}}host_data
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+  // CHECK-NEXT: NullStmt
+
+  // Match the instantiation:
+  // CHECK: FunctionDecl{{.*}}TemplFunc{{.*}}implicit_instantiation
+  // CHECK-NEXT: TemplateArgument type 'InstTy'
+  // CHECK-NEXT: RecordType{{.*}} 'InstTy'
+  // CHECK-NEXT: CXXRecord{{.*}} 'InstTy'
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: OpenACCDataConstruct{{.*}}data
+  // CHECK-NEXT: default(none)
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}} 'bool' '<'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const float' lvalue Var{{.*}} 'SomeFloat' 'const float'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'InstTy'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float' <IntegralToFloating>
+  // CHECK-NEXT: CXXFunctionalCastExpr{{.*}}'typename InstTy::IntTy':'int' functional cast to typename struct InstTy::IntTy <NoOp>
+  // CHECK-NEXT: InitListExpr {{.*}}'typename InstTy::IntTy':'int'
+  // CHECK-NEXT: NullStmt
+
+  // CHECK-NEXT: OpenACCEnterDataConstruct{{.*}}enter data
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'bool' <IntegralToBoolean>
+  // CHECK-NEXT: CXXFunctionalCastExpr{{.*}}'typename InstTy::IntTy':'int' functional cast to typename struct InstTy::IntTy <NoOp>
+  // CHECK-NEXT: InitListExpr {{.*}}'typename InstTy::IntTy':'int'
+  // CHECK-NEXT: NullStmt
+
+  // CHECK-NEXT: OpenACCExitDataConstruct{{.*}}exit data
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'bool' <FloatingToBoolean>
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'float' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const float' lvalue Var{{.*}} 'SomeFloat' 'const float'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'InstTy'
+  // CHECK-NEXT: NullStmt
+
+  // CHECK-NEXT: OpenACCHostDataConstruct{{.*}}host_data
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'bool' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}} 'bool'
+  // CHECK-NEXT: MemberExpr{{.*}} .operator bool
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const BoolConversion' lvalue Var{{.*}} 'BC' 'const BoolConversion'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'InstTy'
+  // CHECK-NEXT: NullStmt
+
+}
+
+struct BoolConversion{ operator bool() const;};
+struct InstTy {
+  using IntTy = int;
+  static constexpr float SomeFloat = 5.0;
+  static constexpr BoolConversion BC;
+};
+
+void Instantiate() {
+  TemplFunc<InstTy>();
+}
+#endif
diff --git a/clang/test/SemaOpenACC/data-construct-if-clause.c b/clang/test/SemaOpenACC/data-construct-if-clause.c
new file mode 100644
index 0000000000000..906b252d219b4
--- /dev/null
+++ b/clang/test/SemaOpenACC/data-construct-if-clause.c
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+void Foo() {
+  int Var;
+#pragma acc data default(present) if(1)
+  ;
+  // expected-error@+2{{OpenACC 'if' clause cannot appear more than once on a 'data' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data default(present) if(1) if (2)
+  ;
+
+  // expected-warning@+1{{OpenACC clause 'copyin' not yet implemented}}
+#pragma acc enter data copyin(Var) if(1)
+
+  // expected-warning@+3{{OpenACC clause 'copyin' not yet implemented}}
+  // expected-error@+2{{OpenACC 'if' clause cannot appear more than once on a 'enter data' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc enter data copyin(Var) if(1) if (2)
+
+  // expected-warning@+1{{OpenACC clause 'copyout' not yet implemented}}
+#pragma acc exit data copyout(Var) if(1)
+  // expected-warning@+3{{OpenACC clause 'copyout' not yet implemented}}
+  // expected-error@+2{{OpenACC 'if' clause cannot appear more than once on a 'exit data' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc exit data copyout(Var) if(1) if (2)
+
+  // expected-warning@+1{{OpenACC clause 'use_device' not yet implemented}}
+#pragma acc host_data use_device(Var) if(1)
+  ;
+  // expected-warning@+3{{OpenACC clause 'use_device' not yet implemented}}
+  // expected-error@+2{{OpenACC 'if' clause cannot appear more than once on a 'host_data' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc host_data use_device(Var) if(1) if (2)
+  ;
+}
diff --git a/clang/test/SemaOpenACC/data-construct-wait-ast.cpp b/clang/test/SemaOpenACC/data-construct-wait-ast.cpp
new file mode 100644
index 0000000000000..40409099d8058
--- /dev/null
+++ b/clang/test/SemaOpenACC/data-construct-wait-ast.cpp
@@ -0,0 +1,230 @@
+// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+
+// Test this with PCH.
+// RUN: %clang_cc1 %s -fopenacc -emit-pch -o %t %s
+// RUN: %clang_cc1 %s -fopenacc -include-pch %t -ast-dump-all | FileCheck %s
+
+#ifndef PCH_HELPER
+#define PCH_HELPER
+
+int some_int();
+long some_long();
+
+void NormalUses() {
+  // CHECK: FunctionDecl{{.*}}NormalUses
+  // CHECK-NEXT: CompoundStmt
+
+  int I;
+  // CHECK-NEXT: DeclStmt
+  // CHECK-NEXT: VarDecl
+
+#pragma acc data copyin(I) wait
+  ;
+  // CHECK-NEXT: OpenACCDataConstruct{{.*}}data
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: NullStmt
+#pragma acc enter data copyin(I) wait()
+  // CHECK: OpenACCEnterDataConstruct{{.*}}enter data
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+#pragma acc exit data copyout(I) wait(some_int(), some_long())
+  // CHECK: OpenACCExitDataConstruct{{.*}}exit data
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'long'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'long (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'long ()' lvalue Function{{.*}} 'some_long' 'long ()'
+#pragma acc data copyin(I) wait(queues:some_int(), some_long())
+  ;
+  // CHECK: OpenACCDataConstruct{{.*}}data
+  // CHECK-NEXT: wait clause has queues tag
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'long'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'long (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'long ()' lvalue Function{{.*}} 'some_long' 'long ()'
+  // CHECK-NEXT: NullStmt
+#pragma acc enter data copyin(I) wait(devnum: some_int() :some_int(), some_long())
+  // CHECK: OpenACCEnterDataConstruct{{.*}}enter data
+  // CHECK-NEXT: wait clause has devnum
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'long'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'long (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'long ()' lvalue Function{{.*}} 'some_long' 'long ()'
+#pragma acc exit data copyout(I) wait(devnum: some_int() : queues :some_int(), some_long()) wait(devnum: some_int() : queues :some_int(), some_long())
+  // CHECK: OpenACCExitDataConstruct{{.*}}exit data
+  // CHECK-NEXT: wait clause has devnum has queues tag
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'long'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'long (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'long ()' lvalue Function{{.*}} 'some_long' 'long ()'
+  // CHECK-NEXT: wait clause has devnum has queues tag
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'long'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'long (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'long ()' lvalue Function{{.*}} 'some_long' 'long ()'
+}
+
+template<typename U>
+void TemplUses(U u) {
+  // CHECK: FunctionTemplateDecl
+  // CHECK-NEXT: TemplateTypeParmDecl{{.*}}typename depth 0 index 0 U
+  // CHECK-NEXT: FunctionDecl{{.*}} TemplUses 'void (U)'
+  // CHECK-NEXT: ParmVarDecl{{.*}} referenced u 'U'
+  // CHECK-NEXT: CompoundStmt
+
+  U I;
+  // CHECK-NEXT: DeclStmt
+  // CHECK-NEXT: VarDecl
+
+#pragma acc data copyin(I) wait
+  ;
+  // CHECK: OpenACCDataConstruct{{.*}}data
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: NullStmt
+
+#pragma acc enter data copyin(I) wait()
+  // CHECK: OpenACCEnterDataConstruct{{.*}}enter data
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+
+#pragma acc exit data copyout(I) wait(U::value, u)
+  // CHECK: OpenACCExitDataConstruct{{.*}}exit data
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'U'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'U' lvalue ParmVar{{.*}} 'u' 'U'
+
+#pragma acc data copyin(I) wait(queues: U::value, u)
+  ;
+  // CHECK: OpenACCDataConstruct{{.*}}data
+  // CHECK-NEXT: wait clause has queues tag
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'U'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: NullStmt
+
+#pragma acc enter data copyin(I) wait(devnum:u:queues: U::value, u)
+  // CHECK: OpenACCEnterDataConstruct{{.*}}data
+  // CHECK-NEXT: wait clause has devnum has queues tag
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'U'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'U' lvalue ParmVar{{.*}} 'u' 'U'
+
+#pragma acc exit data copyout(I) wait(devnum:u: U::value, u)
+  // CHECK: OpenACCExitDataConstruct{{.*}}exit data
+  // CHECK-NEXT: wait clause has devnum
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'U'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'U' lvalue ParmVar{{.*}} 'u' 'U'
+
+  // Check the instantiated versions of the above.
+  // CHECK: FunctionDecl{{.*}} used TemplUses 'void (HasInt)' implicit_instantiation
+  // CHECK-NEXT: TemplateArgument type 'HasInt'
+  // CHECK-NEXT: RecordType{{.*}} 'HasInt'
+  // CHECK-NEXT: CXXRecord{{.*}} 'HasInt'
+  // CHECK-NEXT: ParmVarDecl{{.*}} used u 'HasInt'
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: DeclStmt
+  // CHECK-NEXT: VarDecl
+
+  // CHECK: OpenACCDataConstruct{{.*}}data
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: NullStmt
+
+  // CHECK: OpenACCEnterDataConstruct{{.*}}enter data
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+
+  // CHECK: OpenACCExitDataConstruct{{.*}}exit data
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const int' lvalue Var{{.*}} 'value' 'const int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'HasInt'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'char' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'char'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator char
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'HasInt' lvalue ParmVar
+
+  // CHECK: OpenACCDataConstruct{{.*}}data
+  // CHECK-NEXT: wait clause has queues tag
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const int' lvalue Var{{.*}} 'value' 'const int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'HasInt'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'char' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'char'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator char
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'HasInt' lvalue ParmVar
+  // CHECK-NEXT: NullStmt
+
+  // CHECK: OpenACCEnterDataConstruct{{.*}}enter data
+  // CHECK-NEXT: wait clause has devnum has queues tag
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'char' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'char'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator char
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'HasInt' lvalue ParmVar
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const int' lvalue Var{{.*}} 'value' 'const int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'HasInt'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'char' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'char'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator char
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'HasInt' lvalue ParmVar
+
+  // CHECK: OpenACCExitDataConstruct{{.*}}exit data
+  // CHECK-NEXT: wait clause has devnum
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'char' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'char'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator char
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'HasInt' lvalue ParmVar
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const int' lvalue Var{{.*}} 'value' 'const int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'HasInt'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'char' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'char'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator char
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'HasInt' lvalue ParmVar
+}
+
+struct HasInt {
+  using IntTy = int;
+  using ShortTy = short;
+  static constexpr int value = 1;
+
+  operator char();
+};
+
+void Inst() {
+  TemplUses<HasInt>({});
+}
+#endif
diff --git a/clang/test/SemaOpenACC/data-construct-wait-clause.c b/clang/test/SemaOpenACC/data-construct-wait-clause.c
new file mode 100644
index 0000000000000..50a29df8b03bf
--- /dev/null
+++ b/clang/test/SemaOpenACC/data-construct-wait-clause.c
@@ -0,0 +1,50 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+struct NotConvertible{} NC;
+short getS();
+int getI();
+
+void uses() {
+  int arr[5];
+
+  // expected-warning@+1{{OpenACC clause 'copyin' not yet implemented}}
+#pragma acc data copyin(arr[0]) wait
+  ;
+
+  // expected-warning@+1{{OpenACC clause 'copyin' not yet implemented}}
+#pragma acc enter data copyin(arr[0]) wait()
+
+  // expected-warning@+1{{OpenACC clause 'copyout' not yet implemented}}
+#pragma acc exit data copyout(arr[0]) wait(getS(), getI())
+
+  // expected-warning@+2{{OpenACC clause 'use_device' not yet implemented}}
+  // expected-error@+1{{OpenACC 'wait' clause is not valid on 'host_data' directive}}
+#pragma acc host_data use_device(arr[0]) wait(getS(), getI())
+  ;
+
+  // expected-warning@+1{{OpenACC clause 'copyin' not yet implemented}}
+#pragma acc data copyin(arr[0]) wait(devnum:getS(): getI())
+  ;
+
+  // expected-warning@+1{{OpenACC clause 'copyin' not yet implemented}}
+#pragma acc enter data copyin(arr[0]) wait(devnum:getS(): queues: getI()) wait(devnum:getI(): queues: getS(), getI(), 5)
+
+  // expected-warning@+2{{OpenACC clause 'copyout' not yet implemented}}
+  // expected-error@+1{{OpenACC clause 'wait' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc exit data copyout(arr[0]) wait(devnum:NC : 5)
+
+  // expected-warning@+2{{OpenACC clause 'copyin' not yet implemented}}
+  // expected-error@+1{{OpenACC clause 'wait' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc data copyin(arr[0]) wait(devnum:5 : NC)
+  ;
+
+  // expected-warning@+4{{OpenACC clause 'copyin' not yet implemented}}
+  // expected-error@+3{{OpenACC clause 'wait' requires expression of integer type ('int[5]' invalid)}}
+  // expected-error@+2{{OpenACC clause 'wait' requires expression of integer type ('int[5]' invalid)}}
+  // expected-error@+1{{OpenACC clause 'wait' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc enter data copyin(arr[0]) wait(devnum:arr : queues: arr, NC, 5)
+
+  // expected-error@+1{{OpenACC 'wait' clause is not valid on 'loop' directive}}
+#pragma acc loop wait
+  for(int i = 5; i < 10;++i);
+}
diff --git a/clang/test/SemaOpenACC/data-construct.cpp b/clang/test/SemaOpenACC/data-construct.cpp
new file mode 100644
index 0000000000000..2df68cc9bba4b
--- /dev/null
+++ b/clang/test/SemaOpenACC/data-construct.cpp
@@ -0,0 +1,219 @@
+// RUN: %clang_cc1 %s -fopenacc -verify -Wno-empty-body -Wno-unused-value
+
+void HasStmt() {
+  {
+    // expected-error@+2{{expected statement}}
+#pragma acc data
+  }
+  {
+    // expected-error@+2{{expected statement}}
+#pragma acc host_data
+  }
+  // Don't have statements, so this is fine.
+  {
+#pragma acc enter data
+  }
+  {
+#pragma acc exit data
+  }
+}
+
+void AtLeastOneOf() {
+  int Var;
+// Data
+  // expected-warning@+1{{OpenACC clause 'copy' not yet implemented}}
+#pragma acc data copy(Var)
+  ;
+  // expected-warning@+1{{OpenACC clause 'copyin' not yet implemented}}
+#pragma acc data copyin(Var)
+  ;
+  // expected-warning@+1{{OpenACC clause 'copyout' not yet implemented}}
+#pragma acc data copyout(Var)
+  ;
+  // expected-warning@+1{{OpenACC clause 'create' not yet implemented}}
+#pragma acc data create(Var)
+  ;
+  // expected-warning@+1{{OpenACC clause 'no_create' not yet implemented}}
+#pragma acc data no_create(Var)
+  ;
+  // expected-warning@+1{{OpenACC clause 'present' not yet implemented}}
+#pragma acc data present(Var)
+  ;
+  // expected-warning@+1{{OpenACC clause 'deviceptr' not yet implemented}}
+#pragma acc data deviceptr(Var)
+  ;
+  // expected-warning@+1{{OpenACC clause 'attach' not yet implemented}}
+#pragma acc data attach(Var)
+  ;
+#pragma acc data default(none)
+  ;
+
+  // OpenACC TODO: The following 'data' directives should diagnose, since they
+  // don't have at least one of the above clauses.
+
+#pragma acc data if(Var)
+  ;
+
+#pragma acc data async
+  ;
+
+#pragma acc data wait
+  ;
+
+#pragma acc data device_type(*)
+  ;
+#pragma acc data
+  ;
+
+  // Enter Data
+  // expected-warning@+1{{OpenACC clause 'copyin' not yet implemented}}
+#pragma acc enter data copyin(Var)
+  // expected-warning@+1{{OpenACC clause 'create' not yet implemented}}
+#pragma acc enter data create(Var)
+  // expected-warning@+1{{OpenACC clause 'attach' not yet implemented}}
+#pragma acc enter data attach(Var)
+
+  // OpenACC TODO: The following 'enter data' directives should diagnose, since
+  // they don't have at least one of the above clauses.
+
+#pragma acc enter data if(Var)
+#pragma acc enter data async
+#pragma acc enter data wait
+#pragma acc enter data
+
+  // Exit Data
+  // expected-warning@+1{{OpenACC clause 'copyout' not yet implemented}}
+#pragma acc exit data copyout(Var)
+  // expected-warning@+1{{OpenACC clause 'delete' not yet implemented}}
+#pragma acc exit data delete(Var)
+  // expected-warning@+1{{OpenACC clause 'detach' not yet implemented}}
+#pragma acc exit data detach(Var)
+
+  // OpenACC TODO: The following 'exit data' directives should diagnose, since
+  // they don't have at least one of the above clauses.
+
+#pragma acc exit data if(Var)
+#pragma acc exit data async
+#pragma acc exit data wait
+  // expected-warning@+1{{OpenACC clause 'finalize' not yet implemented}}
+#pragma acc exit data finalize
+#pragma acc exit data
+
+  // Host Data
+  // expected-warning@+1{{OpenACC clause 'use_device' not yet implemented}}
+#pragma acc host_data use_device(Var)
+  ;
+  // OpenACC TODO: The following 'host_data' directives should diagnose, since
+  // they don't have at least one of the above clauses.
+
+#pragma acc host_data if(Var)
+  ;
+  // expected-warning@+1{{OpenACC clause 'if_present' not yet implemented}}
+#pragma acc host_data if_present
+  ;
+#pragma acc host_data
+  ;
+}
+
+void DataRules() {
+  int Var;
+  // OpenACC TODO: Only 'async' and 'wait' are permitted after a device_type, so
+  // the rest of these should diagnose.
+
+  // expected-error@+2{{OpenACC clause 'copy' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(*) copy(Var)
+  ;
+  // expected-error@+2{{OpenACC clause 'copyin' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(*) copyin(Var)
+  ;
+  // expected-error@+2{{OpenACC clause 'copyout' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(*) copyout(Var)
+  ;
+  // expected-error@+2{{OpenACC clause 'create' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(*) create(Var)
+  ;
+  // expected-error@+2{{OpenACC clause 'no_create' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(*) no_create(Var)
+  ;
+  // expected-error@+2{{OpenACC clause 'present' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(*) present(Var)
+  ;
+  // expected-error@+2{{OpenACC clause 'deviceptr' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(*) deviceptr(Var)
+  ;
+  // expected-error@+2{{OpenACC clause 'attach' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(*) attach(Var)
+  ;
+  // expected-error@+2{{OpenACC clause 'default' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(*) default(none)
+  ;
+  // expected-error@+2{{OpenACC clause 'if' may not follow a 'device_type' clause in a 'data' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc data device_type(*) if(Var)
+  ;
+#pragma acc data device_type(*) async
+  ;
+#pragma acc data device_type(*) wait
+  ;
+}
+
+struct HasMembers {
+  int Member;
+
+  void HostDataError() {
+  // TODO OpenACC: The following 3 should error, as use_device's var only allows
+  // a variable or array, not an array index, or sub expression.
+
+  // expected-warning@+1{{OpenACC clause 'use_device' not yet implemented}}
+#pragma acc host_data use_device(this)
+  ;
+  // expected-warning@+1{{OpenACC clause 'use_device' not yet implemented}}
+#pragma acc host_data use_device(this->Member)
+  ;
+  // expected-warning@+1{{OpenACC clause 'use_device' not yet implemented}}
+#pragma acc host_data use_device(Member)
+  ;
+  }
+};
+
+void HostDataRules() {
+  int Var, Var2;
+  // expected-error@+2{{OpenACC 'if' clause cannot appear more than once on a 'host_data' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc host_data if(Var) if (Var2)
+  ;
+
+  // expected-warning@+1{{OpenACC clause 'use_device' not yet implemented}}
+#pragma acc host_data use_device(Var)
+  ;
+
+  int Array[5];
+  // expected-warning@+1{{OpenACC clause 'use_device' not yet implemented}}
+#pragma acc host_data use_device(Array)
+  ;
+
+  // TODO OpenACC: The following 3 should error, as use_device's var only allows
+  // a variable or array, not an array index, or sub expression.
+
+  // expected-warning@+1{{OpenACC clause 'use_device' not yet implemented}}
+#pragma acc host_data use_device(Array[1:1])
+  ;
+
+  // expected-warning@+1{{OpenACC clause 'use_device' not yet implemented}}
+#pragma acc host_data use_device(Array[1])
+  ;
+  HasMembers HM;
+  // expected-warning@+1{{OpenACC clause 'use_device' not yet implemented}}
+#pragma acc host_data use_device(HM.Member)
+  ;
+
+}
diff --git a/clang/test/SemaOpenACC/loop-construct-collapse-clause.cpp b/clang/test/SemaOpenACC/loop-construct-collapse-clause.cpp
index dc954e36d765d..b401dd891629a 100644
--- a/clang/test/SemaOpenACC/loop-construct-collapse-clause.cpp
+++ b/clang/test/SemaOpenACC/loop-construct-collapse-clause.cpp
@@ -323,14 +323,15 @@ void no_other_directives() {
 #pragma acc loop collapse(2)
   for(unsigned i = 0; i < 5; ++i) {
     for(unsigned j = 0; j < 5; ++j) {
-#pragma acc data // expected-warning{{OpenACC construct 'data' not yet implemented}}
+#pragma acc data
+      ;
     }
   }
   // expected-note@+1{{active 'collapse' clause defined here}}
 #pragma acc loop collapse(2)
   for(unsigned i = 0; i < 5; ++i) {
     // expected-error@+1{{OpenACC 'data' construct cannot appear in intervening code of a 'loop' with a 'collapse' clause}}
-#pragma acc data // expected-warning{{OpenACC construct 'data' not yet implemented}}
+#pragma acc data
     for(unsigned j = 0; j < 5; ++j) {
     }
   }
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index def4524449355..d0fc69af7c847 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2185,6 +2185,10 @@ class EnqueueVisitor : public ConstStmtVisitor<EnqueueVisitor, void>,
   void VisitOpenACCComputeConstruct(const OpenACCComputeConstruct *D);
   void VisitOpenACCLoopConstruct(const OpenACCLoopConstruct *D);
   void VisitOpenACCCombinedConstruct(const OpenACCCombinedConstruct *D);
+  void VisitOpenACCDataConstruct(const OpenACCDataConstruct *D);
+  void VisitOpenACCEnterDataConstruct(const OpenACCEnterDataConstruct *D);
+  void VisitOpenACCExitDataConstruct(const OpenACCExitDataConstruct *D);
+  void VisitOpenACCHostDataConstruct(const OpenACCHostDataConstruct *D);
   void VisitOMPExecutableDirective(const OMPExecutableDirective *D);
   void VisitOMPLoopBasedDirective(const OMPLoopBasedDirective *D);
   void VisitOMPLoopDirective(const OMPLoopDirective *D);
@@ -3587,6 +3591,29 @@ void EnqueueVisitor::VisitOpenACCCombinedConstruct(
   for (auto *Clause : C->clauses())
     EnqueueChildren(Clause);
 }
+void EnqueueVisitor::VisitOpenACCDataConstruct(const OpenACCDataConstruct *C) {
+  EnqueueChildren(C);
+  for (auto *Clause : C->clauses())
+    EnqueueChildren(Clause);
+}
+void EnqueueVisitor::VisitOpenACCEnterDataConstruct(
+    const OpenACCEnterDataConstruct *C) {
+  EnqueueChildren(C);
+  for (auto *Clause : C->clauses())
+    EnqueueChildren(Clause);
+}
+void EnqueueVisitor::VisitOpenACCExitDataConstruct(
+    const OpenACCExitDataConstruct *C) {
+  EnqueueChildren(C);
+  for (auto *Clause : C->clauses())
+    EnqueueChildren(Clause);
+}
+void EnqueueVisitor::VisitOpenACCHostDataConstruct(
+    const OpenACCHostDataConstruct *C) {
+  EnqueueChildren(C);
+  for (auto *Clause : C->clauses())
+    EnqueueChildren(Clause);
+}
 
 void EnqueueVisitor::VisitAnnotateAttr(const AnnotateAttr *A) {
   EnqueueChildren(A);
@@ -5270,7 +5297,7 @@ CXString clang_getCursorSpelling(CXCursor C) {
       if (const OverloadExpr *E = Storage.dyn_cast<const OverloadExpr *>())
         return cxstring::createDup(E->getName().getAsString());
       OverloadedTemplateStorage *Ovl =
-          Storage.get<OverloadedTemplateStorage *>();
+          cast<OverloadedTemplateStorage *>(Storage);
       if (Ovl->size() == 0)
         return cxstring::createEmpty();
       return cxstring::createDup((*Ovl->begin())->getNameAsString());
@@ -6342,6 +6369,14 @@ CXString clang_getCursorKindSpelling(enum CXCursorKind Kind) {
     return cxstring::createRef("OpenACCLoopConstruct");
   case CXCursor_OpenACCCombinedConstruct:
     return cxstring::createRef("OpenACCCombinedConstruct");
+  case CXCursor_OpenACCDataConstruct:
+    return cxstring::createRef("OpenACCDataConstruct");
+  case CXCursor_OpenACCEnterDataConstruct:
+    return cxstring::createRef("OpenACCEnterDataConstruct");
+  case CXCursor_OpenACCExitDataConstruct:
+    return cxstring::createRef("OpenACCExitDataConstruct");
+  case CXCursor_OpenACCHostDataConstruct:
+    return cxstring::createRef("OpenACCHostDataConstruct");
   }
 
   llvm_unreachable("Unhandled CXCursorKind");
@@ -7309,7 +7344,7 @@ unsigned clang_getNumOverloadedDecls(CXCursor C) {
           Storage.dyn_cast<OverloadedTemplateStorage *>())
     return S->size();
 
-  const Decl *D = Storage.get<const Decl *>();
+  const Decl *D = cast<const Decl *>(Storage);
   if (const UsingDecl *Using = dyn_cast<UsingDecl>(D))
     return Using->shadow_size();
 
@@ -7332,7 +7367,7 @@ CXCursor clang_getOverloadedDecl(CXCursor cursor, unsigned index) {
           Storage.dyn_cast<OverloadedTemplateStorage *>())
     return MakeCXCursor(S->begin()[index], TU);
 
-  const Decl *D = Storage.get<const Decl *>();
+  const Decl *D = cast<const Decl *>(Storage);
   if (const UsingDecl *Using = dyn_cast<UsingDecl>(D)) {
     // FIXME: This is, unfortunately, linear time.
     UsingDecl::shadow_iterator Pos = Using->shadow_begin();
diff --git a/clang/tools/libclang/CIndexCXX.cpp b/clang/tools/libclang/CIndexCXX.cpp
index ea6f97d39644e..a1be70dde9f67 100644
--- a/clang/tools/libclang/CIndexCXX.cpp
+++ b/clang/tools/libclang/CIndexCXX.cpp
@@ -101,11 +101,11 @@ CXCursor clang_getSpecializedCursorTemplate(CXCursor C) {
       llvm::PointerUnion<ClassTemplateDecl *,
                          ClassTemplatePartialSpecializationDecl *> Result
         = ClassSpec->getSpecializedTemplateOrPartial();
-      if (Result.is<ClassTemplateDecl *>())
-        Template = Result.get<ClassTemplateDecl *>();
+      if (isa<ClassTemplateDecl *>(Result))
+        Template = cast<ClassTemplateDecl *>(Result);
       else
-        Template = Result.get<ClassTemplatePartialSpecializationDecl *>();
-      
+        Template = cast<ClassTemplatePartialSpecializationDecl *>(Result);
+
     } else 
       Template = CXXRecord->getInstantiatedFromMemberClass();
   } else if (const FunctionDecl *Function = dyn_cast<FunctionDecl>(D)) {
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index c8cf51d806132..26935c45ce5f8 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -888,6 +888,18 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent,
   case Stmt::OpenACCCombinedConstructClass:
     K = CXCursor_OpenACCCombinedConstruct;
     break;
+  case Stmt::OpenACCDataConstructClass:
+    K = CXCursor_OpenACCDataConstruct;
+    break;
+  case Stmt::OpenACCEnterDataConstructClass:
+    K = CXCursor_OpenACCEnterDataConstruct;
+    break;
+  case Stmt::OpenACCExitDataConstructClass:
+    K = CXCursor_OpenACCExitDataConstruct;
+    break;
+  case Stmt::OpenACCHostDataConstructClass:
+    K = CXCursor_OpenACCHostDataConstruct;
+    break;
   case Stmt::OMPTargetParallelGenericLoopDirectiveClass:
     K = CXCursor_OMPTargetParallelGenericLoopDirective;
     break;
diff --git a/clang/unittests/AST/ASTContextParentMapTest.cpp b/clang/unittests/AST/ASTContextParentMapTest.cpp
index 515dfb99e1126..9af0a46817a25 100644
--- a/clang/unittests/AST/ASTContextParentMapTest.cpp
+++ b/clang/unittests/AST/ASTContextParentMapTest.cpp
@@ -148,5 +148,54 @@ TEST(GetParents, FriendTypeLoc) {
               ElementsAre(DynTypedNode::create(FrA)));
 }
 
+TEST(GetParents, UserDefinedTupleLikeTypes) {
+  MatchVerifier<VarDecl> Verifier;
+  EXPECT_TRUE(Verifier.match(
+      R"(
+namespace std {
+
+using size_t = __typeof(sizeof(int));
+
+template <typename T>
+struct tuple_size;
+
+template <typename T>
+struct tuple_size<T&> : tuple_size<T>{};
+
+template <typename T>
+requires requires { tuple_size<T>::value; }
+struct tuple_size<const T> : tuple_size<T>{};
+
+
+template<size_t i, typename T>
+struct tuple_element;
+
+
+}  // namespace std
+
+struct Decomposable {};
+
+template<> struct std::tuple_size<Decomposable> {
+  static constexpr size_t value = 2;
+};
+
+template<std::size_t i> struct std::tuple_element<i, Decomposable> {
+  using type = int;
+};
+
+template<std::size_t i> struct std::tuple_element<i, const Decomposable> {
+  using type = const int;
+};
+
+template<std::size_t i>
+const int& get(const Decomposable& d);
+
+void F(const Decomposable& d) {
+    const auto& [x, y] = d;
+}
+)",
+      varDecl(hasName("x"), hasAncestor(decompositionDecl())), Lang_CXX20));
+}
+
 } // end namespace ast_matchers
 } // end namespace clang
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index 39e7001393e5e..0f731f4532535 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -25,6 +25,7 @@
 #include "gtest/gtest.h"
 #include <optional>
 #include <string>
+#include <string_view>
 #include <utility>
 
 namespace clang {
@@ -143,6 +144,15 @@ const Formula &getFormula(const ValueDecl &D, const Environment &Env) {
   return cast<BoolValue>(Env.getValue(D))->formula();
 }
 
+const BindingDecl *findBindingDecl(ASTContext &ASTCtx, std::string_view Name) {
+  using ast_matchers::bindingDecl;
+  using ast_matchers::hasName;
+  auto TargetNodes =
+      ast_matchers::match(bindingDecl(hasName(Name)).bind("v"), ASTCtx);
+  assert(TargetNodes.size() == 1 && "Name must be unique");
+  return ast_matchers::selectFirst<BindingDecl>("v", TargetNodes);
+}
+
 TEST(TransferTest, CNotSupported) {
   TestInputs Inputs("void target() {}");
   Inputs.Language = TestLanguage::Lang_C89;
@@ -5515,10 +5525,10 @@ TEST(TransferTest, StructuredBindingAssignFromTupleLikeType) {
         ASSERT_THAT(Results.keys(), UnorderedElementsAre("p1", "p2"));
         const Environment &Env1 = getEnvironmentAtAnnotation(Results, "p1");
 
-        const ValueDecl *BoundFooDecl = findValueDecl(ASTCtx, "BoundFoo");
+        const ValueDecl *BoundFooDecl = findBindingDecl(ASTCtx, "BoundFoo");
         ASSERT_THAT(BoundFooDecl, NotNull());
 
-        const ValueDecl *BoundBarDecl = findValueDecl(ASTCtx, "BoundBar");
+        const ValueDecl *BoundBarDecl = findBindingDecl(ASTCtx, "BoundBar");
         ASSERT_THAT(BoundBarDecl, NotNull());
 
         const ValueDecl *BazDecl = findValueDecl(ASTCtx, "Baz");
@@ -5596,10 +5606,10 @@ TEST(TransferTest, StructuredBindingAssignRefFromTupleLikeType) {
         ASSERT_THAT(Results.keys(), UnorderedElementsAre("p1", "p2"));
         const Environment &Env1 = getEnvironmentAtAnnotation(Results, "p1");
 
-        const ValueDecl *BoundFooDecl = findValueDecl(ASTCtx, "BoundFoo");
+        const ValueDecl *BoundFooDecl = findBindingDecl(ASTCtx, "BoundFoo");
         ASSERT_THAT(BoundFooDecl, NotNull());
 
-        const ValueDecl *BoundBarDecl = findValueDecl(ASTCtx, "BoundBar");
+        const ValueDecl *BoundBarDecl = findBindingDecl(ASTCtx, "BoundBar");
         ASSERT_THAT(BoundBarDecl, NotNull());
 
         const ValueDecl *BazDecl = findValueDecl(ASTCtx, "Baz");
diff --git a/clang/unittests/Serialization/LoadSpecLazilyTest.cpp b/clang/unittests/Serialization/LoadSpecLazilyTest.cpp
index 0e452652a940d..7cc074c51fcd0 100644
--- a/clang/unittests/Serialization/LoadSpecLazilyTest.cpp
+++ b/clang/unittests/Serialization/LoadSpecLazilyTest.cpp
@@ -82,6 +82,8 @@ class LoadSpecLazilyTest : public ::testing::Test {
     Instance.setDiagnostics(Diags.get());
     Instance.setInvocation(Invocation);
     Instance.getFrontendOpts().OutputFile = CacheBMIPath;
+    // Avoid memory leaks.
+    Instance.getFrontendOpts().DisableFree = false;
     GenerateModuleInterfaceAction Action;
     EXPECT_TRUE(Instance.ExecuteAction(Action));
     EXPECT_FALSE(Diags->hasErrorOccurred());
diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
index a9faba0d84403..72b3468dac486 100644
--- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
+++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
@@ -362,7 +362,7 @@ void InferPedantic::compute(VecOrSet DiagsInPedantic,
     if (auto *V = DiagsInPedantic.dyn_cast<RecordVec *>())
       V->push_back(R);
     else
-      DiagsInPedantic.get<RecordSet *>()->insert(R);
+      cast<RecordSet *>(DiagsInPedantic)->insert(R);
   }
 
   if (!GroupsInPedantic)
@@ -389,7 +389,7 @@ void InferPedantic::compute(VecOrSet DiagsInPedantic,
     if (auto *V = GroupsInPedantic.dyn_cast<RecordVec *>())
       V->push_back(Group);
     else
-      GroupsInPedantic.get<RecordSet *>()->insert(Group);
+      cast<RecordSet *>(GroupsInPedantic)->insert(Group);
   }
 }
 
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index b32b42423f6a9..3a868c11e7288 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -572,8 +572,10 @@ set(aarch64_SOURCES
   aarch64/fp_mode.c
 )
 
+set(COMPILER_RT_AARCH64_FMV_USES_GLOBAL_CONSTRUCTOR NOT(FUCHSIA OR APPLE))
+
 if (COMPILER_RT_HAS_AARCH64_SME)
-  if (NOT COMPILER_RT_DISABLE_AARCH64_FMV AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG)
+  if (NOT COMPILER_RT_DISABLE_AARCH64_FMV AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND COMPILER_RT_AARCH64_FMV_USES_GLOBAL_CONSTRUCTOR)
     list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-libc-mem-routines.S aarch64/sme-abi-assert.c aarch64/sme-libc-routines.c)
     message(STATUS "AArch64 SME ABI routines enabled")
     set_source_files_properties(aarch64/sme-libc-routines.c PROPERTIES COMPILE_FLAGS "-fno-builtin")
diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp
index 4afc74933a33b..a5897274521e9 100644
--- a/compiler-rt/lib/interception/interception_win.cpp
+++ b/compiler-rt/lib/interception/interception_win.cpp
@@ -634,6 +634,7 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
     case 0xD284:  // 84 D2 : test dl,dl
       return 2;
 
+    case 0xE483:  // 83 E4 XX : and esp, XX
     case 0xEC83:  // 83 EC XX : sub esp, XX
     case 0xC1F6:  // F6 C1 XX : test cl, XX
       return 3;
@@ -643,8 +644,7 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
       return 0;
   }
 
-  switch (0x00FFFFFF & *(u32*)address) {
-    case 0xF8E483:  // 83 E4 F8 : and esp, 0xFFFFFFF8
+  switch (0x00FFFFFF & *(u32 *)address) {
     case 0x24A48D:  // 8D A4 24 XX XX XX XX : lea esp, [esp + XX XX XX XX]
       return 7;
   }
@@ -773,7 +773,6 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
     case 0xdb8548:    // 48 85 db : test rbx, rbx
     case 0xdb854d:    // 4d 85 db : test r11, r11
     case 0xdc8b4c:    // 4c 8b dc : mov r11, rsp
-    case 0xe0e483:    // 83 e4 e0 : and esp, 0xFFFFFFE0
     case 0xe48548:    // 48 85 e4 : test rsp, rsp
     case 0xe4854d:    // 4d 85 e4 : test r12, r12
     case 0xe58948:    // 48 89 e5 : mov rbp, rsp
diff --git a/compiler-rt/lib/interception/tests/interception_win_test.cpp b/compiler-rt/lib/interception/tests/interception_win_test.cpp
index 6e01209ac3a7e..04d9a6766f65a 100644
--- a/compiler-rt/lib/interception/tests/interception_win_test.cpp
+++ b/compiler-rt/lib/interception/tests/interception_win_test.cpp
@@ -852,6 +852,7 @@ const struct InstructionSizeData {
     { 2, {0x8B, 0xC1}, 0, "8B C1 : mov eax, ecx"},
     { 2, {0x8B, 0xEC}, 0, "8B EC : mov ebp, esp"},
     { 2, {0x8B, 0xFF}, 0, "8B FF : mov edi, edi"},
+    { 3, {0x83, 0xE4, 0x72}, 0, "83 E4 XX : and esp, XX"},
     { 3, {0x83, 0xEC, 0x72}, 0, "83 EC XX : sub esp, XX"},
     { 3, {0xc2, 0x71, 0x72}, 0, "C2 XX XX : ret XX (needed for registering weak functions)"},
     { 5, {0x68, 0x71, 0x72, 0x73, 0x74}, 0, "68 XX XX XX XX : push imm32"},
diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
index f076c7eae9a22..f000deb3039a8 100644
--- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
@@ -46,6 +46,7 @@ void OSSpinLockLock(volatile OSSpinLock *__lock);
 #include <pthread.h>
 #include <stdarg.h>
 #include <stdio.h>
+#include <sys/select.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <time.h>
diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp
index 41a6255207976..3a689c335c81c 100644
--- a/flang/examples/FeatureList/FeatureList.cpp
+++ b/flang/examples/FeatureList/FeatureList.cpp
@@ -495,8 +495,7 @@ struct NodeVisitor {
   READ_FEATURE(OmpIfClause::Modifier)
   READ_FEATURE(OmpDirectiveNameModifier)
   READ_FEATURE(OmpLinearClause)
-  READ_FEATURE(OmpLinearClause::WithModifier)
-  READ_FEATURE(OmpLinearClause::WithoutModifier)
+  READ_FEATURE(OmpLinearClause::Modifier)
   READ_FEATURE(OmpLinearModifier)
   READ_FEATURE(OmpLinearModifier::Value)
   READ_FEATURE(OmpLoopDirective)
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
index f11162dc0d95e..48764580d526d 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
@@ -1205,6 +1205,7 @@ def hlfir_ShapeOfOp : hlfir_Op<"shape_of", [Pure]> {
   }];
 
   let builders = [OpBuilder<(ins "mlir::Value":$expr)>];
+  let hasFolder = 1;
 }
 
 def hlfir_GetExtentOp : hlfir_Op<"get_extent", [Pure]> {
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 13825eb7ba41e..7c0f04091362b 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -559,10 +559,11 @@ class ParseTreeDumper {
   NODE(parser, OmpLastprivateModifier)
   NODE_ENUM(OmpLastprivateModifier, Value)
   NODE(parser, OmpLinearClause)
-  NODE(OmpLinearClause, WithModifier)
-  NODE(OmpLinearClause, WithoutModifier)
+  NODE(OmpLinearClause, Modifier)
   NODE(parser, OmpLinearModifier)
   NODE_ENUM(OmpLinearModifier, Value)
+  NODE(parser, OmpStepComplexModifier)
+  NODE(parser, OmpStepSimpleModifier)
   NODE(parser, OmpLoopDirective)
   NODE(parser, OmpMapClause)
   NODE(OmpMapClause, Modifier)
@@ -593,7 +594,10 @@ class ParseTreeDumper {
   NODE(parser, OmpReductionClause)
   NODE(OmpReductionClause, Modifier)
   NODE(parser, OmpInReductionClause)
+  NODE(OmpInReductionClause, Modifier)
   NODE(parser, OmpReductionCombiner)
+  NODE(parser, OmpTaskReductionClause)
+  NODE(OmpTaskReductionClause, Modifier)
   NODE(OmpReductionCombiner, FunctionCombiner)
   NODE(parser, OmpReductionInitializerClause)
   NODE(parser, OmpReductionIdentifier)
diff --git a/flang/include/flang/Parser/parse-tree-visitor.h b/flang/include/flang/Parser/parse-tree-visitor.h
index e1ea4d459f4a6..af1d34ae804f3 100644
--- a/flang/include/flang/Parser/parse-tree-visitor.h
+++ b/flang/include/flang/Parser/parse-tree-visitor.h
@@ -897,40 +897,6 @@ struct ParseTreeVisitorLookupScope {
       mutator.Post(x);
     }
   }
-  template <typename V>
-  static void Walk(const OmpLinearClause::WithModifier &x, V &visitor) {
-    if (visitor.Pre(x)) {
-      Walk(x.modifier, visitor);
-      Walk(x.names, visitor);
-      Walk(x.step, visitor);
-      visitor.Post(x);
-    }
-  }
-  template <typename M>
-  static void Walk(OmpLinearClause::WithModifier &x, M &mutator) {
-    if (mutator.Pre(x)) {
-      Walk(x.modifier, mutator);
-      Walk(x.names, mutator);
-      Walk(x.step, mutator);
-      mutator.Post(x);
-    }
-  }
-  template <typename V>
-  static void Walk(const OmpLinearClause::WithoutModifier &x, V &visitor) {
-    if (visitor.Pre(x)) {
-      Walk(x.names, visitor);
-      Walk(x.step, visitor);
-      visitor.Post(x);
-    }
-  }
-  template <typename M>
-  static void Walk(OmpLinearClause::WithoutModifier &x, M &mutator) {
-    if (mutator.Pre(x)) {
-      Walk(x.names, mutator);
-      Walk(x.step, mutator);
-      mutator.Post(x);
-    }
-  }
 };
 } // namespace detail
 
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 2b4cb21017fa0..8086c31031011 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3699,6 +3699,22 @@ struct OmpReductionModifier {
   WRAPPER_CLASS_BOILERPLATE(OmpReductionModifier, Value);
 };
 
+// Ref: [5.2:117-120]
+//
+// step-complex-modifier ->
+//    STEP(integer-expression)                      // since 5.2
+struct OmpStepComplexModifier {
+  WRAPPER_CLASS_BOILERPLATE(OmpStepComplexModifier, ScalarIntExpr);
+};
+
+// Ref: [4.5:207-210], [5.0:290-293], [5.1:323-325], [5.2:117-120]
+//
+// step-simple-modifier ->
+//    integer-expresion                             // since 4.5
+struct OmpStepSimpleModifier {
+  WRAPPER_CLASS_BOILERPLATE(OmpStepSimpleModifier, ScalarIntExpr);
+};
+
 // Ref: [4.5:169-170], [5.0:254-256], [5.1:287-289], [5.2:321]
 //
 // task-dependence-type -> // "dependence-type" in 5.1 and before
@@ -3934,7 +3950,7 @@ struct OmpFailClause {
 struct OmpFromClause {
   TUPLE_CLASS_BOILERPLATE(OmpFromClause);
   MODIFIER_BOILERPLATE(OmpExpectation, OmpIterator, OmpMapper);
-  std::tuple<MODIFIERS(), OmpObjectList, bool> t;
+  std::tuple<MODIFIERS(), OmpObjectList, /*CommaSeparated=*/bool> t;
 };
 
 // Ref: [4.5:87-91], [5.0:140-146], [5.1:166-171], [5.2:269]
@@ -3960,11 +3976,14 @@ struct OmpIfClause {
   std::tuple<MODIFIERS(), ScalarLogicalExpr> t;
 };
 
-// OMP 5.0 2.19.5.6 in_reduction-clause -> IN_REDUCTION (reduction-identifier:
-//                                         variable-name-list)
+// Ref: [5.0:170-176], [5.1:197-205], [5.2:138-139]
+//
+// in-reduction-clause ->
+//    IN_REDUCTION(reduction-identifier: list)      // since 5.0
 struct OmpInReductionClause {
   TUPLE_CLASS_BOILERPLATE(OmpInReductionClause);
-  std::tuple<OmpReductionIdentifier, OmpObjectList> t;
+  MODIFIER_BOILERPLATE(OmpReductionIdentifier);
+  std::tuple<MODIFIERS(), OmpObjectList> t;
 };
 
 // Ref: [4.5:199-201], [5.0:288-290], [5.1:321-322], [5.2:115-117]
@@ -3978,28 +3997,20 @@ struct OmpLastprivateClause {
   std::tuple<MODIFIERS(), OmpObjectList> t;
 };
 
-// 2.15.3.7 linear-clause -> LINEAR (linear-list[ : linear-step])
-//          linear-list -> list | linear-modifier(list)
+// Ref: [4.5:207-210], [5.0:290-293], [5.1:323-325], [5.2:117-120]
+//
+// linear-clause ->
+//    LINEAR(list [: step-simple-modifier]) |       // since 4.5
+//    LINEAR(linear-modifier(list)
+//        [: step-simple-modifier]) |               // since 4.5, until 5.2[*]
+//    LINEAR(list [: linear-modifier,
+//        step-complex-modifier])                   // since 5.2
+// [*] Still allowed in 5.2 when on DECLARE SIMD, but deprecated.
 struct OmpLinearClause {
-  UNION_CLASS_BOILERPLATE(OmpLinearClause);
-  struct WithModifier {
-    BOILERPLATE(WithModifier);
-    WithModifier(OmpLinearModifier &&m, std::list<Name> &&n,
-        std::optional<ScalarIntConstantExpr> &&s)
-        : modifier(std::move(m)), names(std::move(n)), step(std::move(s)) {}
-    OmpLinearModifier modifier;
-    std::list<Name> names;
-    std::optional<ScalarIntConstantExpr> step;
-  };
-  struct WithoutModifier {
-    BOILERPLATE(WithoutModifier);
-    WithoutModifier(
-        std::list<Name> &&n, std::optional<ScalarIntConstantExpr> &&s)
-        : names(std::move(n)), step(std::move(s)) {}
-    std::list<Name> names;
-    std::optional<ScalarIntConstantExpr> step;
-  };
-  std::variant<WithModifier, WithoutModifier> u;
+  TUPLE_CLASS_BOILERPLATE(OmpLinearClause);
+  MODIFIER_BOILERPLATE(
+      OmpLinearModifier, OmpStepSimpleModifier, OmpStepComplexModifier);
+  std::tuple<OmpObjectList, MODIFIERS(), /*PostModified=*/bool> t;
 };
 
 // Ref: [4.5:216-219], [5.0:315-324], [5.1:347-355], [5.2:150-158]
@@ -4014,7 +4025,7 @@ struct OmpLinearClause {
 struct OmpMapClause {
   TUPLE_CLASS_BOILERPLATE(OmpMapClause);
   MODIFIER_BOILERPLATE(OmpMapTypeModifier, OmpMapper, OmpIterator, OmpMapType);
-  std::tuple<MODIFIERS(), OmpObjectList, bool> t;
+  std::tuple<MODIFIERS(), OmpObjectList, /*CommaSeparated=*/bool> t;
 };
 
 // Ref: [4.5:87-91], [5.0:140-146], [5.1:166-171], [5.2:270]
@@ -4079,6 +4090,16 @@ struct OmpScheduleClause {
   std::tuple<MODIFIERS(), Kind, std::optional<ScalarIntExpr>> t;
 };
 
+// Ref: [5.0:232-234], [5.1:264-266], [5.2:137]
+//
+// task-reduction-clause ->
+//    TASK_REDUCTION(reduction-identifier: list)    // since 5.0
+struct OmpTaskReductionClause {
+  TUPLE_CLASS_BOILERPLATE(OmpTaskReductionClause);
+  MODIFIER_BOILERPLATE(OmpReductionIdentifier);
+  std::tuple<MODIFIERS(), OmpObjectList> t;
+};
+
 // Ref: [4.5:107-109], [5.0:176-180], [5.1:205-210], [5.2:167-168]
 //
 // to-clause (in DECLARE TARGET) ->
@@ -4092,7 +4113,7 @@ struct OmpScheduleClause {
 struct OmpToClause {
   TUPLE_CLASS_BOILERPLATE(OmpToClause);
   MODIFIER_BOILERPLATE(OmpExpectation, OmpIterator, OmpMapper);
-  std::tuple<MODIFIERS(), OmpObjectList, bool> t;
+  std::tuple<MODIFIERS(), OmpObjectList, /*CommaSeparated=*/bool> t;
 };
 
 // Ref: [5.0:254-255], [5.1:287-288], [5.2:321-322]
diff --git a/flang/include/flang/Semantics/openmp-modifiers.h b/flang/include/flang/Semantics/openmp-modifiers.h
index 4025ce112d9ca..5d5c5e97faf41 100644
--- a/flang/include/flang/Semantics/openmp-modifiers.h
+++ b/flang/include/flang/Semantics/openmp-modifiers.h
@@ -87,6 +87,8 @@ DECLARE_DESCRIPTOR(parser::OmpOrderingModifier);
 DECLARE_DESCRIPTOR(parser::OmpPrescriptiveness);
 DECLARE_DESCRIPTOR(parser::OmpReductionIdentifier);
 DECLARE_DESCRIPTOR(parser::OmpReductionModifier);
+DECLARE_DESCRIPTOR(parser::OmpStepComplexModifier);
+DECLARE_DESCRIPTOR(parser::OmpStepSimpleModifier);
 DECLARE_DESCRIPTOR(parser::OmpTaskDependenceType);
 DECLARE_DESCRIPTOR(parser::OmpVariableCategory);
 
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 10c31963ec493..b424e209d56da 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -859,10 +859,14 @@ Init make(const parser::OmpClause::Init &inp,
 InReduction make(const parser::OmpClause::InReduction &inp,
                  semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpInReductionClause
-  auto &t0 = std::get<parser::OmpReductionIdentifier>(inp.v.t);
+  auto &mods = semantics::OmpGetModifiers(inp.v);
+  auto *m0 =
+      semantics::OmpGetUniqueModifier<parser::OmpReductionIdentifier>(mods);
   auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
+  assert(m0 && "OmpReductionIdentifier is required");
+
   return InReduction{
-      {/*ReductionIdentifiers=*/{makeReductionOperator(t0, semaCtx)},
+      {/*ReductionIdentifiers=*/{makeReductionOperator(*m0, semaCtx)},
        /*List=*/makeObjects(t1, semaCtx)}};
 }
 
@@ -895,8 +899,6 @@ Lastprivate make(const parser::OmpClause::Lastprivate &inp,
 Linear make(const parser::OmpClause::Linear &inp,
             semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpLinearClause
-  using wrapped = parser::OmpLinearClause;
-
   CLAUSET_ENUM_CONVERT( //
       convert, parser::OmpLinearModifier::Value, Linear::LinearModifier,
       // clang-format off
@@ -906,26 +908,23 @@ Linear make(const parser::OmpClause::Linear &inp,
       // clang-format on
   );
 
-  using Tuple = decltype(Linear::t);
+  auto &mods = semantics::OmpGetModifiers(inp.v);
+  auto *m0 =
+      semantics::OmpGetUniqueModifier<parser::OmpStepComplexModifier>(mods);
+  auto *m1 =
+      semantics::OmpGetUniqueModifier<parser::OmpStepSimpleModifier>(mods);
+  assert((!m0 || !m1) && "Simple and complex modifiers both present");
 
-  return Linear{Fortran::common::visit(
-      common::visitors{
-          [&](const wrapped::WithModifier &s) -> Tuple {
-            return {
-                /*StepSimpleModifier=*/std::nullopt,
-                /*StepComplexModifier=*/maybeApply(makeExprFn(semaCtx), s.step),
-                /*LinearModifier=*/convert(s.modifier.v),
-                /*List=*/makeList(s.names, makeObjectFn(semaCtx))};
-          },
-          [&](const wrapped::WithoutModifier &s) -> Tuple {
-            return {
-                /*StepSimpleModifier=*/maybeApply(makeExprFn(semaCtx), s.step),
-                /*StepComplexModifier=*/std::nullopt,
-                /*LinearModifier=*/std::nullopt,
-                /*List=*/makeList(s.names, makeObjectFn(semaCtx))};
-          },
-      },
-      inp.v.u)};
+  auto *m2 = semantics::OmpGetUniqueModifier<parser::OmpLinearModifier>(mods);
+  auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
+
+  auto &&maybeStep = m0   ? maybeApplyToV(makeExprFn(semaCtx), m0)
+                     : m1 ? maybeApplyToV(makeExprFn(semaCtx), m1)
+                          : std::optional<Linear::StepComplexModifier>{};
+
+  return Linear{{/*StepComplexModifier=*/std::move(maybeStep),
+                 /*LinearModifier=*/maybeApplyToV(convert, m2),
+                 /*List=*/makeObjects(t1, semaCtx)}};
 }
 
 Link make(const parser::OmpClause::Link &inp,
@@ -1155,17 +1154,17 @@ Reduction make(const parser::OmpClause::Reduction &inp,
   );
 
   auto &mods = semantics::OmpGetModifiers(inp.v);
-  auto *t0 =
+  auto *m0 =
       semantics::OmpGetUniqueModifier<parser::OmpReductionModifier>(mods);
-  auto *t1 =
+  auto *m1 =
       semantics::OmpGetUniqueModifier<parser::OmpReductionIdentifier>(mods);
-  auto &t2 = std::get<parser::OmpObjectList>(inp.v.t);
-  assert(t1 && "OmpReductionIdentifier is required");
+  auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
+  assert(m1 && "OmpReductionIdentifier is required");
 
   return Reduction{
-      {/*ReductionModifier=*/maybeApplyToV(convert, t0),
-       /*ReductionIdentifiers=*/{makeReductionOperator(*t1, semaCtx)},
-       /*List=*/makeObjects(t2, semaCtx)}};
+      {/*ReductionModifier=*/maybeApplyToV(convert, m0),
+       /*ReductionIdentifiers=*/{makeReductionOperator(*m1, semaCtx)},
+       /*List=*/makeObjects(t1, semaCtx)}};
 }
 
 // Relaxed: empty
@@ -1259,13 +1258,13 @@ TaskReduction make(const parser::OmpClause::TaskReduction &inp,
                    semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpReductionClause
   auto &mods = semantics::OmpGetModifiers(inp.v);
-  auto *t0 =
+  auto *m0 =
       semantics::OmpGetUniqueModifier<parser::OmpReductionIdentifier>(mods);
   auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
-  assert(t0 && "OmpReductionIdentifier is required");
+  assert(m0 && "OmpReductionIdentifier is required");
 
   return TaskReduction{
-      {/*ReductionIdentifiers=*/{makeReductionOperator(*t0, semaCtx)},
+      {/*ReductionIdentifiers=*/{makeReductionOperator(*m0, semaCtx)},
        /*List=*/makeObjects(t1, semaCtx)}};
 }
 
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index ad53527f43441..82aac7cafa1d0 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -1704,6 +1704,15 @@ hlfir::ShapeOfOp::canonicalize(ShapeOfOp shapeOf,
   return llvm::LogicalResult::success();
 }
 
+mlir::OpFoldResult hlfir::ShapeOfOp::fold(FoldAdaptor adaptor) {
+  if (matchPattern(getExpr(), mlir::m_Op<hlfir::ElementalOp>())) {
+    auto elementalOp =
+        mlir::cast<hlfir::ElementalOp>(getExpr().getDefiningOp());
+    return elementalOp.getShape();
+  }
+  return {};
+}
+
 //===----------------------------------------------------------------------===//
 // GetExtent
 //===----------------------------------------------------------------------===//
diff --git a/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp b/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp
index d2c814cc958dd..c990bebcabde4 100644
--- a/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp
+++ b/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp
@@ -49,13 +49,6 @@ class MapsForPrivatizedSymbolsPass
     : public flangomp::impl::MapsForPrivatizedSymbolsPassBase<
           MapsForPrivatizedSymbolsPass> {
 
-  bool privatizerNeedsMap(omp::PrivateClauseOp &privatizer) {
-    Region &allocRegion = privatizer.getAllocRegion();
-    Value blockArg0 = allocRegion.getArgument(0);
-    if (blockArg0.use_empty())
-      return false;
-    return true;
-  }
   omp::MapInfoOp createMapInfo(Location loc, Value var,
                                fir::FirOpBuilder &builder) {
     uint64_t mapTypeTo = static_cast<
@@ -134,7 +127,7 @@ class MapsForPrivatizedSymbolsPass
         omp::PrivateClauseOp privatizer =
             SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(
                 targetOp, privatizerName);
-        if (!privatizerNeedsMap(privatizer)) {
+        if (!privatizer.needsMap()) {
           privVarMapIdx.push_back(-1);
           continue;
         }
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index f8fda92d5ac2b..7d10de8c60977 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -99,6 +99,25 @@ constexpr ModifierList<Clause, Separator> modifierList(Separator sep) {
   return ModifierList<Clause, Separator>(sep);
 }
 
+// Parse the input as any modifier from ClauseTy, but only succeed if
+// the result was the SpecificTy. It requires that SpecificTy is one
+// of the alternatives in ClauseTy::Modifier.
+// The reason to have this is that ClauseTy::Modifier has "source",
+// while specific modifiers don't. This class allows to parse a specific
+// modifier together with obtaining its location.
+template <typename SpecificTy, typename ClauseTy>
+struct SpecificModifierParser {
+  using resultType = typename ClauseTy::Modifier;
+  std::optional<resultType> Parse(ParseState &state) const {
+    if (auto result{attempt(Parser<resultType>{}).Parse(state)}) {
+      if (std::holds_alternative<SpecificTy>(result->u)) {
+        return result;
+      }
+    }
+    return std::nullopt;
+  }
+};
+
 // OpenMP Clauses
 
 // [5.0] 2.1.6 iterator-specifier -> type-declaration-stmt = subscript-triple |
@@ -232,6 +251,11 @@ TYPE_PARSER(construct<OmpReductionModifier>(
     "TASK" >> pure(OmpReductionModifier::Value::Task) ||
     "DEFAULT" >> pure(OmpReductionModifier::Value::Default)))
 
+TYPE_PARSER(construct<OmpStepComplexModifier>( //
+    "STEP" >> parenthesized(scalarIntExpr)))
+
+TYPE_PARSER(construct<OmpStepSimpleModifier>(scalarIntExpr))
+
 TYPE_PARSER(construct<OmpTaskDependenceType>(
     "DEPOBJ" >> pure(OmpTaskDependenceType::Value::Depobj) ||
     "IN"_id >> pure(OmpTaskDependenceType::Value::In) ||
@@ -282,9 +306,17 @@ TYPE_PARSER(sourced(
 
 TYPE_PARSER(sourced(construct<OmpIfClause::Modifier>(OmpDirectiveNameParser{})))
 
+TYPE_PARSER(sourced(construct<OmpInReductionClause::Modifier>(
+    Parser<OmpReductionIdentifier>{})))
+
 TYPE_PARSER(sourced(construct<OmpLastprivateClause::Modifier>(
     Parser<OmpLastprivateModifier>{})))
 
+TYPE_PARSER(sourced(
+    construct<OmpLinearClause::Modifier>(Parser<OmpLinearModifier>{}) ||
+    construct<OmpLinearClause::Modifier>(Parser<OmpStepComplexModifier>{}) ||
+    construct<OmpLinearClause::Modifier>(Parser<OmpStepSimpleModifier>{})))
+
 TYPE_PARSER(sourced(construct<OmpMapClause::Modifier>(
     sourced(construct<OmpMapClause::Modifier>(Parser<OmpMapTypeModifier>{}) ||
         construct<OmpMapClause::Modifier>(Parser<OmpMapper>{}) ||
@@ -306,6 +338,9 @@ TYPE_PARSER(sourced(construct<OmpScheduleClause::Modifier>(sourced(
     construct<OmpScheduleClause::Modifier>(Parser<OmpChunkModifier>{}) ||
     construct<OmpScheduleClause::Modifier>(Parser<OmpOrderingModifier>{})))))
 
+TYPE_PARSER(sourced(construct<OmpTaskReductionClause::Modifier>(
+    Parser<OmpReductionIdentifier>{})))
+
 TYPE_PARSER(sourced(construct<OmpToClause::Modifier>(
     sourced(construct<OmpToClause::Modifier>(Parser<OmpExpectation>{}) ||
         construct<OmpToClause::Modifier>(Parser<OmpMapper>{}) ||
@@ -407,7 +442,12 @@ TYPE_PARSER(construct<OmpReductionClause>(
 
 // OMP 5.0 2.19.5.6 IN_REDUCTION (reduction-identifier: variable-name-list)
 TYPE_PARSER(construct<OmpInReductionClause>(
-    Parser<OmpReductionIdentifier>{} / ":", Parser<OmpObjectList>{}))
+    maybe(nonemptyList(Parser<OmpInReductionClause::Modifier>{}) / ":"),
+    Parser<OmpObjectList>{}))
+
+TYPE_PARSER(construct<OmpTaskReductionClause>(
+    maybe(nonemptyList(Parser<OmpTaskReductionClause::Modifier>{}) / ":"),
+    Parser<OmpObjectList>{}))
 
 // OMP 5.0 2.11.4 allocate-clause -> ALLOCATE ([allocator:] variable-name-list)
 // OMP 5.2 2.13.4 allocate-clause -> ALLOCATE ([allocate-modifier
@@ -460,13 +500,33 @@ TYPE_PARSER(construct<OmpToClause>(
     applyFunction<OmpToClause>(makeMobClause<false>,
         modifierList<OmpToClause>(maybe(","_tok)), Parser<OmpObjectList>{})))
 
-TYPE_CONTEXT_PARSER("Omp LINEAR clause"_en_US,
-    construct<OmpLinearClause>(
-        construct<OmpLinearClause>(construct<OmpLinearClause::WithModifier>(
-            Parser<OmpLinearModifier>{}, parenthesized(nonemptyList(name)),
-            maybe(":" >> scalarIntConstantExpr))) ||
-        construct<OmpLinearClause>(construct<OmpLinearClause::WithoutModifier>(
-            nonemptyList(name), maybe(":" >> scalarIntConstantExpr)))))
+OmpLinearClause makeLinearFromOldSyntax(OmpLinearClause::Modifier &&lm,
+    OmpObjectList &&objs, std::optional<OmpLinearClause::Modifier> &&ssm) {
+  std::list<OmpLinearClause::Modifier> mods;
+  mods.emplace_back(std::move(lm));
+  if (ssm) {
+    mods.emplace_back(std::move(*ssm));
+  }
+  return OmpLinearClause{std::move(objs),
+      mods.empty() ? decltype(mods){} : std::move(mods),
+      /*PostModified=*/false};
+}
+
+TYPE_PARSER(
+    // Parse the "modifier(x)" first, because syntacticaly it will match
+    // an array element (i.e. a list item).
+    // LINEAR(linear-modifier(list) [: step-simple-modifier])
+    construct<OmpLinearClause>( //
+        applyFunction<OmpLinearClause>(makeLinearFromOldSyntax,
+            SpecificModifierParser<OmpLinearModifier, OmpLinearClause>{},
+            parenthesized(Parser<OmpObjectList>{}),
+            maybe(":"_tok >> SpecificModifierParser<OmpStepSimpleModifier,
+                                 OmpLinearClause>{}))) ||
+    // LINEAR(list [: modifiers])
+    construct<OmpLinearClause>( //
+        Parser<OmpObjectList>{},
+        maybe(":"_tok >> nonemptyList(Parser<OmpLinearClause::Modifier>{})),
+        /*PostModified=*/pure(true)))
 
 // OpenMPv5.2 12.5.2 detach-clause -> DETACH (event-handle)
 TYPE_PARSER(construct<OmpDetachClause>(Parser<OmpObject>{}))
@@ -609,15 +669,15 @@ TYPE_PARSER(
                      parenthesized(Parser<OmpObjectList>{}))) ||
     "PROC_BIND" >> construct<OmpClause>(construct<OmpClause::ProcBind>(
                        parenthesized(Parser<OmpProcBindClause>{}))) ||
-    "REDUCTION" >> construct<OmpClause>(construct<OmpClause::Reduction>(
-                       parenthesized(Parser<OmpReductionClause>{}))) ||
+    "REDUCTION"_id >> construct<OmpClause>(construct<OmpClause::Reduction>(
+                          parenthesized(Parser<OmpReductionClause>{}))) ||
     "IN_REDUCTION" >> construct<OmpClause>(construct<OmpClause::InReduction>(
                           parenthesized(Parser<OmpInReductionClause>{}))) ||
     "DETACH" >> construct<OmpClause>(construct<OmpClause::Detach>(
                     parenthesized(Parser<OmpDetachClause>{}))) ||
     "TASK_REDUCTION" >>
         construct<OmpClause>(construct<OmpClause::TaskReduction>(
-            parenthesized(Parser<OmpReductionClause>{}))) ||
+            parenthesized(Parser<OmpTaskReductionClause>{}))) ||
     "RELAXED" >> construct<OmpClause>(construct<OmpClause::Relaxed>()) ||
     "RELEASE" >> construct<OmpClause>(construct<OmpClause::Release>()) ||
     "REVERSE_OFFLOAD" >>
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index a10be3f1c797d..4b8e2624e36ca 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2133,23 +2133,78 @@ class UnparseVisitor {
     Walk(std::get<std::optional<std::list<Modifier>>>(x.t), ": ");
     Walk(std::get<ScalarLogicalExpr>(x.t));
   }
-  void Unparse(const OmpLinearClause::WithoutModifier &x) {
-    Walk(x.names, ", ");
-    Walk(":", x.step);
+  void Unparse(const OmpStepSimpleModifier &x) { Walk(x.v); }
+  void Unparse(const OmpStepComplexModifier &x) {
+    Word("STEP(");
+    Walk(x.v);
+    Put(")");
   }
-  void Unparse(const OmpLinearClause::WithModifier &x) {
-    Walk(x.modifier), Put("("), Walk(x.names, ","), Put(")");
-    Walk(":", x.step);
+  void Unparse(const OmpLinearClause &x) {
+    using Modifier = OmpLinearClause::Modifier;
+    auto &modifiers{std::get<std::optional<std::list<Modifier>>>(x.t)};
+    if (std::get<bool>(x.t)) { // PostModified
+      Walk(std::get<OmpObjectList>(x.t));
+      Walk(": ", modifiers);
+    } else {
+      // Unparse using pre-5.2 syntax.
+      bool HasStepModifier{false}, HasLinearModifier{false};
+
+      if (modifiers) {
+        bool NeedComma{false};
+        for (const Modifier &m : *modifiers) {
+          // Print all linear modifiers in case we need to unparse an
+          // incorrect tree.
+          if (auto *lmod{std::get_if<parser::OmpLinearModifier>(&m.u)}) {
+            if (NeedComma) {
+              Put(",");
+            }
+            Walk(*lmod);
+            HasLinearModifier = true;
+            NeedComma = true;
+          } else {
+            // If not linear-modifier, then it has to be step modifier.
+            HasStepModifier = true;
+          }
+        }
+      }
+
+      if (HasLinearModifier) {
+        Put("(");
+      }
+      Walk(std::get<OmpObjectList>(x.t));
+      if (HasLinearModifier) {
+        Put(")");
+      }
+
+      if (HasStepModifier) {
+        Put(": ");
+        bool NeedComma{false};
+        for (const Modifier &m : *modifiers) {
+          if (!std::holds_alternative<parser::OmpLinearModifier>(m.u)) {
+            if (NeedComma) {
+              Put(",");
+            }
+            common::visit([&](auto &&s) { Walk(s); }, m.u);
+            NeedComma = true;
+          }
+        }
+      }
+    }
   }
   void Unparse(const OmpReductionClause &x) {
     using Modifier = OmpReductionClause::Modifier;
-    Walk(std::get<std::optional<std::list<Modifier>>>(x.t), ":");
+    Walk(std::get<std::optional<std::list<Modifier>>>(x.t), ": ");
     Walk(std::get<OmpObjectList>(x.t));
   }
   void Unparse(const OmpDetachClause &x) { Walk(x.v); }
   void Unparse(const OmpInReductionClause &x) {
-    Walk(std::get<OmpReductionIdentifier>(x.t));
-    Put(":");
+    using Modifier = OmpInReductionClause::Modifier;
+    Walk(std::get<std::optional<std::list<Modifier>>>(x.t), ": ");
+    Walk(std::get<OmpObjectList>(x.t));
+  }
+  void Unparse(const OmpTaskReductionClause &x) {
+    using Modifier = OmpTaskReductionClause::Modifier;
+    Walk(std::get<std::optional<std::list<Modifier>>>(x.t), ": ");
     Walk(std::get<OmpObjectList>(x.t));
   }
   void Unparse(const OmpAllocateClause &x) {
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 1e78cf359a213..d63f7a5aea3ab 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -414,14 +414,14 @@ void OmpStructureChecker::CheckMultListItems() {
 
   // Linear clause
   for (auto [_, clause] : FindClauses(llvm::omp::Clause::OMPC_linear)) {
-    const auto &linearClause{std::get<parser::OmpClause::Linear>(clause->u)};
+    auto &linearClause{std::get<parser::OmpClause::Linear>(clause->u)};
     std::list<parser::Name> nameList;
-    common::visit(
-        [&](const auto &u) {
-          std::copy(
-              u.names.begin(), u.names.end(), std::back_inserter(nameList));
-        },
-        linearClause.v.u);
+    SymbolSourceMap symbols;
+    GetSymbolsInObjectList(
+        std::get<parser::OmpObjectList>(linearClause.v.t), symbols);
+    llvm::transform(symbols, std::back_inserter(nameList), [&](auto &&pair) {
+      return parser::Name{pair.second, const_cast<Symbol *>(pair.first)};
+    });
     CheckMultipleOccurrence(listVars, nameList, clause->source, "LINEAR");
   }
 }
@@ -958,28 +958,13 @@ void OmpStructureChecker::CheckDistLinear(
   const auto &beginLoopDir{std::get<parser::OmpBeginLoopDirective>(x.t)};
   const auto &clauses{std::get<parser::OmpClauseList>(beginLoopDir.t)};
 
-  semantics::UnorderedSymbolSet indexVars;
+  SymbolSourceMap indexVars;
 
   // Collect symbols of all the variables from linear clauses
-  for (const auto &clause : clauses.v) {
-    if (const auto *linearClause{
-            std::get_if<parser::OmpClause::Linear>(&clause.u)}) {
-
-      std::list<parser::Name> values;
-      // Get the variant type
-      if (std::holds_alternative<parser::OmpLinearClause::WithModifier>(
-              linearClause->v.u)) {
-        const auto &withM{
-            std::get<parser::OmpLinearClause::WithModifier>(linearClause->v.u)};
-        values = withM.names;
-      } else {
-        const auto &withOutM{std::get<parser::OmpLinearClause::WithoutModifier>(
-            linearClause->v.u)};
-        values = withOutM.names;
-      }
-      for (auto const &v : values) {
-        indexVars.insert(*(v.symbol));
-      }
+  for (auto &clause : clauses.v) {
+    if (auto *linearClause{std::get_if<parser::OmpClause::Linear>(&clause.u)}) {
+      auto &objects{std::get<parser::OmpObjectList>(linearClause->v.t)};
+      GetSymbolsInObjectList(objects, indexVars);
     }
   }
 
@@ -999,8 +984,8 @@ void OmpStructureChecker::CheckDistLinear(
         if (loop->IsDoNormal()) {
           const parser::Name &itrVal{GetLoopIndex(loop)};
           if (itrVal.symbol) {
-            // Remove the symbol from the collcted set
-            indexVars.erase(*(itrVal.symbol));
+            // Remove the symbol from the collected set
+            indexVars.erase(&itrVal.symbol->GetUltimate());
           }
           collapseVal--;
           if (collapseVal == 0) {
@@ -1016,12 +1001,10 @@ void OmpStructureChecker::CheckDistLinear(
     }
 
     // Show error for the remaining variables
-    for (auto var : indexVars) {
-      const Symbol &root{GetAssociationRoot(var)};
-      context_.Say(parser::FindSourceLocation(x),
-          "Variable '%s' not allowed in `LINEAR` clause, only loop iterator "
-          "can be specified in `LINEAR` clause of a construct combined with "
-          "`DISTRIBUTE`"_err_en_US,
+    for (auto &[symbol, source] : indexVars) {
+      const Symbol &root{GetAssociationRoot(*symbol)};
+      context_.Say(source,
+          "Variable '%s' not allowed in LINEAR clause, only loop iterator can be specified in LINEAR clause of a construct combined with DISTRIBUTE"_err_en_US,
           root.name());
     }
   }
@@ -2841,7 +2824,6 @@ CHECK_SIMPLE_CLAUSE(Grainsize, OMPC_grainsize)
 CHECK_SIMPLE_CLAUSE(Hint, OMPC_hint)
 CHECK_SIMPLE_CLAUSE(Holds, OMPC_holds)
 CHECK_SIMPLE_CLAUSE(Inclusive, OMPC_inclusive)
-CHECK_SIMPLE_CLAUSE(InReduction, OMPC_in_reduction)
 CHECK_SIMPLE_CLAUSE(Match, OMPC_match)
 CHECK_SIMPLE_CLAUSE(Nontemporal, OMPC_nontemporal)
 CHECK_SIMPLE_CLAUSE(NumTasks, OMPC_num_tasks)
@@ -2863,7 +2845,6 @@ CHECK_SIMPLE_CLAUSE(ProcBind, OMPC_proc_bind)
 CHECK_SIMPLE_CLAUSE(Simd, OMPC_simd)
 CHECK_SIMPLE_CLAUSE(Sizes, OMPC_sizes)
 CHECK_SIMPLE_CLAUSE(Permutation, OMPC_permutation)
-CHECK_SIMPLE_CLAUSE(TaskReduction, OMPC_task_reduction)
 CHECK_SIMPLE_CLAUSE(Uniform, OMPC_uniform)
 CHECK_SIMPLE_CLAUSE(Unknown, OMPC_unknown)
 CHECK_SIMPLE_CLAUSE(Untied, OMPC_untied)
@@ -2978,14 +2959,17 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Reduction &x) {
 
   if (OmpVerifyModifiers(x.v, llvm::omp::OMPC_reduction,
           GetContext().clauseSource, context_)) {
-    if (CheckReductionOperators(x)) {
-      CheckReductionTypeList(x);
-    }
     auto &modifiers{OmpGetModifiers(x.v)};
+    const auto *ident{
+        OmpGetUniqueModifier<parser::OmpReductionIdentifier>(modifiers)};
+    assert(ident && "reduction-identifier is a required modifier");
+    if (CheckReductionOperator(*ident, OmpGetModifierSource(modifiers, ident),
+            llvm::omp::OMPC_reduction)) {
+      CheckReductionObjectTypes(objects, *ident);
+    }
     using ReductionModifier = parser::OmpReductionModifier;
-    if (auto *maybeModifier{
-            OmpGetUniqueModifier<ReductionModifier>(modifiers)}) {
-      CheckReductionModifier(*maybeModifier);
+    if (auto *modifier{OmpGetUniqueModifier<ReductionModifier>(modifiers)}) {
+      CheckReductionModifier(*modifier);
     }
   }
   CheckReductionObjects(objects, llvm::omp::Clause::OMPC_reduction);
@@ -2997,70 +2981,88 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Reduction &x) {
   }
 }
 
-bool OmpStructureChecker::CheckReductionOperators(
-    const parser::OmpClause::Reduction &x) {
-  bool ok = false;
-  auto &modifiers{OmpGetModifiers(x.v)};
-  if (const auto *ident{
-          OmpGetUniqueModifier<parser::OmpReductionIdentifier>(modifiers)}) {
-
-    auto visitOperator{[&](const parser::DefinedOperator &dOpr) {
-      if (const auto *intrinsicOp{
-              std::get_if<parser::DefinedOperator::IntrinsicOperator>(
-                  &dOpr.u)}) {
-        ok = CheckIntrinsicOperator(*intrinsicOp);
-      } else {
-        context_.Say(GetContext().clauseSource,
-            "Invalid reduction operator in REDUCTION clause."_err_en_US,
-            ContextDirectiveAsFortran());
-      }
-    }};
+void OmpStructureChecker::Enter(const parser::OmpClause::InReduction &x) {
+  CheckAllowedClause(llvm::omp::Clause::OMPC_in_reduction);
+  auto &objects{std::get<parser::OmpObjectList>(x.v.t)};
 
-    auto visitDesignator{[&](const parser::ProcedureDesignator &procD) {
-      const parser::Name *name{std::get_if<parser::Name>(&procD.u)};
-      if (name && name->symbol) {
-        const SourceName &realName{name->symbol->GetUltimate().name()};
-        if (realName == "max" || realName == "min" || realName == "iand" ||
-            realName == "ior" || realName == "ieor") {
-          ok = true;
-        }
-      }
-      if (!ok) {
+  if (OmpVerifyModifiers(x.v, llvm::omp::OMPC_in_reduction,
+          GetContext().clauseSource, context_)) {
+    auto &modifiers{OmpGetModifiers(x.v)};
+    const auto *ident{
+        OmpGetUniqueModifier<parser::OmpReductionIdentifier>(modifiers)};
+    assert(ident && "reduction-identifier is a required modifier");
+    if (CheckReductionOperator(*ident, OmpGetModifierSource(modifiers, ident),
+            llvm::omp::OMPC_in_reduction)) {
+      CheckReductionObjectTypes(objects, *ident);
+    }
+  }
+  CheckReductionObjects(objects, llvm::omp::Clause::OMPC_in_reduction);
+}
+
+void OmpStructureChecker::Enter(const parser::OmpClause::TaskReduction &x) {
+  CheckAllowedClause(llvm::omp::Clause::OMPC_task_reduction);
+  auto &objects{std::get<parser::OmpObjectList>(x.v.t)};
+
+  if (OmpVerifyModifiers(x.v, llvm::omp::OMPC_task_reduction,
+          GetContext().clauseSource, context_)) {
+    auto &modifiers{OmpGetModifiers(x.v)};
+    const auto *ident{
+        OmpGetUniqueModifier<parser::OmpReductionIdentifier>(modifiers)};
+    assert(ident && "reduction-identifier is a required modifier");
+    if (CheckReductionOperator(*ident, OmpGetModifierSource(modifiers, ident),
+            llvm::omp::OMPC_task_reduction)) {
+      CheckReductionObjectTypes(objects, *ident);
+    }
+  }
+  CheckReductionObjects(objects, llvm::omp::Clause::OMPC_task_reduction);
+}
+
+bool OmpStructureChecker::CheckReductionOperator(
+    const parser::OmpReductionIdentifier &ident, parser::CharBlock source,
+    llvm::omp::Clause clauseId) {
+  auto visitOperator{[&](const parser::DefinedOperator &dOpr) {
+    if (const auto *intrinsicOp{
+            std::get_if<parser::DefinedOperator::IntrinsicOperator>(&dOpr.u)}) {
+      switch (*intrinsicOp) {
+      case parser::DefinedOperator::IntrinsicOperator::Add:
+      case parser::DefinedOperator::IntrinsicOperator::Multiply:
+      case parser::DefinedOperator::IntrinsicOperator::AND:
+      case parser::DefinedOperator::IntrinsicOperator::OR:
+      case parser::DefinedOperator::IntrinsicOperator::EQV:
+      case parser::DefinedOperator::IntrinsicOperator::NEQV:
+        return true;
+      case parser::DefinedOperator::IntrinsicOperator::Subtract:
         context_.Say(GetContext().clauseSource,
-            "Invalid reduction identifier in REDUCTION "
-            "clause."_err_en_US,
+            "The minus reduction operator is deprecated since OpenMP 5.2 and is not supported in the REDUCTION clause."_err_en_US,
             ContextDirectiveAsFortran());
+        return false;
+      default:
+        break;
       }
-    }};
-    common::visit(common::visitors{visitOperator, visitDesignator}, ident->u);
-  }
-
-  return ok;
-}
+    }
+    context_.Say(source, "Invalid reduction operator in %s clause."_err_en_US,
+        parser::ToUpperCaseLetters(getClauseName(clauseId).str()));
+    return false;
+  }};
 
-bool OmpStructureChecker::CheckIntrinsicOperator(
-    const parser::DefinedOperator::IntrinsicOperator &op) {
+  auto visitDesignator{[&](const parser::ProcedureDesignator &procD) {
+    const parser::Name *name{std::get_if<parser::Name>(&procD.u)};
+    bool valid{false};
+    if (name && name->symbol) {
+      const SourceName &realName{name->symbol->GetUltimate().name()};
+      valid =
+          llvm::is_contained({"max", "min", "iand", "ior", "ieor"}, realName);
+    }
+    if (!valid) {
+      context_.Say(source,
+          "Invalid reduction identifier in %s clause."_err_en_US,
+          parser::ToUpperCaseLetters(getClauseName(clauseId).str()));
+    }
+    return valid;
+  }};
 
-  switch (op) {
-  case parser::DefinedOperator::IntrinsicOperator::Add:
-  case parser::DefinedOperator::IntrinsicOperator::Multiply:
-  case parser::DefinedOperator::IntrinsicOperator::AND:
-  case parser::DefinedOperator::IntrinsicOperator::OR:
-  case parser::DefinedOperator::IntrinsicOperator::EQV:
-  case parser::DefinedOperator::IntrinsicOperator::NEQV:
-    return true;
-  case parser::DefinedOperator::IntrinsicOperator::Subtract:
-    context_.Say(GetContext().clauseSource,
-        "The minus reduction operator is deprecated since OpenMP 5.2 and is "
-        "not supported in the REDUCTION clause."_err_en_US,
-        ContextDirectiveAsFortran());
-    break;
-  default:
-    context_.Say(GetContext().clauseSource,
-        "Invalid reduction operator in REDUCTION clause."_err_en_US,
-        ContextDirectiveAsFortran());
-  }
-  return false;
+  return common::visit(
+      common::visitors{visitOperator, visitDesignator}, ident.u);
 }
 
 /// Check restrictions on objects that are common to all reduction clauses.
@@ -3074,7 +3076,7 @@ void OmpStructureChecker::CheckReductionObjects(
   for (const parser::OmpObject &object : objects.v) {
     CheckIfContiguous(object);
   }
-  CheckReductionArraySection(objects);
+  CheckReductionArraySection(objects, clauseId);
   // An object must be definable.
   CheckDefinableObjects(symbols, clauseId);
   // Procedure pointers are not allowed.
@@ -3127,100 +3129,82 @@ void OmpStructureChecker::CheckReductionObjects(
 }
 
 static bool IsReductionAllowedForType(
-    const parser::OmpClause::Reduction &x, const DeclTypeSpec &type) {
-  auto &modifiers{OmpGetModifiers(x.v)};
-  const auto *definedOp{
-      OmpGetUniqueModifier<parser::OmpReductionIdentifier>(modifiers)};
-  if (!definedOp) {
-    return false;
-  }
-  // TODO: user defined reduction operators. Just allow everything for now.
-  bool ok{true};
-
-  auto IsLogical{[](const DeclTypeSpec &type) -> bool {
+    const parser::OmpReductionIdentifier &ident, const DeclTypeSpec &type) {
+  auto isLogical{[](const DeclTypeSpec &type) -> bool {
     return type.category() == DeclTypeSpec::Logical;
   }};
-  auto IsCharacter{[](const DeclTypeSpec &type) -> bool {
+  auto isCharacter{[](const DeclTypeSpec &type) -> bool {
     return type.category() == DeclTypeSpec::Character;
   }};
 
-  common::visit(
-      common::visitors{
-          [&](const parser::DefinedOperator &dOpr) {
-            if (const auto *intrinsicOp{
-                    std::get_if<parser::DefinedOperator::IntrinsicOperator>(
-                        &dOpr.u)}) {
-              // OMP5.2: The type [...] of a list item that appears in a
-              // reduction clause must be valid for the combiner expression
-              // See F2023: Table 10.2
-              // .LT., .LE., .GT., .GE. are handled as procedure designators
-              // below.
-              switch (*intrinsicOp) {
-              case parser::DefinedOperator::IntrinsicOperator::Multiply:
-                [[fallthrough]];
-              case parser::DefinedOperator::IntrinsicOperator::Add:
-                [[fallthrough]];
-              case parser::DefinedOperator::IntrinsicOperator::Subtract:
-                ok = type.IsNumeric(TypeCategory::Integer) ||
-                    type.IsNumeric(TypeCategory::Real) ||
-                    type.IsNumeric(TypeCategory::Complex);
-                break;
-
-              case parser::DefinedOperator::IntrinsicOperator::AND:
-                [[fallthrough]];
-              case parser::DefinedOperator::IntrinsicOperator::OR:
-                [[fallthrough]];
-              case parser::DefinedOperator::IntrinsicOperator::EQV:
-                [[fallthrough]];
-              case parser::DefinedOperator::IntrinsicOperator::NEQV:
-                ok = IsLogical(type);
-                break;
+  auto checkOperator{[&](const parser::DefinedOperator &dOpr) {
+    if (const auto *intrinsicOp{
+            std::get_if<parser::DefinedOperator::IntrinsicOperator>(&dOpr.u)}) {
+      // OMP5.2: The type [...] of a list item that appears in a
+      // reduction clause must be valid for the combiner expression
+      // See F2023: Table 10.2
+      // .LT., .LE., .GT., .GE. are handled as procedure designators
+      // below.
+      switch (*intrinsicOp) {
+      case parser::DefinedOperator::IntrinsicOperator::Multiply:
+      case parser::DefinedOperator::IntrinsicOperator::Add:
+      case parser::DefinedOperator::IntrinsicOperator::Subtract:
+        return type.IsNumeric(TypeCategory::Integer) ||
+            type.IsNumeric(TypeCategory::Real) ||
+            type.IsNumeric(TypeCategory::Complex);
+
+      case parser::DefinedOperator::IntrinsicOperator::AND:
+      case parser::DefinedOperator::IntrinsicOperator::OR:
+      case parser::DefinedOperator::IntrinsicOperator::EQV:
+      case parser::DefinedOperator::IntrinsicOperator::NEQV:
+        return isLogical(type);
+
+      // Reduction identifier is not in OMP5.2 Table 5.2
+      default:
+        DIE("This should have been caught in CheckIntrinsicOperator");
+        return false;
+      }
+    }
+    return true;
+  }};
 
-              // Reduction identifier is not in OMP5.2 Table 5.2
-              default:
-                DIE("This should have been caught in CheckIntrinsicOperator");
-                ok = false;
-                break;
-              }
-            }
-          },
-          [&](const parser::ProcedureDesignator &procD) {
-            const parser::Name *name{std::get_if<parser::Name>(&procD.u)};
-            if (name && name->symbol) {
-              const SourceName &realName{name->symbol->GetUltimate().name()};
-              // OMP5.2: The type [...] of a list item that appears in a
-              // reduction clause must be valid for the combiner expression
-              if (realName == "iand" || realName == "ior" ||
-                  realName == "ieor") {
-                // IAND: arguments must be integers: F2023 16.9.100
-                // IEOR: arguments must be integers: F2023 16.9.106
-                // IOR: arguments must be integers: F2023 16.9.111
-                ok = type.IsNumeric(TypeCategory::Integer);
-              } else if (realName == "max" || realName == "min") {
-                // MAX: arguments must be integer, real, or character:
-                // F2023 16.9.135
-                // MIN: arguments must be integer, real, or character:
-                // F2023 16.9.141
-                ok = type.IsNumeric(TypeCategory::Integer) ||
-                    type.IsNumeric(TypeCategory::Real) || IsCharacter(type);
-              }
-            }
-          },
-      },
-      definedOp->u);
+  auto checkDesignator{[&](const parser::ProcedureDesignator &procD) {
+    const parser::Name *name{std::get_if<parser::Name>(&procD.u)};
+    if (name && name->symbol) {
+      const SourceName &realName{name->symbol->GetUltimate().name()};
+      // OMP5.2: The type [...] of a list item that appears in a
+      // reduction clause must be valid for the combiner expression
+      if (realName == "iand" || realName == "ior" || realName == "ieor") {
+        // IAND: arguments must be integers: F2023 16.9.100
+        // IEOR: arguments must be integers: F2023 16.9.106
+        // IOR: arguments must be integers: F2023 16.9.111
+        return type.IsNumeric(TypeCategory::Integer);
+      } else if (realName == "max" || realName == "min") {
+        // MAX: arguments must be integer, real, or character:
+        // F2023 16.9.135
+        // MIN: arguments must be integer, real, or character:
+        // F2023 16.9.141
+        return type.IsNumeric(TypeCategory::Integer) ||
+            type.IsNumeric(TypeCategory::Real) || isCharacter(type);
+      }
+    }
+    // TODO: user defined reduction operators. Just allow everything for now.
+    return true;
+  }};
 
-  return ok;
+  return common::visit(
+      common::visitors{checkOperator, checkDesignator}, ident.u);
 }
 
-void OmpStructureChecker::CheckReductionTypeList(
-    const parser::OmpClause::Reduction &x) {
-  const auto &ompObjectList{std::get<parser::OmpObjectList>(x.v.t)};
+void OmpStructureChecker::CheckReductionObjectTypes(
+    const parser::OmpObjectList &objects,
+    const parser::OmpReductionIdentifier &ident) {
   SymbolSourceMap symbols;
-  GetSymbolsInObjectList(ompObjectList, symbols);
+  GetSymbolsInObjectList(objects, symbols);
 
   for (auto &[symbol, source] : symbols) {
     if (auto *type{symbol->GetType()}) {
-      if (!IsReductionAllowedForType(x, *type)) {
+      if (!IsReductionAllowedForType(ident, *type)) {
         context_.Say(source,
             "The type of '%s' is incompatible with the reduction operator."_err_en_US,
             symbol->name());
@@ -3283,13 +3267,12 @@ void OmpStructureChecker::CheckReductionModifier(
 }
 
 void OmpStructureChecker::CheckReductionArraySection(
-    const parser::OmpObjectList &ompObjectList) {
+    const parser::OmpObjectList &ompObjectList, llvm::omp::Clause clauseId) {
   for (const auto &ompObject : ompObjectList.v) {
     if (const auto *dataRef{parser::Unwrap<parser::DataRef>(ompObject)}) {
       if (const auto *arrayElement{
               parser::Unwrap<parser::ArrayElement>(ompObject)}) {
-        CheckArraySection(*arrayElement, GetLastName(*dataRef),
-            llvm::omp::Clause::OMPC_reduction);
+        CheckArraySection(*arrayElement, GetLastName(*dataRef), clauseId);
       }
     }
   }
@@ -3669,17 +3652,63 @@ void OmpStructureChecker::Enter(const parser::OmpClause::If &x) {
 
 void OmpStructureChecker::Enter(const parser::OmpClause::Linear &x) {
   CheckAllowedClause(llvm::omp::Clause::OMPC_linear);
+  unsigned version{context_.langOptions().OpenMPVersion};
+  llvm::omp::Directive dir{GetContext().directive};
+  parser::CharBlock clauseSource{GetContext().clauseSource};
+  const parser::OmpLinearModifier *linearMod{nullptr};
 
-  parser::CharBlock source{GetContext().clauseSource};
-  // 2.7 Loop Construct Restriction
-  if ((llvm::omp::allDoSet | llvm::omp::allSimdSet)
-          .test(GetContext().directive)) {
-    if (std::holds_alternative<parser::OmpLinearClause::WithModifier>(x.v.u)) {
+  SymbolSourceMap symbols;
+  auto &objects{std::get<parser::OmpObjectList>(x.v.t)};
+  GetSymbolsInObjectList(objects, symbols);
+
+  auto CheckIntegerNoRef{[&](const Symbol *symbol, parser::CharBlock source) {
+    if (!symbol->GetType()->IsNumeric(TypeCategory::Integer)) {
+      auto &desc{OmpGetDescriptor<parser::OmpLinearModifier>()};
       context_.Say(source,
-          "A modifier may not be specified in a LINEAR clause "
-          "on the %s directive"_err_en_US,
-          ContextDirectiveAsFortran());
-      return;
+          "The list item '%s' specified without the REF '%s' must be of INTEGER type"_err_en_US,
+          symbol->name(), desc.name.str());
+    }
+  }};
+
+  if (OmpVerifyModifiers(x.v, llvm::omp::OMPC_linear, clauseSource, context_)) {
+    auto &modifiers{OmpGetModifiers(x.v)};
+    linearMod = OmpGetUniqueModifier<parser::OmpLinearModifier>(modifiers);
+    if (linearMod) {
+      // 2.7 Loop Construct Restriction
+      if ((llvm::omp::allDoSet | llvm::omp::allSimdSet).test(dir)) {
+        context_.Say(clauseSource,
+            "A modifier may not be specified in a LINEAR clause on the %s directive"_err_en_US,
+            ContextDirectiveAsFortran());
+        return;
+      }
+
+      auto &desc{OmpGetDescriptor<parser::OmpLinearModifier>()};
+      for (auto &[symbol, source] : symbols) {
+        if (linearMod->v != parser::OmpLinearModifier::Value::Ref) {
+          CheckIntegerNoRef(symbol, source);
+        } else {
+          if (!IsAllocatable(*symbol) && !IsAssumedShape(*symbol) &&
+              !IsPolymorphic(*symbol)) {
+            context_.Say(source,
+                "The list item `%s` specified with the REF '%s' must be polymorphic variable, assumed-shape array, or a variable with the `ALLOCATABLE` attribute"_err_en_US,
+                symbol->name(), desc.name.str());
+          }
+        }
+        if (linearMod->v == parser::OmpLinearModifier::Value::Ref ||
+            linearMod->v == parser::OmpLinearModifier::Value::Uval) {
+          if (!IsDummy(*symbol) || IsValue(*symbol)) {
+            context_.Say(source,
+                "If the `%s` is REF or UVAL, the list item '%s' must be a dummy argument without the VALUE attribute"_err_en_US,
+                desc.name.str(), symbol->name());
+          }
+        }
+      } // for (symbol, source)
+
+      if (version >= 52 && !std::get</*PostModified=*/bool>(x.v.t)) {
+        context_.Say(OmpGetModifierSource(modifiers, linearMod),
+            "The 'modifier(<list>)' syntax is deprecated in %s, use '<list> : modifier' instead"_warn_en_US,
+            ThisVersion(version));
+      }
     }
   }
 
@@ -3692,73 +3721,28 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Linear &x) {
     }
   }
 
-  auto checkForValidLinearClause_01 = [&](const parser::Name &name,
-                                          bool is_ref) {
-    std::string listItemName{name.ToString()};
-    if (!is_ref && !name.symbol->GetType()->IsNumeric(TypeCategory::Integer)) {
-      context_.Say(source,
-          "The list item `%s` specified with other than linear-modifier `REF` must be of type INTEGER"_err_en_US,
-          listItemName);
+  // OpenMP 5.2: Linear clause Restrictions
+  for (auto &[symbol, source] : symbols) {
+    if (!linearMod) {
+      // Already checked this with the modifier present.
+      CheckIntegerNoRef(symbol, source);
     }
-    if (GetContext().directive == llvm::omp::Directive::OMPD_declare_simd &&
-        !IsDummy(*name.symbol)) {
+    if (dir == llvm::omp::Directive::OMPD_declare_simd && !IsDummy(*symbol)) {
       context_.Say(source,
           "The list item `%s` must be a dummy argument"_err_en_US,
-          listItemName);
+          symbol->name());
     }
-    if (IsPointer(*name.symbol) ||
-        name.symbol->test(Symbol::Flag::CrayPointer)) {
+    if (IsPointer(*symbol) || symbol->test(Symbol::Flag::CrayPointer)) {
       context_.Say(source,
           "The list item `%s` in a LINEAR clause must not be Cray Pointer or a variable with POINTER attribute"_err_en_US,
-          listItemName);
+          symbol->name());
     }
-    if (FindCommonBlockContaining(*name.symbol)) {
+    if (FindCommonBlockContaining(*symbol)) {
       context_.Say(source,
           "'%s' is a common block name and must not appear in an LINEAR clause"_err_en_US,
-          listItemName);
-    }
-  };
-
-  auto checkForValidLinearClause_02 = [&](const parser::Name &name,
-                                          const parser::OmpLinearModifier::Value
-                                              &modifierValue) {
-    std::string listItemName{name.ToString()};
-    checkForValidLinearClause_01(
-        name, (modifierValue == parser::OmpLinearModifier::Value::Ref));
-    if (modifierValue != parser::OmpLinearModifier::Value::Val &&
-        IsDummy(*name.symbol) && IsValue(*name.symbol)) {
-      context_.Say(source,
-          "The list item `%s` specified with the linear-modifier `REF` or `UVAL` must be a dummy argument without `VALUE` attribute"_err_en_US,
-          listItemName);
-    }
-    if (modifierValue == parser::OmpLinearModifier::Value::Ref &&
-        !(IsAllocatable(*name.symbol) || IsAssumedShape(*name.symbol) ||
-            IsPolymorphic(*name.symbol))) {
-      context_.Say(source,
-          "The list item `%s` specified with the linear-modifier `REF` must be polymorphic variable, assumed-shape array, or a variable with the `ALLOCATABLE` attribute"_err_en_US,
-          listItemName);
+          symbol->name());
     }
-  };
-
-  // OpenMP 5.2: Linear clause Restrictions
-  common::visit(
-      common::visitors{
-          [&](const parser::OmpLinearClause::WithoutModifier &withoutModifier) {
-            for (const auto &name : withoutModifier.names) {
-              if (name.symbol) {
-                checkForValidLinearClause_01(name, false);
-              }
-            }
-          },
-          [&](const parser::OmpLinearClause::WithModifier &withModifier) {
-            for (const auto &name : withModifier.names) {
-              if (name.symbol) {
-                checkForValidLinearClause_02(name, withModifier.modifier.v);
-              }
-            }
-          },
-      },
-      x.v.u);
+  }
 }
 
 void OmpStructureChecker::CheckAllowedMapTypes(
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index e28e5f6d7b0d5..f3592b2160fac 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -230,10 +230,10 @@ class OmpStructureChecker
   std::int64_t GetOrdCollapseLevel(const parser::OpenMPLoopConstruct &x);
   void CheckReductionObjects(
       const parser::OmpObjectList &objects, llvm::omp::Clause clauseId);
-  bool CheckReductionOperators(const parser::OmpClause::Reduction &);
-  bool CheckIntrinsicOperator(
-      const parser::DefinedOperator::IntrinsicOperator &);
-  void CheckReductionTypeList(const parser::OmpClause::Reduction &);
+  bool CheckReductionOperator(const parser::OmpReductionIdentifier &ident,
+      parser::CharBlock source, llvm::omp::Clause clauseId);
+  void CheckReductionObjectTypes(const parser::OmpObjectList &objects,
+      const parser::OmpReductionIdentifier &ident);
   void CheckReductionModifier(const parser::OmpReductionModifier &);
   void CheckMasterNesting(const parser::OpenMPBlockConstruct &x);
   void ChecksOnOrderedAsBlock();
@@ -241,7 +241,8 @@ class OmpStructureChecker
   void CheckScan(const parser::OpenMPSimpleStandaloneConstruct &x);
   void ChecksOnOrderedAsStandalone();
   void CheckOrderedDependClause(std::optional<std::int64_t> orderedValue);
-  void CheckReductionArraySection(const parser::OmpObjectList &ompObjectList);
+  void CheckReductionArraySection(
+      const parser::OmpObjectList &ompObjectList, llvm::omp::Clause clauseId);
   void CheckArraySection(const parser::ArrayElement &arrayElement,
       const parser::Name &name, const llvm::omp::Clause clause);
   void CheckSharedBindingInOuterContext(
diff --git a/flang/lib/Semantics/openmp-modifiers.cpp b/flang/lib/Semantics/openmp-modifiers.cpp
index f8f81e6c6ffa1..9f2896229bb7f 100644
--- a/flang/lib/Semantics/openmp-modifiers.cpp
+++ b/flang/lib/Semantics/openmp-modifiers.cpp
@@ -407,6 +407,39 @@ const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpReductionModifier>() {
   return desc;
 }
 
+template <>
+const OmpModifierDescriptor &
+OmpGetDescriptor<parser::OmpStepComplexModifier>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"step-complex-modifier",
+      /*props=*/
+      {
+          {52, {OmpProperty::Unique}},
+      },
+      /*clauses=*/
+      {
+          {52, {Clause::OMPC_linear}},
+      },
+  };
+  return desc;
+}
+
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpStepSimpleModifier>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"step-simple-modifier",
+      /*props=*/
+      {
+          {45, {OmpProperty::Unique, OmpProperty::Exclusive}},
+      },
+      /*clauses=*/
+      {
+          {45, {Clause::OMPC_linear}},
+      },
+  };
+  return desc;
+}
+
 template <>
 const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpTaskDependenceType>() {
   static const OmpModifierDescriptor desc{
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 80a086acebba2..39478b58a9070 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -502,19 +502,8 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
     return false;
   }
   bool Pre(const parser::OmpLinearClause &x) {
-    common::visit(common::visitors{
-                      [&](const parser::OmpLinearClause::WithoutModifier
-                              &linearWithoutModifier) {
-                        ResolveOmpNameList(linearWithoutModifier.names,
-                            Symbol::Flag::OmpLinear);
-                      },
-                      [&](const parser::OmpLinearClause::WithModifier
-                              &linearWithModifier) {
-                        ResolveOmpNameList(
-                            linearWithModifier.names, Symbol::Flag::OmpLinear);
-                      },
-                  },
-        x.u);
+    auto &objects{std::get<parser::OmpObjectList>(x.t)};
+    ResolveOmpObjectList(objects, Symbol::Flag::OmpLinear);
     return false;
   }
 
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index b576f59e8c7e5..aef2898919f3f 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -8953,6 +8953,18 @@ void ResolveNamesVisitor::FinishSpecificationPart(
   misparsedStmtFuncFound_ = false;
   funcResultStack().CompleteFunctionResultType();
   CheckImports();
+  bool inDeviceSubprogram = false;
+  if (auto *subp{currScope().symbol()
+              ? currScope().symbol()->detailsIf<SubprogramDetails>()
+              : nullptr}) {
+    if (auto attrs{subp->cudaSubprogramAttrs()}) {
+      if (*attrs != common::CUDASubprogramAttrs::Device ||
+          *attrs != common::CUDASubprogramAttrs::Global ||
+          *attrs != common::CUDASubprogramAttrs::Grid_Global) {
+        inDeviceSubprogram = true;
+      }
+    }
+  }
   for (auto &pair : currScope()) {
     auto &symbol{*pair.second};
     if (inInterfaceBlock()) {
@@ -8961,6 +8973,14 @@ void ResolveNamesVisitor::FinishSpecificationPart(
     if (NeedsExplicitType(symbol)) {
       ApplyImplicitRules(symbol);
     }
+    if (inDeviceSubprogram && IsDummy(symbol) &&
+        symbol.has<ObjectEntityDetails>()) {
+      auto *dummy{symbol.detailsIf<ObjectEntityDetails>()};
+      if (!dummy->cudaDataAttr()) {
+        // Implicitly set device attribute if none is set in device context.
+        dummy->set_cudaDataAttr(common::CUDADataAttr::Device);
+      }
+    }
     if (IsDummy(symbol) && isImplicitNoneType() &&
         symbol.test(Symbol::Flag::Implicit) && !context().HasError(symbol)) {
       Say(symbol.name(),
diff --git a/flang/test/HLFIR/shapeof.fir b/flang/test/HLFIR/shapeof.fir
index b91efc276b62e..43e22dd320c18 100644
--- a/flang/test/HLFIR/shapeof.fir
+++ b/flang/test/HLFIR/shapeof.fir
@@ -27,3 +27,21 @@ func.func @shapeof2(%arg0: !hlfir.expr<?x2xi32>) -> !fir.shape<2> {
 // CHECK-ALL:          %[[EXPR:.*]]: !hlfir.expr<?x2xi32>
 // CHECK-ALL-NEXT:   %[[SHAPE:.*]] = hlfir.shape_of %[[EXPR]] : (!hlfir.expr<?x2xi32>) -> !fir.shape<2>
 // CHECK-ALL-NEXT:   return %[[SHAPE]]
+
+// Checks hlfir.elemental -> hlfir.shape_of folding
+func.func @shapeof_fold1(%extent: index) -> !fir.shape<1> {
+  %shape1 = fir.shape %extent : (index) -> !fir.shape<1>
+  %elem = hlfir.elemental %shape1 : (!fir.shape<1>) -> !hlfir.expr<?xindex> {
+    hlfir.yield_element %extent : index
+  }
+  %shape2 = hlfir.shape_of %elem : (!hlfir.expr<?xindex>) -> !fir.shape<1>
+  return %shape2 : !fir.shape<1>
+}
+// CHECK-ALL-LABEL:   func.func @shapeof_fold1(
+// CHECK-ALL-SAME:        %[[VAL_0:.*]]: index) -> !fir.shape<1> {
+// CHECK-CANON-NEXT:    %[[VAL_1:.*]] = fir.shape %[[VAL_0]] : (index) -> !fir.shape<1>
+// CHECK-CANON-NEXT:    %[[VAL_2:.*]] = hlfir.elemental %[[VAL_1]] : (!fir.shape<1>) -> !hlfir.expr<?xindex> {
+// CHECK-CANON-NEXT:      hlfir.yield_element %[[VAL_0]] : index
+// CHECK-CANON-NEXT:    }
+// CHECK-CANON-NEXT:    return %[[VAL_1]] : !fir.shape<1>
+// CHECK-CANON-NEXT:  }
diff --git a/flang/test/Parser/OpenMP/in-reduction-clause.f90 b/flang/test/Parser/OpenMP/in-reduction-clause.f90
index ab26ca2d9300f..8a0bede62f03f 100644
--- a/flang/test/Parser/OpenMP/in-reduction-clause.f90
+++ b/flang/test/Parser/OpenMP/in-reduction-clause.f90
@@ -5,16 +5,16 @@
 
 subroutine omp_in_reduction_taskgroup()
     integer :: z, i
-    !CHECK: !$OMP TASKGROUP  TASK_REDUCTION(+:z)
+    !CHECK: !$OMP TASKGROUP  TASK_REDUCTION(+: z)
     !$omp taskgroup task_reduction(+:z)
-    !CHECK-NEXT: !$OMP TASK  IN_REDUCTION(+:z)
+    !CHECK-NEXT: !$OMP TASK  IN_REDUCTION(+: z)
         !$omp task in_reduction(+:z)
     !CHECK-NEXT: z=z+5_4
             z = z + 5
     !CHECK-NEXT: !$OMP END TASK
         !$omp end task
 
-    !CHECK-NEXT: !$OMP TASKLOOP  IN_REDUCTION(+:z)
+    !CHECK-NEXT: !$OMP TASKLOOP  IN_REDUCTION(+: z)
         !$omp taskloop in_reduction(+:z)
     !CHECK-NEXT: DO i=1_4,10_4
             do i=1,10
@@ -31,7 +31,7 @@ end subroutine omp_in_reduction_taskgroup
 !PARSE-TREE: OpenMPConstruct -> OpenMPBlockConstruct
 !PARSE-TREE-NEXT: OmpBeginBlockDirective
 !PARSE-TREE-NEXT: OmpBlockDirective -> llvm::omp::Directive = taskgroup
-!PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> TaskReduction -> OmpReductionClause
+!PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> TaskReduction -> OmpTaskReductionClause
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPBlockConstruct
 !PARSE-TREE-NEXT: OmpBeginBlockDirective
@@ -49,9 +49,9 @@ end subroutine omp_in_reduction_taskgroup
 
 subroutine omp_in_reduction_parallel()
     integer :: z
-    !CHECK: !$OMP PARALLEL  REDUCTION(+:z)
+    !CHECK: !$OMP PARALLEL  REDUCTION(+: z)
     !$omp parallel reduction(+:z)
-    !CHECK-NEXT: !$OMP TASKLOOP SIMD  IN_REDUCTION(+:z)
+    !CHECK-NEXT: !$OMP TASKLOOP SIMD  IN_REDUCTION(+: z)
         !$omp taskloop simd in_reduction(+:z)
     !CHECK-NEXT: DO i=1_4,10_4
             do i=1,10
diff --git a/flang/test/Parser/OpenMP/linear-clause.f90 b/flang/test/Parser/OpenMP/linear-clause.f90
new file mode 100644
index 0000000000000..5f031b0694149
--- /dev/null
+++ b/flang/test/Parser/OpenMP/linear-clause.f90
@@ -0,0 +1,117 @@
+!RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=52 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s
+!RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=52 %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+subroutine f00(x)
+  integer :: x
+  !$omp do linear(x)
+  do x = 1, 10
+  enddo
+  !$omp end do
+end
+
+!UNPARSE: SUBROUTINE f00 (x)
+!UNPARSE:  INTEGER x
+!UNPARSE: !$OMP DO  LINEAR(x)
+!UNPARSE:  DO x=1_4,10_4
+!UNPARSE:  END DO
+!UNPARSE: !$OMP END DO
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: OmpBeginLoopDirective
+!PARSE-TREE: | OmpLoopDirective -> llvm::omp::Directive = do
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Linear -> OmpLinearClause
+!PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | bool = 'true'
+!PARSE-TREE: DoConstruct
+
+subroutine f01(x)
+  integer :: x
+  !$omp do linear(x : 2)
+  do x = 1, 10
+  enddo
+  !$omp end do
+end
+
+!UNPARSE: SUBROUTINE f01 (x)
+!UNPARSE:  INTEGER x
+!UNPARSE: !$OMP DO  LINEAR(x: 2_4)
+!UNPARSE:  DO x=1_4,10_4
+!UNPARSE:  END DO
+!UNPARSE: !$OMP END DO
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: OmpBeginLoopDirective
+!PARSE-TREE: | OmpLoopDirective -> llvm::omp::Directive = do
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Linear -> OmpLinearClause
+!PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | Modifier -> OmpStepSimpleModifier -> Scalar -> Integer -> Expr = '2_4'
+!PARSE-TREE: | | | LiteralConstant -> IntLiteralConstant = '2'
+!PARSE-TREE: | | bool = 'true'
+!PARSE-TREE: DoConstruct
+
+subroutine f02(x)
+  integer :: x
+  !$omp do linear(x : step(3))
+  do x = 1, 10
+  enddo
+  !$omp end do
+end
+
+!UNPARSE: SUBROUTINE f02 (x)
+!UNPARSE:  INTEGER x
+!UNPARSE: !$OMP DO  LINEAR(x: STEP(3_4))
+!UNPARSE:  DO x=1_4,10_4
+!UNPARSE:  END DO
+!UNPARSE: !$OMP END DO
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: OmpBeginLoopDirective
+!PARSE-TREE: | OmpLoopDirective -> llvm::omp::Directive = do
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Linear -> OmpLinearClause
+!PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | Modifier -> OmpStepComplexModifier -> Scalar -> Integer -> Expr = '3_4'
+!PARSE-TREE: | | | LiteralConstant -> IntLiteralConstant = '3'
+!PARSE-TREE: | | bool = 'true'
+!PARSE-TREE: DoConstruct
+
+subroutine f03(x)
+  integer :: x
+  !$omp declare simd linear(x : uval)
+end
+
+!UNPARSE: SUBROUTINE f03 (x)
+!UNPARSE:  INTEGER x
+!UNPARSE: !$OMP DECLARE SIMD  LINEAR(x: UVAL)
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: SpecificationPart
+![...]
+!PARSE-TREE: | DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareSimdConstruct
+!PARSE-TREE: | | Verbatim
+!PARSE-TREE: | | OmpClauseList -> OmpClause -> Linear -> OmpLinearClause
+!PARSE-TREE: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | Modifier -> OmpLinearModifier -> Value = Uval
+!PARSE-TREE: | | | bool = 'true'
+!PARSE-TREE: ExecutionPart -> Block
+
+subroutine f04(x)
+  integer :: x
+  !$omp declare simd linear(x : uval, step(3))
+end
+
+!UNPARSE: SUBROUTINE f04 (x)
+!UNPARSE:  INTEGER x
+!UNPARSE: !$OMP DECLARE SIMD  LINEAR(x: UVAL, STEP(3_4))
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: SpecificationPart
+![...]
+!PARSE-TREE: | DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareSimdConstruct
+!PARSE-TREE: | | Verbatim
+!PARSE-TREE: | | OmpClauseList -> OmpClause -> Linear -> OmpLinearClause
+!PARSE-TREE: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | Modifier -> OmpLinearModifier -> Value = Uval
+!PARSE-TREE: | | | Modifier -> OmpStepComplexModifier -> Scalar -> Integer -> Expr = '3_4'
+!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '3'
+!PARSE-TREE: | | | bool = 'true'
+!PARSE-TREE: ExecutionPart -> Block
diff --git a/flang/test/Parser/OpenMP/reduction-modifier.f90 b/flang/test/Parser/OpenMP/reduction-modifier.f90
index 64cd452e839e7..56303af66395e 100644
--- a/flang/test/Parser/OpenMP/reduction-modifier.f90
+++ b/flang/test/Parser/OpenMP/reduction-modifier.f90
@@ -4,7 +4,7 @@
 subroutine foo()
   integer :: i, j
   j = 0
-! CHECK: !$OMP DO  REDUCTION(TASK, *:j)
+! CHECK: !$OMP DO  REDUCTION(TASK, *: j)
 ! PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
 ! PARSE-TREE: | | | OmpBeginLoopDirective
 ! PARSE-TREE: | | | | OmpLoopDirective -> llvm::omp::Directive = do
diff --git a/flang/test/Parser/OpenMP/task-reduction-clause.f90 b/flang/test/Parser/OpenMP/task-reduction-clause.f90
new file mode 100644
index 0000000000000..248ff7918dbe5
--- /dev/null
+++ b/flang/test/Parser/OpenMP/task-reduction-clause.f90
@@ -0,0 +1,23 @@
+!RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=50 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s
+!RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=50 %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+subroutine f00
+  integer :: x
+!$omp taskgroup task_reduction(+: x)
+  x = x + 1
+!$omp end taskgroup
+end
+
+!UNPARSE: SUBROUTINE f00
+!UNPARSE:  INTEGER x
+!UNPARSE: !$OMP TASKGROUP  TASK_REDUCTION(+: x)
+!UNPARSE:   x=x+1_4
+!UNPARSE: !$OMP END TASKGROUP
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: OmpBeginBlockDirective
+!PARSE-TREE: | OmpBlockDirective -> llvm::omp::Directive = taskgroup
+!PARSE-TREE: | OmpClauseList -> OmpClause -> TaskReduction -> OmpTaskReductionClause
+!PARSE-TREE: | | Modifier -> OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
+!PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: Block
diff --git a/flang/test/Preprocessing/directive-contin-with-pp.F90 b/flang/test/Preprocessing/directive-contin-with-pp.F90
index 544c6619f6b53..6e84c2bde52f9 100644
--- a/flang/test/Preprocessing/directive-contin-with-pp.F90
+++ b/flang/test/Preprocessing/directive-contin-with-pp.F90
@@ -70,13 +70,13 @@ subroutine s3
 !CHECK:   !DIR$ IGNORE_TKR x5
 !CHECK:   !DIR$ IGNORE_TKR x6
 !CHECK:   STOP 1_4
-!CHECK: !$OMP PARALLEL DO  REDUCTION(+:x)
+!CHECK: !$OMP PARALLEL DO  REDUCTION(+: x)
 !CHECK:   DO j1=1_4,n
 !CHECK:   END DO
-!CHECK: !$OMP PARALLEL DO  REDUCTION(+:x)
+!CHECK: !$OMP PARALLEL DO  REDUCTION(+: x)
 !CHECK:   DO j2=1_4,n
 !CHECK:   END DO
-!CHECK: !$OMP PARALLEL DO  REDUCTION(+:x)
+!CHECK: !$OMP PARALLEL DO  REDUCTION(+: x)
 !CHECK:   DO j3=1_4,n
 !CHECK:   END DO
 !CHECK:  END SUBROUTINE
diff --git a/flang/test/Semantics/OpenMP/clause-validity01.f90 b/flang/test/Semantics/OpenMP/clause-validity01.f90
index 66e11e4b540f0..e8114154a809b 100644
--- a/flang/test/Semantics/OpenMP/clause-validity01.f90
+++ b/flang/test/Semantics/OpenMP/clause-validity01.f90
@@ -221,11 +221,19 @@
 
   !ERROR: Clause LINEAR is not allowed if clause ORDERED appears on the DO directive
   !ERROR: The parameter of the ORDERED clause must be a constant positive integer expression
+  !ERROR: 'b' appears in more than one data-sharing clause on the same OpenMP directive
   !$omp do ordered(1-1) private(b) linear(b) linear(a)
   do i = 1, N
      a = 3.14
   enddo
 
+  !ERROR: Clause LINEAR is not allowed if clause ORDERED appears on the DO directive
+  !ERROR: The parameter of the ORDERED clause must be a constant positive integer expression
+  !$omp do ordered(1-1) linear(a)
+  do i = 1, N
+     a = 3.14
+  enddo
+
   !ERROR: The parameter of the ORDERED clause must be greater than or equal to the parameter of the COLLAPSE clause
   !$omp do collapse(num-14) ordered(1)
   do i = 1, N
diff --git a/flang/test/Semantics/OpenMP/in-reduction.f90 b/flang/test/Semantics/OpenMP/in-reduction.f90
new file mode 100644
index 0000000000000..1b82134b7104b
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/in-reduction.f90
@@ -0,0 +1,70 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=50
+
+subroutine f00
+  real :: x
+!ERROR: The type of 'x' is incompatible with the reduction operator.
+!$omp target in_reduction(.or.: x)
+!$omp end target
+end
+
+subroutine f01
+  real :: x
+!ERROR: Invalid reduction operator in IN_REDUCTION clause.
+!$omp target in_reduction(.not.: x)
+!$omp end target
+end
+
+subroutine f02(p)
+  integer, pointer, intent(in) :: p
+!ERROR: Pointer 'p' with the INTENT(IN) attribute may not appear in a IN_REDUCTION clause
+!$omp target in_reduction(+: p)
+!$omp end target
+end
+
+subroutine f03
+  common /c/ a, b 
+!ERROR: Common block names are not allowed in IN_REDUCTION clause
+!$omp target in_reduction(+: /c/)
+!$omp end target
+end
+
+subroutine f04
+  integer :: x(10)
+!ERROR: Reference to 'x' must be a contiguous object
+!$omp target in_reduction(+: x(1:10:2))
+!$omp end target
+end
+
+subroutine f05
+  integer :: x(10)
+!ERROR: 'x' in IN_REDUCTION clause is a zero size array section
+!$omp target in_reduction(+: x(1:0))
+!$omp end target
+end
+
+subroutine f06
+  type t
+    integer :: a(10)
+  end type
+  type(t) :: x
+!ERROR: The base expression of an array element or section in IN_REDUCTION clause must be an identifier
+!$omp target in_reduction(+: x%a(2))
+!$omp end target
+end
+
+subroutine f07
+  type t
+    integer :: a(10)
+  end type
+  type(t) :: x
+!ERROR: The base expression of an array element or section in IN_REDUCTION clause must be an identifier
+!$omp target in_reduction(+: x%a(1:10))
+!$omp end target
+end
+
+subroutine f08
+  integer :: x
+!ERROR: Type parameter inquiry is not permitted in IN_REDUCTION clause
+!$omp target in_reduction(+: x%kind)
+!$omp end target
+end
diff --git a/flang/test/Semantics/OpenMP/linear-clause01.f90 b/flang/test/Semantics/OpenMP/linear-clause01.f90
index 654aa07f5bd40..f95e834c9026c 100644
--- a/flang/test/Semantics/OpenMP/linear-clause01.f90
+++ b/flang/test/Semantics/OpenMP/linear-clause01.f90
@@ -16,25 +16,31 @@ end subroutine linear_clause_01
 
 ! Case 2
 subroutine linear_clause_02(arg_01, arg_02)
-    !ERROR: The list item `arg_01` specified with other than linear-modifier `REF` must be of type INTEGER
+    !ERROR: The list item 'arg_01' specified without the REF 'linear-modifier' must be of INTEGER type
     !$omp declare simd linear(val(arg_01))
     real, intent(in) :: arg_01(:)
 
-    !ERROR: The list item `arg_02` specified with the linear-modifier `REF` or `UVAL` must be a dummy argument without `VALUE` attribute
+    !ERROR: The list item 'arg_02' specified without the REF 'linear-modifier' must be of INTEGER type
+    !ERROR: If the `linear-modifier` is REF or UVAL, the list item 'arg_02' must be a dummy argument without the VALUE attribute
     !$omp declare simd linear(uval(arg_02))
+    !ERROR: The type of 'arg_02' has already been implicitly declared
     integer, value, intent(in) :: arg_02
 
+    !ERROR: The list item 'var' specified without the REF 'linear-modifier' must be of INTEGER type
+    !ERROR: If the `linear-modifier` is REF or UVAL, the list item 'var' must be a dummy argument without the VALUE attribute
     !ERROR: The list item `var` must be a dummy argument
     !ERROR: The list item `var` in a LINEAR clause must not be Cray Pointer or a variable with POINTER attribute
     !$omp declare simd linear(uval(var))
+    !ERROR: The type of 'var' has already been implicitly declared
     integer, pointer :: var
 end subroutine linear_clause_02
 
 ! Case 3
 subroutine linear_clause_03(arg)
     integer, intent(in) :: arg
-    !ERROR: The list item `arg` specified with the linear-modifier `REF` must be polymorphic variable, assumed-shape array, or a variable with the `ALLOCATABLE` attribute
+    !ERROR: The list item `arg` specified with the REF 'linear-modifier' must be polymorphic variable, assumed-shape array, or a variable with the `ALLOCATABLE` attribute
     !ERROR: List item 'arg' present at multiple LINEAR clauses
+    !ERROR: 'arg' appears in more than one data-sharing clause on the same OpenMP directive
     !$omp declare simd linear(ref(arg)) linear(arg)
 
     integer :: i
diff --git a/flang/test/Semantics/OpenMP/linear-clause02.f90 b/flang/test/Semantics/OpenMP/linear-clause02.f90
new file mode 100644
index 0000000000000..695d61715820f
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/linear-clause02.f90
@@ -0,0 +1,13 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=52
+
+subroutine f00(x)
+  integer :: x
+  !WARNING: The 'modifier(<list>)' syntax is deprecated in OpenMP v5.2, use '<list> : modifier' instead
+  !$omp declare simd linear(uval(x))
+end
+
+subroutine f01(x)
+  integer :: x
+  !ERROR: An exclusive 'step-simple-modifier' modifier cannot be specified together with a modifier of a different type
+  !$omp declare simd linear(uval(x) : 2)
+end
diff --git a/flang/test/Semantics/OpenMP/linear-iter.f90 b/flang/test/Semantics/OpenMP/linear-iter.f90
index 8102c1a03cd37..1f40228be92ad 100644
--- a/flang/test/Semantics/OpenMP/linear-iter.f90
+++ b/flang/test/Semantics/OpenMP/linear-iter.f90
@@ -20,7 +20,7 @@ SUBROUTINE LINEAR_BAD(N)
 
   !$omp target
   !$omp teams
-  !ERROR: Variable 'j' not allowed in `LINEAR` clause, only loop iterator can be specified in `LINEAR` clause of a construct combined with `DISTRIBUTE`
+  !ERROR: Variable 'j' not allowed in LINEAR clause, only loop iterator can be specified in LINEAR clause of a construct combined with DISTRIBUTE
   !$omp distribute parallel do simd linear(j) 
   do i = 1, N
       a = 3.14
@@ -31,8 +31,8 @@ SUBROUTINE LINEAR_BAD(N)
 
   !$omp target
   !$omp teams
-  !ERROR: Variable 'j' not allowed in `LINEAR` clause, only loop iterator can be specified in `LINEAR` clause of a construct combined with `DISTRIBUTE`
-  !ERROR: Variable 'b' not allowed in `LINEAR` clause, only loop iterator can be specified in `LINEAR` clause of a construct combined with `DISTRIBUTE`
+  !ERROR: Variable 'j' not allowed in LINEAR clause, only loop iterator can be specified in LINEAR clause of a construct combined with DISTRIBUTE
+  !ERROR: Variable 'b' not allowed in LINEAR clause, only loop iterator can be specified in LINEAR clause of a construct combined with DISTRIBUTE
   !$omp distribute parallel do simd linear(j) linear(b)
   do i = 1, N
      a = 3.14
@@ -43,8 +43,8 @@ SUBROUTINE LINEAR_BAD(N)
 
   !$omp target
   !$omp teams
-  !ERROR: Variable 'j' not allowed in `LINEAR` clause, only loop iterator can be specified in `LINEAR` clause of a construct combined with `DISTRIBUTE`
-  !ERROR: Variable 'b' not allowed in `LINEAR` clause, only loop iterator can be specified in `LINEAR` clause of a construct combined with `DISTRIBUTE`
+  !ERROR: Variable 'j' not allowed in LINEAR clause, only loop iterator can be specified in LINEAR clause of a construct combined with DISTRIBUTE
+  !ERROR: Variable 'b' not allowed in LINEAR clause, only loop iterator can be specified in LINEAR clause of a construct combined with DISTRIBUTE
   !$omp distribute parallel do simd linear(j, b)
   do i = 1, N
      a = 3.14
@@ -54,7 +54,7 @@ SUBROUTINE LINEAR_BAD(N)
   !$omp end target 
 
   !ERROR: `DISTRIBUTE` region has to be strictly nested inside `TEAMS` region.
-  !ERROR: Variable 'j' not allowed in `LINEAR` clause, only loop iterator can be specified in `LINEAR` clause of a construct combined with `DISTRIBUTE`
+  !ERROR: Variable 'j' not allowed in LINEAR clause, only loop iterator can be specified in LINEAR clause of a construct combined with DISTRIBUTE
   !$omp distribute simd linear(i,j)
    do i = 1, N
       do j = 1, N
@@ -64,7 +64,7 @@ SUBROUTINE LINEAR_BAD(N)
    !$omp end distribute simd
 
    !ERROR: `DISTRIBUTE` region has to be strictly nested inside `TEAMS` region.
-   !ERROR: Variable 'j' not allowed in `LINEAR` clause, only loop iterator can be specified in `LINEAR` clause of a construct combined with `DISTRIBUTE`
+   !ERROR: Variable 'j' not allowed in LINEAR clause, only loop iterator can be specified in LINEAR clause of a construct combined with DISTRIBUTE
    !$omp distribute simd linear(i,j) collapse(1)
    do i = 1, N
       do j = 1, N
diff --git a/flang/test/Semantics/OpenMP/symbol08.f90 b/flang/test/Semantics/OpenMP/symbol08.f90
index 69ccd17391b54..80ae1c6d2242b 100644
--- a/flang/test/Semantics/OpenMP/symbol08.f90
+++ b/flang/test/Semantics/OpenMP/symbol08.f90
@@ -130,13 +130,14 @@ subroutine dotprod (b, c, n, block_size, num_teams, block_threads)
  !REF: /dotprod/sum
  sum = 0.0e0
 !$omp target  map(to:b,c)  map(tofrom:sum)
-!$omp teams  num_teams(num_teams) thread_limit(block_threads) reduction(+:sum)
+!$omp teams  num_teams(num_teams) thread_limit(block_threads) reduction(+: sum&
+!$OMP&)
 !$omp distribute
  !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/i0 (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
  !REF: /dotprod/n
  !REF: /dotprod/block_size
  do i0=1,n,block_size
-!$omp parallel do  reduction(+:sum)
+!$omp parallel do  reduction(+: sum)
   !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/i0 HostAssoc INTEGER(4)
   !DEF: /dotprod/min ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
diff --git a/flang/test/Semantics/OpenMP/task-reduction.f90 b/flang/test/Semantics/OpenMP/task-reduction.f90
new file mode 100644
index 0000000000000..5a18ee48e7728
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/task-reduction.f90
@@ -0,0 +1,70 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=50
+
+subroutine f00
+  real :: x
+!ERROR: The type of 'x' is incompatible with the reduction operator.
+!$omp taskgroup task_reduction(.or.: x)
+!$omp end taskgroup
+end
+
+subroutine f01
+  real :: x
+!ERROR: Invalid reduction operator in TASK_REDUCTION clause.
+!$omp taskgroup task_reduction(.not.: x)
+!$omp end taskgroup
+end
+
+subroutine f02(p)
+  integer, pointer, intent(in) :: p
+!ERROR: Pointer 'p' with the INTENT(IN) attribute may not appear in a TASK_REDUCTION clause
+!$omp taskgroup task_reduction(+: p)
+!$omp end taskgroup
+end
+
+subroutine f03
+  common /c/ a, b 
+!ERROR: Common block names are not allowed in TASK_REDUCTION clause
+!$omp taskgroup task_reduction(+: /c/)
+!$omp end taskgroup
+end
+
+subroutine f04
+  integer :: x(10)
+!ERROR: Reference to 'x' must be a contiguous object
+!$omp taskgroup task_reduction(+: x(1:10:2))
+!$omp end taskgroup
+end
+
+subroutine f05
+  integer :: x(10)
+!ERROR: 'x' in TASK_REDUCTION clause is a zero size array section
+!$omp taskgroup task_reduction(+: x(1:0))
+!$omp end taskgroup
+end
+
+subroutine f06
+  type t
+    integer :: a(10)
+  end type
+  type(t) :: x
+!ERROR: The base expression of an array element or section in TASK_REDUCTION clause must be an identifier
+!$omp taskgroup task_reduction(+: x%a(2))
+!$omp end taskgroup
+end
+
+subroutine f07
+  type t
+    integer :: a(10)
+  end type
+  type(t) :: x
+!ERROR: The base expression of an array element or section in TASK_REDUCTION clause must be an identifier
+!$omp taskgroup task_reduction(+: x%a(1:10))
+!$omp end taskgroup
+end
+
+subroutine f08
+  integer :: x
+!ERROR: Type parameter inquiry is not permitted in TASK_REDUCTION clause
+!$omp taskgroup task_reduction(+: x%kind)
+!$omp end taskgroup
+end
diff --git a/flang/test/Semantics/OpenMP/taskgroup01.f90 b/flang/test/Semantics/OpenMP/taskgroup01.f90
index e05051387411a..ded5d47525af4 100644
--- a/flang/test/Semantics/OpenMP/taskgroup01.f90
+++ b/flang/test/Semantics/OpenMP/taskgroup01.f90
@@ -41,6 +41,8 @@
     !$omp task
       !$omp taskgroup task_reduction(+ : reduction_var)
           print *, "The "
+  !ERROR: The type of 'reduction_var' is incompatible with the reduction operator.
+  !ERROR: The type of 'reduction_var' is incompatible with the reduction operator.
         !$omp taskgroup task_reduction(.or. : reduction_var) task_reduction(.and. : reduction_var)
           print *, "almighty sun"
         !$omp end taskgroup
diff --git a/flang/test/Semantics/modfile55.cuf b/flang/test/Semantics/modfile55.cuf
index cf01bdd5f58f6..6c0d152a382a8 100644
--- a/flang/test/Semantics/modfile55.cuf
+++ b/flang/test/Semantics/modfile55.cuf
@@ -29,6 +29,7 @@ end
 !contains
 !attributes(global) subroutine globsub(x,y,z)
 !real(4),value::x
+!attributes(device) x
 !real(4)::y
 !attributes(device) y
 !real(4)::z
diff --git a/flang/unittests/Runtime/AccessTest.cpp b/flang/unittests/Runtime/AccessTest.cpp
index 66f19f78c7cfb..c2a2d7d398220 100644
--- a/flang/unittests/Runtime/AccessTest.cpp
+++ b/flang/unittests/Runtime/AccessTest.cpp
@@ -32,6 +32,12 @@ struct AccessType {
 
 } // namespace
 
+static bool userSkipsPermissionChecks() {
+  // The tests in this file assume normal permission checks apply to the user
+  // running the tests. This isn't true when the test is run by root.
+  return geteuid() == 0;
+}
+
 static std::string addPIDSuffix(const char *name) {
   std::stringstream ss;
   ss << name;
@@ -166,6 +172,10 @@ TEST(AccessTests, TestRead) {
 
   ASSERT_EQ(unlink(path.c_str()), 0);
 
+  if (userSkipsPermissionChecks()) {
+    return;
+  }
+
   ASSERT_EQ(res, 0);
 }
 
@@ -181,6 +191,10 @@ TEST(AccessTests, TestNotRead) {
 
   ASSERT_EQ(unlink(path.c_str()), 0);
 
+  if (userSkipsPermissionChecks()) {
+    return;
+  }
+
   ASSERT_NE(res, 0);
 }
 
@@ -195,6 +209,10 @@ TEST(AccessTests, TestWrite) {
 
   ASSERT_EQ(unlink(path.c_str()), 0);
 
+  if (userSkipsPermissionChecks()) {
+    return;
+  }
+
   ASSERT_EQ(res, 0);
 }
 
@@ -210,6 +228,10 @@ TEST(AccessTests, TestNotWrite) {
 
   ASSERT_EQ(unlink(path.c_str()), 0);
 
+  if (userSkipsPermissionChecks()) {
+    return;
+  }
+
   ASSERT_NE(res, 0);
 }
 
@@ -225,6 +247,10 @@ TEST(AccessTests, TestReadWrite) {
 
   ASSERT_EQ(unlink(path.c_str()), 0);
 
+  if (userSkipsPermissionChecks()) {
+    return;
+  }
+
   ASSERT_EQ(res, 0);
 }
 
@@ -242,6 +268,10 @@ TEST(AccessTests, TestNotReadWrite0) {
 
   ASSERT_EQ(unlink(path.c_str()), 0);
 
+  if (userSkipsPermissionChecks()) {
+    return;
+  }
+
   ASSERT_NE(res, 0);
 }
 
@@ -259,6 +289,10 @@ TEST(AccessTests, TestNotReadWrite1) {
 
   ASSERT_EQ(unlink(path.c_str()), 0);
 
+  if (userSkipsPermissionChecks()) {
+    return;
+  }
+
   ASSERT_NE(res, 0);
 }
 
@@ -276,6 +310,10 @@ TEST(AccessTests, TestNotReadWrite2) {
 
   ASSERT_EQ(unlink(path.c_str()), 0);
 
+  if (userSkipsPermissionChecks()) {
+    return;
+  }
+
   ASSERT_NE(res, 0);
 }
 
@@ -290,6 +328,10 @@ TEST(AccessTests, TestExecute) {
 
   ASSERT_EQ(unlink(path.c_str()), 0);
 
+  if (userSkipsPermissionChecks()) {
+    return;
+  }
+
   ASSERT_EQ(res, 0);
 }
 
@@ -305,6 +347,10 @@ TEST(AccessTests, TestNotExecute) {
 
   ASSERT_EQ(unlink(path.c_str()), 0);
 
+  if (userSkipsPermissionChecks()) {
+    return;
+  }
+
   ASSERT_NE(res, 0);
 }
 
@@ -321,6 +367,10 @@ TEST(AccessTests, TestRWX) {
 
   ASSERT_EQ(unlink(path.c_str()), 0);
 
+  if (userSkipsPermissionChecks()) {
+    return;
+  }
+
   ASSERT_EQ(res, 0);
 }
 
@@ -340,6 +390,10 @@ TEST(AccessTests, TestNotRWX0) {
 
   ASSERT_EQ(unlink(path.c_str()), 0);
 
+  if (userSkipsPermissionChecks()) {
+    return;
+  }
+
   ASSERT_NE(res, 0);
 }
 
@@ -359,6 +413,10 @@ TEST(AccessTests, TestNotRWX1) {
 
   ASSERT_EQ(unlink(path.c_str()), 0);
 
+  if (userSkipsPermissionChecks()) {
+    return;
+  }
+
   ASSERT_NE(res, 0);
 }
 
@@ -378,6 +436,10 @@ TEST(AccessTests, TestNotRWX2) {
 
   ASSERT_EQ(unlink(path.c_str()), 0);
 
+  if (userSkipsPermissionChecks()) {
+    return;
+  }
+
   ASSERT_NE(res, 0);
 }
 
@@ -397,6 +459,10 @@ TEST(AccessTests, TestNotRWX3) {
 
   ASSERT_EQ(unlink(path.c_str()), 0);
 
+  if (userSkipsPermissionChecks()) {
+    return;
+  }
+
   ASSERT_NE(res, 0);
 }
 
@@ -416,6 +482,10 @@ TEST(AccessTests, TestNotRWX4) {
 
   ASSERT_EQ(unlink(path.c_str()), 0);
 
+  if (userSkipsPermissionChecks()) {
+    return;
+  }
+
   ASSERT_NE(res, 0);
 }
 
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 6d48489284fae..88cc75e83b043 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -1,6 +1,12 @@
 cmake_minimum_required(VERSION 3.20.0)
 set(LLVM_SUBPROJECT_TITLE "libc")
 
+if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+  message(FATAL_ERROR "Builds rooted in the libc directory are not supported. "
+    "Builds should be rooted in the runtimes directory instead. "
+    "Please see the documentation at https://libc.llvm.org/usage_modes.html for more info.")
+endif()
+
 # Include LLVM's cmake policies.
 if(NOT DEFINED LLVM_COMMON_CMAKE_UTILS)
   set(LLVM_COMMON_CMAKE_UTILS ${CMAKE_CURRENT_SOURCE_DIR}/../cmake)
diff --git a/libc/docs/headers/arpa/inet.rst b/libc/docs/headers/arpa/inet.rst
new file mode 100644
index 0000000000000..c82ca5427fbbb
--- /dev/null
+++ b/libc/docs/headers/arpa/inet.rst
@@ -0,0 +1,50 @@
+.. include:: ../../check.rst
+
+===========
+arpa/inet.h
+===========
+
+Functions
+=========
+
+.. list-table::
+  :widths: auto
+  :align: center
+  :header-rows: 1
+
+  * - Function
+    - Implemented
+    - C23 Standard Section
+    - POSIX.1-2024 Standard Section
+  * - htonl
+    - |check|
+    -
+    - 
+  * - htons
+    - |check|
+    -
+    - 
+  * - inet_addr
+    -
+    -
+    - 
+  * - inet_ntoa
+    -
+    -
+    - 
+  * - inet_ntop
+    -
+    -
+    - 
+  * - inet_pton
+    -
+    -
+    - 
+  * - ntohl
+    - |check|
+    -
+    - 
+  * - ntohs
+    - |check|
+    -
+    - 
diff --git a/libc/docs/headers/assert.rst b/libc/docs/headers/assert.rst
index 06ea27966de1a..682170755ba43 100644
--- a/libc/docs/headers/assert.rst
+++ b/libc/docs/headers/assert.rst
@@ -15,7 +15,7 @@ Macros
   * - Macro
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - __STDC_VERSION_ASSERT_H__
     - |check|
     - 7.2.1
@@ -23,5 +23,5 @@ Macros
   * - assert
     -
     - 7.2.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/assert.h.html>`__
 
diff --git a/libc/docs/headers/ctype.rst b/libc/docs/headers/ctype.rst
index e506830809f79..9b5b1574fd274 100644
--- a/libc/docs/headers/ctype.rst
+++ b/libc/docs/headers/ctype.rst
@@ -15,60 +15,116 @@ Functions
   * - Function
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - isalnum
     - |check|
     - 7.4.1.1
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isalnum.html>`__
+  * - isalnum_l
+    - |check|
     -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isalnum_l.html>`__
   * - isalpha
     - |check|
     - 7.4.1.2
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isalpha.html>`__
+  * - isalpha_l
+    - |check|
     -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isalpha_l.html>`__
   * - isblank
     - |check|
     - 7.4.1.3
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isblank.html>`__
+  * - isblank_l
+    - |check|
     -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isblank_l.html>`__
   * - iscntrl
     - |check|
     - 7.4.1.4
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/iscntrl.html>`__
+  * - iscntrl_l
+    - |check|
     -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/iscntrl_l.html>`__
   * - isdigit
     - |check|
     - 7.4.1.5
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isdigit.html>`__
+  * - isdigit_l
+    - |check|
     -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isdigit_l.html>`__
   * - isgraph
     - |check|
     - 7.4.1.6
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isgraph.html>`__
+  * - isgraph_l
+    - |check|
     -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isgraph_l.html>`__
   * - islower
     - |check|
     - 7.4.1.7
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/islower.html>`__
+  * - islower_l
+    - |check|
     -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/islower_l.html>`__
   * - isprint
     - |check|
     - 7.4.1.8
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isprint.html>`__
+  * - isprint_l
+    - |check|
     -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isprint_l.html>`__
   * - ispunct
     - |check|
     - 7.4.1.9
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/ispunct.html>`__
+  * - ispunct_l
+    - |check|
     -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/ispunct_l.html>`__
   * - isspace
     - |check|
     - 7.4.1.10
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isspace.html>`__
+  * - isspace_l
+    - |check|
     -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isspace_l.html>`__
   * - isupper
     - |check|
     - 7.4.1.11
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isupper.html>`__
+  * - isupper_l
+    - |check|
     -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isupper_l.html>`__
   * - isxdigit
     - |check|
     - 7.4.1.12
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isxdigit.html>`__
+  * - isxdigit_l
+    - |check|
     -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isxdigit_l.html>`__
   * - tolower
     - |check|
     - 7.4.2.1
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/tolower.html>`__
+  * - tolower_l
+    - |check|
     -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/tolower_l.html>`__
   * - toupper
     - |check|
     - 7.4.2.2
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/toupper.html>`__
+  * - toupper_l
+    - |check|
     -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/toupper_l.html>`__
diff --git a/libc/docs/headers/errno.rst b/libc/docs/headers/errno.rst
index f25aae4f23b2c..b2b2e62728e1a 100644
--- a/libc/docs/headers/errno.rst
+++ b/libc/docs/headers/errno.rst
@@ -15,21 +15,21 @@ Macros
   * - Macro
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - EDOM
     -
     - 7.5
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/errno.h.html>`__
   * - EILSEQ
     -
     - 7.5
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/errno.h.html>`__
   * - ERANGE
     -
     - 7.5
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/errno.h.html>`__
   * - errno
     -
     - 7.5
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/errno.h.html>`__
 
diff --git a/libc/docs/headers/fenv.rst b/libc/docs/headers/fenv.rst
index 374b46ef57be0..d0e3c5dda6d00 100644
--- a/libc/docs/headers/fenv.rst
+++ b/libc/docs/headers/fenv.rst
@@ -15,11 +15,11 @@ Macros
   * - Macro
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - FE_ALL_EXCEPT
     - |check|
     - 7.6.12
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
   * - FE_DEC_DOWNWARD
     -
     - 7.6.14
@@ -43,7 +43,7 @@ Macros
   * - FE_DFL_ENV
     - |check|
     - 7.6.17
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
   * - FE_DFL_MODE
     -
     - 7.6.11
@@ -51,27 +51,27 @@ Macros
   * - FE_DIVBYZERO
     - |check|
     - 7.6.9
-    -
-  * - FE_DOWNARD
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
+  * - FE_DOWNWARD
+    - |check|
     - 7.6.13
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
   * - FE_INEXACT
     - |check|
     - 7.6.9
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
   * - FE_INVALID
     - |check|
     - 7.6.9
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
   * - FE_OVERFLOW
     - |check|
     - 7.6.9
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
   * - FE_TONEAREST
     - |check|
     - 7.6.13
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
   * - FE_TONEARESTFROMZERO
     -
     - 7.6.13
@@ -79,15 +79,15 @@ Macros
   * - FE_TOWARDZERO
     - |check|
     - 7.6.13
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
   * - FE_UNDERFLOW
     - |check|
     - 7.6.9
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
   * - FE_UPWARD
     - |check|
     - 7.6.13
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
   * - __STDC_VERSION_FENV_H__
     -
     - 7.6.5
@@ -104,7 +104,7 @@ Functions
   * - Function
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - fe_dec_getround
     -
     - 7.6.5.3
@@ -116,15 +116,15 @@ Functions
   * - feclearexcept
     - |check|
     - 7.6.4.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/feclearexcept.html>`__
   * - fegetenv
     - |check|
     - 7.6.6.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fegetenv.html>`__
   * - fegetexceptflag
     - |check|
     - 7.6.4.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fegetexceptflag.html>`__
   * - fegetmode
     -
     - 7.6.5.1
@@ -132,19 +132,19 @@ Functions
   * - fegetround
     - |check|
     - 7.6.5.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fegetround.html>`__
   * - feholdexcept
     - |check|
     - 7.6.6.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/feholdexcept.html>`__
   * - feraiseexcept
     - |check|
     - 7.6.4.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/feraiseexcept.html>`__
   * - fesetenv
     - |check|
     - 7.6.6.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fesetenv.html>`__
   * - fesetexcept
     - |check|
     - 7.6.4.4
@@ -152,7 +152,7 @@ Functions
   * - fesetexceptflag
     - |check|
     - 7.6.4.5
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fesetexceptflag.html>`__
   * - fesetmode
     -
     - 7.6.5.4
@@ -160,11 +160,11 @@ Functions
   * - fesetround
     - |check|
     - 7.6.5.5
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fesetround.html>`__
   * - fetestexcept
     - |check|
     - 7.6.4.7
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fetestexcept.html>`__
   * - fetestexceptflag
     - |check|
     - 7.6.4.6
@@ -172,4 +172,4 @@ Functions
   * - feupdateenv
     - |check|
     - 7.6.6.4
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/feupdateenv.html>`__
diff --git a/libc/docs/headers/float.rst b/libc/docs/headers/float.rst
index b603867fcef9c..8ef0f3a05020c 100644
--- a/libc/docs/headers/float.rst
+++ b/libc/docs/headers/float.rst
@@ -15,23 +15,23 @@ Macros
   * - Macro
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - DBL_DECIMAL_DIG
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - DBL_DIG
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - DBL_EPSILON
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - DBL_HAS_SUBNORM
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - DBL_IS_IEC_60559
     -
     - 5.3.5.3.3
@@ -43,27 +43,27 @@ Macros
   * - DBL_MAX
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - DBL_MAX_10_EXP
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - DBL_MAX_EXP
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - DBL_MIN
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - DBL_MIN_10_EXP
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - DBL_MIN_EXP
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - DBL_NORM_MAX
     -
     - 5.3.5.3.3
@@ -75,31 +75,31 @@ Macros
   * - DBL_TRUE_MIN
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - DECIMAL_DIG
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - FLT_DECIMAL_DIG
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - FLT_DIG
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - FLT_EPSILON
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - FLT_EVAL_METHOD
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - FLT_HAS_SUBNORM
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - FLT_IS_IEC_60559
     -
     - 5.3.5.3.3
@@ -111,27 +111,27 @@ Macros
   * - FLT_MAX
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - FLT_MAX_10_EXP
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - FLT_MAX_EXP
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - FLT_MIN
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - FLT_MIN_10_EXP
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - FLT_MIN_EXP
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - FLT_NORM_MAX
     -
     - 5.3.5.3.3
@@ -139,11 +139,11 @@ Macros
   * - FLT_RADIX
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - FLT_ROUNDS
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - FLT_SNAN
     -
     - 5.3.5.3.3
@@ -151,7 +151,7 @@ Macros
   * - FLT_TRUE_MIN
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - INFINITY
     -
     - 5.3.5.3.3
@@ -159,19 +159,19 @@ Macros
   * - LDBL_DECIMAL_DIG
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - LDBL_DIG
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - LDBL_EPSILON
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - LDBL_HAS_SUBNORM
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - LDBL_IS_IEC_60559
     -
     - 5.3.5.3.3
@@ -183,27 +183,27 @@ Macros
   * - LDBL_MAX
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - LDBL_MAX_10_EXP
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - LDBL_MAX_EXP
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - LDBL_MIN
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - LDBL_MIN_10_EXP
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - LDBL_MIN_EXP
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - LDBL_NORM_MAX
     -
     - 5.3.5.3.3
@@ -215,7 +215,7 @@ Macros
   * - LDBL_TRUE_MIN
     - |check|
     - 5.3.5.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
   * - NAN
     -
     - 5.3.5.3.3
diff --git a/libc/docs/headers/index.rst b/libc/docs/headers/index.rst
index 9bd6396843e78..07ab6dd9b2674 100644
--- a/libc/docs/headers/index.rst
+++ b/libc/docs/headers/index.rst
@@ -4,6 +4,7 @@ Implementation Status
 .. toctree::
    :maxdepth: 1
 
+   arpa/inet
    assert
    complex
    ctype
@@ -21,6 +22,7 @@ Implementation Status
    stdlib
    string
    strings
+   sys/mman
    threads
    time
    uchar
diff --git a/libc/docs/headers/inttypes.rst b/libc/docs/headers/inttypes.rst
index f43c80f095c6a..9269b40f242a6 100644
--- a/libc/docs/headers/inttypes.rst
+++ b/libc/docs/headers/inttypes.rst
@@ -15,28 +15,28 @@ Functions
   * - Function
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - imaxabs
     - |check|
     - 7.8.2.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/imaxabs.html>`__
   * - imaxdiv
     - |check|
     - 7.8.2.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/imaxdiv.html>`__
   * - strtoimax
     - |check|
     - 7.8.2.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtoimax.html>`__
   * - strtoumax
     - |check|
     - 7.8.2.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtoumax.html>`__
   * - wcstoimax
     -
     - 7.8.2.4
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/wcstoimax.html>`__
   * - wcstoumax
     -
     - 7.8.2.4
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/wcstoumax.html>`__
diff --git a/libc/docs/headers/locale.rst b/libc/docs/headers/locale.rst
index 2d5525bd3f2f9..c97d1f63b1f0c 100644
--- a/libc/docs/headers/locale.rst
+++ b/libc/docs/headers/locale.rst
@@ -15,31 +15,31 @@ Macros
   * - Macro
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - LC_ALL
     - |check|
     - 7.11
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/locale.h.html>`__
   * - LC_COLLATE
     - |check|
     - 7.11
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/locale.h.html>`__
   * - LC_CTYPE
     - |check|
     - 7.11
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/locale.h.html>`__
   * - LC_MONETARY
     - |check|
     - 7.11
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/locale.h.html>`__
   * - LC_NUMERIC
     - |check|
     - 7.11
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/locale.h.html>`__
   * - LC_TIME
     - |check|
     - 7.11
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/locale.h.html>`__
 
 Functions
 =========
@@ -52,12 +52,32 @@ Functions
   * - Function
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
+  * - duplocale
+    - |check|
+    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/duplocale.html>`__
+  * - freelocale
+    - |check|
+    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/freelocale.html>`__
+  * - getlocalename_l
+    -
+    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/getlocalename_l.html>`__
   * - localeconv
     - |check|
     - 7.11.2.1
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/localeconv.html>`__
+  * - newlocale
+    - |check|
     -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/newlocale.html>`__
   * - setlocale
     - |check|
     - 7.11.1.1
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/setlocale.html>`__
+  * - uselocale
+    - |check|
     -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/uselocale.html>`__
diff --git a/libc/docs/headers/signal.rst b/libc/docs/headers/signal.rst
index b59ae09342357..4f51f611c9fe7 100644
--- a/libc/docs/headers/signal.rst
+++ b/libc/docs/headers/signal.rst
@@ -15,143 +15,143 @@ Macros
   * - Macro
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - SIGABRT
     - |check|
     - 7.14.3
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGALRM
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGBUS
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGCHLD
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGCONT
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGFPE
     - |check|
     - 7.14.3
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGHUP
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGILL
     - |check|
     - 7.14.3
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGINT
     - |check|
     - 7.14.3
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGKILL
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGPIPE
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGPOLL
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGPROF
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGQUIT
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGRTMAX
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGRTMIN
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGSEGV
     - |check|
     - 7.14.3
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGSTOP
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGSYS
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGTERM
     - |check|
     - 7.14.3
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGTRAP
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGTSTP
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGTTIN
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGTTOU
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGURG
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGUSR1
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGUSR2
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGVTALRM
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGXCPU
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIGXFSZ
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIG_DFL
     - |check|
     - 7.14.3
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIG_ERR
     - |check|
     - 7.14.3
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIG_HOLD
     -
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
   * - SIG_IGN
     - |check|
     - 7.14.3
-    - https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
 
 Functions
 =========
@@ -164,44 +164,44 @@ Functions
   * - Function
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - kill
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/kill.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/kill.html>`__
   * - raise
     - |check|
     - 7.14.2.1
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/raise.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/raise.html>`__
   * - sigaction
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigaction.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigaction.html>`__
   * - sigaddset
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigaddset.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigaddset.html>`__
   * - sigaltstack
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigaltstack.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigaltstack.html>`__
   * - sigdelset
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigdelset.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigdelset.html>`__
   * - sigemptyset
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigemptyset.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigemptyset.html>`__
   * - sigfillset
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigfillset.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigfillset.html>`__
   * - signal
     - |check|
     - 7.14.1.1
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/signal.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/signal.html>`__
   * - sigprocmask
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigprocmask.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigprocmask.html>`__
diff --git a/libc/docs/headers/stdlib.rst b/libc/docs/headers/stdlib.rst
index 139d9b4a92228..4151f2934c940 100644
--- a/libc/docs/headers/stdlib.rst
+++ b/libc/docs/headers/stdlib.rst
@@ -15,23 +15,23 @@ Macros
   * - Macro
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - EXIT_FAILURE
     - |check|
     - 7.24
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdlib.h.html>`__
   * - EXIT_SUCCESS
     - |check|
     - 7.24
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdlib.h.html>`__
   * - MB_CUR_MAX
     - |check|
     - 7.24
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdlib.h.html>`__
   * - RAND_MAX
     - |check|
     - 7.24
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdlib.h.html>`__
   * - __STDC_VERSION_STDLIB_H__
     -
     - 7.24
@@ -48,67 +48,67 @@ Functions
   * - Function
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - _Exit
     - |check|
     - 7.24.4.5
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/_Exit.html>`__
   * - abort
     - |check|
     - 7.24.4.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/abort.html>`__
   * - abs
     - |check|
     - 7.24.6.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/abs.html>`__
   * - aligned_alloc
     - |check|
     - 7.24.3.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/aligned_alloc.html>`__
   * - at_quick_exit
     - |check|
     - 7.24.4.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/at_quick_exit.html>`__
   * - atexit
     - |check|
     - 7.24.4.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/atexit.html>`__
   * - atof
     - |check|
     - 7.24.1.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/atof.html>`__
   * - atoi
     - |check|
     - 7.24.1.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/atoi.html>`__
   * - atol
     - |check|
     - 7.24.1.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/atol.html>`__
   * - atoll
     - |check|
     - 7.24.1.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/atoll.html>`__
   * - bsearch
     - |check|
     - 7.24.5.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/bsearch.html>`__
   * - calloc
     - |check|
     - 7.24.3.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/calloc.html>`__
   * - div
     - |check|
     - 7.24.6.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/div.html>`__
   * - exit
     - |check|
     - 7.24.4.4
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/exit.html>`__
   * - free
     - |check|
     - 7.24.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/free.html>`__
   * - free_aligned_sized
     -
     - 7.24.3.5
@@ -120,39 +120,39 @@ Functions
   * - getenv
     - |check|
     - 7.24.4.6
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/getenv.html>`__
   * - labs
     - |check|
     - 7.24.6.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/labs.html>`__
   * - ldiv
     - |check|
     - 7.24.6.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/ldiv.html>`__
   * - llabs
     - |check|
     - 7.24.6.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/llabs.html>`__
   * - lldiv
     - |check|
     - 7.24.6.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/lldiv.html>`__
   * - malloc
     - |check|
     - 7.24.3.6
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/malloc.html>`__
   * - mblen
     -
     - 7.24.7.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mblen.html>`__
   * - mbstowcs
     -
     - 7.24.8.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mbstowcs.html>`__
   * - mbtowc
     -
     - 7.24.7.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mbtowc.html>`__
   * - memalignment
     -
     - 7.24.9.1
@@ -160,23 +160,23 @@ Functions
   * - qsort
     - |check|
     - 7.24.5.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/qsort.html>`__
   * - quick_exit
     - |check|
     - 7.24.4.7
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/quick_exit.html>`__
   * - rand
     - |check|
     - 7.24.2.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/rand.html>`__
   * - realloc
     - |check|
     - 7.24.3.7
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/realloc.html>`__
   * - srand
     - |check|
     - 7.24.2.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/srand.html>`__
   * - strfromd
     - |check|
     - 7.24.1.3
@@ -204,7 +204,7 @@ Functions
   * - strtod
     - |check|
     - 7.24.1.5
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtod.html>`__
   * - strtod128
     -
     - 7.24.1.6
@@ -220,36 +220,36 @@ Functions
   * - strtof
     - |check|
     - 7.24.1.5
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtof.html>`__
   * - strtol
     - |check|
     - 7.24.1.7
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtol.html>`__
   * - strtold
     - |check|
     - 7.24.1.5
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtold.html>`__
   * - strtoll
     - |check|
     - 7.24.1.7
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtoll.html>`__
   * - strtoul
     - |check|
     - 7.24.1.7
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtoul.html>`__
   * - strtoull
     - |check|
     - 7.24.1.7
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtoull.html>`__
   * - system
     - |check|
     - 7.24.4.8
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/system.html>`__
   * - wcstombs
     -
     - 7.24.8.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/wcstombs.html>`__
   * - wctomb
     -
     - 7.24.7.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/wctomb.html>`__
diff --git a/libc/docs/headers/string.rst b/libc/docs/headers/string.rst
index 55f779c830ea8..2665ed8ca17e6 100644
--- a/libc/docs/headers/string.rst
+++ b/libc/docs/headers/string.rst
@@ -15,7 +15,7 @@ Macros
   * - Macro
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - __STDC_VERSION_STRING_H__
     -
     - 7.26.1
@@ -32,27 +32,27 @@ Functions
   * - Function
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - memccpy
     - |check|
     - 7.26.2.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/memccpy.html>`__
   * - memchr
     - |check|
     - 7.26.5.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/memchr.html>`__
   * - memcmp
     - |check|
     - 7.26.4.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/memcmp.html>`__
   * - memcpy
     - |check|
     - 7.26.2.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/memcpy.html>`__
   * - memmove
     - |check|
     - 7.26.2.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/memmove.html>`__
   * - mempcpy
     - |check|
     - TODO: glibc extension
@@ -60,7 +60,7 @@ Functions
   * - memset
     - |check|
     - 7.26.6.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/memset.html>`__
   * - memset_explicit
     - |check|
     - 7.26.6.2
@@ -68,88 +68,96 @@ Functions
   * - stpcpy
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/stpcpy.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/stpcpy.html>`__
   * - stpncpy
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/stpncpy.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/stpncpy.html>`__
   * - strcat
     - |check|
     - 7.26.3.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strcat.html>`__
   * - strchr
     - |check|
     - 7.26.5.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strchr.html>`__
   * - strcmp
     - |check|
     - 7.26.4.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strcmp.html>`__
   * - strcoll
     - |check|
     - 7.26.4.3
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strcoll.html>`__
+  * - strcoll_l
+    - |check|
     -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strcoll_l.html>`__
   * - strcpy
     - |check|
     - 7.26.2.4
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strcpy.html>`__
   * - strcspn
     - |check|
     - 7.26.5.4
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strcspn.html>`__
   * - strdup
     - |check|
     - 7.26.2.6
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strdup.html>`__
   * - strerror
     - |check|
     - 7.26.6.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strerror.html>`__
   * - strlen
     - |check|
     - 7.26.6.4
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strlen.html>`__
   * - strncat
     - |check|
     - 7.26.3.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strncat.html>`__
   * - strncmp
     - |check|
     - 7.26.4.4
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strncmp.html>`__
   * - strncpy
     - |check|
     - 7.26.2.5
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strncpy.html>`__
   * - strndup
     - |check|
     - 7.26.2.7
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strndup.html>`__
   * - strpbrk
     - |check|
     - 7.26.5.5
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strpbrk.html>`__
   * - strrchr
     - |check|
     - 7.26.5.6
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strrchr.html>`__
   * - strspn
     - |check|
     - 7.26.5.7
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strspn.html>`__
   * - strstr
     - |check|
     - 7.26.5.8
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strstr.html>`__
   * - strtok
     - |check|
     - 7.26.5.9
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtok.html>`__
   * - strtok_r
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtok_r.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtok_r.html>`__
   * - strxfrm
     - |check|
     - 7.26.4.5
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strxfrm.html>`__
+  * - strxfrm_l
+    - |check|
     -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strxfrm_l.html>`__
diff --git a/libc/docs/headers/strings.rst b/libc/docs/headers/strings.rst
index b5935d2683d08..effd667cd5219 100644
--- a/libc/docs/headers/strings.rst
+++ b/libc/docs/headers/strings.rst
@@ -15,52 +15,52 @@ Functions
   * - Function
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - bcmp
     - |check|
     -
-    - removed in POSIX.1-2008
+    - `removed in POSIX.1-2008 <https://pubs.opengroup.org/onlinepubs/007904875/functions/bcmp.html>`__
   * - bcopy
     - |check|
     -
-    - removed in POSIX.1-2008
+    - `removed in POSIX.1-2008 <https://pubs.opengroup.org/onlinepubs/007904875/functions/bcopy.html>`__
   * - bzero
     - |check|
     -
-    - removed in POSIX.1-2008
+    - `removed in POSIX.1-2008 <https://pubs.opengroup.org/onlinepubs/007904875/functions/bzero.html>`__
   * - ffs
     -
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/ffs.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/ffs.html>`__
   * - ffsl
     -
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/ffsl.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/ffsl.html>`__
   * - ffsll
     -
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/ffsll.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/ffsll.html>`__
   * - index
     - |check|
     -
-    - removed in POSIX.1-2008
+    - `removed in POSIX.1-2008 <https://pubs.opengroup.org/onlinepubs/007904875/functions/index.html>`__
   * - rindex
     - |check|
     -
-    - removed in POSIX.1-2008
+    - `removed in POSIX.1-2008 <https://pubs.opengroup.org/onlinepubs/007904875/functions/rindex.html>`__
   * - strcasecmp
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/strncasecmp.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strcasecmp.html>`__
   * - strcasecmp_l
     -
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/strncasecmp.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strcasecmp_l.html>`__
   * - strncasecmp
     - |check|
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/strncasecmp.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strncasecmp.html>`__
   * - strncasecmp_l
     -
     -
-    - https://pubs.opengroup.org/onlinepubs/9799919799/functions/strncasecmp.html
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strncasecmp_l.html>`__
diff --git a/libc/docs/headers/sys/mman.rst b/libc/docs/headers/sys/mman.rst
new file mode 100644
index 0000000000000..e3404205c07ac
--- /dev/null
+++ b/libc/docs/headers/sys/mman.rst
@@ -0,0 +1,179 @@
+.. include:: ../../check.rst
+
+==========
+sys/mman.h
+==========
+
+Macros
+======
+
+.. list-table::
+  :widths: auto
+  :align: center
+  :header-rows: 1
+
+  * - Macro
+    - Implemented
+    - C23 Standard Section
+    - POSIX.1-2024 Standard Section
+  * - MAP_ANON
+    -
+    -
+    - 
+  * - MAP_ANONYMOUS
+    -
+    -
+    - 
+  * - MAP_FAILED
+    - |check|
+    -
+    - 
+  * - MAP_FIXED
+    -
+    -
+    - 
+  * - MAP_PRIVATE
+    -
+    -
+    - 
+  * - MAP_SHARED
+    -
+    -
+    - 
+  * - MCL_CURRENT
+    -
+    -
+    - 
+  * - MCL_FUTURE
+    -
+    -
+    - 
+  * - MS_ASYNC
+    -
+    -
+    - 
+  * - MS_INVALIDATE
+    -
+    -
+    - 
+  * - MS_SYNC
+    -
+    -
+    - 
+  * - POSIX_MADV_DONTNEED
+    - |check|
+    -
+    - 
+  * - POSIX_MADV_NORMAL
+    - |check|
+    -
+    - 
+  * - POSIX_MADV_RANDOM
+    - |check|
+    -
+    - 
+  * - POSIX_MADV_SEQUENTIAL
+    - |check|
+    -
+    - 
+  * - POSIX_MADV_WILLNEED
+    - |check|
+    -
+    - 
+  * - POSIX_TYPED_MEM_ALLOCATE
+    -
+    -
+    - 
+  * - POSIX_TYPED_MEM_ALLOCATE_CONTIG
+    -
+    -
+    - 
+  * - POSIX_TYPED_MEM_MAP_ALLOCATABLE
+    -
+    -
+    - 
+  * - PROT_EXEC
+    -
+    -
+    - 
+  * - PROT_NONE
+    -
+    -
+    - 
+  * - PROT_READ
+    -
+    -
+    - 
+  * - PROT_WRITE
+    -
+    -
+    - 
+
+Functions
+=========
+
+.. list-table::
+  :widths: auto
+  :align: center
+  :header-rows: 1
+
+  * - Function
+    - Implemented
+    - C23 Standard Section
+    - POSIX.1-2024 Standard Section
+  * - mlock
+    - |check|
+    -
+    - 
+  * - mlockall
+    - |check|
+    -
+    - 
+  * - mmap
+    - |check|
+    -
+    - 
+  * - mprotect
+    - |check|
+    -
+    - 
+  * - msync
+    - |check|
+    -
+    - 
+  * - munlock
+    - |check|
+    -
+    - 
+  * - munlockall
+    - |check|
+    -
+    - 
+  * - munmap
+    - |check|
+    -
+    - 
+  * - posix_madvise
+    - |check|
+    -
+    - 
+  * - posix_mem_offset
+    -
+    -
+    - 
+  * - posix_typed_mem_get_info
+    -
+    -
+    - 
+  * - posix_typed_mem_open
+    -
+    -
+    - 
+  * - shm_open
+    - |check|
+    -
+    - 
+  * - shm_unlink
+    - |check|
+    -
+    - 
diff --git a/libc/docs/headers/threads.rst b/libc/docs/headers/threads.rst
index be313c6013b8d..c2837b8c3591c 100644
--- a/libc/docs/headers/threads.rst
+++ b/libc/docs/headers/threads.rst
@@ -15,19 +15,23 @@ Macros
   * - Macro
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - ONCE_FLAG_INIT
     -
     - 7.28.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/threads.h.html>`__
   * - TSS_DTOR_ITERATIONS
     -
     - 7.28.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/threads.h.html>`__
   * - __STDC_NO_THREADS__
     -
     - 7.28.1
     -
+  * - thread_local
+    -
+    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/threads.h.html>`__
 
 Functions
 =========
@@ -40,104 +44,104 @@ Functions
   * - Function
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - call_once
     - |check|
     - 7.28.2.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/call_once.html>`__
   * - cnd_broadcast
     - |check|
     - 7.28.3.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/cnd_broadcast.html>`__
   * - cnd_destroy
     - |check|
     - 7.28.3.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/cnd_destroy.html>`__
   * - cnd_init
     - |check|
     - 7.28.3.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/cnd_init.html>`__
   * - cnd_signal
     - |check|
     - 7.28.3.4
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/cnd_signal.html>`__
   * - cnd_timedwait
     -
     - 7.28.3.5
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/cnd_timedwait.html>`__
   * - cnd_wait
     - |check|
     - 7.28.3.6
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/cnd_wait.html>`__
   * - mtx_destroy
     - |check|
     - 7.28.4.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mtx_destroy.html>`__
   * - mtx_init
     - |check|
     - 7.28.4.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mtx_init.html>`__
   * - mtx_lock
     - |check|
     - 7.28.4.4
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mtx_lock.html>`__
   * - mtx_timedlock
     -
     - 7.28.4.5
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mtx_timedlock.html>`__
   * - mtx_trylock
     -
     - 7.28.4.6
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mtx_trylock.html>`__
   * - mtx_unlock
     - |check|
     - 7.28.4.7
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mtx_unlock.html>`__
   * - thrd_create
     - |check|
     - 7.28.5.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/thrd_create.html>`__
   * - thrd_current
     - |check|
     - 7.28.5.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/thrd_current.html>`__
   * - thrd_detach
     - |check|
     - 7.28.5.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/thrd_detach.html>`__
   * - thrd_equal
     - |check|
     - 7.28.5.4
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/thrd_equal.html>`__
   * - thrd_exit
     - |check|
     - 7.28.5.5
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/thrd_exit.html>`__
   * - thrd_join
     - |check|
     - 7.28.5.6
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/thrd_join.html>`__
   * - thrd_sleep
     -
     - 7.28.5.7
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/thrd_sleep.html>`__
   * - thrd_yield
     -
     - 7.28.5.8
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/thrd_yield.html>`__
   * - tss_create
     - |check|
     - 7.28.6.1
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/tss_create.html>`__
   * - tss_delete
     - |check|
     - 7.28.6.2
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/tss_delete.html>`__
   * - tss_get
     - |check|
     - 7.28.6.3
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/tss_get.html>`__
   * - tss_set
     - |check|
     - 7.28.6.4
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/tss_set.html>`__
diff --git a/libc/docs/headers/uchar.rst b/libc/docs/headers/uchar.rst
index 4645109c8c378..abb684bf9ae0e 100644
--- a/libc/docs/headers/uchar.rst
+++ b/libc/docs/headers/uchar.rst
@@ -15,7 +15,7 @@ Macros
   * - Macro
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - __STDC_VERSION_UCHAR_H__
     -
     - 7.30.1
@@ -32,15 +32,15 @@ Functions
   * - Function
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - c16rtomb
     -
     - 7.30.2.5
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/c16rtomb.html>`__
   * - c32rtomb
     -
     - 7.30.2.7
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/c32rtomb.html>`__
   * - c8rtomb
     -
     - 7.30.2.3
@@ -48,11 +48,11 @@ Functions
   * - mbrtoc16
     -
     - 7.30.2.4
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mbrtoc16.html>`__
   * - mbrtoc32
     -
     - 7.30.2.6
-    -
+    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mbrtoc32.html>`__
   * - mbrtoc8
     -
     - 7.30.2.2
diff --git a/libc/docs/headers/wchar.rst b/libc/docs/headers/wchar.rst
index ce2be3389a2ec..89a1e7b3fe660 100644
--- a/libc/docs/headers/wchar.rst
+++ b/libc/docs/headers/wchar.rst
@@ -15,7 +15,7 @@ Macros
   * - Macro
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - WEOF
     - |check|
     - 7.31.1
@@ -36,7 +36,7 @@ Functions
   * - Function
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - btowc
     - |check|
     - 7.31.6.2.1
diff --git a/libc/docs/headers/wctype.rst b/libc/docs/headers/wctype.rst
index 48096c3e25804..076db04f183e9 100644
--- a/libc/docs/headers/wctype.rst
+++ b/libc/docs/headers/wctype.rst
@@ -15,7 +15,7 @@ Functions
   * - Function
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section
+    - POSIX Docs
   * - iswalnum
     -
     - 7.32.2.1.1
diff --git a/libc/utils/docgen/arpa/inet.yaml b/libc/utils/docgen/arpa/inet.yaml
new file mode 100644
index 0000000000000..7f388cbbd0204
--- /dev/null
+++ b/libc/utils/docgen/arpa/inet.yaml
@@ -0,0 +1,18 @@
+functions:
+  htonl:
+    posix-definition: ''
+  htons:
+    posix-definition: ''
+  inet_addr:
+    posix-definition: ''
+  inet_ntoa:
+    posix-definition: ''
+  inet_ntop:
+    posix-definition: ''
+  inet_pton:
+    posix-definition: ''
+  ntohl:
+    posix-definition: ''
+  ntohs:
+    posix-definition: ''
+
diff --git a/libc/utils/docgen/assert.json b/libc/utils/docgen/assert.json
deleted file mode 100644
index 28ec12028ef67..0000000000000
--- a/libc/utils/docgen/assert.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-  "macros": {
-    "__STDC_VERSION_ASSERT_H__": {
-      "c-definition": "7.2.1"
-    },
-    "assert": {
-      "c-definition": "7.2.1"
-    }
-  }
-}
diff --git a/libc/utils/docgen/assert.yaml b/libc/utils/docgen/assert.yaml
new file mode 100644
index 0000000000000..0afd4e2d3c063
--- /dev/null
+++ b/libc/utils/docgen/assert.yaml
@@ -0,0 +1,7 @@
+macros:
+  __STDC_VERSION_ASSERT_H__:
+    c-definition: 7.2.1
+  assert:
+    c-definition: 7.2.1
+    in-latest-posix: ''
+
diff --git a/libc/utils/docgen/ctype.json b/libc/utils/docgen/ctype.json
deleted file mode 100644
index af97e4bbbc0a2..0000000000000
--- a/libc/utils/docgen/ctype.json
+++ /dev/null
@@ -1,47 +0,0 @@
-{
-  "functions": {
-    "isalnum": {
-      "c-definition": "7.4.1.1"
-    },
-    "isalpha": {
-      "c-definition": "7.4.1.2"
-    },
-    "isblank": {
-      "c-definition": "7.4.1.3"
-    },
-    "iscntrl": {
-      "c-definition": "7.4.1.4"
-    },
-    "isdigit": {
-      "c-definition": "7.4.1.5"
-    },
-    "isgraph": {
-      "c-definition": "7.4.1.6"
-    },
-    "islower": {
-      "c-definition": "7.4.1.7"
-    },
-    "isprint": {
-      "c-definition": "7.4.1.8"
-    },
-    "ispunct": {
-      "c-definition": "7.4.1.9"
-    },
-    "isspace": {
-      "c-definition": "7.4.1.10"
-    },
-    "isupper": {
-      "c-definition": "7.4.1.11"
-    },
-    "isxdigit": {
-      "c-definition": "7.4.1.12"
-    },
-    "tolower" : {
-      "c-definition": "7.4.2.1"
-    },
-    "toupper": {
-      "c-definition": "7.4.2.2"
-    }
-  }
-}
-
diff --git a/libc/utils/docgen/ctype.yaml b/libc/utils/docgen/ctype.yaml
new file mode 100644
index 0000000000000..027d8f38c71f9
--- /dev/null
+++ b/libc/utils/docgen/ctype.yaml
@@ -0,0 +1,72 @@
+functions:
+  isalnum:
+    c-definition: 7.4.1.1
+    in-latest-posix: ''
+  isalnum_l:
+    in-latest-posix: ''
+  isalpha:
+    c-definition: 7.4.1.2
+    in-latest-posix: ''
+  isalpha_l:
+    in-latest-posix: ''
+  isblank:
+    c-definition: 7.4.1.3
+    in-latest-posix: ''
+  isblank_l:
+    in-latest-posix: ''
+  iscntrl:
+    c-definition: 7.4.1.4
+    in-latest-posix: ''
+  iscntrl_l:
+    in-latest-posix: ''
+  isdigit:
+    c-definition: 7.4.1.5
+    in-latest-posix: ''
+  isdigit_l:
+    in-latest-posix: ''
+  isgraph:
+    c-definition: 7.4.1.6
+    in-latest-posix: ''
+  isgraph_l:
+    in-latest-posix: ''
+  islower:
+    c-definition: 7.4.1.7
+    in-latest-posix: ''
+  islower_l:
+    in-latest-posix: ''
+  isprint:
+    c-definition: 7.4.1.8
+    in-latest-posix: ''
+  isprint_l:
+    in-latest-posix: ''
+  ispunct:
+    c-definition: 7.4.1.9
+    in-latest-posix: ''
+  ispunct_l:
+    in-latest-posix: ''
+  isspace:
+    c-definition: 7.4.1.10
+    in-latest-posix: ''
+  isspace_l:
+    in-latest-posix: ''
+  isupper:
+    c-definition: 7.4.1.11
+    in-latest-posix: ''
+  isupper_l:
+    in-latest-posix: ''
+  isxdigit:
+    c-definition: 7.4.1.12
+    in-latest-posix: ''
+  isxdigit_l:
+    in-latest-posix: ''
+  tolower:
+    c-definition: 7.4.2.1
+    in-latest-posix: ''
+  tolower_l:
+    in-latest-posix: ''
+  toupper:
+    c-definition: 7.4.2.2
+    in-latest-posix: ''
+  toupper_l:
+    in-latest-posix: ''
+
diff --git a/libc/utils/docgen/docgen.py b/libc/utils/docgen/docgen.py
index aa30a6e51ef87..09db284ef9282 100755
--- a/libc/utils/docgen/docgen.py
+++ b/libc/utils/docgen/docgen.py
@@ -10,8 +10,9 @@
 from argparse import ArgumentParser, Namespace
 from pathlib import Path
 from typing import Dict
+import os
 import sys
-import json
+import yaml
 
 from header import Header
 
@@ -22,14 +23,14 @@ class DocgenAPIFormatError(Exception):
 
 def check_api(header: Header, api: Dict):
     """
-    Checks that docgen json files are properly formatted. If there are any
+    Checks that docgen yaml files are properly formatted. If there are any
     fatal formatting errors, raises exceptions with error messages useful for
     fixing formatting. Warnings are printed to stderr on non-fatal formatting
     errors. The code that runs after ``check_api(api)`` is called expects that
-    ``check_api`` executed without raising formatting exceptions so the json
+    ``check_api`` executed without raising formatting exceptions so the yaml
     matches the formatting specified here.
 
-    The json file may contain:
+    The yaml file may contain:
     * an optional macros object
     * an optional functions object
 
@@ -48,11 +49,15 @@ def check_api(header: Header, api: Dict):
     this should be a C standard section number. For the ``"posix-definition"`` property,
     this should be a link to the definition.
 
-    :param api: docgen json file contents parsed into a dict
+    :param api: docgen yaml file contents parsed into a dict
     """
     errors = []
-    cdef = "c-definition"
-    pdef = "posix-definition"
+    # We require entries to have at least one of these.
+    possible_keys = [
+        "c-definition",
+        "in-latest-posix",
+        "removed-in-posix-2008",
+    ]
 
     # Validate macros
     if "macros" in api:
@@ -65,8 +70,8 @@ def check_api(header: Header, api: Dict):
         macros = api["macros"]
 
         for name, obj in macros.items():
-            if not (cdef in obj or pdef in obj):
-                err = f'error: Macro {name} does not contain at least one required property: "{cdef}" or "{pdef}"'
+            if not any(k in obj for k in possible_keys):
+                err = f"error: Macro {name} does not contain at least one required property: {possible_keys}"
                 errors.append(err)
 
     # Validate functions
@@ -79,8 +84,8 @@ def check_api(header: Header, api: Dict):
 
         fns = api["functions"]
         for name, obj in fns.items():
-            if not (cdef in obj or pdef in obj):
-                err = f'error: function {name} does not contain at least one required property: "{cdef}" or "{pdef}"'
+            if not any(k in obj for k in possible_keys):
+                err = f"error: function {name} does not contain at least one required property: {possible_keys}"
                 errors.append(err)
 
     if errors:
@@ -88,8 +93,8 @@ def check_api(header: Header, api: Dict):
 
 
 def load_api(header: Header) -> Dict:
-    api = header.docgen_json.read_text(encoding="utf-8")
-    return json.loads(api)
+    api = header.docgen_yaml.read_text(encoding="utf-8")
+    return yaml.safe_load(api)
 
 
 def print_tbl_dir(name):
@@ -103,7 +108,7 @@ def print_tbl_dir(name):
   * - {name}
     - Implemented
     - C23 Standard Section
-    - POSIX.1-2024 Standard Section"""
+    - POSIX Docs"""
     )
 
 
@@ -127,8 +132,14 @@ def print_functions_rst(header: Header, functions: Dict):
         else:
             print("    -")
 
-        if "posix-definition" in functions[name]:
-            print(f'    - {functions[name]["posix-definition"]}')
+        if "in-latest-posix" in functions[name]:
+            print(
+                f"    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/{name}.html>`__"
+            )
+        elif "removed-in-posix-2008" in functions[name]:
+            print(
+                f"    - `removed in POSIX.1-2008 <https://pubs.opengroup.org/onlinepubs/007904875/functions/{name}.html>`__"
+            )
         else:
             print("    -")
 
@@ -153,15 +164,20 @@ def print_macros_rst(header: Header, macros: Dict):
         else:
             print("    -")
 
-        if "posix-definition" in macros[name]:
-            print(f'    - {macros[name]["posix-definition"]}')
+        if "in-latest-posix" in macros[name]:
+            print(
+                f"    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/{header.name}.html>`__"
+            )
         else:
             print("    -")
     print()
 
 
 def print_impl_status_rst(header: Header, api: Dict):
-    print(".. include:: ../check.rst\n")
+    if os.sep in header.name:
+        print(".. include:: ../../check.rst\n")
+    else:
+        print(".. include:: ../check.rst\n")
 
     print("=" * len(header.name))
     print(header.name)
@@ -176,10 +192,22 @@ def print_impl_status_rst(header: Header, api: Dict):
         print_functions_rst(header, api["functions"])
 
 
+# This code implicitly relies on docgen.py being in the same dir as the yaml
+# files and is likely to need to be fixed when re-integrating docgen into
+# hdrgen.
+def get_choices() -> list:
+    choices = []
+    for path in Path(__file__).parent.rglob("*.yaml"):
+        fname = path.with_suffix(".h").name
+        if path.parent != Path(__file__).parent:
+            fname = path.parent.name + os.sep + fname
+        choices.append(fname)
+    return choices
+
+
 def parse_args() -> Namespace:
     parser = ArgumentParser()
-    choices = [p.with_suffix(".h").name for p in Path(__file__).parent.glob("*.json")]
-    parser.add_argument("header_name", choices=choices)
+    parser.add_argument("header_name", choices=get_choices())
     return parser.parse_args()
 
 
diff --git a/libc/utils/docgen/errno.json b/libc/utils/docgen/errno.json
deleted file mode 100644
index aface8e42b495..0000000000000
--- a/libc/utils/docgen/errno.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "macros": {
-    "EDOM": {
-      "c-definition": "7.5"
-    },
-    "EILSEQ": {
-      "c-definition": "7.5"
-    },
-    "ERANGE": {
-      "c-definition": "7.5"
-    },
-    "errno": {
-      "c-definition": "7.5"
-    }
-  }
-}
diff --git a/libc/utils/docgen/errno.yaml b/libc/utils/docgen/errno.yaml
new file mode 100644
index 0000000000000..da41552bc3a18
--- /dev/null
+++ b/libc/utils/docgen/errno.yaml
@@ -0,0 +1,14 @@
+macros:
+  EDOM:
+    c-definition: '7.5'
+    in-latest-posix: ''
+  EILSEQ:
+    c-definition: '7.5'
+    in-latest-posix: ''
+  ERANGE:
+    c-definition: '7.5'
+    in-latest-posix: ''
+  errno:
+    c-definition: '7.5'
+    in-latest-posix: ''
+
diff --git a/libc/utils/docgen/fenv.json b/libc/utils/docgen/fenv.json
deleted file mode 100644
index 788b196c053bc..0000000000000
--- a/libc/utils/docgen/fenv.json
+++ /dev/null
@@ -1,114 +0,0 @@
-{
-  "macros": {
-    "__STDC_VERSION_FENV_H__": {
-      "c-definition": "7.6.5"
-    },
-    "FE_DIVBYZERO": {
-      "c-definition": "7.6.9"
-    },
-    "FE_INEXACT": {
-      "c-definition": "7.6.9"
-    },
-    "FE_INVALID": {
-      "c-definition": "7.6.9"
-    },
-    "FE_OVERFLOW": {
-      "c-definition": "7.6.9"
-    },
-    "FE_UNDERFLOW": {
-      "c-definition": "7.6.9"
-    },
-    "FE_ALL_EXCEPT": {
-      "c-definition": "7.6.12"
-    },
-    "FE_DFL_MODE": {
-      "c-definition": "7.6.11"
-    },
-    "FE_DOWNARD": {
-      "c-definition": "7.6.13"
-    },
-    "FE_TONEAREST": {
-      "c-definition": "7.6.13"
-    },
-    "FE_TONEARESTFROMZERO": {
-      "c-definition": "7.6.13"
-    },
-    "FE_TOWARDZERO": {
-      "c-definition": "7.6.13"
-    },
-    "FE_UPWARD": {
-      "c-definition": "7.6.13"
-    },
-    "FE_DEC_DOWNWARD": {
-      "c-definition": "7.6.14"
-    },
-    "FE_DEC_TONEAREST": {
-      "c-definition": "7.6.14"
-    },
-    "FE_DEC_TONEARESTFROMZERO": {
-      "c-definition": "7.6.14"
-    },
-    "FE_DEC_TOWARDZERO": {
-      "c-definition": "7.6.14"
-    },
-    "FE_DEC_UPWARD": {
-      "c-definition": "7.6.14"
-    },
-    "FE_DFL_ENV": {
-      "c-definition": "7.6.17"
-    }
-  },
-  "functions": {
-    "feclearexcept": {
-      "c-definition": "7.6.4.1"
-    },
-    "fegetexceptflag": {
-      "c-definition": "7.6.4.2"
-    },
-    "feraiseexcept": {
-      "c-definition": "7.6.4.3"
-    },
-    "fesetexcept": {
-      "c-definition": "7.6.4.4"
-    },
-    "fesetexceptflag": {
-      "c-definition": "7.6.4.5"
-    },
-    "fetestexceptflag": {
-      "c-definition": "7.6.4.6"
-    },
-    "fetestexcept": {
-      "c-definition": "7.6.4.7"
-    },
-    "fegetmode": {
-      "c-definition": "7.6.5.1"
-    },
-    "fegetround": {
-      "c-definition": "7.6.5.2"
-    },
-    "fe_dec_getround": {
-      "c-definition": "7.6.5.3"
-    },
-    "fesetmode": {
-      "c-definition": "7.6.5.4"
-    },
-    "fesetround": {
-      "c-definition": "7.6.5.5"
-    },
-    "fe_dec_setround": {
-      "c-definition": "7.6.5.6"
-    },
-    "fegetenv": {
-      "c-definition": "7.6.6.1"
-    },
-    "feholdexcept": {
-      "c-definition": "7.6.6.2"
-    },
-    "fesetenv": {
-      "c-definition": "7.6.6.3"
-    },
-    "feupdateenv": {
-      "c-definition": "7.6.6.4"
-    }
-  }
-}
diff --git a/libc/utils/docgen/fenv.yaml b/libc/utils/docgen/fenv.yaml
new file mode 100644
index 0000000000000..1d73697f36bec
--- /dev/null
+++ b/libc/utils/docgen/fenv.yaml
@@ -0,0 +1,97 @@
+functions:
+  fe_dec_getround:
+    c-definition: 7.6.5.3
+  fe_dec_setround:
+    c-definition: 7.6.5.6
+  feclearexcept:
+    c-definition: 7.6.4.1
+    in-latest-posix: ''
+  fegetenv:
+    c-definition: 7.6.6.1
+    in-latest-posix: ''
+  fegetexceptflag:
+    c-definition: 7.6.4.2
+    in-latest-posix: ''
+  fegetmode:
+    c-definition: 7.6.5.1
+  fegetround:
+    c-definition: 7.6.5.2
+    in-latest-posix: ''
+  feholdexcept:
+    c-definition: 7.6.6.2
+    in-latest-posix: ''
+  feraiseexcept:
+    c-definition: 7.6.4.3
+    in-latest-posix: ''
+  fesetenv:
+    c-definition: 7.6.6.3
+    in-latest-posix: ''
+  fesetexcept:
+    c-definition: 7.6.4.4
+  fesetexceptflag:
+    c-definition: 7.6.4.5
+    in-latest-posix: ''
+  fesetmode:
+    c-definition: 7.6.5.4
+  fesetround:
+    c-definition: 7.6.5.5
+    in-latest-posix: ''
+  fetestexcept:
+    c-definition: 7.6.4.7
+    in-latest-posix: ''
+  fetestexceptflag:
+    c-definition: 7.6.4.6
+  feupdateenv:
+    c-definition: 7.6.6.4
+    in-latest-posix: ''
+macros:
+  FE_ALL_EXCEPT:
+    c-definition: 7.6.12
+    in-latest-posix: ''
+  FE_DEC_DOWNWARD:
+    c-definition: 7.6.14
+  FE_DEC_TONEAREST:
+    c-definition: 7.6.14
+  FE_DEC_TONEARESTFROMZERO:
+    c-definition: 7.6.14
+  FE_DEC_TOWARDZERO:
+    c-definition: 7.6.14
+  FE_DEC_UPWARD:
+    c-definition: 7.6.14
+  FE_DFL_ENV:
+    c-definition: 7.6.17
+    in-latest-posix: ''
+  FE_DFL_MODE:
+    c-definition: 7.6.11
+  FE_DIVBYZERO:
+    c-definition: 7.6.9
+    in-latest-posix: ''
+  FE_DOWNWARD:
+    c-definition: 7.6.13
+    in-latest-posix: ''
+  FE_INEXACT:
+    c-definition: 7.6.9
+    in-latest-posix: ''
+  FE_INVALID:
+    c-definition: 7.6.9
+    in-latest-posix: ''
+  FE_OVERFLOW:
+    c-definition: 7.6.9
+    in-latest-posix: ''
+  FE_TONEAREST:
+    c-definition: 7.6.13
+    in-latest-posix: ''
+  FE_TONEARESTFROMZERO:
+    c-definition: 7.6.13
+  FE_TOWARDZERO:
+    c-definition: 7.6.13
+    in-latest-posix: ''
+  FE_UNDERFLOW:
+    c-definition: 7.6.9
+    in-latest-posix: ''
+  FE_UPWARD:
+    c-definition: 7.6.13
+    in-latest-posix: ''
+  __STDC_VERSION_FENV_H__:
+    c-definition: 7.6.5
+
diff --git a/libc/utils/docgen/float.json b/libc/utils/docgen/float.json
deleted file mode 100644
index a906cbf4fa754..0000000000000
--- a/libc/utils/docgen/float.json
+++ /dev/null
@@ -1,163 +0,0 @@
-{
-  "macros": {
-    "__STDC_VERSION_FLOAT_H__": {
-      "c-definition": "7.7"
-    },
-    "FLT_EVAL_METHOD" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "FLT_ROUNDS" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "FLT_EVAL_METHOD" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "FLT_HAS_SUBNORM" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "DBL_HAS_SUBNORM" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "LDBL_HAS_SUBNORM" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "FLT_RADIX" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "FLT_MANT_DIG" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "DBL_MANT_DIG" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "LDBL_MANT_DIG" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "FLT_DECIMAL_DIG" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "DBL_DECIMAL_DIG" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "LDBL_DECIMAL_DIG" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "DECIMAL_DIG" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "FLT_IS_IEC_60559" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "DBL_IS_IEC_60559" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "LDBL_IS_IEC_60559" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "FLT_DIG" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "DBL_DIG" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "LDBL_DIG" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "FLT_MIN_EXP" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "DBL_MIN_EXP" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "LDBL_MIN_EXP" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "FLT_MIN_10_EXP" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "DBL_MIN_10_EXP" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "LDBL_MIN_10_EXP" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "FLT_MAX_EXP" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "DBL_MAX_EXP" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "LDBL_MAX_EXP" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "FLT_MAX_10_EXP" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "DBL_MAX_10_EXP" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "LDBL_MAX_10_EXP" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "FLT_MAX" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "DBL_MAX" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "LDBL_MAX" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "FLT_NORM_MAX" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "DBL_NORM_MAX" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "LDBL_NORM_MAX" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "FLT_EPSILON" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "DBL_EPSILON" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "LDBL_EPSILON" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "FLT_MIN" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "DBL_MIN" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "LDBL_MIN" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "FLT_SNAN" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "DBL_SNAN" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "LDBL_SNAN" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "FLT_TRUE_MIN" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "DBL_TRUE_MIN" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "LDBL_TRUE_MIN" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "INFINITY" : {
-      "c-definition": "5.3.5.3.3"
-    },
-    "NAN" : {
-      "c-definition": "5.3.5.3.3"
-    }
-  }
-}
diff --git a/libc/utils/docgen/float.yaml b/libc/utils/docgen/float.yaml
new file mode 100644
index 0000000000000..a8840b91be55b
--- /dev/null
+++ b/libc/utils/docgen/float.yaml
@@ -0,0 +1,143 @@
+macros:
+  DBL_DECIMAL_DIG:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  DBL_DIG:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  DBL_EPSILON:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  DBL_HAS_SUBNORM:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  DBL_IS_IEC_60559:
+    c-definition: 5.3.5.3.3
+  DBL_MANT_DIG:
+    c-definition: 5.3.5.3.3
+  DBL_MAX:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  DBL_MAX_10_EXP:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  DBL_MAX_EXP:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  DBL_MIN:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  DBL_MIN_10_EXP:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  DBL_MIN_EXP:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  DBL_NORM_MAX:
+    c-definition: 5.3.5.3.3
+  DBL_SNAN:
+    c-definition: 5.3.5.3.3
+  DBL_TRUE_MIN:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  DECIMAL_DIG:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  FLT_DECIMAL_DIG:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  FLT_DIG:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  FLT_EPSILON:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  FLT_EVAL_METHOD:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  FLT_HAS_SUBNORM:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  FLT_IS_IEC_60559:
+    c-definition: 5.3.5.3.3
+  FLT_MANT_DIG:
+    c-definition: 5.3.5.3.3
+  FLT_MAX:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  FLT_MAX_10_EXP:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  FLT_MAX_EXP:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  FLT_MIN:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  FLT_MIN_10_EXP:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  FLT_MIN_EXP:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  FLT_NORM_MAX:
+    c-definition: 5.3.5.3.3
+  FLT_RADIX:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  FLT_ROUNDS:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  FLT_SNAN:
+    c-definition: 5.3.5.3.3
+  FLT_TRUE_MIN:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  INFINITY:
+    c-definition: 5.3.5.3.3
+  LDBL_DECIMAL_DIG:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  LDBL_DIG:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  LDBL_EPSILON:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  LDBL_HAS_SUBNORM:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  LDBL_IS_IEC_60559:
+    c-definition: 5.3.5.3.3
+  LDBL_MANT_DIG:
+    c-definition: 5.3.5.3.3
+  LDBL_MAX:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  LDBL_MAX_10_EXP:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  LDBL_MAX_EXP:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  LDBL_MIN:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  LDBL_MIN_10_EXP:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  LDBL_MIN_EXP:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  LDBL_NORM_MAX:
+    c-definition: 5.3.5.3.3
+  LDBL_SNAN:
+    c-definition: 5.3.5.3.3
+  LDBL_TRUE_MIN:
+    c-definition: 5.3.5.3.3
+    in-latest-posix: ''
+  NAN:
+    c-definition: 5.3.5.3.3
+  __STDC_VERSION_FLOAT_H__:
+    c-definition: '7.7'
+
diff --git a/libc/utils/docgen/header.py b/libc/utils/docgen/header.py
index dde210078db27..5bf524a64b69b 100644
--- a/libc/utils/docgen/header.py
+++ b/libc/utils/docgen/header.py
@@ -14,7 +14,7 @@ class Header:
     Maintains implementation information about a standard header file:
     * where does its implementation dir live
     * where is its macros file
-    * where is its docgen json file
+    * where is its docgen yaml file
 
     By convention, the macro-only part of a header file is in a header-specific
     file somewhere in the directory tree with root at
@@ -42,7 +42,7 @@ def __init__(self, header_name: str):
         self.stem = header_name.rstrip(".h")
         self.docgen_root = Path(__file__).parent
         self.libc_root = self.docgen_root.parent.parent
-        self.docgen_json = self.docgen_root / Path(header_name).with_suffix(".json")
+        self.docgen_yaml = self.docgen_root / Path(header_name).with_suffix(".yaml")
         self.fns_dir = Path(self.libc_root, "src", self.stem)
         self.macros_dir = Path(self.libc_root, "include", "llvm-libc-macros")
 
@@ -83,5 +83,10 @@ def __get_macro_files(self) -> Generator[Path, None, None]:
         macro file might be located in a subdirectory:
         libc/include/llvm-libc-macros/fcntl-macros.h
         libc/include/llvm-libc-macros/linux/fcntl-macros.h
+
+        When a header would be nested in a dir (such as arpa/, sys/, etc) we
+        instead use a hyphen in the name.
+        libc/include/llvm-libc-macros/sys-mman-macros.h
         """
-        return self.macros_dir.glob(f"**/{self.stem}-macros.h")
+        stem = self.stem.replace("/", "-")
+        return self.macros_dir.glob(f"**/{stem}-macros.h")
diff --git a/libc/utils/docgen/inttypes.json b/libc/utils/docgen/inttypes.json
deleted file mode 100644
index 001f15f27e638..0000000000000
--- a/libc/utils/docgen/inttypes.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "functions": {
-    "imaxabs": {
-      "c-definition": "7.8.2.1"
-    },
-    "imaxdiv": {
-      "c-definition": "7.8.2.2"
-    },
-    "strtoimax": {
-      "c-definition": "7.8.2.3"
-    },
-    "strtoumax": {
-      "c-definition": "7.8.2.3"
-    },
-    "wcstoimax": {
-      "c-definition": "7.8.2.4"
-    },
-    "wcstoumax": {
-      "c-definition": "7.8.2.4"
-    }
-  }
-}
diff --git a/libc/utils/docgen/inttypes.yaml b/libc/utils/docgen/inttypes.yaml
new file mode 100644
index 0000000000000..cbf50592ef072
--- /dev/null
+++ b/libc/utils/docgen/inttypes.yaml
@@ -0,0 +1,20 @@
+functions:
+  imaxabs:
+    c-definition: 7.8.2.1
+    in-latest-posix: ''
+  imaxdiv:
+    c-definition: 7.8.2.2
+    in-latest-posix: ''
+  strtoimax:
+    c-definition: 7.8.2.3
+    in-latest-posix: ''
+  strtoumax:
+    c-definition: 7.8.2.3
+    in-latest-posix: ''
+  wcstoimax:
+    c-definition: 7.8.2.4
+    in-latest-posix: ''
+  wcstoumax:
+    c-definition: 7.8.2.4
+    in-latest-posix: ''
+
diff --git a/libc/utils/docgen/locale.json b/libc/utils/docgen/locale.json
deleted file mode 100644
index 89329f9aae5b0..0000000000000
--- a/libc/utils/docgen/locale.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-  "macros": {
-    "LC_ALL": {
-      "c-definition": "7.11"
-    },
-    "LC_COLLATE": {
-      "c-definition": "7.11"
-    },
-    "LC_CTYPE": {
-      "c-definition": "7.11"
-    },
-    "LC_MONETARY": {
-      "c-definition": "7.11"
-    },
-    "LC_NUMERIC": {
-      "c-definition": "7.11"
-    },
-    "LC_TIME": {
-      "c-definition": "7.11"
-    }
-  },
-  "functions": {
-    "setlocale": {
-      "c-definition": "7.11.1.1"
-    },
-    "localeconv": {
-      "c-definition": "7.11.2.1"
-    }
-  }
-}
diff --git a/libc/utils/docgen/locale.yaml b/libc/utils/docgen/locale.yaml
new file mode 100644
index 0000000000000..eea91a885ff49
--- /dev/null
+++ b/libc/utils/docgen/locale.yaml
@@ -0,0 +1,37 @@
+functions:
+  duplocale:
+    in-latest-posix: ''
+  freelocale:
+    in-latest-posix: ''
+  getlocalename_l:
+    in-latest-posix: ''
+  localeconv:
+    c-definition: 7.11.2.1
+    in-latest-posix: ''
+  newlocale:
+    in-latest-posix: ''
+  setlocale:
+    c-definition: 7.11.1.1
+    in-latest-posix: ''
+  uselocale:
+    in-latest-posix: ''
+macros:
+  LC_ALL:
+    c-definition: '7.11'
+    in-latest-posix: ''
+  LC_COLLATE:
+    c-definition: '7.11'
+    in-latest-posix: ''
+  LC_CTYPE:
+    c-definition: '7.11'
+    in-latest-posix: ''
+  LC_MONETARY:
+    c-definition: '7.11'
+    in-latest-posix: ''
+  LC_NUMERIC:
+    c-definition: '7.11'
+    in-latest-posix: ''
+  LC_TIME:
+    c-definition: '7.11'
+    in-latest-posix: ''
+
diff --git a/libc/utils/docgen/setjmp.json b/libc/utils/docgen/setjmp.json
deleted file mode 100644
index 0b9a4e65da4f6..0000000000000
--- a/libc/utils/docgen/setjmp.json
+++ /dev/null
@@ -1,15 +0,0 @@
-{
-  "macros": {
-    "__STDC_VERSION_SETJMP_H__": {
-      "c-definition": "7.13.2"
-    }
-  },
-  "functions": {
-    "setjmp": {
-      "c-definition": "7.13.1.1"
-    },
-    "longjmp": {
-      "c-definition": "7.13.2.1"
-    }
-  }
-}
diff --git a/libc/utils/docgen/setjmp.yaml b/libc/utils/docgen/setjmp.yaml
new file mode 100644
index 0000000000000..123739d1a6ceb
--- /dev/null
+++ b/libc/utils/docgen/setjmp.yaml
@@ -0,0 +1,15 @@
+functions:
+  longjmp:
+    c-definition: 7.13.2.1
+    in-latest-posix: ''
+  setjmp:
+    c-definition: 7.13.1.1
+    in-latest-posix: ''
+  siglongjmp:
+    in-latest-posix: ''
+  sigsetjmp:
+    in-latest-posix: ''
+macros:
+  __STDC_VERSION_SETJMP_H__:
+    c-definition: 7.13.2
+
diff --git a/libc/utils/docgen/signal.json b/libc/utils/docgen/signal.json
deleted file mode 100644
index ec83144da8576..0000000000000
--- a/libc/utils/docgen/signal.json
+++ /dev/null
@@ -1,152 +0,0 @@
-{
-  "macros": {
-    "SIG_DFL": {
-      "c-definition": "7.14.3",
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIG_ERR": {
-      "c-definition": "7.14.3",
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIG_HOLD": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIG_IGN": {
-      "c-definition": "7.14.3",
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGRTMIN": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGRTMAX": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGABRT": {
-      "c-definition": "7.14.3",
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGALRM": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGBUS": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGCHLD": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGCONT": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGFPE": {
-      "c-definition": "7.14.3",
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGHUP": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGILL": {
-      "c-definition": "7.14.3",
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGINT": {
-      "c-definition": "7.14.3",
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGKILL": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGPIPE": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGPIPE": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGQUIT": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGSEGV": {
-      "c-definition": "7.14.3",
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGSTOP": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGTERM": {
-      "c-definition": "7.14.3",
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGTSTP": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGTTIN": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGTTOU": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGUSR1": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGUSR2": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGPOLL": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGPROF": {
-    "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGSYS": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGTRAP": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGURG": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGVTALRM": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGXCPU": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    },
-    "SIGXFSZ": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html"
-    }
-  },
-  "functions": {
-    "signal": {
-      "c-definition": "7.14.1.1",
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/signal.html"
-    },
-    "raise": {
-      "c-definition": "7.14.2.1",
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/raise.html"
-    },
-    "kill": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/kill.html"
-    },
-    "sigaction": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigaction.html"
-    },
-    "sigaddset": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigaddset.html"
-    },
-    "sigaltstack": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigaltstack.html"
-    },
-    "sigdelset": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigdelset.html"
-    },
-    "sigemptyset": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigemptyset.html"
-    },
-    "sigfillset": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigfillset.html"
-    },
-    "sigprocmask": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigprocmask.html"
-    }
-  }
-}
diff --git a/libc/utils/docgen/signal.yaml b/libc/utils/docgen/signal.yaml
new file mode 100644
index 0000000000000..da31a86b00eb0
--- /dev/null
+++ b/libc/utils/docgen/signal.yaml
@@ -0,0 +1,102 @@
+functions:
+  kill:
+    in-latest-posix: ''
+  raise:
+    c-definition: 7.14.2.1
+    in-latest-posix: ''
+  sigaction:
+    in-latest-posix: ''
+  sigaddset:
+    in-latest-posix: ''
+  sigaltstack:
+    in-latest-posix: ''
+  sigdelset:
+    in-latest-posix: ''
+  sigemptyset:
+    in-latest-posix: ''
+  sigfillset:
+    in-latest-posix: ''
+  signal:
+    c-definition: 7.14.1.1
+    in-latest-posix: ''
+  sigprocmask:
+    in-latest-posix: ''
+macros:
+  SIGABRT:
+    c-definition: 7.14.3
+    in-latest-posix: ''
+  SIGALRM:
+    in-latest-posix: ''
+  SIGBUS:
+    in-latest-posix: ''
+  SIGCHLD:
+    in-latest-posix: ''
+  SIGCONT:
+    in-latest-posix: ''
+  SIGFPE:
+    c-definition: 7.14.3
+    in-latest-posix: ''
+  SIGHUP:
+    in-latest-posix: ''
+  SIGILL:
+    c-definition: 7.14.3
+    in-latest-posix: ''
+  SIGINT:
+    c-definition: 7.14.3
+    in-latest-posix: ''
+  SIGKILL:
+    in-latest-posix: ''
+  SIGPIPE:
+    in-latest-posix: ''
+  SIGPOLL:
+    in-latest-posix: ''
+  SIGPROF:
+    in-latest-posix: ''
+  SIGQUIT:
+    in-latest-posix: ''
+  SIGRTMAX:
+    in-latest-posix: ''
+  SIGRTMIN:
+    in-latest-posix: ''
+  SIGSEGV:
+    c-definition: 7.14.3
+    in-latest-posix: ''
+  SIGSTOP:
+    in-latest-posix: ''
+  SIGSYS:
+    in-latest-posix: ''
+  SIGTERM:
+    c-definition: 7.14.3
+    in-latest-posix: ''
+  SIGTRAP:
+    in-latest-posix: ''
+  SIGTSTP:
+    in-latest-posix: ''
+  SIGTTIN:
+    in-latest-posix: ''
+  SIGTTOU:
+    in-latest-posix: ''
+  SIGURG:
+    in-latest-posix: ''
+  SIGUSR1:
+    in-latest-posix: ''
+  SIGUSR2:
+    in-latest-posix: ''
+  SIGVTALRM:
+    in-latest-posix: ''
+  SIGXCPU:
+    in-latest-posix: ''
+  SIGXFSZ:
+    in-latest-posix: ''
+  SIG_DFL:
+    c-definition: 7.14.3
+    in-latest-posix: ''
+  SIG_ERR:
+    c-definition: 7.14.3
+    in-latest-posix: ''
+  SIG_HOLD:
+    in-latest-posix: ''
+  SIG_IGN:
+    c-definition: 7.14.3
+    in-latest-posix: ''
+
diff --git a/libc/utils/docgen/stdbit.json b/libc/utils/docgen/stdbit.json
deleted file mode 100644
index 25060c1ff9fd8..0000000000000
--- a/libc/utils/docgen/stdbit.json
+++ /dev/null
@@ -1,270 +0,0 @@
-{
-  "macros": {
-    "__STDC_VERSION_STDBIT_H__": {
-      "c-definition": "7.18.1.2"
-    },
-    "__STDC_ENDIAN_LITTLE__": {
-      "c-definition": "7.18.2.2"
-    },
-    "__STDC_ENDIAN_BIG__": {
-      "c-definition": "7.18.2.2"
-    },
-    "__STDC_ENDIAN_NATIVE__": {
-      "c-definition": "7.18.2.2"
-    },
-    "stdc_leading_zeros": {
-      "c-definition": "7.18.3.1"
-    },
-    "stdc_leading_ones": {
-      "c-definition": "7.18.4.1"
-    },
-    "stdc_trailing_zeros": {
-      "c-definition": "7.18.5.1"
-    },
-    "stdc_trailing_ones": {
-      "c-definition": "7.18.6.1"
-    },
-    "stdc_first_leading_zero": {
-      "c-definition": "7.18.7.1"
-    },
-    "stdc_first_leading_one": {
-      "c-definition": "7.18.8.1"
-    },
-    "stdc_first_trailing_zero": {
-      "c-definition": "7.18.9.1"
-    },
-    "stdc_first_trailing_one": {
-      "c-definition": "7.18.10.1"
-    },
-    "stdc_count_zeros": {
-      "c-definition": "7.18.11.1"
-    },
-    "stdc_count_ones": {
-      "c-definition": "7.18.12.1"
-    },
-    "stdc_has_single_bit": {
-      "c-definition": "7.18.13.1"
-    },
-    "stdc_bit_width": {
-      "c-definition": "7.18.14.1"
-    },
-    "stdc_bit_floor": {
-      "c-definition": "7.18.15.1"
-    },
-    "stdc_bit_ceil": {
-      "c-definition": "7.18.16.1"
-    }
-  },
-  "functions": {
-    "stdc_leading_zeros_uc": {
-      "c-definition": "7.18.3"
-    },
-    "stdc_leading_zeros_us": {
-      "c-definition": "7.18.3"
-    },
-    "stdc_leading_zeros_ui": {
-      "c-definition": "7.18.3"
-    },
-    "stdc_leading_zeros_ul": {
-      "c-definition": "7.18.3"
-    },
-    "stdc_leading_zeros_ull": {
-      "c-definition": "7.18.3"
-    },
-    "stdc_leading_ones_uc": {
-      "c-definition": "7.18.4"
-    },
-    "stdc_leading_ones_us": {
-      "c-definition": "7.18.4"
-    },
-    "stdc_leading_ones_ui": {
-      "c-definition": "7.18.4"
-    },
-    "stdc_leading_ones_ul": {
-      "c-definition": "7.18.4"
-    },
-    "stdc_leading_ones_ull": {
-      "c-definition": "7.18.4"
-    },
-    "stdc_trailing_zeros_uc": {
-      "c-definition": "7.18.5"
-    },
-    "stdc_trailing_zeros_us": {
-      "c-definition": "7.18.5"
-    },
-    "stdc_trailing_zeros_ui": {
-      "c-definition": "7.18.5"
-    },
-    "stdc_trailing_zeros_ul": {
-      "c-definition": "7.18.5"
-    },
-    "stdc_trailing_zeros_ull": {
-      "c-definition": "7.18.5"
-    },
-    "stdc_trailing_ones_uc": {
-      "c-definition": "7.18.6"
-    },
-    "stdc_trailing_ones_us": {
-      "c-definition": "7.18.6"
-    },
-    "stdc_trailing_ones_ui": {
-      "c-definition": "7.18.6"
-    },
-    "stdc_trailing_ones_ul": {
-      "c-definition": "7.18.6"
-    },
-    "stdc_trailing_ones_ull": {
-      "c-definition": "7.18.6"
-    },
-    "stdc_first_leading_zero_uc": {
-      "c-definition": "7.18.7"
-    },
-    "stdc_first_leading_zero_us": {
-      "c-definition": "7.18.7"
-    },
-    "stdc_first_leading_zero_ui": {
-      "c-definition": "7.18.7"
-    },
-    "stdc_first_leading_zero_ul": {
-      "c-definition": "7.18.7"
-    },
-    "stdc_first_leading_zero_ull": {
-      "c-definition": "7.18.7"
-    },
-    "stdc_first_leading_one_uc": {
-      "c-definition": "7.18.8"
-    },
-    "stdc_first_leading_one_us": {
-      "c-definition": "7.18.8"
-    },
-    "stdc_first_leading_one_ui": {
-      "c-definition": "7.18.8"
-    },
-    "stdc_first_leading_one_ul": {
-      "c-definition": "7.18.8"
-    },
-    "stdc_first_leading_one_ull": {
-      "c-definition": "7.18.8"
-    },
-    "stdc_first_trailing_zero_uc": {
-      "c-definition": "7.18.9"
-    },
-    "stdc_first_trailing_zero_us": {
-      "c-definition": "7.18.9"
-    },
-    "stdc_first_trailing_zero_ui": {
-      "c-definition": "7.18.9"
-    },
-    "stdc_first_trailing_zero_ul": {
-      "c-definition": "7.18.9"
-    },
-    "stdc_first_trailing_zero_ull": {
-      "c-definition": "7.18.9"
-    },
-    "stdc_first_trailing_one_uc": {
-      "c-definition": "7.18.10"
-    },
-    "stdc_first_trailing_one_us": {
-      "c-definition": "7.18.10"
-    },
-    "stdc_first_trailing_one_ui": {
-      "c-definition": "7.18.10"
-    },
-    "stdc_first_trailing_one_ul": {
-      "c-definition": "7.18.10"
-    },
-    "stdc_first_trailing_one_ull": {
-      "c-definition": "7.18.10"
-    },
-    "stdc_count_zeros_uc": {
-      "c-definition": "7.18.11"
-    },
-    "stdc_count_zeros_us": {
-      "c-definition": "7.18.11"
-    },
-    "stdc_count_zeros_ui": {
-      "c-definition": "7.18.11"
-    },
-    "stdc_count_zeros_ul": {
-      "c-definition": "7.18.11"
-    },
-    "stdc_count_zeros_ull": {
-      "c-definition": "7.18.11"
-    },
-    "stdc_count_ones_uc": {
-      "c-definition": "7.18.12"
-    },
-    "stdc_count_ones_us": {
-      "c-definition": "7.18.12"
-    },
-    "stdc_count_ones_ui": {
-      "c-definition": "7.18.12"
-    },
-    "stdc_count_ones_ul": {
-      "c-definition": "7.18.12"
-    },
-    "stdc_count_ones_ull": {
-      "c-definition": "7.18.12"
-    },
-    "stdc_has_single_bit_uc": {
-      "c-definition": "7.18.13"
-    },
-    "stdc_has_single_bit_us": {
-      "c-definition": "7.18.13"
-    },
-    "stdc_has_single_bit_ui": {
-      "c-definition": "7.18.13"
-    },
-    "stdc_has_single_bit_ul": {
-      "c-definition": "7.18.13"
-    },
-    "stdc_has_single_bit_ull": {
-      "c-definition": "7.18.13"
-    },
-    "stdc_bit_width_uc": {
-      "c-definition": "7.18.14"
-    },
-    "stdc_bit_width_us": {
-      "c-definition": "7.18.14"
-    },
-    "stdc_bit_width_ui": {
-      "c-definition": "7.18.14"
-    },
-    "stdc_bit_width_ul": {
-      "c-definition": "7.18.14"
-    },
-    "stdc_bit_width_ull": {
-      "c-definition": "7.18.14"
-    },
-    "stdc_bit_floor_uc": {
-      "c-definition": "7.18.15"
-    },
-    "stdc_bit_floor_us": {
-      "c-definition": "7.18.15"
-    },
-    "stdc_bit_floor_ui": {
-      "c-definition": "7.18.15"
-    },
-    "stdc_bit_floor_ul": {
-      "c-definition": "7.18.15"
-    },
-    "stdc_bit_floor_ull": {
-      "c-definition": "7.18.15"
-    },
-    "stdc_bit_ceil_uc": {
-      "c-definition": "7.18.16"
-    },
-    "stdc_bit_ceil_us": {
-      "c-definition": "7.18.16"
-    },
-    "stdc_bit_ceil_ui": {
-      "c-definition": "7.18.16"
-    },
-    "stdc_bit_ceil_ul": {
-      "c-definition": "7.18.16"
-    },
-    "stdc_bit_ceil_ull": {
-      "c-definition": "7.18.16"
-    }
-  }
-}
diff --git a/libc/utils/docgen/stdbit.yaml b/libc/utils/docgen/stdbit.yaml
new file mode 100644
index 0000000000000..976221601e9cb
--- /dev/null
+++ b/libc/utils/docgen/stdbit.yaml
@@ -0,0 +1,179 @@
+functions:
+  stdc_bit_ceil_uc:
+    c-definition: 7.18.16
+  stdc_bit_ceil_ui:
+    c-definition: 7.18.16
+  stdc_bit_ceil_ul:
+    c-definition: 7.18.16
+  stdc_bit_ceil_ull:
+    c-definition: 7.18.16
+  stdc_bit_ceil_us:
+    c-definition: 7.18.16
+  stdc_bit_floor_uc:
+    c-definition: 7.18.15
+  stdc_bit_floor_ui:
+    c-definition: 7.18.15
+  stdc_bit_floor_ul:
+    c-definition: 7.18.15
+  stdc_bit_floor_ull:
+    c-definition: 7.18.15
+  stdc_bit_floor_us:
+    c-definition: 7.18.15
+  stdc_bit_width_uc:
+    c-definition: 7.18.14
+  stdc_bit_width_ui:
+    c-definition: 7.18.14
+  stdc_bit_width_ul:
+    c-definition: 7.18.14
+  stdc_bit_width_ull:
+    c-definition: 7.18.14
+  stdc_bit_width_us:
+    c-definition: 7.18.14
+  stdc_count_ones_uc:
+    c-definition: 7.18.12
+  stdc_count_ones_ui:
+    c-definition: 7.18.12
+  stdc_count_ones_ul:
+    c-definition: 7.18.12
+  stdc_count_ones_ull:
+    c-definition: 7.18.12
+  stdc_count_ones_us:
+    c-definition: 7.18.12
+  stdc_count_zeros_uc:
+    c-definition: 7.18.11
+  stdc_count_zeros_ui:
+    c-definition: 7.18.11
+  stdc_count_zeros_ul:
+    c-definition: 7.18.11
+  stdc_count_zeros_ull:
+    c-definition: 7.18.11
+  stdc_count_zeros_us:
+    c-definition: 7.18.11
+  stdc_first_leading_one_uc:
+    c-definition: 7.18.8
+  stdc_first_leading_one_ui:
+    c-definition: 7.18.8
+  stdc_first_leading_one_ul:
+    c-definition: 7.18.8
+  stdc_first_leading_one_ull:
+    c-definition: 7.18.8
+  stdc_first_leading_one_us:
+    c-definition: 7.18.8
+  stdc_first_leading_zero_uc:
+    c-definition: 7.18.7
+  stdc_first_leading_zero_ui:
+    c-definition: 7.18.7
+  stdc_first_leading_zero_ul:
+    c-definition: 7.18.7
+  stdc_first_leading_zero_ull:
+    c-definition: 7.18.7
+  stdc_first_leading_zero_us:
+    c-definition: 7.18.7
+  stdc_first_trailing_one_uc:
+    c-definition: 7.18.10
+  stdc_first_trailing_one_ui:
+    c-definition: 7.18.10
+  stdc_first_trailing_one_ul:
+    c-definition: 7.18.10
+  stdc_first_trailing_one_ull:
+    c-definition: 7.18.10
+  stdc_first_trailing_one_us:
+    c-definition: 7.18.10
+  stdc_first_trailing_zero_uc:
+    c-definition: 7.18.9
+  stdc_first_trailing_zero_ui:
+    c-definition: 7.18.9
+  stdc_first_trailing_zero_ul:
+    c-definition: 7.18.9
+  stdc_first_trailing_zero_ull:
+    c-definition: 7.18.9
+  stdc_first_trailing_zero_us:
+    c-definition: 7.18.9
+  stdc_has_single_bit_uc:
+    c-definition: 7.18.13
+  stdc_has_single_bit_ui:
+    c-definition: 7.18.13
+  stdc_has_single_bit_ul:
+    c-definition: 7.18.13
+  stdc_has_single_bit_ull:
+    c-definition: 7.18.13
+  stdc_has_single_bit_us:
+    c-definition: 7.18.13
+  stdc_leading_ones_uc:
+    c-definition: 7.18.4
+  stdc_leading_ones_ui:
+    c-definition: 7.18.4
+  stdc_leading_ones_ul:
+    c-definition: 7.18.4
+  stdc_leading_ones_ull:
+    c-definition: 7.18.4
+  stdc_leading_ones_us:
+    c-definition: 7.18.4
+  stdc_leading_zeros_uc:
+    c-definition: 7.18.3
+  stdc_leading_zeros_ui:
+    c-definition: 7.18.3
+  stdc_leading_zeros_ul:
+    c-definition: 7.18.3
+  stdc_leading_zeros_ull:
+    c-definition: 7.18.3
+  stdc_leading_zeros_us:
+    c-definition: 7.18.3
+  stdc_trailing_ones_uc:
+    c-definition: 7.18.6
+  stdc_trailing_ones_ui:
+    c-definition: 7.18.6
+  stdc_trailing_ones_ul:
+    c-definition: 7.18.6
+  stdc_trailing_ones_ull:
+    c-definition: 7.18.6
+  stdc_trailing_ones_us:
+    c-definition: 7.18.6
+  stdc_trailing_zeros_uc:
+    c-definition: 7.18.5
+  stdc_trailing_zeros_ui:
+    c-definition: 7.18.5
+  stdc_trailing_zeros_ul:
+    c-definition: 7.18.5
+  stdc_trailing_zeros_ull:
+    c-definition: 7.18.5
+  stdc_trailing_zeros_us:
+    c-definition: 7.18.5
+macros:
+  __STDC_ENDIAN_BIG__:
+    c-definition: 7.18.2.2
+  __STDC_ENDIAN_LITTLE__:
+    c-definition: 7.18.2.2
+  __STDC_ENDIAN_NATIVE__:
+    c-definition: 7.18.2.2
+  __STDC_VERSION_STDBIT_H__:
+    c-definition: 7.18.1.2
+  stdc_bit_ceil:
+    c-definition: 7.18.16.1
+  stdc_bit_floor:
+    c-definition: 7.18.15.1
+  stdc_bit_width:
+    c-definition: 7.18.14.1
+  stdc_count_ones:
+    c-definition: 7.18.12.1
+  stdc_count_zeros:
+    c-definition: 7.18.11.1
+  stdc_first_leading_one:
+    c-definition: 7.18.8.1
+  stdc_first_leading_zero:
+    c-definition: 7.18.7.1
+  stdc_first_trailing_one:
+    c-definition: 7.18.10.1
+  stdc_first_trailing_zero:
+    c-definition: 7.18.9.1
+  stdc_has_single_bit:
+    c-definition: 7.18.13.1
+  stdc_leading_ones:
+    c-definition: 7.18.4.1
+  stdc_leading_zeros:
+    c-definition: 7.18.3.1
+  stdc_trailing_ones:
+    c-definition: 7.18.6.1
+  stdc_trailing_zeros:
+    c-definition: 7.18.5.1
+
diff --git a/libc/utils/docgen/stdlib.json b/libc/utils/docgen/stdlib.json
deleted file mode 100644
index 0ca508110c0f1..0000000000000
--- a/libc/utils/docgen/stdlib.json
+++ /dev/null
@@ -1,174 +0,0 @@
-{
-  "macros": {
-    "__STDC_VERSION_STDLIB_H__": {
-      "c-definition": "7.24"
-    },
-    "EXIT_FAILURE": {
-      "c-definition": "7.24"
-    },
-    "EXIT_SUCCESS": {
-      "c-definition": "7.24"
-    },
-    "RAND_MAX": {
-      "c-definition": "7.24"
-    },
-    "MB_CUR_MAX": {
-      "c-definition": "7.24"
-    }
-  },
-  "functions": {
-    "atof": {
-      "c-definition": "7.24.1.1"
-    },
-    "atoi": {
-      "c-definition": "7.24.1.2"
-    },
-    "atol": {
-      "c-definition": "7.24.1.2"
-    },
-    "atoll": {
-      "c-definition": "7.24.1.2"
-    },
-    "strfromd": {
-      "c-definition": "7.24.1.3"
-    },
-    "strfromf": {
-      "c-definition": "7.24.1.3"
-    },
-    "strfroml": {
-      "c-definition": "7.24.1.3"
-    },
-    "strfromd32": {
-      "c-definition": "7.24.1.4"
-    },
-    "strfromd64": {
-      "c-definition": "7.24.1.4"
-    },
-    "strfromd128": {
-      "c-definition": "7.24.1.4"
-    },
-    "strtod": {
-      "c-definition": "7.24.1.5"
-    },
-    "strtof": {
-      "c-definition": "7.24.1.5"
-    },
-    "strtold": {
-      "c-definition": "7.24.1.5"
-    },
-    "strtod32": {
-      "c-definition": "7.24.1.6"
-    },
-    "strtod64": {
-      "c-definition": "7.24.1.6"
-    },
-    "strtod128": {
-      "c-definition": "7.24.1.6"
-    },
-    "strtol": {
-      "c-definition": "7.24.1.7"
-    },
-    "strtoll": {
-      "c-definition": "7.24.1.7"
-    },
-    "strtoul": {
-      "c-definition": "7.24.1.7"
-    },
-    "strtoull": {
-      "c-definition": "7.24.1.7"
-    },
-    "rand": {
-      "c-definition": "7.24.2.1"
-    },
-    "srand": {
-      "c-definition": "7.24.2.2"
-    },
-    "aligned_alloc": {
-      "c-definition": "7.24.3.1"
-    },
-    "calloc": {
-      "c-definition": "7.24.3.2"
-    },
-    "free": {
-      "c-definition": "7.24.3.3"
-    },
-    "free_sized": {
-      "c-definition": "7.24.3.4"
-    },
-    "free_aligned_sized": {
-      "c-definition": "7.24.3.5"
-    },
-    "malloc": {
-      "c-definition": "7.24.3.6"
-    },
-    "realloc": {
-      "c-definition": "7.24.3.7"
-    },
-    "abort": {
-      "c-definition": "7.24.4.1"
-    },
-    "atexit": {
-      "c-definition": "7.24.4.2"
-    },
-    "at_quick_exit": {
-      "c-definition": "7.24.4.3"
-    },
-    "exit": {
-      "c-definition": "7.24.4.4"
-    },
-    "_Exit": {
-      "c-definition": "7.24.4.5"
-    },
-    "getenv": {
-      "c-definition": "7.24.4.6"
-    },
-    "quick_exit": {
-      "c-definition": "7.24.4.7"
-    },
-    "system": {
-      "c-definition": "7.24.4.8"
-    },
-    "bsearch": {
-      "c-definition": "7.24.5.1"
-    },
-    "qsort": {
-      "c-definition": "7.24.5.2"
-    },
-    "abs": {
-      "c-definition": "7.24.6.1"
-    },
-    "labs": {
-      "c-definition": "7.24.6.1"
-    },
-    "llabs": {
-      "c-definition": "7.24.6.1"
-    },
-    "div": {
-      "c-definition": "7.24.6.2"
-    },
-    "ldiv": {
-      "c-definition": "7.24.6.2"
-    },
-    "lldiv": {
-      "c-definition": "7.24.6.2"
-    },
-    "mblen": {
-      "c-definition": "7.24.7.1"
-    },
-    "mbtowc": {
-      "c-definition": "7.24.7.2"
-    },
-    "wctomb": {
-      "c-definition": "7.24.7.3"
-    },
-    "mbstowcs": {
-      "c-definition": "7.24.8.1"
-    },
-    "wcstombs": {
-      "c-definition": "7.24.8.2"
-    },
-    "memalignment": {
-      "c-definition": "7.24.9.1"
-    }
-  }
-}
diff --git a/libc/utils/docgen/stdlib.yaml b/libc/utils/docgen/stdlib.yaml
new file mode 100644
index 0000000000000..526ddefbe1ce3
--- /dev/null
+++ b/libc/utils/docgen/stdlib.yaml
@@ -0,0 +1,158 @@
+functions:
+  _Exit:
+    c-definition: 7.24.4.5
+    in-latest-posix: ''
+  abort:
+    c-definition: 7.24.4.1
+    in-latest-posix: ''
+  abs:
+    c-definition: 7.24.6.1
+    in-latest-posix: ''
+  aligned_alloc:
+    c-definition: 7.24.3.1
+    in-latest-posix: ''
+  at_quick_exit:
+    c-definition: 7.24.4.3
+    in-latest-posix: ''
+  atexit:
+    c-definition: 7.24.4.2
+    in-latest-posix: ''
+  atof:
+    c-definition: 7.24.1.1
+    in-latest-posix: ''
+  atoi:
+    c-definition: 7.24.1.2
+    in-latest-posix: ''
+  atol:
+    c-definition: 7.24.1.2
+    in-latest-posix: ''
+  atoll:
+    c-definition: 7.24.1.2
+    in-latest-posix: ''
+  bsearch:
+    c-definition: 7.24.5.1
+    in-latest-posix: ''
+  calloc:
+    c-definition: 7.24.3.2
+    in-latest-posix: ''
+  div:
+    c-definition: 7.24.6.2
+    in-latest-posix: ''
+  exit:
+    c-definition: 7.24.4.4
+    in-latest-posix: ''
+  free:
+    c-definition: 7.24.3.3
+    in-latest-posix: ''
+  free_aligned_sized:
+    c-definition: 7.24.3.5
+  free_sized:
+    c-definition: 7.24.3.4
+  getenv:
+    c-definition: 7.24.4.6
+    in-latest-posix: ''
+  labs:
+    c-definition: 7.24.6.1
+    in-latest-posix: ''
+  ldiv:
+    c-definition: 7.24.6.2
+    in-latest-posix: ''
+  llabs:
+    c-definition: 7.24.6.1
+    in-latest-posix: ''
+  lldiv:
+    c-definition: 7.24.6.2
+    in-latest-posix: ''
+  malloc:
+    c-definition: 7.24.3.6
+    in-latest-posix: ''
+  mblen:
+    c-definition: 7.24.7.1
+    in-latest-posix: ''
+  mbstowcs:
+    c-definition: 7.24.8.1
+    in-latest-posix: ''
+  mbtowc:
+    c-definition: 7.24.7.2
+    in-latest-posix: ''
+  memalignment:
+    c-definition: 7.24.9.1
+  qsort:
+    c-definition: 7.24.5.2
+    in-latest-posix: ''
+  quick_exit:
+    c-definition: 7.24.4.7
+    in-latest-posix: ''
+  rand:
+    c-definition: 7.24.2.1
+    in-latest-posix: ''
+  realloc:
+    c-definition: 7.24.3.7
+    in-latest-posix: ''
+  srand:
+    c-definition: 7.24.2.2
+    in-latest-posix: ''
+  strfromd:
+    c-definition: 7.24.1.3
+  strfromd128:
+    c-definition: 7.24.1.4
+  strfromd32:
+    c-definition: 7.24.1.4
+  strfromd64:
+    c-definition: 7.24.1.4
+  strfromf:
+    c-definition: 7.24.1.3
+  strfroml:
+    c-definition: 7.24.1.3
+  strtod:
+    c-definition: 7.24.1.5
+    in-latest-posix: ''
+  strtod128:
+    c-definition: 7.24.1.6
+  strtod32:
+    c-definition: 7.24.1.6
+  strtod64:
+    c-definition: 7.24.1.6
+  strtof:
+    c-definition: 7.24.1.5
+    in-latest-posix: ''
+  strtol:
+    c-definition: 7.24.1.7
+    in-latest-posix: ''
+  strtold:
+    c-definition: 7.24.1.5
+    in-latest-posix: ''
+  strtoll:
+    c-definition: 7.24.1.7
+    in-latest-posix: ''
+  strtoul:
+    c-definition: 7.24.1.7
+    in-latest-posix: ''
+  strtoull:
+    c-definition: 7.24.1.7
+    in-latest-posix: ''
+  system:
+    c-definition: 7.24.4.8
+    in-latest-posix: ''
+  wcstombs:
+    c-definition: 7.24.8.2
+    in-latest-posix: ''
+  wctomb:
+    c-definition: 7.24.7.3
+    in-latest-posix: ''
+macros:
+  EXIT_FAILURE:
+    c-definition: '7.24'
+    in-latest-posix: ''
+  EXIT_SUCCESS:
+    c-definition: '7.24'
+    in-latest-posix: ''
+  MB_CUR_MAX:
+    c-definition: '7.24'
+    in-latest-posix: ''
+  RAND_MAX:
+    c-definition: '7.24'
+    in-latest-posix: ''
+  __STDC_VERSION_STDLIB_H__:
+    c-definition: '7.24'
+
diff --git a/libc/utils/docgen/string.json b/libc/utils/docgen/string.json
deleted file mode 100644
index d3fd9daf186a1..0000000000000
--- a/libc/utils/docgen/string.json
+++ /dev/null
@@ -1,99 +0,0 @@
-{
-  "macros": {
-    "__STDC_VERSION_STRING_H__": {
-      "c-definition": "7.26.1"
-    }
-  },
-  "functions": {
-    "memcpy": {
-      "c-definition": "7.26.2.1"
-    },
-    "memccpy": {
-      "c-definition": "7.26.2.2"
-    },
-    "mempcpy": {
-      "c-definition": "TODO: glibc extension"
-    },
-    "memmove": {
-      "c-definition": "7.26.2.3"
-    },
-    "strcpy": {
-      "c-definition": "7.26.2.4"
-    },
-    "strncpy": {
-      "c-definition": "7.26.2.5"
-    },
-    "stpcpy": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/stpcpy.html"
-    },
-    "stpncpy": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/stpncpy.html"
-    },
-    "strdup": {
-      "c-definition": "7.26.2.6"
-    },
-    "strndup": {
-      "c-definition": "7.26.2.7"
-    },
-    "strcat": {
-      "c-definition": "7.26.3.1"
-    },
-    "strncat": {
-      "c-definition": "7.26.3.2"
-    },
-    "memcmp": {
-      "c-definition": "7.26.4.1"
-    },
-    "strcmp": {
-      "c-definition": "7.26.4.2"
-    },
-    "strcoll": {
-      "c-definition": "7.26.4.3"
-    },
-    "strncmp": {
-      "c-definition": "7.26.4.4"
-    },
-    "strxfrm": {
-      "c-definition": "7.26.4.5"
-    },
-    "memchr": {
-      "c-definition": "7.26.5.2"
-    },
-    "strchr": {
-      "c-definition": "7.26.5.3"
-    },
-    "strcspn": {
-      "c-definition": "7.26.5.4"
-    },
-    "strpbrk": {
-      "c-definition": "7.26.5.5"
-    },
-    "strrchr": {
-      "c-definition": "7.26.5.6"
-    },
-    "strspn": {
-      "c-definition": "7.26.5.7"
-    },
-    "strstr": {
-      "c-definition": "7.26.5.8"
-    },
-    "strtok": {
-      "c-definition": "7.26.5.9"
-    },
-    "strtok_r": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtok_r.html"
-    },
-    "memset": {
-      "c-definition": "7.26.6.1"
-    },
-    "memset_explicit": {
-      "c-definition": "7.26.6.2"
-    },
-    "strerror": {
-      "c-definition": "7.26.6.3"
-    },
-    "strlen": {
-      "c-definition": "7.26.6.4"
-    }
-  }
-}
diff --git a/libc/utils/docgen/string.yaml b/libc/utils/docgen/string.yaml
new file mode 100644
index 0000000000000..d703a8e3593e1
--- /dev/null
+++ b/libc/utils/docgen/string.yaml
@@ -0,0 +1,94 @@
+functions:
+  memccpy:
+    c-definition: 7.26.2.2
+    in-latest-posix: ''
+  memchr:
+    c-definition: 7.26.5.2
+    in-latest-posix: ''
+  memcmp:
+    c-definition: 7.26.4.1
+    in-latest-posix: ''
+  memcpy:
+    c-definition: 7.26.2.1
+    in-latest-posix: ''
+  memmove:
+    c-definition: 7.26.2.3
+    in-latest-posix: ''
+  mempcpy:
+    c-definition: 'TODO: glibc extension'
+  memset:
+    c-definition: 7.26.6.1
+    in-latest-posix: ''
+  memset_explicit:
+    c-definition: 7.26.6.2
+  stpcpy:
+    in-latest-posix: ''
+  stpncpy:
+    in-latest-posix: ''
+  strcat:
+    c-definition: 7.26.3.1
+    in-latest-posix: ''
+  strchr:
+    c-definition: 7.26.5.3
+    in-latest-posix: ''
+  strcmp:
+    c-definition: 7.26.4.2
+    in-latest-posix: ''
+  strcoll:
+    c-definition: 7.26.4.3
+    in-latest-posix: ''
+  strcoll_l:
+    in-latest-posix: ''
+  strcpy:
+    c-definition: 7.26.2.4
+    in-latest-posix: ''
+  strcspn:
+    c-definition: 7.26.5.4
+    in-latest-posix: ''
+  strdup:
+    c-definition: 7.26.2.6
+    in-latest-posix: ''
+  strerror:
+    c-definition: 7.26.6.3
+    in-latest-posix: ''
+  strlen:
+    c-definition: 7.26.6.4
+    in-latest-posix: ''
+  strncat:
+    c-definition: 7.26.3.2
+    in-latest-posix: ''
+  strncmp:
+    c-definition: 7.26.4.4
+    in-latest-posix: ''
+  strncpy:
+    c-definition: 7.26.2.5
+    in-latest-posix: ''
+  strndup:
+    c-definition: 7.26.2.7
+    in-latest-posix: ''
+  strpbrk:
+    c-definition: 7.26.5.5
+    in-latest-posix: ''
+  strrchr:
+    c-definition: 7.26.5.6
+    in-latest-posix: ''
+  strspn:
+    c-definition: 7.26.5.7
+    in-latest-posix: ''
+  strstr:
+    c-definition: 7.26.5.8
+    in-latest-posix: ''
+  strtok:
+    c-definition: 7.26.5.9
+    in-latest-posix: ''
+  strtok_r:
+    in-latest-posix: ''
+  strxfrm:
+    c-definition: 7.26.4.5
+    in-latest-posix: ''
+  strxfrm_l:
+    in-latest-posix: ''
+macros:
+  __STDC_VERSION_STRING_H__:
+    c-definition: 7.26.1
+
diff --git a/libc/utils/docgen/strings.json b/libc/utils/docgen/strings.json
deleted file mode 100644
index 5274745f5f50d..0000000000000
--- a/libc/utils/docgen/strings.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-  "functions": {
-    "bzero": {
-      "posix-definition": "removed in POSIX.1-2008"
-    },
-    "bcmp": {
-      "posix-definition": "removed in POSIX.1-2008"
-    },
-    "bcopy": {
-      "posix-definition": "removed in POSIX.1-2008"
-    },
-    "ffs": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/ffs.html"
-    },
-    "ffsl": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/ffsl.html"
-    },
-    "ffsll": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/ffsll.html"
-    },
-    "index": {
-      "posix-definition": "removed in POSIX.1-2008"
-    },
-    "rindex": {
-      "posix-definition": "removed in POSIX.1-2008"
-    },
-    "strcasecmp": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/strncasecmp.html"
-    },
-    "strcasecmp_l": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/strncasecmp.html"
-    },
-    "strncasecmp": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/strncasecmp.html"
-    },
-    "strncasecmp_l": {
-      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9799919799/functions/strncasecmp.html"
-    }
-  }
-}
diff --git a/libc/utils/docgen/sys/mman.yaml b/libc/utils/docgen/sys/mman.yaml
new file mode 100644
index 0000000000000..dba26cabc6621
--- /dev/null
+++ b/libc/utils/docgen/sys/mman.yaml
@@ -0,0 +1,77 @@
+functions:
+  mlock:
+    posix-definition: ''
+  mlockall:
+    posix-definition: ''
+  mmap:
+    posix-definition: ''
+  mprotect:
+    posix-definition: ''
+  msync:
+    posix-definition: ''
+  munlock:
+    posix-definition: ''
+  munlockall:
+    posix-definition: ''
+  munmap:
+    posix-definition: ''
+  posix_madvise:
+    posix-definition: ''
+  posix_mem_offset:
+    posix-definition: ''
+  posix_typed_mem_get_info:
+    posix-definition: ''
+  posix_typed_mem_open:
+    posix-definition: ''
+  shm_open:
+    posix-definition: ''
+  shm_unlink:
+    posix-definition: ''
+macros:
+  MAP_ANON:
+    posix-definition: ''
+  MAP_ANONYMOUS:
+    posix-definition: ''
+  MAP_FAILED:
+    posix-definition: ''
+  MAP_FIXED:
+    posix-definition: ''
+  MAP_PRIVATE:
+    posix-definition: ''
+  MAP_SHARED:
+    posix-definition: ''
+  MCL_CURRENT:
+    posix-definition: ''
+  MCL_FUTURE:
+    posix-definition: ''
+  MS_ASYNC:
+    posix-definition: ''
+  MS_INVALIDATE:
+    posix-definition: ''
+  MS_SYNC:
+    posix-definition: ''
+  POSIX_MADV_DONTNEED:
+    posix-definition: ''
+  POSIX_MADV_NORMAL:
+    posix-definition: ''
+  POSIX_MADV_RANDOM:
+    posix-definition: ''
+  POSIX_MADV_SEQUENTIAL:
+    posix-definition: ''
+  POSIX_MADV_WILLNEED:
+    posix-definition: ''
+  POSIX_TYPED_MEM_ALLOCATE:
+    posix-definition: ''
+  POSIX_TYPED_MEM_ALLOCATE_CONTIG:
+    posix-definition: ''
+  POSIX_TYPED_MEM_MAP_ALLOCATABLE:
+    posix-definition: ''
+  PROT_EXEC:
+    posix-definition: ''
+  PROT_NONE:
+    posix-definition: ''
+  PROT_READ:
+    posix-definition: ''
+  PROT_WRITE:
+    posix-definition: ''
+
diff --git a/libc/utils/docgen/threads.json b/libc/utils/docgen/threads.json
deleted file mode 100644
index 62c5ff881e42e..0000000000000
--- a/libc/utils/docgen/threads.json
+++ /dev/null
@@ -1,90 +0,0 @@
-{
-  "macros": {
-    "__STDC_NO_THREADS__": {
-      "c-definition": "7.28.1"
-    },
-    "ONCE_FLAG_INIT": {
-      "c-definition": "7.28.1"
-    },
-    "TSS_DTOR_ITERATIONS": {
-      "c-definition": "7.28.1"
-    }
-  },
-  "functions": {
-    "call_once": {
-      "c-definition": "7.28.2.1"
-    },
-    "cnd_broadcast": {
-      "c-definition": "7.28.3.1"
-    },
-    "cnd_destroy": {
-      "c-definition": "7.28.3.2"
-    },
-    "cnd_init": {
-      "c-definition": "7.28.3.3"
-    },
-    "cnd_signal": {
-      "c-definition": "7.28.3.4"
-    },
-    "cnd_timedwait": {
-      "c-definition": "7.28.3.5"
-    },
-    "cnd_wait": {
-      "c-definition": "7.28.3.6"
-    },
-    "mtx_destroy": {
-      "c-definition": "7.28.4.2"
-    },
-    "mtx_init": {
-      "c-definition": "7.28.4.3"
-    },
-    "mtx_lock": {
-      "c-definition": "7.28.4.4"
-    },
-    "mtx_timedlock": {
-      "c-definition": "7.28.4.5"
-    },
-    "mtx_trylock": {
-      "c-definition": "7.28.4.6"
-    },
-    "mtx_unlock": {
-      "c-definition": "7.28.4.7"
-    },
-    "thrd_create": {
-      "c-definition": "7.28.5.1"
-    },
-    "thrd_current": {
-      "c-definition": "7.28.5.2"
-    },
-    "thrd_detach": {
-      "c-definition": "7.28.5.3"
-    },
-    "thrd_equal": {
-      "c-definition": "7.28.5.4"
-    },
-    "thrd_exit": {
-      "c-definition": "7.28.5.5"
-    },
-    "thrd_join": {
-      "c-definition": "7.28.5.6"
-    },
-    "thrd_sleep": {
-      "c-definition": "7.28.5.7"
-    },
-    "thrd_yield": {
-      "c-definition": "7.28.5.8"
-    },
-    "tss_create": {
-      "c-definition": "7.28.6.1"
-    },
-    "tss_delete": {
-      "c-definition": "7.28.6.2"
-    },
-    "tss_get": {
-      "c-definition": "7.28.6.3"
-    },
-    "tss_set": {
-      "c-definition": "7.28.6.4"
-    }
-  }
-}
diff --git a/libc/utils/docgen/threads.yaml b/libc/utils/docgen/threads.yaml
new file mode 100644
index 0000000000000..83db0992dc45d
--- /dev/null
+++ b/libc/utils/docgen/threads.yaml
@@ -0,0 +1,88 @@
+functions:
+  call_once:
+    c-definition: 7.28.2.1
+    in-latest-posix: ''
+  cnd_broadcast:
+    c-definition: 7.28.3.1
+    in-latest-posix: ''
+  cnd_destroy:
+    c-definition: 7.28.3.2
+    in-latest-posix: ''
+  cnd_init:
+    c-definition: 7.28.3.3
+    in-latest-posix: ''
+  cnd_signal:
+    c-definition: 7.28.3.4
+    in-latest-posix: ''
+  cnd_timedwait:
+    c-definition: 7.28.3.5
+    in-latest-posix: ''
+  cnd_wait:
+    c-definition: 7.28.3.6
+    in-latest-posix: ''
+  mtx_destroy:
+    c-definition: 7.28.4.2
+    in-latest-posix: ''
+  mtx_init:
+    c-definition: 7.28.4.3
+    in-latest-posix: ''
+  mtx_lock:
+    c-definition: 7.28.4.4
+    in-latest-posix: ''
+  mtx_timedlock:
+    c-definition: 7.28.4.5
+    in-latest-posix: ''
+  mtx_trylock:
+    c-definition: 7.28.4.6
+    in-latest-posix: ''
+  mtx_unlock:
+    c-definition: 7.28.4.7
+    in-latest-posix: ''
+  thrd_create:
+    c-definition: 7.28.5.1
+    in-latest-posix: ''
+  thrd_current:
+    c-definition: 7.28.5.2
+    in-latest-posix: ''
+  thrd_detach:
+    c-definition: 7.28.5.3
+    in-latest-posix: ''
+  thrd_equal:
+    c-definition: 7.28.5.4
+    in-latest-posix: ''
+  thrd_exit:
+    c-definition: 7.28.5.5
+    in-latest-posix: ''
+  thrd_join:
+    c-definition: 7.28.5.6
+    in-latest-posix: ''
+  thrd_sleep:
+    c-definition: 7.28.5.7
+    in-latest-posix: ''
+  thrd_yield:
+    c-definition: 7.28.5.8
+    in-latest-posix: ''
+  tss_create:
+    c-definition: 7.28.6.1
+    in-latest-posix: ''
+  tss_delete:
+    c-definition: 7.28.6.2
+    in-latest-posix: ''
+  tss_get:
+    c-definition: 7.28.6.3
+    in-latest-posix: ''
+  tss_set:
+    c-definition: 7.28.6.4
+    in-latest-posix: ''
+macros:
+  ONCE_FLAG_INIT:
+    c-definition: 7.28.1
+    in-latest-posix: ''
+  TSS_DTOR_ITERATIONS:
+    c-definition: 7.28.1
+    in-latest-posix: ''
+  __STDC_NO_THREADS__:
+    c-definition: 7.28.1
+  thread_local:
+    in-latest-posix: ''
+
diff --git a/libc/utils/docgen/uchar.json b/libc/utils/docgen/uchar.json
deleted file mode 100644
index c7a8764657da6..0000000000000
--- a/libc/utils/docgen/uchar.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-  "macros": {
-    "__STDC_VERSION_UCHAR_H__": {
-      "c-definition": "7.30.1"
-    }
-  },
-  "functions": {
-    "mbrtoc8": {
-      "c-definition": "7.30.2.2"
-    },
-    "c8rtomb": {
-      "c-definition": "7.30.2.3"
-    },
-    "mbrtoc16": {
-      "c-definition": "7.30.2.4"
-    },
-    "c16rtomb": {
-      "c-definition": "7.30.2.5"
-    },
-    "mbrtoc32": {
-      "c-definition": "7.30.2.6"
-    },
-    "c32rtomb": {
-      "c-definition": "7.30.2.7"
-    }
-  }
-}
diff --git a/libc/utils/docgen/uchar.yaml b/libc/utils/docgen/uchar.yaml
new file mode 100644
index 0000000000000..580af0f548336
--- /dev/null
+++ b/libc/utils/docgen/uchar.yaml
@@ -0,0 +1,21 @@
+functions:
+  c16rtomb:
+    c-definition: 7.30.2.5
+    in-latest-posix: ''
+  c32rtomb:
+    c-definition: 7.30.2.7
+    in-latest-posix: ''
+  c8rtomb:
+    c-definition: 7.30.2.3
+  mbrtoc16:
+    c-definition: 7.30.2.4
+    in-latest-posix: ''
+  mbrtoc32:
+    c-definition: 7.30.2.6
+    in-latest-posix: ''
+  mbrtoc8:
+    c-definition: 7.30.2.2
+macros:
+  __STDC_VERSION_UCHAR_H__:
+    c-definition: 7.30.1
+
diff --git a/libc/utils/docgen/wchar.json b/libc/utils/docgen/wchar.json
deleted file mode 100644
index a44f2ef82b7cd..0000000000000
--- a/libc/utils/docgen/wchar.json
+++ /dev/null
@@ -1,198 +0,0 @@
-{
-  "macros": {
-    "__STDC_VERSION_WCHAR_H__": {
-      "c-definition": "7.31.1"
-    },
-    "WEOF": {
-      "c-definition": "7.31.1"
-    }
-  },
-  "functions": {
-    "fwprintf": {
-      "c-definition": "7.31.2.2"
-    },
-    "fwscanf": {
-      "c-definition": "7.31.2.3"
-    },
-    "swprintf": {
-      "c-definition": "7.31.2.4"
-    },
-    "swscanf": {
-      "c-definition": "7.31.2.5"
-    },
-    "vfwprintf": {
-      "c-definition": "7.31.2.6"
-    },
-    "vfwscanf": {
-      "c-definition": "7.31.2.7"
-    },
-    "vswprintf": {
-      "c-definition": "7.31.2.8"
-    },
-    "vswscanf": {
-      "c-definition": "7.31.2.9"
-    },
-    "vwprintf": {
-      "c-definition": "7.31.2.10"
-    },
-    "vwscanf": {
-      "c-definition": "7.31.2.11"
-    },
-    "wprintf": {
-      "c-definition": "7.31.2.12"
-    },
-    "wscanf": {
-      "c-definition": "7.31.2.13"
-    },
-    "fgetwc": {
-      "c-definition": "7.31.3.1"
-    },
-    "fgetws": {
-      "c-definition": "7.31.3.2"
-    },
-    "fputwc": {
-      "c-definition": "7.31.3.3"
-    },
-    "fputws": {
-      "c-definition": "7.31.3.4"
-    },
-    "fwide": {
-      "c-definition": "7.31.3.5"
-    },
-    "getwc": {
-      "c-definition": "7.31.3.6"
-    },
-    "getwchar": {
-      "c-definition": "7.31.3.7"
-    },
-    "putwc": {
-      "c-definition": "7.31.3.8"
-    },
-    "putwchar": {
-      "c-definition": "7.31.3.9"
-    },
-    "ungetwc": {
-      "c-definition": "7.31.3.10"
-    },
-    "wcstod": {
-      "c-definition": "7.31.4.2.2"
-    },
-    "wcstof": {
-      "c-definition": "7.31.4.2.2"
-    },
-    "wcstold": {
-      "c-definition": "7.31.4.2.2"
-    },
-    "wcstod32": {
-      "c-definition": "7.31.4.2.3"
-    },
-    "wcstod64": {
-      "c-definition": "7.31.4.2.3"
-    },
-    "wcstod128": {
-      "c-definition": "7.31.4.2.3"
-    },
-    "wcstol": {
-      "c-definition": "7.31.4.2.4"
-    },
-    "wcstoll": {
-      "c-definition": "7.31.4.2.4"
-    },
-    "wcstoul": {
-      "c-definition": "7.31.4.2.4"
-    },
-    "wcstoull": {
-      "c-definition": "7.31.4.2.4"
-    },
-    "wcscpy": {
-      "c-definition": "7.31.4.3.1"
-    },
-    "wcsncpy": {
-      "c-definition": "7.31.4.3.2"
-    },
-    "wmemcpy": {
-      "c-definition": "7.31.4.3.3"
-    },
-    "wmemmove": {
-      "c-definition": "7.31.4.3.4"
-    },
-    "wcscat": {
-      "c-definition": "7.31.4.4.1"
-    },
-    "wcsncat": {
-      "c-definition": "7.31.4.4.2"
-    },
-    "wcscmp": {
-      "c-definition": "7.31.4.5.2"
-    },
-    "wcscoll": {
-      "c-definition": "7.31.4.5.3"
-    },
-    "wcsncmp": {
-      "c-definition": "7.31.4.5.4"
-    },
-    "wcsxfrm": {
-      "c-definition": "7.31.4.5.5"
-    },
-    "wmemcmp": {
-      "c-definition": "7.31.4.5.6"
-    },
-    "wcschr": {
-      "c-definition": "7.31.4.6.2"
-    },
-    "wcscspn": {
-      "c-definition": "7.31.4.6.3"
-    },
-    "wcspbrk": {
-      "c-definition": "7.31.4.6.4"
-    },
-    "wcsrchr": {
-      "c-definition": "7.31.4.6.5"
-    },
-    "wcsspn": {
-      "c-definition": "7.31.4.6.6"
-    },
-    "wcsstr": {
-      "c-definition": "7.31.4.6.7"
-    },
-    "wcstok": {
-      "c-definition": "7.31.4.6.8"
-    },
-    "wmemchr": {
-      "c-definition": "7.31.4.6.9"
-    },
-    "wcslen": {
-      "c-definition": "7.31.4.7.1"
-    },
-    "wmemset": {
-      "c-definition": "7.31.4.7.2"
-    },
-    "wcsftime": {
-      "c-definition": "7.31.5.1"
-    },
-    "btowc": {
-      "c-definition": "7.31.6.2.1"
-    },
-    "wctob": {
-      "c-definition": "7.31.6.2.2"
-    },
-    "mbsinit": {
-      "c-definition": "7.31.6.3.1"
-    },
-    "mbrlen": {
-      "c-definition": "7.31.6.4.2"
-    },
-    "mbrtowc": {
-      "c-definition": "7.31.6.4.3"
-    },
-    "wcrtomb": {
-      "c-definition": "7.31.6.4.4"
-    },
-    "mbsrtowcs": {
-      "c-definition": "7.31.6.5.2"
-    },
-    "wcsrtombs": {
-      "c-definition": "7.31.6.5.3"
-    }
-  }
-}
diff --git a/libc/utils/docgen/wchar.yaml b/libc/utils/docgen/wchar.yaml
new file mode 100644
index 0000000000000..dcc8963efdd35
--- /dev/null
+++ b/libc/utils/docgen/wchar.yaml
@@ -0,0 +1,131 @@
+functions:
+  btowc:
+    c-definition: 7.31.6.2.1
+  fgetwc:
+    c-definition: 7.31.3.1
+  fgetws:
+    c-definition: 7.31.3.2
+  fputwc:
+    c-definition: 7.31.3.3
+  fputws:
+    c-definition: 7.31.3.4
+  fwide:
+    c-definition: 7.31.3.5
+  fwprintf:
+    c-definition: 7.31.2.2
+  fwscanf:
+    c-definition: 7.31.2.3
+  getwc:
+    c-definition: 7.31.3.6
+  getwchar:
+    c-definition: 7.31.3.7
+  mbrlen:
+    c-definition: 7.31.6.4.2
+  mbrtowc:
+    c-definition: 7.31.6.4.3
+  mbsinit:
+    c-definition: 7.31.6.3.1
+  mbsrtowcs:
+    c-definition: 7.31.6.5.2
+  putwc:
+    c-definition: 7.31.3.8
+  putwchar:
+    c-definition: 7.31.3.9
+  swprintf:
+    c-definition: 7.31.2.4
+  swscanf:
+    c-definition: 7.31.2.5
+  ungetwc:
+    c-definition: 7.31.3.10
+  vfwprintf:
+    c-definition: 7.31.2.6
+  vfwscanf:
+    c-definition: 7.31.2.7
+  vswprintf:
+    c-definition: 7.31.2.8
+  vswscanf:
+    c-definition: 7.31.2.9
+  vwprintf:
+    c-definition: 7.31.2.10
+  vwscanf:
+    c-definition: 7.31.2.11
+  wcrtomb:
+    c-definition: 7.31.6.4.4
+  wcscat:
+    c-definition: 7.31.4.4.1
+  wcschr:
+    c-definition: 7.31.4.6.2
+  wcscmp:
+    c-definition: 7.31.4.5.2
+  wcscoll:
+    c-definition: 7.31.4.5.3
+  wcscpy:
+    c-definition: 7.31.4.3.1
+  wcscspn:
+    c-definition: 7.31.4.6.3
+  wcsftime:
+    c-definition: 7.31.5.1
+  wcslen:
+    c-definition: 7.31.4.7.1
+  wcsncat:
+    c-definition: 7.31.4.4.2
+  wcsncmp:
+    c-definition: 7.31.4.5.4
+  wcsncpy:
+    c-definition: 7.31.4.3.2
+  wcspbrk:
+    c-definition: 7.31.4.6.4
+  wcsrchr:
+    c-definition: 7.31.4.6.5
+  wcsrtombs:
+    c-definition: 7.31.6.5.3
+  wcsspn:
+    c-definition: 7.31.4.6.6
+  wcsstr:
+    c-definition: 7.31.4.6.7
+  wcstod:
+    c-definition: 7.31.4.2.2
+  wcstod128:
+    c-definition: 7.31.4.2.3
+  wcstod32:
+    c-definition: 7.31.4.2.3
+  wcstod64:
+    c-definition: 7.31.4.2.3
+  wcstof:
+    c-definition: 7.31.4.2.2
+  wcstok:
+    c-definition: 7.31.4.6.8
+  wcstol:
+    c-definition: 7.31.4.2.4
+  wcstold:
+    c-definition: 7.31.4.2.2
+  wcstoll:
+    c-definition: 7.31.4.2.4
+  wcstoul:
+    c-definition: 7.31.4.2.4
+  wcstoull:
+    c-definition: 7.31.4.2.4
+  wcsxfrm:
+    c-definition: 7.31.4.5.5
+  wctob:
+    c-definition: 7.31.6.2.2
+  wmemchr:
+    c-definition: 7.31.4.6.9
+  wmemcmp:
+    c-definition: 7.31.4.5.6
+  wmemcpy:
+    c-definition: 7.31.4.3.3
+  wmemmove:
+    c-definition: 7.31.4.3.4
+  wmemset:
+    c-definition: 7.31.4.7.2
+  wprintf:
+    c-definition: 7.31.2.12
+  wscanf:
+    c-definition: 7.31.2.13
+macros:
+  WEOF:
+    c-definition: 7.31.1
+  __STDC_VERSION_WCHAR_H__:
+    c-definition: 7.31.1
+
diff --git a/libc/utils/docgen/wctype.json b/libc/utils/docgen/wctype.json
deleted file mode 100644
index 29bac1d5d03aa..0000000000000
--- a/libc/utils/docgen/wctype.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "functions": {
-    "iswalnum": {
-      "c-definition": "7.32.2.1.1"
-    },
-    "iswalpha": {
-      "c-definition": "7.32.2.1.2"
-    },
-    "iswblank": {
-      "c-definition": "7.32.2.1.3"
-    },
-    "iswblank": {
-      "c-definition": "7.32.2.1.4"
-    },
-    "iswdigit": {
-      "c-definition": "7.32.2.1.5"
-    },
-    "iswgraph": {
-      "c-definition": "7.32.2.1.6"
-    },
-    "iswlower": {
-      "c-definition": "7.32.2.1.7"
-    },
-    "iswprint": {
-      "c-definition": "7.32.2.1.8"
-    },
-    "iswpunct": {
-      "c-definition": "7.32.2.1.9"
-    },
-    "iswspace": {
-      "c-definition": "7.32.2.1.10"
-    },
-    "iswupper": {
-      "c-definition": "7.32.2.1.11"
-    },
-    "iswxdigit": {
-      "c-definition": "7.32.2.1.12"
-    },
-    "iswctype": {
-      "c-definition": "7.32.2.2.1"
-    },
-    "wctype": {
-      "c-definition": "7.32.2.2.2"
-    },
-    "towlower": {
-      "c-definition": "7.32.3.1.1"
-    },
-    "towupper": {
-      "c-definition": "7.32.3.1.2"
-    },
-    "towctrans": {
-      "c-definition": "7.32.3.2.1"
-    },
-    "wctrans": {
-      "c-definition": "7.32.3.2.2"
-    }
-  }
-}
diff --git a/libc/utils/docgen/wctype.yaml b/libc/utils/docgen/wctype.yaml
new file mode 100644
index 0000000000000..8675cbe500da3
--- /dev/null
+++ b/libc/utils/docgen/wctype.yaml
@@ -0,0 +1,36 @@
+functions:
+  iswalnum:
+    c-definition: 7.32.2.1.1
+  iswalpha:
+    c-definition: 7.32.2.1.2
+  iswblank:
+    c-definition: 7.32.2.1.4
+  iswctype:
+    c-definition: 7.32.2.2.1
+  iswdigit:
+    c-definition: 7.32.2.1.5
+  iswgraph:
+    c-definition: 7.32.2.1.6
+  iswlower:
+    c-definition: 7.32.2.1.7
+  iswprint:
+    c-definition: 7.32.2.1.8
+  iswpunct:
+    c-definition: 7.32.2.1.9
+  iswspace:
+    c-definition: 7.32.2.1.10
+  iswupper:
+    c-definition: 7.32.2.1.11
+  iswxdigit:
+    c-definition: 7.32.2.1.12
+  towctrans:
+    c-definition: 7.32.3.2.1
+  towlower:
+    c-definition: 7.32.3.1.1
+  towupper:
+    c-definition: 7.32.3.1.2
+  wctrans:
+    c-definition: 7.32.3.2.2
+  wctype:
+    c-definition: 7.32.2.2.2
+
diff --git a/libcxx/test/benchmarks/algorithms.partition_point.bench.cpp b/libcxx/test/benchmarks/algorithms/algorithms.partition_point.bench.cpp
similarity index 98%
rename from libcxx/test/benchmarks/algorithms.partition_point.bench.cpp
rename to libcxx/test/benchmarks/algorithms/algorithms.partition_point.bench.cpp
index 42ebce8ad2f4a..0777acbafb5cc 100644
--- a/libcxx/test/benchmarks/algorithms.partition_point.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/algorithms.partition_point.bench.cpp
@@ -17,8 +17,8 @@
 
 #include "benchmark/benchmark.h"
 
-#include "CartesianBenchmarks.h"
-#include "GenerateInput.h"
+#include "../CartesianBenchmarks.h"
+#include "../GenerateInput.h"
 
 namespace {
 
diff --git a/libcxx/test/benchmarks/lexicographical_compare_three_way.bench.cpp b/libcxx/test/benchmarks/algorithms/lexicographical_compare_three_way.bench.cpp
similarity index 100%
rename from libcxx/test/benchmarks/lexicographical_compare_three_way.bench.cpp
rename to libcxx/test/benchmarks/algorithms/lexicographical_compare_three_way.bench.cpp
diff --git a/libcxx/test/benchmarks/ContainerBenchmarks.h b/libcxx/test/benchmarks/containers/ContainerBenchmarks.h
similarity index 99%
rename from libcxx/test/benchmarks/ContainerBenchmarks.h
rename to libcxx/test/benchmarks/containers/ContainerBenchmarks.h
index 458134c8191d1..6d21e12896ec9 100644
--- a/libcxx/test/benchmarks/ContainerBenchmarks.h
+++ b/libcxx/test/benchmarks/containers/ContainerBenchmarks.h
@@ -15,7 +15,7 @@
 #include <utility>
 
 #include "benchmark/benchmark.h"
-#include "Utilities.h"
+#include "../Utilities.h"
 #include "test_iterators.h"
 
 namespace ContainerBenchmarks {
diff --git a/libcxx/test/benchmarks/deque.bench.cpp b/libcxx/test/benchmarks/containers/deque.bench.cpp
similarity index 98%
rename from libcxx/test/benchmarks/deque.bench.cpp
rename to libcxx/test/benchmarks/containers/deque.bench.cpp
index ab0ba96b12ffc..7ff1093a9391c 100644
--- a/libcxx/test/benchmarks/deque.bench.cpp
+++ b/libcxx/test/benchmarks/containers/deque.bench.cpp
@@ -14,7 +14,7 @@
 #include "benchmark/benchmark.h"
 
 #include "ContainerBenchmarks.h"
-#include "GenerateInput.h"
+#include "../GenerateInput.h"
 
 using namespace ContainerBenchmarks;
 
diff --git a/libcxx/test/benchmarks/deque_iterator.bench.cpp b/libcxx/test/benchmarks/containers/deque_iterator.bench.cpp
similarity index 100%
rename from libcxx/test/benchmarks/deque_iterator.bench.cpp
rename to libcxx/test/benchmarks/containers/deque_iterator.bench.cpp
diff --git a/libcxx/test/benchmarks/map.bench.cpp b/libcxx/test/benchmarks/containers/map.bench.cpp
similarity index 99%
rename from libcxx/test/benchmarks/map.bench.cpp
rename to libcxx/test/benchmarks/containers/map.bench.cpp
index 81bdc5077f026..e37c7d8d55163 100644
--- a/libcxx/test/benchmarks/map.bench.cpp
+++ b/libcxx/test/benchmarks/containers/map.bench.cpp
@@ -14,7 +14,7 @@
 #include <random>
 #include <vector>
 
-#include "CartesianBenchmarks.h"
+#include "../CartesianBenchmarks.h"
 #include "benchmark/benchmark.h"
 #include "test_macros.h"
 
diff --git a/libcxx/test/benchmarks/ordered_set.bench.cpp b/libcxx/test/benchmarks/containers/ordered_set.bench.cpp
similarity index 99%
rename from libcxx/test/benchmarks/ordered_set.bench.cpp
rename to libcxx/test/benchmarks/containers/ordered_set.bench.cpp
index 7883233c23aee..cb68902c6dcc8 100644
--- a/libcxx/test/benchmarks/ordered_set.bench.cpp
+++ b/libcxx/test/benchmarks/containers/ordered_set.bench.cpp
@@ -17,7 +17,7 @@
 #include <string>
 #include <vector>
 
-#include "CartesianBenchmarks.h"
+#include "../CartesianBenchmarks.h"
 #include "benchmark/benchmark.h"
 #include "test_macros.h"
 
diff --git a/libcxx/test/benchmarks/string.bench.cpp b/libcxx/test/benchmarks/containers/string.bench.cpp
similarity index 99%
rename from libcxx/test/benchmarks/string.bench.cpp
rename to libcxx/test/benchmarks/containers/string.bench.cpp
index 0d7ce2b87bead..f7da3e2da312b 100644
--- a/libcxx/test/benchmarks/string.bench.cpp
+++ b/libcxx/test/benchmarks/containers/string.bench.cpp
@@ -13,8 +13,8 @@
 #include <new>
 #include <vector>
 
-#include "CartesianBenchmarks.h"
-#include "GenerateInput.h"
+#include "../CartesianBenchmarks.h"
+#include "../GenerateInput.h"
 #include "benchmark/benchmark.h"
 #include "test_macros.h"
 
diff --git a/libcxx/test/benchmarks/unordered_set_operations.bench.cpp b/libcxx/test/benchmarks/containers/unordered_set_operations.bench.cpp
similarity index 99%
rename from libcxx/test/benchmarks/unordered_set_operations.bench.cpp
rename to libcxx/test/benchmarks/containers/unordered_set_operations.bench.cpp
index 7b1700bfd850d..a8448ef5a0cfb 100644
--- a/libcxx/test/benchmarks/unordered_set_operations.bench.cpp
+++ b/libcxx/test/benchmarks/containers/unordered_set_operations.bench.cpp
@@ -18,7 +18,7 @@
 #include "benchmark/benchmark.h"
 
 #include "ContainerBenchmarks.h"
-#include "GenerateInput.h"
+#include "../GenerateInput.h"
 #include "test_macros.h"
 
 using namespace ContainerBenchmarks;
diff --git a/libcxx/test/benchmarks/vector_operations.bench.cpp b/libcxx/test/benchmarks/containers/vector_operations.bench.cpp
similarity index 99%
rename from libcxx/test/benchmarks/vector_operations.bench.cpp
rename to libcxx/test/benchmarks/containers/vector_operations.bench.cpp
index 3a72eaec4dd57..9449bed31ec38 100644
--- a/libcxx/test/benchmarks/vector_operations.bench.cpp
+++ b/libcxx/test/benchmarks/containers/vector_operations.bench.cpp
@@ -19,7 +19,7 @@
 
 #include "benchmark/benchmark.h"
 #include "ContainerBenchmarks.h"
-#include "GenerateInput.h"
+#include "../GenerateInput.h"
 
 using namespace ContainerBenchmarks;
 
diff --git a/libcxx/test/benchmarks/format.bench.cpp b/libcxx/test/benchmarks/format/format.bench.cpp
similarity index 100%
rename from libcxx/test/benchmarks/format.bench.cpp
rename to libcxx/test/benchmarks/format/format.bench.cpp
diff --git a/libcxx/test/benchmarks/format_to.bench.cpp b/libcxx/test/benchmarks/format/format_to.bench.cpp
similarity index 100%
rename from libcxx/test/benchmarks/format_to.bench.cpp
rename to libcxx/test/benchmarks/format/format_to.bench.cpp
diff --git a/libcxx/test/benchmarks/format_to_n.bench.cpp b/libcxx/test/benchmarks/format/format_to_n.bench.cpp
similarity index 100%
rename from libcxx/test/benchmarks/format_to_n.bench.cpp
rename to libcxx/test/benchmarks/format/format_to_n.bench.cpp
diff --git a/libcxx/test/benchmarks/formatted_size.bench.cpp b/libcxx/test/benchmarks/format/formatted_size.bench.cpp
similarity index 100%
rename from libcxx/test/benchmarks/formatted_size.bench.cpp
rename to libcxx/test/benchmarks/format/formatted_size.bench.cpp
diff --git a/libcxx/test/benchmarks/formatter_float.bench.cpp b/libcxx/test/benchmarks/format/formatter_float.bench.cpp
similarity index 99%
rename from libcxx/test/benchmarks/formatter_float.bench.cpp
rename to libcxx/test/benchmarks/format/formatter_float.bench.cpp
index ec20eab3c0937..77c067e7bc03e 100644
--- a/libcxx/test/benchmarks/formatter_float.bench.cpp
+++ b/libcxx/test/benchmarks/format/formatter_float.bench.cpp
@@ -17,7 +17,7 @@
 #include <random>
 #include <string>
 
-#include "CartesianBenchmarks.h"
+#include "../CartesianBenchmarks.h"
 #include "benchmark/benchmark.h"
 
 // *** Localization ***
diff --git a/libcxx/test/benchmarks/formatter_int.bench.cpp b/libcxx/test/benchmarks/format/formatter_int.bench.cpp
similarity index 99%
rename from libcxx/test/benchmarks/formatter_int.bench.cpp
rename to libcxx/test/benchmarks/format/formatter_int.bench.cpp
index db0edab6ae052..6df0187667aa1 100644
--- a/libcxx/test/benchmarks/formatter_int.bench.cpp
+++ b/libcxx/test/benchmarks/format/formatter_int.bench.cpp
@@ -12,7 +12,7 @@
 #include <format>
 #include <random>
 
-#include "CartesianBenchmarks.h"
+#include "../CartesianBenchmarks.h"
 #include "benchmark/benchmark.h"
 #include "test_macros.h"
 
diff --git a/libcxx/test/benchmarks/std_format_spec_string_unicode.bench.cpp b/libcxx/test/benchmarks/format/std_format_spec_string_unicode.bench.cpp
similarity index 100%
rename from libcxx/test/benchmarks/std_format_spec_string_unicode.bench.cpp
rename to libcxx/test/benchmarks/format/std_format_spec_string_unicode.bench.cpp
diff --git a/libcxx/test/benchmarks/std_format_spec_string_unicode_escape.bench.cpp b/libcxx/test/benchmarks/format/std_format_spec_string_unicode_escape.bench.cpp
similarity index 100%
rename from libcxx/test/benchmarks/std_format_spec_string_unicode_escape.bench.cpp
rename to libcxx/test/benchmarks/format/std_format_spec_string_unicode_escape.bench.cpp
diff --git a/lldb/include/lldb/Target/StackFrameList.h b/lldb/include/lldb/Target/StackFrameList.h
index 7d0e7a5b9a71b..8a66296346f2d 100644
--- a/lldb/include/lldb/Target/StackFrameList.h
+++ b/lldb/include/lldb/Target/StackFrameList.h
@@ -11,6 +11,7 @@
 
 #include <memory>
 #include <mutex>
+#include <shared_mutex>
 #include <vector>
 
 #include "lldb/Target/StackFrame.h"
@@ -94,24 +95,36 @@ class StackFrameList {
                    bool show_unique = false, bool show_hidden = false,
                    const char *frame_marker = nullptr);
 
+  /// Returns whether we have currently fetched all the frames of a stack.
+  bool WereAllFramesFetched() const;
+
 protected:
   friend class Thread;
   friend class ScriptedThread;
 
+  /// Use this API to build a stack frame list (used for scripted threads, for
+  /// instance.)  This API is not meant for StackFrameLists that have unwinders
+  /// and partake in lazy stack filling (using GetFramesUpTo).  Rather if you
+  /// are building StackFrameLists with this API, you should build the entire
+  /// list before making it available for use.
   bool SetFrameAtIndex(uint32_t idx, lldb::StackFrameSP &frame_sp);
 
-  /// Realizes frames up to (and including) end_idx (which can be greater than  
-  /// the actual number of frames.)  
+  /// Ensures that frames up to (and including) `end_idx` are realized in the
+  /// StackFrameList.  `end_idx` can be larger than the actual number of frames,
+  /// in which case all the frames will be fetched.  Acquires the writer end of
+  /// the list mutex.
   /// Returns true if the function was interrupted, false otherwise.
-  bool GetFramesUpTo(uint32_t end_idx, 
-      InterruptionControl allow_interrupt = AllowInterruption);
-
-  void GetOnlyConcreteFramesUpTo(uint32_t end_idx, Unwind &unwinder);
-
-  void SynthesizeTailCallFrames(StackFrame &next_frame);
-
-  bool GetAllFramesFetched() { return m_concrete_frames_fetched == UINT32_MAX; }
+  /// Callers should first check (under the shared mutex) whether we need to
+  /// fetch frames or not.
+  bool GetFramesUpTo(uint32_t end_idx, InterruptionControl allow_interrupt);
+
+  // This should be called with either the reader or writer end of the list
+  // mutex held:
+  bool GetAllFramesFetched() const {
+    return m_concrete_frames_fetched == UINT32_MAX;
+  }
 
+  // This should be called with the writer end of the list mutex held.
   void SetAllFramesFetched() { m_concrete_frames_fetched = UINT32_MAX; }
 
   bool DecrementCurrentInlinedDepth();
@@ -122,6 +135,9 @@ class StackFrameList {
 
   void SetCurrentInlinedDepth(uint32_t new_depth);
 
+  /// Calls into the stack frame recognizers and stop info to set the most
+  /// relevant frame.  This can call out to arbitrary user code so it can't
+  /// hold the StackFrameList mutex.
   void SelectMostRelevantFrame();
 
   typedef std::vector<lldb::StackFrameSP> collection;
@@ -138,11 +154,16 @@ class StackFrameList {
   // source of information.
   lldb::StackFrameListSP m_prev_frames_sp;
 
-  /// A mutex for this frame list.
-  // TODO: This mutex may not always be held when required. In particular, uses
-  // of the StackFrameList APIs in lldb_private::Thread look suspect. Consider
-  // passing around a lock_guard reference to enforce proper locking.
-  mutable std::recursive_mutex m_mutex;
+  /// A mutex for this frame list.  The only public API that requires the
+  /// unique lock is Clear.  All other clients take the shared lock, though
+  /// if we need more frames we may swap shared for unique to fulfill that
+  /// requirement.
+  mutable std::shared_mutex m_list_mutex;
+
+  // Setting the inlined depth should be protected against other attempts to
+  // change it, but since it doesn't mutate the list itself, we can limit the
+  // critical regions it produces by having a separate mutex.
+  mutable std::mutex m_inlined_depth_mutex;
 
   /// A cache of frames. This may need to be updated when the program counter
   /// changes.
@@ -171,6 +192,21 @@ class StackFrameList {
   const bool m_show_inlined_frames;
 
 private:
+  uint32_t SetSelectedFrameNoLock(lldb_private::StackFrame *frame);
+  lldb::StackFrameSP
+  GetFrameAtIndexNoLock(uint32_t idx,
+                        std::shared_lock<std::shared_mutex> &guard);
+
+  /// These two Fetch frames APIs and SynthesizeTailCallFrames are called in
+  /// GetFramesUpTo, they are the ones that actually add frames.  They must be
+  /// called with the writer end of the list mutex held.
+
+  /// Returns true if fetching frames was interrupted, false otherwise.
+  bool FetchFramesUpTo(uint32_t end_idx, InterruptionControl allow_interrupt);
+  /// Not currently interruptible so returns void.
+  void FetchOnlyConcreteFramesUpTo(uint32_t end_idx);
+  void SynthesizeTailCallFrames(StackFrame &next_frame);
+
   StackFrameList(const StackFrameList &) = delete;
   const StackFrameList &operator=(const StackFrameList &) = delete;
 };
diff --git a/lldb/source/Target/StackFrameList.cpp b/lldb/source/Target/StackFrameList.cpp
index 94a381edd5e20..9c6208e9e0a65 100644
--- a/lldb/source/Target/StackFrameList.cpp
+++ b/lldb/source/Target/StackFrameList.cpp
@@ -38,7 +38,7 @@ using namespace lldb_private;
 StackFrameList::StackFrameList(Thread &thread,
                                const lldb::StackFrameListSP &prev_frames_sp,
                                bool show_inline_frames)
-    : m_thread(thread), m_prev_frames_sp(prev_frames_sp), m_mutex(), m_frames(),
+    : m_thread(thread), m_prev_frames_sp(prev_frames_sp), m_frames(),
       m_selected_frame_idx(), m_concrete_frames_fetched(0),
       m_current_inlined_depth(UINT32_MAX),
       m_current_inlined_pc(LLDB_INVALID_ADDRESS),
@@ -63,6 +63,7 @@ void StackFrameList::CalculateCurrentInlinedDepth() {
 }
 
 uint32_t StackFrameList::GetCurrentInlinedDepth() {
+  std::lock_guard<std::mutex> guard(m_inlined_depth_mutex);
   if (m_show_inlined_frames && m_current_inlined_pc != LLDB_INVALID_ADDRESS) {
     lldb::addr_t cur_pc = m_thread.GetRegisterContext()->GetPC();
     if (cur_pc != m_current_inlined_pc) {
@@ -84,11 +85,6 @@ void StackFrameList::ResetCurrentInlinedDepth() {
   if (!m_show_inlined_frames)
     return;
 
-  std::lock_guard<std::recursive_mutex> guard(m_mutex);
-
-  m_current_inlined_pc = LLDB_INVALID_ADDRESS;
-  m_current_inlined_depth = UINT32_MAX;
-
   StopInfoSP stop_info_sp = m_thread.GetStopInfo();
   if (!stop_info_sp)
     return;
@@ -98,6 +94,7 @@ void StackFrameList::ResetCurrentInlinedDepth() {
   // We're only adjusting the inlined stack here.
   Log *log = GetLog(LLDBLog::Step);
   if (inline_depth) {
+    std::lock_guard<std::mutex> guard(m_inlined_depth_mutex);
     m_current_inlined_depth = *inline_depth;
     m_current_inlined_pc = m_thread.GetRegisterContext()->GetPC();
 
@@ -107,6 +104,9 @@ void StackFrameList::ResetCurrentInlinedDepth() {
                 "depth: %d 0x%" PRIx64 ".\n",
                 m_current_inlined_depth, m_current_inlined_pc);
   } else {
+    std::lock_guard<std::mutex> guard(m_inlined_depth_mutex);
+    m_current_inlined_pc = LLDB_INVALID_ADDRESS;
+    m_current_inlined_depth = UINT32_MAX;
     if (log && log->GetVerbose())
       LLDB_LOGF(
           log,
@@ -119,6 +119,7 @@ bool StackFrameList::DecrementCurrentInlinedDepth() {
     uint32_t current_inlined_depth = GetCurrentInlinedDepth();
     if (current_inlined_depth != UINT32_MAX) {
       if (current_inlined_depth > 0) {
+        std::lock_guard<std::mutex> guard(m_inlined_depth_mutex);
         m_current_inlined_depth--;
         return true;
       }
@@ -128,6 +129,7 @@ bool StackFrameList::DecrementCurrentInlinedDepth() {
 }
 
 void StackFrameList::SetCurrentInlinedDepth(uint32_t new_depth) {
+  std::lock_guard<std::mutex> guard(m_inlined_depth_mutex);
   m_current_inlined_depth = new_depth;
   if (new_depth == UINT32_MAX)
     m_current_inlined_pc = LLDB_INVALID_ADDRESS;
@@ -135,23 +137,9 @@ void StackFrameList::SetCurrentInlinedDepth(uint32_t new_depth) {
     m_current_inlined_pc = m_thread.GetRegisterContext()->GetPC();
 }
 
-void StackFrameList::GetOnlyConcreteFramesUpTo(uint32_t end_idx,
-                                               Unwind &unwinder) {
-  assert(m_thread.IsValid() && "Expected valid thread");
-  assert(m_frames.size() <= end_idx && "Expected there to be frames to fill");
-
-  if (end_idx < m_concrete_frames_fetched)
-    return;
-
-  uint32_t num_frames = unwinder.GetFramesUpTo(end_idx);
-  if (num_frames <= end_idx + 1) {
-    // Done unwinding.
-    m_concrete_frames_fetched = UINT32_MAX;
-  }
-
-  // Don't create the frames eagerly. Defer this work to GetFrameAtIndex,
-  // which can lazily query the unwinder to create frames.
-  m_frames.resize(num_frames);
+bool StackFrameList::WereAllFramesFetched() const {
+  std::shared_lock<std::shared_mutex> guard(m_list_mutex);
+  return GetAllFramesFetched();
 }
 
 /// A sequence of calls that comprise some portion of a backtrace. Each frame
@@ -167,6 +155,8 @@ using CallSequence = std::vector<CallDescriptor>;
 /// Find the unique path through the call graph from \p begin (with return PC
 /// \p return_pc) to \p end. On success this path is stored into \p path, and
 /// on failure \p path is unchanged.
+/// This function doesn't currently access StackFrameLists at all, it only looks
+/// at the frame set in the ExecutionContext it passes around.
 static void FindInterveningFrames(Function &begin, Function &end,
                                   ExecutionContext &exe_ctx, Target &target,
                                   addr_t return_pc, CallSequence &path,
@@ -350,23 +340,65 @@ void StackFrameList::SynthesizeTailCallFrames(StackFrame &next_frame) {
 
 bool StackFrameList::GetFramesUpTo(uint32_t end_idx,
                                    InterruptionControl allow_interrupt) {
+  // GetFramesUpTo is always called with the intent to add frames, so get the
+  // writer lock:
+  std::unique_lock<std::shared_mutex> guard(m_list_mutex);
+  // Now that we have the lock, check to make sure someone didn't get there
+  // ahead of us:
+  if (m_frames.size() > end_idx || GetAllFramesFetched())
+    return false;
+
   // Do not fetch frames for an invalid thread.
   bool was_interrupted = false;
   if (!m_thread.IsValid())
     return false;
 
-  // We've already gotten more frames than asked for, or we've already finished
-  // unwinding, return.
-  if (m_frames.size() > end_idx || GetAllFramesFetched())
+  // lock the writer side of m_list_mutex as we're going to add frames here:
+  if (!m_show_inlined_frames) {
+    if (end_idx < m_concrete_frames_fetched)
+      return false;
+    // We're adding concrete frames now:
+    // FIXME: This should also be interruptible:
+    FetchOnlyConcreteFramesUpTo(end_idx);
     return false;
+  }
+
+  // We're adding concrete and inlined frames now:
+  was_interrupted = FetchFramesUpTo(end_idx, allow_interrupt);
+
+#if defined(DEBUG_STACK_FRAMES)
+  s.PutCString("\n\nNew frames:\n");
+  Dump(&s);
+  s.EOL();
+#endif
+  return was_interrupted;
+}
+
+void StackFrameList::FetchOnlyConcreteFramesUpTo(uint32_t end_idx) {
+  assert(m_thread.IsValid() && "Expected valid thread");
+  assert(m_frames.size() <= end_idx && "Expected there to be frames to fill");
 
   Unwind &unwinder = m_thread.GetUnwinder();
 
-  if (!m_show_inlined_frames) {
-    GetOnlyConcreteFramesUpTo(end_idx, unwinder);
-    return false;
+  if (end_idx < m_concrete_frames_fetched)
+    return;
+
+  uint32_t num_frames = unwinder.GetFramesUpTo(end_idx);
+  if (num_frames <= end_idx + 1) {
+    // Done unwinding.
+    m_concrete_frames_fetched = UINT32_MAX;
   }
 
+  // Don't create the frames eagerly. Defer this work to GetFrameAtIndex,
+  // which can lazily query the unwinder to create frames.
+  m_frames.resize(num_frames);
+}
+
+bool StackFrameList::FetchFramesUpTo(uint32_t end_idx,
+                                     InterruptionControl allow_interrupt) {
+  Unwind &unwinder = m_thread.GetUnwinder();
+  bool was_interrupted = false;
+
 #if defined(DEBUG_STACK_FRAMES)
   StreamFile s(stdout, false);
 #endif
@@ -421,11 +453,11 @@ bool StackFrameList::GetFramesUpTo(uint32_t end_idx,
     } else {
       // Check for interruption when building the frames.
       // Do the check in idx > 0 so that we'll always create a 0th frame.
-      if (allow_interrupt 
-          && INTERRUPT_REQUESTED(dbg, "Interrupted having fetched {0} frames",
-                                 m_frames.size())) {
-          was_interrupted = true;
-          break;
+      if (allow_interrupt &&
+          INTERRUPT_REQUESTED(dbg, "Interrupted having fetched {0} frames",
+                              m_frames.size())) {
+        was_interrupted = true;
+        break;
       }
 
       const bool success =
@@ -534,12 +566,6 @@ bool StackFrameList::GetFramesUpTo(uint32_t end_idx,
     // We are done with the old stack frame list, we can release it now.
     m_prev_frames_sp.reset();
   }
-
-#if defined(DEBUG_STACK_FRAMES)
-  s.PutCString("\n\nNew frames:\n");
-  Dump(&s);
-  s.EOL();
-#endif
   // Don't report interrupted if we happen to have gotten all the frames:
   if (!GetAllFramesFetched())
     return was_interrupted;
@@ -547,20 +573,23 @@ bool StackFrameList::GetFramesUpTo(uint32_t end_idx,
 }
 
 uint32_t StackFrameList::GetNumFrames(bool can_create) {
-  std::lock_guard<std::recursive_mutex> guard(m_mutex);
-
-  if (can_create) {
+  if (!WereAllFramesFetched() && can_create) {
     // Don't allow interrupt or we might not return the correct count
-    GetFramesUpTo(UINT32_MAX, DoNotAllowInterruption); 
+    GetFramesUpTo(UINT32_MAX, DoNotAllowInterruption);
+  }
+  uint32_t frame_idx;
+  {
+    std::shared_lock<std::shared_mutex> guard(m_list_mutex);
+    frame_idx = GetVisibleStackFrameIndex(m_frames.size());
   }
-  return GetVisibleStackFrameIndex(m_frames.size());
+  return frame_idx;
 }
 
 void StackFrameList::Dump(Stream *s) {
   if (s == nullptr)
     return;
 
-  std::lock_guard<std::recursive_mutex> guard(m_mutex);
+  std::shared_lock<std::shared_mutex> guard(m_list_mutex);
 
   const_iterator pos, begin = m_frames.begin(), end = m_frames.end();
   for (pos = begin; pos != end; ++pos) {
@@ -578,72 +607,53 @@ void StackFrameList::Dump(Stream *s) {
 
 StackFrameSP StackFrameList::GetFrameAtIndex(uint32_t idx) {
   StackFrameSP frame_sp;
-  std::lock_guard<std::recursive_mutex> guard(m_mutex);
   uint32_t original_idx = idx;
 
-  uint32_t inlined_depth = GetCurrentInlinedDepth();
-  if (inlined_depth != UINT32_MAX)
-    idx += inlined_depth;
+  // We're going to consult the m_frames.size, but if there are already
+  // enough frames for our request we don't want to block other readers, so
+  // first acquire the shared lock:
+  { // Scope for shared lock:
+    std::shared_lock<std::shared_mutex> guard(m_list_mutex);
 
-  if (idx < m_frames.size())
-    frame_sp = m_frames[idx];
+    uint32_t inlined_depth = GetCurrentInlinedDepth();
+    if (inlined_depth != UINT32_MAX)
+      idx += inlined_depth;
 
-  if (frame_sp)
-    return frame_sp;
+    if (idx < m_frames.size())
+      frame_sp = m_frames[idx];
+
+    if (frame_sp)
+      return frame_sp;
+  } // End of reader lock scope
 
   // GetFramesUpTo will fill m_frames with as many frames as you asked for, if
   // there are that many.  If there weren't then you asked for too many frames.
   // GetFramesUpTo returns true if interrupted:
-  if (GetFramesUpTo(idx)) {
+  if (GetFramesUpTo(idx, AllowInterruption)) {
     Log *log = GetLog(LLDBLog::Thread);
     LLDB_LOG(log, "GetFrameAtIndex was interrupted");
     return {};
   }
 
-  if (idx < m_frames.size()) {
-    if (m_show_inlined_frames) {
-      // When inline frames are enabled we actually create all the frames in
-      // GetFramesUpTo.
+  { // Now we're accessing m_frames as a reader, so acquire the reader lock.
+    std::shared_lock<std::shared_mutex> guard(m_list_mutex);
+    if (idx < m_frames.size()) {
       frame_sp = m_frames[idx];
-    } else {
-      addr_t pc, cfa;
-      bool behaves_like_zeroth_frame = (idx == 0);
-      if (m_thread.GetUnwinder().GetFrameInfoAtIndex(
-              idx, cfa, pc, behaves_like_zeroth_frame)) {
-        const bool cfa_is_valid = true;
-        frame_sp = std::make_shared<StackFrame>(
-            m_thread.shared_from_this(), idx, idx, cfa, cfa_is_valid, pc,
-            StackFrame::Kind::Regular, behaves_like_zeroth_frame, nullptr);
-
-        Function *function =
-            frame_sp->GetSymbolContext(eSymbolContextFunction).function;
-        if (function) {
-          // When we aren't showing inline functions we always use the top
-          // most function block as the scope.
-          frame_sp->SetSymbolContextScope(&function->GetBlock(false));
-        } else {
-          // Set the symbol scope from the symbol regardless if it is nullptr
-          // or valid.
-          frame_sp->SetSymbolContextScope(
-              frame_sp->GetSymbolContext(eSymbolContextSymbol).symbol);
-        }
-        SetFrameAtIndex(idx, frame_sp);
+    } else if (original_idx == 0) {
+      // There should ALWAYS be a frame at index 0.  If something went wrong
+      // with the CurrentInlinedDepth such that there weren't as many frames as
+      // we thought taking that into account, then reset the current inlined
+      // depth and return the real zeroth frame.
+      if (m_frames.empty()) {
+        // Why do we have a thread with zero frames, that should not ever
+        // happen...
+        assert(!m_thread.IsValid() && "A valid thread has no frames.");
+      } else {
+        ResetCurrentInlinedDepth();
+        frame_sp = m_frames[original_idx];
       }
     }
-  } else if (original_idx == 0) {
-    // There should ALWAYS be a frame at index 0.  If something went wrong with
-    // the CurrentInlinedDepth such that there weren't as many frames as we
-    // thought taking that into account, then reset the current inlined depth
-    // and return the real zeroth frame.
-    if (m_frames.empty()) {
-      // Why do we have a thread with zero frames, that should not ever
-      // happen...
-      assert(!m_thread.IsValid() && "A valid thread has no frames.");
-    } else {
-      ResetCurrentInlinedDepth();
-      frame_sp = m_frames[original_idx];
-    }
-  }
+  } // End of reader lock scope
 
   return frame_sp;
 }
@@ -675,19 +685,18 @@ StackFrameSP StackFrameList::GetFrameWithStackID(const StackID &stack_id) {
   StackFrameSP frame_sp;
 
   if (stack_id.IsValid()) {
-    std::lock_guard<std::recursive_mutex> guard(m_mutex);
     uint32_t frame_idx = 0;
-    // Do a binary search in case the stack frame is already in our cache
-    collection::const_iterator begin = m_frames.begin();
-    collection::const_iterator end = m_frames.end();
-    if (begin != end) {
+    {
+      // First see if the frame is already realized.  This is the scope for
+      // the shared mutex:
+      std::shared_lock<std::shared_mutex> guard(m_list_mutex);
+      // Do a binary search in case the stack frame is already in our cache
       collection::const_iterator pos =
-          std::lower_bound(begin, end, stack_id, CompareStackID);
-      if (pos != end) {
-        if ((*pos)->GetStackID() == stack_id)
-          return *pos;
-      }
+          llvm::lower_bound(m_frames, stack_id, CompareStackID);
+      if (pos != m_frames.end() && (*pos)->GetStackID() == stack_id)
+        return *pos;
     }
+    // If we needed to add more frames, we would get to here.
     do {
       frame_sp = GetFrameAtIndex(frame_idx);
       if (frame_sp && frame_sp->GetStackID() == stack_id)
@@ -699,6 +708,7 @@ StackFrameSP StackFrameList::GetFrameWithStackID(const StackID &stack_id) {
 }
 
 bool StackFrameList::SetFrameAtIndex(uint32_t idx, StackFrameSP &frame_sp) {
+  std::unique_lock<std::shared_mutex> guard(m_list_mutex);
   if (idx >= m_frames.size())
     m_frames.resize(idx + 1);
   // Make sure allocation succeeded by checking bounds again
@@ -738,7 +748,7 @@ void StackFrameList::SelectMostRelevantFrame() {
   }
   LLDB_LOG(log, "Frame #0 not recognized");
 
-  // If this thread has a non-trivial StopInof, then let it suggest
+  // If this thread has a non-trivial StopInfo, then let it suggest
   // a most relevant frame:
   StopInfoSP stop_info_sp = m_thread.GetStopInfo();
   uint32_t stack_idx = 0;
@@ -771,9 +781,8 @@ void StackFrameList::SelectMostRelevantFrame() {
     LLDB_LOG(log, "No relevant frame!");
 }
 
-uint32_t StackFrameList::GetSelectedFrameIndex(
-    SelectMostRelevant select_most_relevant) {
-  std::lock_guard<std::recursive_mutex> guard(m_mutex);
+uint32_t
+StackFrameList::GetSelectedFrameIndex(SelectMostRelevant select_most_relevant) {
   if (!m_selected_frame_idx && select_most_relevant)
     SelectMostRelevantFrame();
   if (!m_selected_frame_idx) {
@@ -788,7 +797,8 @@ uint32_t StackFrameList::GetSelectedFrameIndex(
 }
 
 uint32_t StackFrameList::SetSelectedFrame(lldb_private::StackFrame *frame) {
-  std::lock_guard<std::recursive_mutex> guard(m_mutex);
+  std::shared_lock<std::shared_mutex> guard(m_list_mutex);
+
   const_iterator pos;
   const_iterator begin = m_frames.begin();
   const_iterator end = m_frames.end();
@@ -803,13 +813,11 @@ uint32_t StackFrameList::SetSelectedFrame(lldb_private::StackFrame *frame) {
       break;
     }
   }
-
   SetDefaultFileAndLineToSelectedFrame();
   return *m_selected_frame_idx;
 }
 
 bool StackFrameList::SetSelectedFrameByIndex(uint32_t idx) {
-  std::lock_guard<std::recursive_mutex> guard(m_mutex);
   StackFrameSP frame_sp(GetFrameAtIndex(idx));
   if (frame_sp) {
     SetSelectedFrame(frame_sp.get());
@@ -840,7 +848,7 @@ void StackFrameList::SetDefaultFileAndLineToSelectedFrame() {
 // does not describe how StackFrameLists are currently used.
 // Clear is currently only used to clear the list in the destructor.
 void StackFrameList::Clear() {
-  std::lock_guard<std::recursive_mutex> guard(m_mutex);
+  std::unique_lock<std::shared_mutex> guard(m_list_mutex);
   m_frames.clear();
   m_concrete_frames_fetched = 0;
   m_selected_frame_idx.reset();
@@ -848,6 +856,7 @@ void StackFrameList::Clear() {
 
 lldb::StackFrameSP
 StackFrameList::GetStackFrameSPForStackFramePtr(StackFrame *stack_frame_ptr) {
+  std::shared_lock<std::shared_mutex> guard(m_list_mutex);
   const_iterator pos;
   const_iterator begin = m_frames.begin();
   const_iterator end = m_frames.end();
diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp
index fb17276051909..a6130f6b925bb 100644
--- a/lldb/source/Target/Thread.cpp
+++ b/lldb/source/Target/Thread.cpp
@@ -1449,7 +1449,7 @@ void Thread::ClearStackFrames() {
   // frames:
   // FIXME: At some point we can try to splice in the frames we have fetched
   // into the new frame as we make it, but let's not try that now.
-  if (m_curr_frames_sp && m_curr_frames_sp->GetAllFramesFetched())
+  if (m_curr_frames_sp && m_curr_frames_sp->WereAllFramesFetched())
     m_prev_frames_sp.swap(m_curr_frames_sp);
   m_curr_frames_sp.reset();
 
diff --git a/lldb/test/API/api/multithreaded/TestMultithreaded.py b/lldb/test/API/api/multithreaded/TestMultithreaded.py
index 07c9f5b9bbcca..d5b29ec7af181 100644
--- a/lldb/test/API/api/multithreaded/TestMultithreaded.py
+++ b/lldb/test/API/api/multithreaded/TestMultithreaded.py
@@ -22,6 +22,7 @@ def setUp(self):
         self.generateSource("test_listener_event_process_state.cpp")
         self.generateSource("test_listener_resume.cpp")
         self.generateSource("test_stop-hook.cpp")
+        self.generateSource("test_concurrent_unwind.cpp")
 
     @skipIfRemote
     # clang-cl does not support throw or catch (llvm.org/pr24538)
@@ -91,7 +92,19 @@ def test_sb_api_listener_resume(self):
             "test_listener_resume",
         )
 
-    def build_and_test(self, sources, test_name, args=None):
+    @skipIfRemote
+    # clang-cl does not support throw or catch (llvm.org/pr24538)
+    @skipIfWindows
+    @skipIfHostIncompatibleWithTarget
+    def test_concurrent_unwind(self):
+        """Test that you can run a python command in a stop-hook when stdin is File based."""
+        self.build_and_test(
+            "driver.cpp test_concurrent_unwind.cpp",
+            "test_concurrent_unwind",
+            inferior_source="deep_stack.cpp",
+        )
+
+    def build_and_test(self, sources, test_name, inferior_source="inferior.cpp"):
         """Build LLDB test from sources, and run expecting 0 exit code"""
 
         # These tests link against host lldb API.
@@ -104,7 +117,7 @@ def build_and_test(self, sources, test_name, args=None):
             )
 
         self.inferior = "inferior_program"
-        self.buildProgram("inferior.cpp", self.inferior)
+        self.buildProgram(inferior_source, self.inferior)
         self.addTearDownHook(lambda: os.remove(self.getBuildArtifact(self.inferior)))
 
         self.buildDriver(sources, test_name)
diff --git a/lldb/test/API/api/multithreaded/deep_stack.cpp b/lldb/test/API/api/multithreaded/deep_stack.cpp
new file mode 100644
index 0000000000000..da89228766e42
--- /dev/null
+++ b/lldb/test/API/api/multithreaded/deep_stack.cpp
@@ -0,0 +1,17 @@
+// This is a test program that makes a deep stack
+// so we can test unwinding from multiple threads.
+
+void call_me(int input) {
+  if (input > 1000) {
+    input += 1; // Set a breakpoint here
+    if (input > 1001)
+      input += 1;
+    return;
+  } else
+    call_me(++input);
+}
+
+int main() {
+  call_me(0);
+  return 0;
+}
diff --git a/lldb/test/API/api/multithreaded/test_concurrent_unwind.cpp.template b/lldb/test/API/api/multithreaded/test_concurrent_unwind.cpp.template
new file mode 100644
index 0000000000000..e5101dde79619
--- /dev/null
+++ b/lldb/test/API/api/multithreaded/test_concurrent_unwind.cpp.template
@@ -0,0 +1,91 @@
+#include "pseudo_barrier.h"
+
+#include <atomic>
+#include <thread>
+
+%include_SB_APIs%
+
+#include "common.h"
+
+using namespace lldb;
+
+void test (SBDebugger &dbg, std::vector<std::string> args) {
+
+SBError error;
+  dbg.SetAsync(false);
+  SBTarget target = dbg.CreateTarget(args.at(0).c_str());
+  if (!target.IsValid())
+    throw Exception("Invalid target");
+
+  // Now set our breakpoint and launch:
+  SBFileSpec main_sourcefile("deep_stack.cpp");
+  SBBreakpoint bkpt = target.BreakpointCreateBySourceRegex("Set a breakpoint here",
+                                                           main_sourcefile);
+  if (bkpt.GetNumLocations() == 0)
+    throw Exception("Main breakpoint got no locations");
+
+  SBLaunchInfo launch_info = target.GetLaunchInfo();
+  SBProcess process = target.Launch(launch_info, error);
+  if (error.Fail())
+    throw Exception("Failed to launch process");
+  if (!process.IsValid())
+    throw Exception("Process is not valid");
+  if (process.GetState() != lldb::eStateStopped)
+    throw Exception("Process was not stopped");
+
+  size_t num_threads = process.GetNumThreads();
+  if (num_threads != 1)
+    throw Exception("Unexpected number of threads.");
+  SBThread cur_thread = process.GetThreadAtIndex(0);
+  if (!cur_thread.IsValid())
+    throw Exception("Didn't get a valid thread");
+
+  // Record the number of frames at the point where we stopped:
+  const size_t num_frames = cur_thread.GetNumFrames();
+  // Now step once to clear the frame cache:
+  cur_thread.StepOver();
+  
+  // Create three threads and set them to getting frames simultaneously,
+  // and make sure we don't deadlock.
+  pseudo_barrier_t rendevous;
+  pseudo_barrier_init(rendevous, 5);
+  std::atomic_size_t success(true);
+  std::atomic_size_t largest(0);
+
+  auto lambda = [&](size_t stride){
+    pseudo_barrier_wait(rendevous);
+    bool younger = true;
+    while (1) {
+      size_t cursor = largest;
+      if (cursor > stride && !younger) {
+        cursor -= stride;
+        younger = true;
+      } else {
+        cursor += stride;
+        largest += stride;
+        younger = false;
+      }
+      SBFrame frame = cur_thread.GetFrameAtIndex(cursor);
+      if (!frame.IsValid()) {
+        if (cursor < num_frames)
+          success = false;
+        break;
+      }
+    }
+    
+  };
+
+  std::thread thread1(lambda, 1);
+  std::thread thread2(lambda, 3);
+  std::thread thread3(lambda, 5);
+  std::thread thread4(lambda, 7);
+  std::thread thread5(lambda, 11);
+  thread1.join();
+  thread2.join();
+  thread3.join();
+  thread4.join();
+  thread5.join();
+  
+  if (!success)
+    throw Exception("One thread stopped before 1000");
+}
diff --git a/lldb/unittests/Host/PipeTest.cpp b/lldb/unittests/Host/PipeTest.cpp
index 506f3d225a21e..f8fb254b5009c 100644
--- a/lldb/unittests/Host/PipeTest.cpp
+++ b/lldb/unittests/Host/PipeTest.cpp
@@ -55,6 +55,8 @@ TEST_F(PipeTest, OpenAsReader) {
 }
 #endif
 
+// This test is flaky on Windows on Arm.
+#ifndef _WIN32
 TEST_F(PipeTest, WriteWithTimeout) {
   Pipe pipe;
   ASSERT_THAT_ERROR(pipe.CreateNew(false).ToError(), llvm::Succeeded());
@@ -150,3 +152,4 @@ TEST_F(PipeTest, WriteWithTimeout) {
                         .ToError(),
                     llvm::Succeeded());
 }
+#endif /*ifndef _WIN32*/
diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index 655bd99680629..11d6deb1bf919 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -143,8 +143,8 @@ quentin.colombet@gmail.com (email), [qcolombet](https://github.com/qcolombet) (G
 
 #### MC layer
 
-James Grosbach \
-grosbach@apple.com (email)
+Fangrui Song \
+i@maskray.me (email), [MaskRay](https://github.com/MaskRay) (GitHub)
 
 #### Windows codegen
 
@@ -340,8 +340,8 @@ jgorbe@google.com (email), [slackito](https://github.com/slackito) (GitHub)
 
 #### TableGen
 
-Paul C. Anagnostopoulos \
-paul@windfall.com (email)
+Rahul Joshi \
+rjoshi@nvidia.com (email), [jurahul](https://github.com/jurahul) (GitHub)
 
 #### TextAPI
 
@@ -452,9 +452,11 @@ sabre@nondot.org (email), [lattner](https://github.com/lattner) (GitHub), clattn
 
 ### Inactive or former component maintainers
 
+Paul C. Anagnostopoulos (paul@windfall.com, [Paul-C-Anagnostopoulos](https://github.com/Paul-C-Anagnostopoulos)) -- TableGen \
 Justin Bogner (mail@justinbogner.com, [bogner](https://github.com/bogner)) -- SelectionDAG \
 Evan Cheng (evan.cheng@apple.com) -- Parts of code generator not covered by someone else \
 Renato Golin (rengolin@systemcall.eu, [rengolin](https://github.com/rengolin)) -- ARM backend \
+James Grosbach (grosbach@apple.com) -- MC layer \
 Anton Korobeynikov (anton@korobeynikov.info, [asl](https://github.com/asl)) -- ARM EABI \
 Chad Rosier (mcrosier@codeaurora.org) -- FastISel \
 Hans Wennborg (hans@chromium.org, [zmodem](https://github.com/zmodem)) -- Release management \
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 3978332b1149a..2867dcceb84fd 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -429,6 +429,9 @@ The current vendor extensions supported are:
 ``experimental-Xqcia``
   LLVM implements `version 0.2 of the Qualcomm uC Arithmetic extension specification <https://github.com/quic/riscv-unified-db/releases/latest>`__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
 
+``experimental-Xqcics``
+  LLVM implements `version 0.2 of the Qualcomm uC Conditional Select extension specification <https://github.com/quic/riscv-unified-db/releases/latest>`__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
+
 ``experimental-Xqcicsr``
   LLVM implements `version 0.2 of the Qualcomm uC CSR extension specification <https://github.com/quic/riscv-unified-db/releases/latest>`__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
 
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index a5805e050bfdb..ddee4ab8ce1b3 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -221,6 +221,8 @@ Changes to the RISC-V Backend
   extension.
 * Adds experimental assembler support for the Qualcomm uC 'Xqcia` (Arithmetic)
   extension.
+* Adds experimental assembler support for the Qualcomm uC 'Xqcics` (Conditonal Select)
+  extension.
 
 Changes to the WebAssembly Backend
 ----------------------------------
diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl07.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl07.rst
index 8fd4c39d3ff47..ed1dea2324918 100644
--- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl07.rst
+++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl07.rst
@@ -336,7 +336,7 @@ the function:
     /// CreateEntryBlockAlloca - Create an alloca instruction in the entry block of
     /// the function.  This is used for mutable variables etc.
     static AllocaInst *CreateEntryBlockAlloca(Function *TheFunction,
-                                              const std::string &VarName) {
+                                              StringRef VarName) {
       IRBuilder<> TmpB(&TheFunction->getEntryBlock(),
                      TheFunction->getEntryBlock().begin());
       return TmpB.CreateAlloca(Type::getDoubleTy(*TheContext), nullptr,
@@ -440,11 +440,11 @@ get good codegen once again:
 .. code-block:: c++
 
         // Promote allocas to registers.
-        TheFPM->add(createPromoteMemoryToRegisterPass());
+        TheFPM->addPass(PromotePass());
         // Do simple "peephole" optimizations and bit-twiddling optzns.
-        TheFPM->add(createInstructionCombiningPass());
+        TheFPM->addPass(InstCombinePass());
         // Reassociate expressions.
-        TheFPM->add(createReassociatePass());
+        TheFPM->addPass(ReassociatePass());
         ...
 
 It is interesting to see what the code looks like before and after the
diff --git a/llvm/examples/Kaleidoscope/Chapter7/toy.cpp b/llvm/examples/Kaleidoscope/Chapter7/toy.cpp
index 68208c4f3394a..374f2c03b48e0 100644
--- a/llvm/examples/Kaleidoscope/Chapter7/toy.cpp
+++ b/llvm/examples/Kaleidoscope/Chapter7/toy.cpp
@@ -21,7 +21,7 @@
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/Reassociate.h"
 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
-#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/Mem2Reg.h"
 #include <algorithm>
 #include <cassert>
 #include <cctype>
@@ -1142,6 +1142,8 @@ static void InitializeModuleAndManagers() {
   TheSI->registerCallbacks(*ThePIC, TheMAM.get());
 
   // Add transform passes.
+  // Promote allocas to registers.
+  TheFPM->addPass(PromotePass());
   // Do simple "peephole" optimizations and bit-twiddling optzns.
   TheFPM->addPass(InstCombinePass());
   // Reassociate expressions.
diff --git a/llvm/include/llvm/ADT/StringTable.h b/llvm/include/llvm/ADT/StringTable.h
new file mode 100644
index 0000000000000..4049f892fa66e
--- /dev/null
+++ b/llvm/include/llvm/ADT/StringTable.h
@@ -0,0 +1,91 @@
+//===- StringTable.h - Table of strings tracked by offset ----------C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_STRING_TABLE_H
+#define LLVM_ADT_STRING_TABLE_H
+
+#include "llvm/ADT/StringRef.h"
+#include <limits>
+
+namespace llvm {
+
+/// A table of densely packed, null-terminated strings indexed by offset.
+///
+/// This table abstracts a densely concatenated list of null-terminated strings,
+/// each of which can be referenced using an offset into the table.
+///
+/// This requires and ensures that the string at offset 0 is also the empty
+/// string. This helps allow zero-initialized offsets form empty strings and
+/// avoids non-zero initialization when using a string literal pointer would
+/// allow a null pointer.
+///
+/// The primary use case is having a single global string literal for the table
+/// contents, and offsets into it in other global data structures to avoid
+/// dynamic relocations of individual string literal pointers in those global
+/// data structures.
+class StringTable {
+  StringRef Table;
+
+public:
+  // An offset into one of these packed string tables, used to select a string
+  // within the table.
+  //
+  // Typically these are created by TableGen or other code generator from
+  // computed offsets, and it just wraps that integer into a type until it is
+  // used with the relevant table.
+  //
+  // We also ensure that the empty string is at offset zero and default
+  // constructing this class gives you an offset of zero. This makes default
+  // constructing this type work similarly (after indexing the table) to default
+  // constructing a `StringRef`.
+  class Offset {
+    // Note that we ensure the empty string is at offset zero.
+    unsigned Value = 0;
+
+  public:
+    constexpr Offset() = default;
+    constexpr Offset(unsigned Value) : Value(Value) {}
+
+    constexpr unsigned value() const { return Value; }
+  };
+
+  // We directly handle string literals with a templated converting constructor
+  // because we *don't* want to do `strlen` on them -- we fully expect null
+  // bytes in this input. This is somewhat the opposite of how `StringLiteral`
+  // works.
+  template <size_t N>
+  constexpr StringTable(const char (&RawTable)[N]) : Table(RawTable, N) {
+    static_assert(N <= std::numeric_limits<unsigned>::max(),
+                  "We only support table sizes that can be indexed by an "
+                  "`unsigned` offset.");
+
+    // Note that we can only use `empty`, `data`, and `size` in these asserts to
+    // support `constexpr`.
+    assert(!Table.empty() && "Requires at least a valid empty string.");
+    assert(Table.data()[0] == '\0' && "Offset zero must be the empty string.");
+    // Ensure that `strlen` from any offset cannot overflow the end of the table
+    // by insisting on a null byte at the end.
+    assert(Table.data()[Table.size() - 1] == '\0' &&
+           "Last byte must be a null byte.");
+  }
+
+  // Get a string from the table starting with the provided offset. The returned
+  // `StringRef` is in fact null terminated, and so can be converted safely to a
+  // C-string if necessary for a system API.
+  constexpr StringRef operator[](Offset O) const {
+    assert(O.value() < Table.size() && "Out of bounds offset!");
+    return Table.data() + O.value();
+  }
+
+  /// Returns the byte size of the table.
+  constexpr size_t size() const { return Table.size(); }
+};
+
+} // namespace llvm
+
+#endif // LLVM_ADT_STRING_TABLE_H
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 5d992faf99d27..e8041e22b031c 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -50,9 +50,16 @@ enum class RecurKind {
   FMulAdd,  ///< Sum of float products with llvm.fmuladd(a * b + sum).
   IAnyOf,   ///< Any_of reduction with select(icmp(),x,y) where one of (x,y) is
             ///< loop invariant, and both x and y are integer type.
-  FAnyOf    ///< Any_of reduction with select(fcmp(),x,y) where one of (x,y) is
+  FAnyOf,   ///< Any_of reduction with select(fcmp(),x,y) where one of (x,y) is
             ///< loop invariant, and both x and y are integer type.
-  // TODO: Any_of reduction need not be restricted to integer type only.
+  IFindLastIV, ///< FindLast reduction with select(icmp(),x,y) where one of
+               ///< (x,y) is increasing loop induction, and both x and y are
+               ///< integer type.
+  FFindLastIV ///< FindLast reduction with select(fcmp(),x,y) where one of (x,y)
+              ///< is increasing loop induction, and both x and y are integer
+              ///< type.
+  // TODO: Any_of and FindLast reduction need not be restricted to integer type
+  // only.
 };
 
 /// The RecurrenceDescriptor is used to identify recurrences variables in a
@@ -124,7 +131,7 @@ class RecurrenceDescriptor {
   /// the returned struct.
   static InstDesc isRecurrenceInstr(Loop *L, PHINode *Phi, Instruction *I,
                                     RecurKind Kind, InstDesc &Prev,
-                                    FastMathFlags FuncFMF);
+                                    FastMathFlags FuncFMF, ScalarEvolution *SE);
 
   /// Returns true if instruction I has multiple uses in Insts
   static bool hasMultipleUsesOf(Instruction *I,
@@ -151,6 +158,16 @@ class RecurrenceDescriptor {
   static InstDesc isAnyOfPattern(Loop *Loop, PHINode *OrigPhi, Instruction *I,
                                  InstDesc &Prev);
 
+  /// Returns a struct describing whether the instruction is either a
+  ///   Select(ICmp(A, B), X, Y), or
+  ///   Select(FCmp(A, B), X, Y)
+  /// where one of (X, Y) is an increasing loop induction variable, and the
+  /// other is a PHI value.
+  // TODO: Support non-monotonic variable. FindLast does not need be restricted
+  // to increasing loop induction variables.
+  static InstDesc isFindLastIVPattern(Loop *TheLoop, PHINode *OrigPhi,
+                                      Instruction *I, ScalarEvolution &SE);
+
   /// Returns a struct describing if the instruction is a
   /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern.
   static InstDesc isConditionalRdxPattern(RecurKind Kind, Instruction *I);
@@ -236,10 +253,25 @@ class RecurrenceDescriptor {
     return Kind == RecurKind::IAnyOf || Kind == RecurKind::FAnyOf;
   }
 
+  /// Returns true if the recurrence kind is of the form
+  ///   select(cmp(),x,y) where one of (x,y) is increasing loop induction.
+  static bool isFindLastIVRecurrenceKind(RecurKind Kind) {
+    return Kind == RecurKind::IFindLastIV || Kind == RecurKind::FFindLastIV;
+  }
+
   /// Returns the type of the recurrence. This type can be narrower than the
   /// actual type of the Phi if the recurrence has been type-promoted.
   Type *getRecurrenceType() const { return RecurrenceType; }
 
+  /// Returns the sentinel value for FindLastIV recurrences to replace the start
+  /// value.
+  Value *getSentinelValue() const {
+    assert(isFindLastIVRecurrenceKind(Kind) && "Unexpected recurrence kind");
+    Type *Ty = StartValue->getType();
+    return ConstantInt::get(Ty,
+                            APInt::getSignedMinValue(Ty->getIntegerBitWidth()));
+  }
+
   /// Returns a reference to the instructions used for type-promoting the
   /// recurrence.
   const SmallPtrSet<Instruction *, 8> &getCastInsts() const { return CastInsts; }
diff --git a/llvm/include/llvm/Analysis/PtrUseVisitor.h b/llvm/include/llvm/Analysis/PtrUseVisitor.h
index bbe2741f44fc3..c9d3874e7dd96 100644
--- a/llvm/include/llvm/Analysis/PtrUseVisitor.h
+++ b/llvm/include/llvm/Analysis/PtrUseVisitor.h
@@ -64,6 +64,9 @@ class PtrUseVisitorBase {
     /// Is the pointer escaped at some point?
     bool isEscaped() const { return EscapedInfo != nullptr; }
 
+    /// Is the pointer escaped into a read-only nocapture call at some point?
+    bool isEscapedReadOnly() const { return EscapedReadOnly != nullptr; }
+
     /// Get the instruction causing the visit to abort.
     /// \returns a pointer to the instruction causing the abort if one is
     /// available; otherwise returns null.
@@ -74,6 +77,10 @@ class PtrUseVisitorBase {
     /// is available; otherwise returns null.
     Instruction *getEscapingInst() const { return EscapedInfo; }
 
+    /// Get the instruction causing the pointer to escape which is a read-only
+    /// nocapture call.
+    Instruction *getEscapedReadOnlyInst() const { return EscapedReadOnly; }
+
     /// Mark the visit as aborted. Intended for use in a void return.
     /// \param I The instruction which caused the visit to abort, if available.
     void setAborted(Instruction *I) {
@@ -88,6 +95,12 @@ class PtrUseVisitorBase {
       EscapedInfo = I;
     }
 
+    /// Mark the pointer as escaped into a readonly-nocapture call.
+    void setEscapedReadOnly(Instruction *I) {
+      assert(I && "Expected a valid pointer in setEscapedReadOnly");
+      EscapedReadOnly = I;
+    }
+
     /// Mark the pointer as escaped, and the visit as aborted. Intended
     /// for use in a void return.
     /// \param I The instruction which both escapes the pointer and aborts the
@@ -100,6 +113,7 @@ class PtrUseVisitorBase {
   private:
     Instruction *AbortedInfo = nullptr;
     Instruction *EscapedInfo = nullptr;
+    Instruction *EscapedReadOnly = nullptr;
   };
 
 protected:
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
index 325c9cd9900b3..f51d2bb9d50a2 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -10,7 +10,9 @@
 #define LLVM_ANALYSIS_TARGETLIBRARYINFO_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/TargetParser/Triple.h"
@@ -565,6 +567,16 @@ class TargetLibraryInfo {
   /// \copydoc TargetLibraryInfoImpl::getSizeTSize()
   unsigned getSizeTSize(const Module &M) const { return Impl->getSizeTSize(M); }
 
+  /// Returns an IntegerType corresponding to size_t.
+  IntegerType *getSizeTType(const Module &M) const {
+    return IntegerType::get(M.getContext(), getSizeTSize(M));
+  }
+
+  /// Returns a constant materialized as a size_t type.
+  ConstantInt *getAsSizeT(uint64_t V, const Module &M) const {
+    return ConstantInt::get(getSizeTType(M), V);
+  }
+
   /// \copydoc TargetLibraryInfoImpl::getIntSize()
   unsigned getIntSize() const {
     return Impl->getIntSize();
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 42d132efec2e7..3dd62b2ba333c 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -99,8 +99,16 @@
 
 namespace llvm {
 
-extern cl::opt<bool> ForceTopDown;
-extern cl::opt<bool> ForceBottomUp;
+namespace MISched {
+enum Direction {
+  Unspecified,
+  TopDown,
+  BottomUp,
+  Bidirectional,
+};
+} // namespace MISched
+
+extern cl::opt<MISched::Direction> PreRADirection;
 extern cl::opt<bool> VerifyScheduling;
 #ifndef NDEBUG
 extern cl::opt<bool> ViewMISchedDAGs;
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index 67632fb79f8aa..3e22b6ff71c8f 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -758,15 +758,13 @@ template <typename T, typename I, typename E> //
 struct LinearT {
   // std::get<type> won't work here due to duplicate types in the tuple.
   using List = ObjectListT<I, E>;
-  using StepSimpleModifier = E;
+  // StepSimpleModifier is same as StepComplexModifier.
   using StepComplexModifier = E;
   ENUM(LinearModifier, Ref, Val, Uval);
 
   using TupleTrait = std::true_type;
   // Step == nullopt means 1.
-  std::tuple<OPT(StepSimpleModifier), OPT(StepComplexModifier),
-             OPT(LinearModifier), List>
-      t;
+  std::tuple<OPT(StepComplexModifier), OPT(LinearModifier), List> t;
 };
 
 // V5.2: [5.8.5] `link` clause
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 772f60343c634..4f23a6792d634 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -464,7 +464,7 @@ def OMPC_Sizes: Clause<"sizes"> {
 }
 def OMPC_TaskReduction : Clause<"task_reduction"> {
   let clangClass = "OMPTaskReductionClause";
-  let flangClass = "OmpReductionClause";
+  let flangClass = "OmpTaskReductionClause";
 }
 def OMPC_ThreadLimit : Clause<"thread_limit"> {
   let clangClass = "OMPThreadLimitClause";
diff --git a/llvm/include/llvm/Transforms/Utils/Evaluator.h b/llvm/include/llvm/Transforms/Utils/Evaluator.h
index 5d53773b5d6b6..118037625421a 100644
--- a/llvm/include/llvm/Transforms/Utils/Evaluator.h
+++ b/llvm/include/llvm/Transforms/Utils/Evaluator.h
@@ -125,9 +125,6 @@ class Evaluator {
     ValueStack.back()[V] = C;
   }
 
-  /// Casts call result to a type of bitcast call expression
-  Constant *castCallResultIfNeeded(Type *ReturnType, Constant *RV);
-
   /// Given call site return callee and list of its formal arguments
   Function *getCalleeWithFormalArgs(CallBase &CB,
                                     SmallVectorImpl<Constant *> &Formals);
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 4b3d6fbed8362..b4cd52fef70fd 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -419,6 +419,12 @@ Value *createAnyOfReduction(IRBuilderBase &B, Value *Src,
                             const RecurrenceDescriptor &Desc,
                             PHINode *OrigPhi);
 
+/// Create a reduction of the given vector \p Src for a reduction of the
+/// kind RecurKind::IFindLastIV or RecurKind::FFindLastIV. The reduction
+/// operation is described by \p Desc.
+Value *createFindLastIVReduction(IRBuilderBase &B, Value *Src,
+                                 const RecurrenceDescriptor &Desc);
+
 /// Create a generic reduction using a recurrence descriptor \p Desc
 /// Fast-math-flags are propagated using the RecurrenceDescriptor.
 Value *createReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc,
diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
index 73649766a9538..989cf0b2d0e7b 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
@@ -188,6 +188,13 @@ class LoadAndStorePromoter {
   /// Return false if a sub-class wants to keep one of the loads/stores
   /// after the SSA construction.
   virtual bool shouldDelete(Instruction *I) const { return true; }
+
+  /// Return the value to use for the point in the code that the alloca is
+  /// positioned. This will only be used if an Alloca is included in Insts,
+  /// otherwise the value of a uninitialized load will be assumed to be poison.
+  virtual Value *getValueToUseForAlloca(Instruction *AI) const {
+    return nullptr;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index e1eb219cf977e..76a78d5229652 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -51,6 +51,8 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) {
   case RecurKind::UMin:
   case RecurKind::IAnyOf:
   case RecurKind::FAnyOf:
+  case RecurKind::IFindLastIV:
+  case RecurKind::FFindLastIV:
     return true;
   }
   return false;
@@ -372,7 +374,7 @@ bool RecurrenceDescriptor::AddReductionVar(
     // type-promoted).
     if (Cur != Start) {
       ReduxDesc =
-          isRecurrenceInstr(TheLoop, Phi, Cur, Kind, ReduxDesc, FuncFMF);
+          isRecurrenceInstr(TheLoop, Phi, Cur, Kind, ReduxDesc, FuncFMF, SE);
       ExactFPMathInst = ExactFPMathInst == nullptr
                             ? ReduxDesc.getExactFPMathInst()
                             : ExactFPMathInst;
@@ -658,6 +660,95 @@ RecurrenceDescriptor::isAnyOfPattern(Loop *Loop, PHINode *OrigPhi,
                                                      : RecurKind::FAnyOf);
 }
 
+// We are looking for loops that do something like this:
+//   int r = 0;
+//   for (int i = 0; i < n; i++) {
+//     if (src[i] > 3)
+//       r = i;
+//   }
+// The reduction value (r) is derived from either the values of an increasing
+// induction variable (i) sequence, or from the start value (0).
+// The LLVM IR generated for such loops would be as follows:
+//   for.body:
+//     %r = phi i32 [ %spec.select, %for.body ], [ 0, %entry ]
+//     %i = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+//     ...
+//     %cmp = icmp sgt i32 %5, 3
+//     %spec.select = select i1 %cmp, i32 %i, i32 %r
+//     %inc = add nsw i32 %i, 1
+//     ...
+// Since 'i' is an increasing induction variable, the reduction value after the
+// loop will be the maximum value of 'i' that the condition (src[i] > 3) is
+// satisfied, or the start value (0 in the example above). When the start value
+// of the increasing induction variable 'i' is greater than the minimum value of
+// the data type, we can use the minimum value of the data type as a sentinel
+// value to replace the start value. This allows us to perform a single
+// reduction max operation to obtain the final reduction result.
+// TODO: It is possible to solve the case where the start value is the minimum
+// value of the data type or a non-constant value by using mask and multiple
+// reduction operations.
+RecurrenceDescriptor::InstDesc
+RecurrenceDescriptor::isFindLastIVPattern(Loop *TheLoop, PHINode *OrigPhi,
+                                          Instruction *I, ScalarEvolution &SE) {
+  // TODO: Support the vectorization of FindLastIV when the reduction phi is
+  // used by more than one select instruction. This vectorization is only
+  // performed when the SCEV of each increasing induction variable used by the
+  // select instructions is identical.
+  if (!OrigPhi->hasOneUse())
+    return InstDesc(false, I);
+
+  // TODO: Match selects with multi-use cmp conditions.
+  Value *NonRdxPhi = nullptr;
+  if (!match(I, m_CombineOr(m_Select(m_OneUse(m_Cmp()), m_Value(NonRdxPhi),
+                                     m_Specific(OrigPhi)),
+                            m_Select(m_OneUse(m_Cmp()), m_Specific(OrigPhi),
+                                     m_Value(NonRdxPhi)))))
+    return InstDesc(false, I);
+
+  auto IsIncreasingLoopInduction = [&](Value *V) {
+    Type *Ty = V->getType();
+    if (!SE.isSCEVable(Ty))
+      return false;
+
+    auto *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(V));
+    if (!AR || AR->getLoop() != TheLoop)
+      return false;
+
+    const SCEV *Step = AR->getStepRecurrence(SE);
+    if (!SE.isKnownPositive(Step))
+      return false;
+
+    const ConstantRange IVRange = SE.getSignedRange(AR);
+    unsigned NumBits = Ty->getIntegerBitWidth();
+    // Keep the minimum value of the recurrence type as the sentinel value.
+    // The maximum acceptable range for the increasing induction variable,
+    // called the valid range, will be defined as
+    //   [<sentinel value> + 1, <sentinel value>)
+    // where <sentinel value> is SignedMin(<recurrence type>)
+    // TODO: This range restriction can be lifted by adding an additional
+    // virtual OR reduction.
+    const APInt Sentinel = APInt::getSignedMinValue(NumBits);
+    const ConstantRange ValidRange =
+        ConstantRange::getNonEmpty(Sentinel + 1, Sentinel);
+    LLVM_DEBUG(dbgs() << "LV: FindLastIV valid range is " << ValidRange
+                      << ", and the signed range of " << *AR << " is "
+                      << IVRange << "\n");
+    // Ensure the induction variable does not wrap around by verifying that its
+    // range is fully contained within the valid range.
+    return ValidRange.contains(IVRange);
+  };
+
+  // We are looking for selects of the form:
+  //   select(cmp(), phi, increasing_loop_induction) or
+  //   select(cmp(), increasing_loop_induction, phi)
+  // TODO: Support for monotonically decreasing induction variable
+  if (!IsIncreasingLoopInduction(NonRdxPhi))
+    return InstDesc(false, I);
+
+  return InstDesc(I, isa<ICmpInst>(I->getOperand(0)) ? RecurKind::IFindLastIV
+                                                     : RecurKind::FFindLastIV);
+}
+
 RecurrenceDescriptor::InstDesc
 RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind,
                                       const InstDesc &Prev) {
@@ -756,10 +847,9 @@ RecurrenceDescriptor::isConditionalRdxPattern(RecurKind Kind, Instruction *I) {
   return InstDesc(true, SI);
 }
 
-RecurrenceDescriptor::InstDesc
-RecurrenceDescriptor::isRecurrenceInstr(Loop *L, PHINode *OrigPhi,
-                                        Instruction *I, RecurKind Kind,
-                                        InstDesc &Prev, FastMathFlags FuncFMF) {
+RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
+    Loop *L, PHINode *OrigPhi, Instruction *I, RecurKind Kind, InstDesc &Prev,
+    FastMathFlags FuncFMF, ScalarEvolution *SE) {
   assert(Prev.getRecKind() == RecurKind::None || Prev.getRecKind() == Kind);
   switch (I->getOpcode()) {
   default:
@@ -789,6 +879,8 @@ RecurrenceDescriptor::isRecurrenceInstr(Loop *L, PHINode *OrigPhi,
     if (Kind == RecurKind::FAdd || Kind == RecurKind::FMul ||
         Kind == RecurKind::Add || Kind == RecurKind::Mul)
       return isConditionalRdxPattern(Kind, I);
+    if (isFindLastIVRecurrenceKind(Kind) && SE)
+      return isFindLastIVPattern(L, OrigPhi, I, *SE);
     [[fallthrough]];
   case Instruction::FCmp:
   case Instruction::ICmp:
@@ -893,6 +985,15 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
                       << *Phi << "\n");
     return true;
   }
+  if (AddReductionVar(Phi, RecurKind::IFindLastIV, TheLoop, FMF, RedDes, DB, AC,
+                      DT, SE)) {
+    LLVM_DEBUG(dbgs() << "Found a "
+                      << (RedDes.getRecurrenceKind() == RecurKind::FFindLastIV
+                              ? "F"
+                              : "I")
+                      << "FindLastIV reduction PHI." << *Phi << "\n");
+    return true;
+  }
   if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT,
                       SE)) {
     LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
@@ -1048,12 +1149,14 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
   case RecurKind::UMax:
   case RecurKind::UMin:
   case RecurKind::IAnyOf:
+  case RecurKind::IFindLastIV:
     return Instruction::ICmp;
   case RecurKind::FMax:
   case RecurKind::FMin:
   case RecurKind::FMaximum:
   case RecurKind::FMinimum:
   case RecurKind::FAnyOf:
+  case RecurKind::FFindLastIV:
     return Instruction::FCmp;
   default:
     llvm_unreachable("Unknown recurrence operation");
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index e0482b2b1ce02..8557901192e40 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -1459,19 +1459,16 @@ unsigned TargetLibraryInfoImpl::getWCharSize(const Module &M) const {
 }
 
 unsigned TargetLibraryInfoImpl::getSizeTSize(const Module &M) const {
-  // There is really no guarantee that sizeof(size_t) is equal to sizeof(int*).
-  // If that isn't true then it should be possible to derive the SizeTTy from
-  // the target triple here instead and do an early return.
-
-  // Historically LLVM assume that size_t has same size as intptr_t (hence
-  // deriving the size from sizeof(int*) in address space zero). This should
-  // work for most targets. For future consideration: DataLayout also implement
-  // getIndexSizeInBits which might map better to size_t compared to
-  // getPointerSizeInBits. Hard coding address space zero here might be
-  // unfortunate as well. Maybe getDefaultGlobalsAddressSpace() or
-  // getAllocaAddrSpace() is better.
-  unsigned AddressSpace = 0;
-  return M.getDataLayout().getPointerSizeInBits(AddressSpace);
+  // There is really no guarantee that sizeof(size_t) is equal to the index
+  // size of the default address space. If that isn't true then it should be
+  // possible to derive the SizeTTy from the target triple here instead and do
+  // an early return.
+
+  // Hard coding address space zero may seem unfortunate, but a number of
+  // configurations of common targets (i386, x86-64 x32, aarch64 x32, possibly
+  // others) have larger-than-size_t index sizes on non-default address spaces,
+  // making this the best default.
+  return M.getDataLayout().getIndexSizeInBits(/*AddressSpace=*/0);
 }
 
 TargetLibraryInfoWrapperPass::TargetLibraryInfoWrapperPass()
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index f2c6949e535d2..c148dbce92d1a 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -4906,10 +4906,10 @@ static void computeKnownFPClassFromCond(const Value *V, Value *Cond,
     if (CmpVal == V)
       KnownFromContext.knownNot(~(CondIsTrue ? MaskIfTrue : MaskIfFalse));
   } else if (match(Cond, m_Intrinsic<Intrinsic::is_fpclass>(
-                             m_Value(LHS), m_ConstantInt(ClassVal)))) {
+                             m_Specific(V), m_ConstantInt(ClassVal)))) {
     FPClassTest Mask = static_cast<FPClassTest>(ClassVal);
     KnownFromContext.knownNot(CondIsTrue ? ~Mask : Mask);
-  } else if (match(Cond, m_ICmp(Pred, m_ElementWiseBitCast(m_Value(LHS)),
+  } else if (match(Cond, m_ICmp(Pred, m_ElementWiseBitCast(m_Specific(V)),
                                 m_APInt(RHS)))) {
     bool TrueIfSigned;
     if (!isSignBitCheck(Pred, *RHS, TrueIfSigned))
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index f56be39838ba7..39c09f4b71d0c 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1716,14 +1716,9 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
   case TargetOpcode::G_ICMP: {
     Register LHS = MI.getOperand(2).getReg();
     LLT SrcTy = MRI.getType(LHS);
-    uint64_t SrcSize = SrcTy.getSizeInBits();
     CmpInst::Predicate Pred =
         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
 
-    // TODO: Handle the non-equality case for weird sizes.
-    if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred))
-      return UnableToLegalize;
-
     LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
     SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
     if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
@@ -1775,19 +1770,59 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
         Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
       MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
     } else {
-      // TODO: Handle non-power-of-two types.
-      assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?");
-      assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?");
-      Register LHSL = LHSPartRegs[0];
-      Register LHSH = LHSPartRegs[1];
-      Register RHSL = RHSPartRegs[0];
-      Register RHSH = RHSPartRegs[1];
-      MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
-      MachineInstrBuilder CmpHEQ =
-          MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
-      MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
-          ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
-      MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH);
+      Register CmpIn;
+      for (unsigned I = 0, E = LHSPartRegs.size(); I != E; ++I) {
+        Register CmpOut;
+        CmpInst::Predicate PartPred;
+
+        if (I == E - 1 && LHSLeftoverRegs.empty()) {
+          PartPred = Pred;
+          CmpOut = Dst;
+        } else {
+          PartPred = ICmpInst::getUnsignedPredicate(Pred);
+          CmpOut = MRI.createGenericVirtualRegister(ResTy);
+        }
+
+        if (!CmpIn) {
+          MIRBuilder.buildICmp(PartPred, CmpOut, LHSPartRegs[I],
+                               RHSPartRegs[I]);
+        } else {
+          auto Cmp = MIRBuilder.buildICmp(PartPred, ResTy, LHSPartRegs[I],
+                                          RHSPartRegs[I]);
+          auto CmpEq = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy,
+                                            LHSPartRegs[I], RHSPartRegs[I]);
+          MIRBuilder.buildSelect(CmpOut, CmpEq, CmpIn, Cmp);
+        }
+
+        CmpIn = CmpOut;
+      }
+
+      for (unsigned I = 0, E = LHSLeftoverRegs.size(); I != E; ++I) {
+        Register CmpOut;
+        CmpInst::Predicate PartPred;
+
+        if (I == E - 1 && LHSLeftoverRegs.empty()) {
+          PartPred = Pred;
+          CmpOut = Dst;
+        } else {
+          PartPred = ICmpInst::getUnsignedPredicate(Pred);
+          CmpOut = MRI.createGenericVirtualRegister(ResTy);
+        }
+
+        if (!CmpIn) {
+          MIRBuilder.buildICmp(PartPred, CmpOut, LHSLeftoverRegs[I],
+                               RHSLeftoverRegs[I]);
+        } else {
+          auto Cmp = MIRBuilder.buildICmp(PartPred, ResTy, LHSLeftoverRegs[I],
+                                          RHSLeftoverRegs[I]);
+          auto CmpEq =
+              MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy,
+                                   LHSLeftoverRegs[I], RHSLeftoverRegs[I]);
+          MIRBuilder.buildSelect(CmpOut, CmpEq, CmpIn, Cmp);
+        }
+
+        CmpIn = CmpOut;
+      }
     }
     MI.eraseFromParent();
     return Legalized;
@@ -5347,9 +5382,9 @@ LegalizerHelper::fewerElementsBitcast(MachineInstr &MI, unsigned int TypeIdx,
 
   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
 
-  unsigned SrcScalSize = SrcTy.getScalarSizeInBits();
-  LLT SrcNarrowTy =
-      LLT::fixed_vector(NarrowTy.getSizeInBits() / SrcScalSize, SrcScalSize);
+  unsigned NewElemCount =
+      NarrowTy.getSizeInBits() / SrcTy.getScalarSizeInBits();
+  LLT SrcNarrowTy = LLT::fixed_vector(NewElemCount, SrcTy.getElementType());
 
   // Split the Src and Dst Reg into smaller registers
   SmallVector<Register> SrcVRegs, BitcastVRegs;
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 45807a6818ee5..8c1e41ea106ec 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -525,8 +525,7 @@ bool llvm::extractParts(Register Reg, LLT RegTy, LLT MainTy, LLT &LeftoverTy,
         RegNumElts % LeftoverNumElts == 0 &&
         RegTy.getScalarSizeInBits() == MainTy.getScalarSizeInBits() &&
         LeftoverNumElts > 1) {
-      LeftoverTy =
-          LLT::fixed_vector(LeftoverNumElts, RegTy.getScalarSizeInBits());
+      LeftoverTy = LLT::fixed_vector(LeftoverNumElts, RegTy.getElementType());
 
       // Unmerge the SrcReg to LeftoverTy vectors
       SmallVector<Register, 4> UnmergeValues;
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 1722bdda99e4a..91aaeea156c4a 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -77,30 +77,30 @@ STATISTIC(NumClustered, "Number of load/store pairs clustered");
 
 namespace llvm {
 
-cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
-                           cl::desc("Force top-down list scheduling"));
-cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
-                            cl::desc("Force bottom-up list scheduling"));
-namespace MISchedPostRASched {
-enum Direction {
-  TopDown,
-  BottomUp,
-  Bidirectional,
-};
-} // end namespace MISchedPostRASched
-cl::opt<MISchedPostRASched::Direction> PostRADirection(
+cl::opt<MISched::Direction> PreRADirection(
+    "misched-prera-direction", cl::Hidden,
+    cl::desc("Pre reg-alloc list scheduling direction"),
+    cl::init(MISched::Unspecified),
+    cl::values(
+        clEnumValN(MISched::TopDown, "topdown",
+                   "Force top-down pre reg-alloc list scheduling"),
+        clEnumValN(MISched::BottomUp, "bottomup",
+                   "Force bottom-up pre reg-alloc list scheduling"),
+        clEnumValN(MISched::Bidirectional, "bidirectional",
+                   "Force bidirectional pre reg-alloc list scheduling")));
+
+cl::opt<MISched::Direction> PostRADirection(
     "misched-postra-direction", cl::Hidden,
     cl::desc("Post reg-alloc list scheduling direction"),
-    // Default to top-down because it was implemented first and existing targets
-    // expect that behavior by default.
-    cl::init(MISchedPostRASched::TopDown),
+    cl::init(MISched::Unspecified),
     cl::values(
-        clEnumValN(MISchedPostRASched::TopDown, "topdown",
+        clEnumValN(MISched::TopDown, "topdown",
                    "Force top-down post reg-alloc list scheduling"),
-        clEnumValN(MISchedPostRASched::BottomUp, "bottomup",
+        clEnumValN(MISched::BottomUp, "bottomup",
                    "Force bottom-up post reg-alloc list scheduling"),
-        clEnumValN(MISchedPostRASched::Bidirectional, "bidirectional",
+        clEnumValN(MISched::Bidirectional, "bidirectional",
                    "Force bidirectional post reg-alloc list scheduling")));
+
 cl::opt<bool>
 DumpCriticalPathLength("misched-dcpl", cl::Hidden,
                        cl::desc("Print critical path length to stdout"));
@@ -3307,19 +3307,15 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
     RegionPolicy.ShouldTrackLaneMasks = false;
   }
 
-  // Check -misched-topdown/bottomup can force or unforce scheduling direction.
-  // e.g. -misched-bottomup=false allows scheduling in both directions.
-  assert((!ForceTopDown || !ForceBottomUp) &&
-         "-misched-topdown incompatible with -misched-bottomup");
-  if (ForceBottomUp.getNumOccurrences() > 0) {
-    RegionPolicy.OnlyBottomUp = ForceBottomUp;
-    if (RegionPolicy.OnlyBottomUp)
-      RegionPolicy.OnlyTopDown = false;
-  }
-  if (ForceTopDown.getNumOccurrences() > 0) {
-    RegionPolicy.OnlyTopDown = ForceTopDown;
-    if (RegionPolicy.OnlyTopDown)
-      RegionPolicy.OnlyBottomUp = false;
+  if (PreRADirection == MISched::TopDown) {
+    RegionPolicy.OnlyTopDown = true;
+    RegionPolicy.OnlyBottomUp = false;
+  } else if (PreRADirection == MISched::BottomUp) {
+    RegionPolicy.OnlyTopDown = false;
+    RegionPolicy.OnlyBottomUp = true;
+  } else if (PreRADirection == MISched::Bidirectional) {
+    RegionPolicy.OnlyBottomUp = false;
+    RegionPolicy.OnlyTopDown = false;
   }
 }
 
@@ -3911,17 +3907,15 @@ void PostGenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
   MF.getSubtarget().overridePostRASchedPolicy(RegionPolicy, NumRegionInstrs);
 
   // After subtarget overrides, apply command line options.
-  if (PostRADirection.getNumOccurrences() > 0) {
-    if (PostRADirection == MISchedPostRASched::TopDown) {
-      RegionPolicy.OnlyTopDown = true;
-      RegionPolicy.OnlyBottomUp = false;
-    } else if (PostRADirection == MISchedPostRASched::BottomUp) {
-      RegionPolicy.OnlyTopDown = false;
-      RegionPolicy.OnlyBottomUp = true;
-    } else if (PostRADirection == MISchedPostRASched::Bidirectional) {
-      RegionPolicy.OnlyBottomUp = false;
-      RegionPolicy.OnlyTopDown = false;
-    }
+  if (PostRADirection == MISched::TopDown) {
+    RegionPolicy.OnlyTopDown = true;
+    RegionPolicy.OnlyBottomUp = false;
+  } else if (PostRADirection == MISched::BottomUp) {
+    RegionPolicy.OnlyTopDown = false;
+    RegionPolicy.OnlyBottomUp = true;
+  } else if (PostRADirection == MISched::Bidirectional) {
+    RegionPolicy.OnlyBottomUp = false;
+    RegionPolicy.OnlyTopDown = false;
   }
 }
 
@@ -4368,10 +4362,9 @@ class InstructionShuffler : public MachineSchedStrategy {
 } // end anonymous namespace
 
 static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) {
-  bool Alternate = !ForceTopDown && !ForceBottomUp;
-  bool TopDown = !ForceBottomUp;
-  assert((TopDown || !ForceTopDown) &&
-         "-misched-topdown incompatible with -misched-bottomup");
+  bool Alternate =
+      PreRADirection != MISched::TopDown && PreRADirection != MISched::BottomUp;
+  bool TopDown = PreRADirection != MISched::BottomUp;
   return new ScheduleDAGMILive(
       C, std::make_unique<InstructionShuffler>(Alternate, TopDown));
 }
diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index 2ec5af2d864f7..98321a395a506 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/SelectOptimize.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
@@ -218,7 +219,7 @@ class SelectOptimizeImpl {
 private:
   // Select groups consist of consecutive select-like instructions with the same
   // condition. Between select-likes could be any number of auxiliary
-  // instructions related to the condition like not, zext
+  // instructions related to the condition like not, zext, ashr/lshr
   struct SelectGroup {
     Value *Condition;
     SmallVector<SelectLike, 2> Selects;
@@ -496,7 +497,13 @@ static Value *getTrueOrFalseValue(
 
   auto *CBO = BO->clone();
   auto CondIdx = SI.getConditionOpIndex();
-  CBO->setOperand(CondIdx, ConstantInt::get(CBO->getType(), 1));
+  auto *AuxI = cast<Instruction>(CBO->getOperand(CondIdx));
+  if (isa<ZExtInst>(AuxI) || isa<LShrOperator>(AuxI)) {
+    CBO->setOperand(CondIdx, ConstantInt::get(CBO->getType(), 1));
+  } else {
+    assert(isa<AShrOperator>(AuxI) && "Unexpected opcode");
+    CBO->setOperand(CondIdx, ConstantInt::get(CBO->getType(), -1));
+  }
 
   unsigned OtherIdx = 1 - CondIdx;
   if (auto *IV = dyn_cast<Instruction>(CBO->getOperand(OtherIdx))) {
@@ -755,6 +762,9 @@ void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
   // zero or some constant value on True/False branch, such as:
   // * ZExt(1bit)
   // * Not(1bit)
+  // * A(L)Shr(Val), ValBitSize - 1, where there is a condition like `Val <= 0`
+  // earlier in the BB. For conditions that check the sign of the Val compiler
+  // may generate shifts instead of ZExt/SExt.
   struct SelectLikeInfo {
     Value *Cond;
     bool IsAuxiliary;
@@ -763,11 +773,19 @@ void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
   };
 
   DenseMap<Value *, SelectLikeInfo> SelectInfo;
+  // Keeps visited comparisons to help identify AShr/LShr variants of auxiliary
+  // instructions.
+  SmallSetVector<CmpInst *, 4> SeenCmp;
 
   // Check if the instruction is SelectLike or might be part of SelectLike
   // expression, put information into SelectInfo and return the iterator to the
   // inserted position.
-  auto ProcessSelectInfo = [&SelectInfo](Instruction *I) {
+  auto ProcessSelectInfo = [&SelectInfo, &SeenCmp](Instruction *I) {
+    if (auto *Cmp = dyn_cast<CmpInst>(I)) {
+      SeenCmp.insert(Cmp);
+      return SelectInfo.end();
+    }
+
     Value *Cond;
     if (match(I, m_OneUse(m_ZExt(m_Value(Cond)))) &&
         Cond->getType()->isIntegerTy(1)) {
@@ -784,35 +802,59 @@ void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
       bool Inverted = match(Cond, m_Not(m_Value(Cond)));
       return SelectInfo.insert({I, {Cond, false, Inverted, 0}}).first;
     }
+    Value *Val;
+    ConstantInt *Shift;
+    if (match(I, m_Shr(m_Value(Val), m_ConstantInt(Shift))) &&
+        I->getType()->getIntegerBitWidth() == Shift->getZExtValue() + 1) {
+      for (auto *CmpI : SeenCmp) {
+        auto Pred = CmpI->getPredicate();
+        if (Val != CmpI->getOperand(0))
+          continue;
+        if ((Pred == CmpInst::ICMP_SGT &&
+             match(CmpI->getOperand(1), m_ConstantInt<-1>())) ||
+            (Pred == CmpInst::ICMP_SGE &&
+             match(CmpI->getOperand(1), m_Zero())) ||
+            (Pred == CmpInst::ICMP_SLT &&
+             match(CmpI->getOperand(1), m_Zero())) ||
+            (Pred == CmpInst::ICMP_SLE &&
+             match(CmpI->getOperand(1), m_ConstantInt<-1>()))) {
+          bool Inverted =
+              Pred == CmpInst::ICMP_SGT || Pred == CmpInst::ICMP_SGE;
+          return SelectInfo.insert({I, {CmpI, true, Inverted, 0}}).first;
+        }
+      }
+      return SelectInfo.end();
+    }
 
-    // An Or(zext(i1 X), Y) can also be treated like a select, with condition X
+    // An BinOp(Aux(X), Y) can also be treated like a select, with condition X
     // and values Y|1 and Y.
-    if (auto *BO = dyn_cast<BinaryOperator>(I)) {
-      switch (I->getOpcode()) {
-      case Instruction::Add:
-      case Instruction::Sub: {
-        Value *X;
-        if (!((PatternMatch::match(I->getOperand(0),
-                                   m_OneUse(m_ZExt(m_Value(X)))) ||
-               PatternMatch::match(I->getOperand(1),
-                                   m_OneUse(m_ZExt(m_Value(X))))) &&
-              X->getType()->isIntegerTy(1)))
-          return SelectInfo.end();
-        break;
-      }
-      case Instruction::Or:
-        if (BO->getType()->isIntegerTy(1) || BO->getOpcode() != Instruction::Or)
-          return SelectInfo.end();
-        break;
-      }
+    // `Aux` can be either `ZExt(1bit)` or `XShr(Val), ValBitSize - 1`
+    // `BinOp` can be Add, Sub, Or
+    Value *X;
+    auto MatchZExtPattern = m_c_BinOp(m_Value(), m_OneUse(m_ZExt(m_Value(X))));
+    auto MatchShiftPattern =
+        m_c_BinOp(m_Value(), m_OneUse(m_Shr(m_Value(X), m_ConstantInt(Shift))));
+
+    // This check is unnecessary, but it prevents costly access to the
+    // SelectInfo map.
+    if ((match(I, MatchZExtPattern) && X->getType()->isIntegerTy(1)) ||
+        (match(I, MatchShiftPattern) &&
+         X->getType()->getIntegerBitWidth() == Shift->getZExtValue() + 1)) {
+      if (I->getOpcode() != Instruction::Add &&
+          I->getOpcode() != Instruction::Sub &&
+          I->getOpcode() != Instruction::Or)
+        return SelectInfo.end();
+
+      if (I->getOpcode() == Instruction::Or && I->getType()->isIntegerTy(1))
+        return SelectInfo.end();
 
       // Iterate through operands and find dependant on recognised sign
       // extending auxiliary select-like instructions. The operand index does
       // not matter for Add and Or. However, for Sub, we can only safely
       // transform when the operand is second.
-      unsigned Idx = BO->getOpcode() == Instruction::Sub ? 1 : 0;
+      unsigned Idx = I->getOpcode() == Instruction::Sub ? 1 : 0;
       for (; Idx < 2; Idx++) {
-        auto *Op = BO->getOperand(Idx);
+        auto *Op = I->getOperand(Idx);
         auto It = SelectInfo.find(Op);
         if (It != SelectInfo.end() && It->second.IsAuxiliary) {
           Cond = It->second.Cond;
diff --git a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
index 0cddf59d0ca2a..2fd1dd5f84a91 100644
--- a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
+++ b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
@@ -297,9 +297,6 @@ void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
     HighPressureSets[i] =
         ((float)MaxPressure[i] > ((float)Limit * RPThreshold));
   }
-
-  assert((!ForceTopDown || !ForceBottomUp) &&
-         "-misched-topdown incompatible with -misched-bottomup");
 }
 
 VLIWResourceModel *ConvergingVLIWScheduler::createVLIWResourceModel(
@@ -954,7 +951,7 @@ SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
     return nullptr;
   }
   SUnit *SU;
-  if (ForceTopDown) {
+  if (PreRADirection == MISched::TopDown) {
     SU = Top.pickOnlyChoice();
     if (!SU) {
       SchedCandidate TopCand;
@@ -965,7 +962,7 @@ SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
       SU = TopCand.SU;
     }
     IsTopNode = true;
-  } else if (ForceBottomUp) {
+  } else if (PreRADirection == MISched::BottomUp) {
     SU = Bot.pickOnlyChoice();
     if (!SU) {
       SchedCandidate BotCand;
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp
index b7a980cb7f068..fa426d41854bb 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp
@@ -1813,19 +1813,19 @@ DwarfUnit *CompileUnit::OutputUnitVariantPtr::operator->() {
 }
 
 bool CompileUnit::OutputUnitVariantPtr::isCompileUnit() {
-  return Ptr.is<CompileUnit *>();
+  return isa<CompileUnit *>(Ptr);
 }
 
 bool CompileUnit::OutputUnitVariantPtr::isTypeUnit() {
-  return Ptr.is<TypeUnit *>();
+  return isa<TypeUnit *>(Ptr);
 }
 
 CompileUnit *CompileUnit::OutputUnitVariantPtr::getAsCompileUnit() {
-  return Ptr.get<CompileUnit *>();
+  return cast<CompileUnit *>(Ptr);
 }
 
 TypeUnit *CompileUnit::OutputUnitVariantPtr::getAsTypeUnit() {
-  return Ptr.get<TypeUnit *>();
+  return cast<TypeUnit *>(Ptr);
 }
 
 bool CompileUnit::resolveDependenciesAndMarkLiveness(
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 21004e6a15d49..764e2ca8fe4f4 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6821,8 +6821,11 @@ static Expected<Function *> createOutlinedFunction(
     OMPBuilder.ConstantAllocaRaiseCandidates.emplace_back(Func);
 
   // Insert target deinit call in the device compilation pass.
-  llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
-      CBFunc(Builder.saveIP(), Builder.saveIP());
+  BasicBlock *OutlinedBodyBB =
+      splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
+  llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = CBFunc(
+      Builder.saveIP(),
+      OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
   if (!AfterIP)
     return AfterIP.takeError();
   Builder.restoreIP(*AfterIP);
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index d15a9a8a36c5a..153c1070a68c8 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -162,8 +162,8 @@ class AsmParser : public MCAsmParser {
   };
   CppHashInfoTy CppHashInfo;
 
-  /// The filename from the first cpp hash file line comment, if any.
-  StringRef FirstCppHashFilename;
+  /// Have we seen any file line comment.
+  bool HadCppHashFilename = false;
 
   /// List of forward directional labels for diagnosis at the end.
   SmallVector<std::tuple<SMLoc, CppHashInfoTy, MCSymbol *>, 4> DirLabels;
@@ -952,12 +952,6 @@ bool AsmParser::enabledGenDwarfForAssembly() {
   // the assembler source was produced with debug info already) then emit one
   // describing the assembler source file itself.
   if (getContext().getGenDwarfFileNumber() == 0) {
-    // Use the first #line directive for this, if any. It's preprocessed, so
-    // there is no checksum, and of course no source directive.
-    if (!FirstCppHashFilename.empty())
-      getContext().setMCLineTableRootFile(
-          /*CUID=*/0, getContext().getCompilationDir(), FirstCppHashFilename,
-          /*Cksum=*/std::nullopt, /*Source=*/std::nullopt);
     const MCDwarfFile &RootFile =
         getContext().getMCDwarfLineTable(/*CUID=*/0).getRootFile();
     getContext().setGenDwarfFileNumber(getStreamer().emitDwarfFileDirective(
@@ -2440,8 +2434,20 @@ bool AsmParser::parseCppHashLineFilenameComment(SMLoc L, bool SaveLocInfo) {
   CppHashInfo.Filename = Filename;
   CppHashInfo.LineNumber = LineNumber;
   CppHashInfo.Buf = CurBuffer;
-  if (FirstCppHashFilename.empty())
-    FirstCppHashFilename = Filename;
+  if (!HadCppHashFilename) {
+    HadCppHashFilename = true;
+    // If we haven't encountered any .file directives, then the first #line
+    // directive describes the "root" file and directory of the compilation
+    // unit.
+    if (getContext().getGenDwarfForAssembly() &&
+        getContext().getGenDwarfFileNumber() == 0) {
+      // It's preprocessed, so there is no checksum, and of course no source
+      // directive.
+      getContext().setMCLineTableRootFile(
+          /*CUID=*/0, getContext().getCompilationDir(), Filename,
+          /*Cksum=*/std::nullopt, /*Source=*/std::nullopt);
+    }
+  }
   return false;
 }
 
diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index c4bd5e2472351..d4e8261ae4feb 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -25,6 +25,7 @@
 // These two headers must be included last, and make sure shlobj is required
 // after Windows.h to make sure it picks up our definition of _WIN32_WINNT
 #include "llvm/Support/Windows/WindowsSupport.h"
+#include <atlbase.h>
 #include <shellapi.h>
 #include <shlobj.h>
 
@@ -1387,14 +1388,33 @@ std::error_code remove_directories(const Twine &path, bool IgnoreErrors) {
   Path16.push_back(0);
   Path16.push_back(0);
 
-  SHFILEOPSTRUCTW shfos = {};
-  shfos.wFunc = FO_DELETE;
-  shfos.pFrom = Path16.data();
-  shfos.fFlags = FOF_NO_UI;
-
-  int result = ::SHFileOperationW(&shfos);
-  if (result != 0 && !IgnoreErrors)
-    return mapWindowsError(result);
+  HRESULT HR;
+  do {
+    HR =
+        CoInitializeEx(NULL, COINIT_APARTMENTTHREADED | COINIT_DISABLE_OLE1DDE);
+    if (FAILED(HR))
+      break;
+    auto Uninitialize = make_scope_exit([] { CoUninitialize(); });
+    CComPtr<IFileOperation> FileOp;
+    HR = FileOp.CoCreateInstance(CLSID_FileOperation);
+    if (FAILED(HR))
+      break;
+    HR = FileOp->SetOperationFlags(FOF_NO_UI | FOFX_NOCOPYHOOKS);
+    if (FAILED(HR))
+      break;
+    PIDLIST_ABSOLUTE PIDL = ILCreateFromPathW(Path16.data());
+    auto FreePIDL = make_scope_exit([&PIDL] { ILFree(PIDL); });
+    CComPtr<IShellItem> ShItem;
+    HR = SHCreateItemFromIDList(PIDL, IID_PPV_ARGS(&ShItem));
+    if (FAILED(HR))
+      break;
+    HR = FileOp->DeleteItem(ShItem, NULL);
+    if (FAILED(HR))
+      break;
+    HR = FileOp->PerformOperations();
+  } while (false);
+  if (FAILED(HR) && !IgnoreErrors)
+    return mapWindowsError(HRESULT_CODE(HR));
   return std::error_code();
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 055cb3cefcedf..0ce4b8971625c 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -67,6 +67,10 @@ class AArch64ExpandPseudo : public MachineFunctionPass {
                             TargetRegisterClass ContiguousClass,
                             TargetRegisterClass StridedClass,
                             unsigned ContiguousOpc, unsigned StridedOpc);
+  bool expandFormTuplePseudo(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MBBI,
+                             MachineBasicBlock::iterator &NextMBBI,
+                             unsigned Size);
   bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                     unsigned BitSize);
 
@@ -1142,6 +1146,32 @@ bool AArch64ExpandPseudo::expandMultiVecPseudo(
   return true;
 }
 
+bool AArch64ExpandPseudo::expandFormTuplePseudo(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI, unsigned Size) {
+  assert(Size == 2 || Size == 4 && "Invalid Tuple Size");
+  MachineInstr &MI = *MBBI;
+  Register ReturnTuple = MI.getOperand(0).getReg();
+
+  const TargetRegisterInfo *TRI =
+      MBB.getParent()->getSubtarget().getRegisterInfo();
+  for (unsigned I = 0; I < Size; ++I) {
+    Register FormTupleOpReg = MI.getOperand(I + 1).getReg();
+    Register ReturnTupleSubReg =
+        TRI->getSubReg(ReturnTuple, AArch64::zsub0 + I);
+    // Add copies to ensure the subregisters remain in the correct order
+    // for any contigious operation they are used by.
+    if (FormTupleOpReg != ReturnTupleSubReg)
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORR_ZZZ))
+          .addReg(ReturnTupleSubReg, RegState::Define)
+          .addReg(FormTupleOpReg)
+          .addReg(FormTupleOpReg);
+  }
+
+  MI.eraseFromParent();
+  return true;
+}
+
 /// If MBBI references a pseudo instruction that should be expanded here,
 /// do the expansion and return true.  Otherwise return false.
 bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
@@ -1724,6 +1754,10 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
      return expandMultiVecPseudo(
          MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
          AArch64::LDNT1D_4Z, AArch64::LDNT1D_4Z_STRIDED);
+   case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO:
+     return expandFormTuplePseudo(MBB, MBBI, NextMBBI, 2);
+   case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO:
+     return expandFormTuplePseudo(MBB, MBBI, NextMBBI, 4);
   }
   return false;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index cee609ed1e2f6..41e0214dab6c7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8581,6 +8581,56 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
   return ZExtBool;
 }
 
+// The FORM_TRANSPOSED_REG_TUPLE pseudo should only be used if the
+// input operands are copy nodes where the source register is in a
+// StridedOrContiguous class. For example:
+//
+//   %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
+//   %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
+//   %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
+//   %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
+//   %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
+//   %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
+//   %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
+//
+bool shouldUseFormStridedPseudo(MachineInstr &MI) {
+  MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+
+  const TargetRegisterClass *RegClass = nullptr;
+  switch (MI.getOpcode()) {
+  case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO:
+    RegClass = &AArch64::ZPR2StridedOrContiguousRegClass;
+    break;
+  case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO:
+    RegClass = &AArch64::ZPR4StridedOrContiguousRegClass;
+    break;
+  default:
+    llvm_unreachable("Unexpected opcode.");
+  }
+
+  MCRegister SubReg = MCRegister::NoRegister;
+  for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
+    MachineOperand &MO = MI.getOperand(I);
+    assert(MO.isReg() && "Unexpected operand to FORM_TRANSPOSED_REG_TUPLE");
+
+    MachineOperand *Def = MRI.getOneDef(MO.getReg());
+    if (!Def || !Def->getParent()->isCopy())
+      return false;
+
+    const MachineOperand &CopySrc = Def->getParent()->getOperand(1);
+    unsigned OpSubReg = CopySrc.getSubReg();
+    if (SubReg == MCRegister::NoRegister)
+      SubReg = OpSubReg;
+
+    MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg());
+    if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg ||
+        MRI.getRegClass(CopySrcOp->getReg()) != RegClass)
+      return false;
+  }
+
+  return true;
+}
+
 void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
                                                           SDNode *Node) const {
   // Live-in physreg copies that are glued to SMSTART are applied as
@@ -8606,6 +8656,27 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
     }
   }
 
+  if (MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
+      MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) {
+    // If input values to the FORM_TRANSPOSED_REG_TUPLE pseudo aren't copies
+    // from a StridedOrContiguous class, fall back on REG_SEQUENCE node.
+    if (shouldUseFormStridedPseudo(MI))
+      return;
+
+    const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+    MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+                                      TII->get(TargetOpcode::REG_SEQUENCE),
+                                      MI.getOperand(0).getReg());
+
+    for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
+      MIB.add(MI.getOperand(I));
+      MIB.addImm(AArch64::zsub0 + (I - 1));
+    }
+
+    MI.eraseFromParent();
+    return;
+  }
+
   // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
   // have nothing to do with VG, were it not that they are used to materialise a
   // frame-address. If they contain a frame-index to a scalable vector, this
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 133cc1344b98f..85a7663993a04 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1081,6 +1081,58 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   }
 }
 
+// FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register allocation
+// where a consecutive multi-vector tuple is constructed from the same indices
+// of multiple strided loads. This may still result in unnecessary copies
+// between the loads and the tuple. Here we try to return a hint to assign the
+// contiguous ZPRMulReg starting at the same register as the first operand of
+// the pseudo, which should be a subregister of the first strided load.
+//
+// For example, if the first strided load has been assigned $z16_z20_z24_z28
+// and the operands of the pseudo are each accessing subregister zsub2, we
+// should look through through Order to find a contiguous register which
+// begins with $z24 (i.e. $z24_z25_z26_z27).
+//
+bool AArch64RegisterInfo::getRegAllocationHints(
+    Register VirtReg, ArrayRef<MCPhysReg> Order,
+    SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
+    const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  for (MachineInstr &MI : MRI.def_instructions(VirtReg)) {
+    if (MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO &&
+        MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO)
+      return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
+                                                       MF, VRM);
+
+    unsigned FirstOpSubReg = MI.getOperand(1).getSubReg();
+    switch (FirstOpSubReg) {
+    case AArch64::zsub0:
+    case AArch64::zsub1:
+    case AArch64::zsub2:
+    case AArch64::zsub3:
+      break;
+    default:
+      continue;
+    }
+
+    // Look up the physical register mapped to the first operand of the pseudo.
+    Register FirstOpVirtReg = MI.getOperand(1).getReg();
+    if (!VRM->hasPhys(FirstOpVirtReg))
+      continue;
+
+    MCRegister TupleStartReg =
+        getSubReg(VRM->getPhys(FirstOpVirtReg), FirstOpSubReg);
+    for (unsigned I = 0; I < Order.size(); ++I)
+      if (MCRegister R = getSubReg(Order[I], AArch64::zsub0))
+        if (R == TupleStartReg)
+          Hints.push_back(Order[I]);
+  }
+
+  return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
+                                                   VRM);
+}
+
 unsigned AArch64RegisterInfo::getLocalAddressRegister(
   const MachineFunction &MF) const {
   const auto &MFI = MF.getFrameInfo();
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 5c8a5e029584f..11da624af4881 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -134,6 +134,11 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const override;
 
+  bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
+                             SmallVectorImpl<MCPhysReg> &Hints,
+                             const MachineFunction &MF, const VirtRegMap *VRM,
+                             const LiveRegMatrix *Matrix) const override;
+
   unsigned getLocalAddressRegister(const MachineFunction &MF) const;
   bool regNeedsCFI(unsigned Reg, unsigned &RegToUseForCFI) const;
 
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index a67093b1a58c3..b62ffcbebc652 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -35,6 +35,30 @@ def tileslicerange0s4 : ComplexPattern<i32, 2, "SelectSMETileSlice<0,  4>", []>;
 let WantsRoot = true in
 def am_sme_indexed_b4 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0, 15>">;
 
+// The FORM_TRANSPOSED_REG_TUPLE pseudos defined below are intended to
+// improve register allocation for intrinsics which use strided and contiguous
+// multi-vector registers, avoiding unnecessary copies.
+// If the operands of the pseudo are copies where the source register is in
+// the StridedOrContiguous class, the pseudo is used to provide a hint to the
+// register allocator suggesting a contigious multi-vector register which
+// matches the subregister sequence used by the operands.
+// If the operands do not match this pattern, the pseudos are expanded
+// to a REG_SEQUENCE using the post-isel hook.
+
+def FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO :
+  Pseudo<(outs ZPR2Mul2:$tup),
+         (ins ZPR:$zn0, ZPR:$zn1), []>, Sched<[]>{
+  let hasSideEffects = 0;
+  let hasPostISelHook = 1;
+}
+
+def FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO :
+  Pseudo<(outs ZPR4Mul4:$tup),
+         (ins ZPR:$zn0, ZPR:$zn1, ZPR:$zn2, ZPR:$zn3), []>, Sched<[]>{
+  let hasSideEffects = 0;
+  let hasPostISelHook = 1;
+}
+
 def SDTZALoadStore : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>;
 def AArch64SMELdr : SDNode<"AArch64ISD::SME_ZA_LDR", SDTZALoadStore,
                              [SDNPHasChain, SDNPSideEffect, SDNPMayLoad]>;
@@ -173,14 +197,14 @@ class SME2_ZA_TwoOp_VG2_Multi_Index_Pat<string name, SDPatternOperator intrinsic
                                         Operand imm_ty, ComplexPattern tileslice>
     : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)),
           (!cast<Instruction>(name # _PSEUDO) $base, $offset,
-                                              (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), zpr_ty:$Zm, imm_ty:$i)>;
+                                              (FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1,vt:$Zn2), zpr_ty:$Zm, imm_ty:$i)>;
 
 class SME2_ZA_TwoOp_VG4_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
                                         Operand imm_ty, ComplexPattern tileslice>
     : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
                      vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm, (i32 imm_ty:$i)),
           (!cast<Instruction>(name # _PSEUDO) $base, $offset,
-                                              (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
+                                              (FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
                                               zpr_ty:$Zm, imm_ty:$i)>;
 
 class SME2_Sat_Shift_VG2_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, ValueType in_vt, Operand imm_ty>
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 75e20c7930168..e02ef56f23449 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1195,18 +1195,37 @@ static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
 int AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
                                             Value *Den, unsigned AtLeast,
                                             bool IsSigned) const {
-  unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
-  if (LHSSignBits < AtLeast)
-    return -1;
+  assert(Num->getType()->getScalarSizeInBits() ==
+         Den->getType()->getScalarSizeInBits());
+  unsigned SSBits = Num->getType()->getScalarSizeInBits();
+  if (IsSigned) {
+    unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
+    if (RHSSignBits < AtLeast)
+      return -1;
+
+    unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
+    if (LHSSignBits < AtLeast)
+      return -1;
+
+    unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
+    unsigned DivBits = SSBits - SignBits + 1;
+    return DivBits; // a SignBit needs to be reserved for shrinking
+  }
+
+  // All bits are used for unsigned division for Num or Den in range
+  // (SignedMax, UnsignedMax].
+  KnownBits Known = computeKnownBits(Den, DL, 0, AC, &I);
+  if (Known.isNegative() || !Known.isNonNegative())
+    return SSBits;
+  unsigned RHSSignBits = Known.countMinLeadingZeros();
 
-  unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
-  if (RHSSignBits < AtLeast)
-    return -1;
+  Known = computeKnownBits(Num, DL, 0, AC, &I);
+  if (Known.isNegative() || !Known.isNonNegative())
+    return SSBits;
+  unsigned LHSSignBits = Known.countMinLeadingZeros();
 
   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
-  unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
-  if (IsSigned)
-    ++DivBits;
+  unsigned DivBits = SSBits - SignBits;
   return DivBits;
 }
 
@@ -1220,7 +1239,7 @@ Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
   // If Num bits <= 24, assume 0 signbits.
   unsigned AtLeast = (SSBits <= 24) ? 0 : (SSBits - 24 + IsSigned);
   int DivBits = getDivNumBits(I, Num, Den, AtLeast, IsSigned);
-  if (DivBits == -1)
+  if (DivBits == -1 || DivBits > 24)
     return nullptr;
   return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 2e66f7525b9cc..9836e10c36bc5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -154,8 +154,8 @@ static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
       if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
         break;
     }
-
-    return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
+    return std::pair(TypeIdx,
+                     LLT::fixed_vector(NewNumElts, Ty.getElementType()));
   };
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 3f21d5a00ab7d..7256eec89008a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -879,6 +879,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
           return onlyAllocateSGPRs;
         if (FilterName == "vgpr")
           return onlyAllocateVGPRs;
+        if (FilterName == "wwm")
+          return onlyAllocateWWMRegs;
         return nullptr;
       });
 }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 296c32fa4e0d0..2f5a99e5de5e3 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1817,6 +1817,8 @@ void SIRegisterInfo::buildSpillLoadStore(
                            .addReg(SubReg, getKillRegState(IsKill));
         if (NeedSuperRegDef)
           AccRead.addReg(ValueReg, RegState::ImplicitDefine);
+        if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
+          AccRead.addReg(ValueReg, RegState::Implicit);
         AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
       }
       SubReg = TmpIntermediateVGPR;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 94dce739b08b5..2b508c40b81c0 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1296,8 +1296,7 @@ let SubtargetPredicate = isGFX12Plus in {
 let SubtargetPredicate = HasBitOp3Insts  in {
   let isReMaterializable = 1 in {
     defm V_BITOP3_B16 : VOP3Inst <"v_bitop3_b16",
-                                  VOP3_BITOP3_Profile<VOPProfile_True16<VOPProfile <[i16, i16, i16, i16, i32]>>,
-                                                      VOP3_OPSEL>>;
+                                  VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>>;
     defm V_BITOP3_B32 : VOP3Inst <"v_bitop3_b32",
                                   VOP3_BITOP3_Profile<VOPProfile <[i32, i32, i32, i32, i32]>, VOP3_REGULAR>>;
   }
diff --git a/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp b/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
index 7f0f737faccd0..ce069ced66579 100644
--- a/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
+++ b/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
@@ -19,8 +19,8 @@
 
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCDecoderOps.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Endian.h"
@@ -83,6 +83,12 @@ static DecodeStatus DecodeXR32RegisterClass(MCInst &Inst, uint64_t RegNo,
   return DecodeRegisterClass(Inst, RegNo, Address, Decoder);
 }
 
+static DecodeStatus DecodeXR32RegisterClass(MCInst &Inst, APInt RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return DecodeRegisterClass(Inst, RegNo.getZExtValue(), Address, Decoder);
+}
+
 static DecodeStatus DecodeXR16RegisterClass(MCInst &Inst, uint64_t RegNo,
                                             uint64_t Address,
                                             const void *Decoder) {
diff --git a/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp b/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp
index fcad5f7460bb2..53c144c8fa79a 100644
--- a/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp
+++ b/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp
@@ -708,6 +708,20 @@ bool M68kDAGToDAGISel::SelectARIPD(SDNode *Parent, SDValue N, SDValue &Base) {
   return false;
 }
 
+[[maybe_unused]] static bool allowARIDWithDisp(SDNode *Parent) {
+  if (!Parent)
+    return false;
+  switch (Parent->getOpcode()) {
+  case ISD::LOAD:
+  case ISD::STORE:
+  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_STORE:
+    return true;
+  default:
+    return false;
+  }
+}
+
 bool M68kDAGToDAGISel::SelectARID(SDNode *Parent, SDValue N, SDValue &Disp,
                                   SDValue &Base) {
   LLVM_DEBUG(dbgs() << "Selecting AddrType::ARID: ");
@@ -740,7 +754,8 @@ bool M68kDAGToDAGISel::SelectARID(SDNode *Parent, SDValue N, SDValue &Disp,
   Base = AM.BaseReg;
 
   if (getSymbolicDisplacement(AM, SDLoc(N), Disp)) {
-    assert(!AM.Disp && "Should not be any displacement");
+    assert((!AM.Disp || allowARIDWithDisp(Parent)) &&
+           "Should not be any displacement");
     LLVM_DEBUG(dbgs() << "SUCCESS, matched Symbol\n");
     return true;
   }
@@ -780,6 +795,7 @@ static bool AllowARIIWithZeroDisp(SDNode *Parent) {
   case ISD::STORE:
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:
+  case ISD::ATOMIC_CMP_SWAP:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/M68k/M68kInstrAtomics.td b/llvm/lib/Target/M68k/M68kInstrAtomics.td
index 9203a3ef4ed09..867afbefe68fe 100644
--- a/llvm/lib/Target/M68k/M68kInstrAtomics.td
+++ b/llvm/lib/Target/M68k/M68kInstrAtomics.td
@@ -13,6 +13,15 @@ foreach size = [8, 16, 32] in {
   def : Pat<(!cast<SDPatternOperator>("atomic_load_"#size) MxCP_ARII:$ptr),
             (!cast<MxInst>("MOV"#size#"df") !cast<MxMemOp>("MxARII"#size):$ptr)>;
 
+  def : Pat<(!cast<SDPatternOperator>("atomic_load_"#size) MxCP_ARID:$ptr),
+            (!cast<MxInst>("MOV"#size#"dp") !cast<MxMemOp>("MxARID"#size):$ptr)>;
+
+  def : Pat<(!cast<SDPatternOperator>("atomic_load_"#size) MxCP_PCD:$ptr),
+            (!cast<MxInst>("MOV"#size#"dq") !cast<MxMemOp>("MxPCD"#size):$ptr)>;
+
+  def : Pat<(!cast<SDPatternOperator>("atomic_load_"#size) MxCP_PCI:$ptr),
+            (!cast<MxInst>("MOV"#size#"dk") !cast<MxMemOp>("MxPCI"#size):$ptr)>;
+
   def : Pat<(!cast<SDPatternOperator>("atomic_store_"#size) !cast<MxRegOp>("MxDRD"#size):$val, MxCP_ARI:$ptr),
             (!cast<MxInst>("MOV"#size#"jd") !cast<MxMemOp>("MxARI"#size):$ptr,
                                             !cast<MxRegOp>("MxDRD"#size):$val)>;
@@ -20,10 +29,22 @@ foreach size = [8, 16, 32] in {
   def : Pat<(!cast<SDPatternOperator>("atomic_store_"#size) !cast<MxRegOp>("MxDRD"#size):$val, MxCP_ARII:$ptr),
             (!cast<MxInst>("MOV"#size#"fd") !cast<MxMemOp>("MxARII"#size):$ptr,
                                             !cast<MxRegOp>("MxDRD"#size):$val)>;
+
+  def : Pat<(!cast<SDPatternOperator>("atomic_store_"#size) !cast<MxRegOp>("MxDRD"#size):$val, MxCP_ARID:$ptr),
+            (!cast<MxInst>("MOV"#size#"pd") !cast<MxMemOp>("MxARID"#size):$ptr,
+                                            !cast<MxRegOp>("MxDRD"#size):$val)>;
+
+  def : Pat<(!cast<SDPatternOperator>("atomic_store_"#size) !cast<MxRegOp>("MxDRD"#size):$val, MxCP_PCD:$ptr),
+            (!cast<MxInst>("MOV"#size#"qd") !cast<MxMemOp>("MxPCD"#size):$ptr,
+                                            !cast<MxRegOp>("MxDRD"#size):$val)>;                                   
+
+  def : Pat<(!cast<SDPatternOperator>("atomic_store_"#size) !cast<MxRegOp>("MxDRD"#size):$val, MxCP_PCI:$ptr),
+            (!cast<MxInst>("MOV"#size#"kd") !cast<MxMemOp>("MxPCI"#size):$ptr,
+                                            !cast<MxRegOp>("MxDRD"#size):$val)>;                               
 }
 
 let Predicates = [AtLeastM68020] in {
-class MxCASOp<bits<2> size_encoding, MxType type>
+class MxCASARIOp<bits<2> size_encoding, MxType type>
     : MxInst<(outs type.ROp:$out),
              (ins type.ROp:$dc, type.ROp:$du, !cast<MxMemOp>("MxARI"#type.Size):$mem),
              "cas."#type.Prefix#" $dc, $du, $mem"> {
@@ -36,17 +57,69 @@ class MxCASOp<bits<2> size_encoding, MxType type>
   let mayStore = 1;
 }
 
-def CAS8  : MxCASOp<0x1, MxType8d>;
-def CAS16 : MxCASOp<0x2, MxType16d>;
-def CAS32 : MxCASOp<0x3, MxType32d>;
+def CASARI8  : MxCASARIOp<0x1, MxType8d>;
+def CASARI16 : MxCASARIOp<0x2, MxType16d>;
+def CASARI32 : MxCASARIOp<0x3, MxType32d>;
+
+class MxCASARIDOp<bits<2> size_encoding, MxType type>
+    : MxInst<(outs type.ROp:$out),
+             (ins type.ROp:$dc, type.ROp:$du, !cast<MxMemOp>("MxARID"#type.Size):$mem),
+             "cas."#type.Prefix#" $dc, $du, $mem"> {
+  let Inst = (ascend
+                (descend 0b00001, size_encoding, 0b011, MxEncAddrMode_p<"mem">.EA),
+                (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3))
+              );
+  let Constraints = "$out = $dc";
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+def CASARID8  : MxCASARIDOp<0x1, MxType8d>;
+def CASARID16 : MxCASARIDOp<0x2, MxType16d>;
+def CASARID32 : MxCASARIDOp<0x3, MxType32d>;
+
+class MxCASARIIOp<bits<2> size_encoding, MxType type>
+    : MxInst<(outs type.ROp:$out),
+             (ins type.ROp:$dc, type.ROp:$du, !cast<MxMemOp>("MxARII"#type.Size):$mem),
+             "cas."#type.Prefix#" $dc, $du, $mem"> {
+  let Inst = (ascend
+                (descend 0b00001, size_encoding, 0b011, MxEncAddrMode_f<"mem">.EA),
+                (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3))
+              );
+  let Constraints = "$out = $dc";
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+def CASARII8  : MxCASARIIOp<0x1, MxType8d>;
+def CASARII16 : MxCASARIIOp<0x2, MxType16d>;
+def CASARII32 : MxCASARIIOp<0x3, MxType32d>;
+
+class MxCASALOp<bits<2> size_encoding, MxType type>
+    : MxInst<(outs type.ROp:$out),
+             (ins type.ROp:$dc, type.ROp:$du, !cast<MxMemOp>("MxAL"#type.Size):$mem),
+             "cas."#type.Prefix#" $dc, $du, $mem"> {
+  let Inst = (ascend
+                (descend 0b00001, size_encoding, 0b011, MxEncAddrMode_abs<"mem">.EA),
+                (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3))
+              );
+  let Constraints = "$out = $dc";
+  let mayLoad = 1;
+  let mayStore = 1;
+}
 
+def CASAL8  : MxCASALOp<0x1, MxType8d>;
+def CASAL16 : MxCASALOp<0x2, MxType16d>;
+def CASAL32 : MxCASALOp<0x3, MxType32d>;
 
+foreach mode = ["ARI", "ARII", "ARID", "AL"] in {
 foreach size = [8, 16, 32] in {
-  def : Pat<(!cast<SDPatternOperator>("atomic_cmp_swap_i"#size) MxCP_ARI:$ptr,
+  def : Pat<(!cast<SDPatternOperator>("atomic_cmp_swap_i"#size) !cast<ComplexPattern>("MxCP_"#mode):$ptr,
                                                                 !cast<MxRegOp>("MxDRD"#size):$cmp,
                                                                 !cast<MxRegOp>("MxDRD"#size):$new),
-            (!cast<MxInst>("CAS"#size) !cast<MxRegOp>("MxDRD"#size):$cmp,
+            (!cast<MxInst>("CAS"#mode#size) !cast<MxRegOp>("MxDRD"#size):$cmp,
                                        !cast<MxRegOp>("MxDRD"#size):$new,
-                                       !cast<MxMemOp>("MxARI"#size):$ptr)>;
-}
+                                       !cast<MxMemOp>("Mx"#mode#size):$ptr)>;
+} // size
+} // addr mode
 } // let Predicates = [AtLeastM68020]
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 2af983fc7b04e..69bc2cce6c2c7 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -7244,6 +7244,9 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
     MVT LocVT = VA.getLocVT();
     MVT ValVT = VA.getValVT();
     ISD::ArgFlagsTy Flags = Ins[VA.getValNo()].Flags;
+
+    EVT ArgVT = Ins[VA.getValNo()].ArgVT;
+    bool ArgSignExt = Ins[VA.getValNo()].Flags.isSExt();
     // For compatibility with the AIX XL compiler, the float args in the
     // parameter save area are initialized even if the argument is available
     // in register.  The caller is required to initialize both the register
@@ -7291,7 +7294,24 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
       SDValue ArgValue =
           DAG.getLoad(ValVT, dl, Chain, FIN, MachinePointerInfo());
-      InVals.push_back(ArgValue);
+
+      // While the ABI specifies the argument type is (sign or zero) extended
+      // out to register width, not all code is compliant. We truncate and
+      // re-extend to be more forgiving of these callers when the argument type
+      // is smaller than register width.
+      if (!ArgVT.isVector() && !ValVT.isVector() && ArgVT.isInteger() &&
+          ValVT.isInteger() &&
+          ArgVT.getScalarSizeInBits() < ValVT.getScalarSizeInBits()) {
+        SDValue ArgValueTrunc = DAG.getNode(
+            ISD::TRUNCATE, dl, ArgVT.getSimpleVT() == MVT::i1 ? MVT::i8 : ArgVT,
+            ArgValue);
+        SDValue ArgValueExt =
+            ArgSignExt ? DAG.getSExtOrTrunc(ArgValueTrunc, dl, ValVT)
+                       : DAG.getZExtOrTrunc(ArgValueTrunc, dl, ValVT);
+        InVals.push_back(ArgValueExt);
+      } else {
+        InVals.push_back(ArgValue);
+      }
     };
 
     // Vector arguments to VaArg functions are passed both on the stack, and
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 4d563046923a5..b8ca32434aa43 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -688,6 +688,8 @@ DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
                         "Qualcomm uC Scaled Load Store custom opcode table");
   TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcia, DecoderTableXqcia32,
                         "Qualcomm uC Arithmetic custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcics, DecoderTableXqcics32,
+                        "Qualcomm uC Conditional Select custom opcode table");
   TRY_TO_DECODE(true, DecoderTable32, "RISCV32 table");
 
   return MCDisassembler::Fail;
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 52268c3fa62cc..d1922eb026279 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1367,6 +1367,14 @@ def HasVendorXqcia
       AssemblerPredicate<(all_of FeatureVendorXqcia),
                          "'Xqcia' (Qualcomm uC Arithmetic Extension)">;
 
+def FeatureVendorXqcics
+    : RISCVExperimentalExtension<"xqcics", 0, 2,
+                                 "'Xqcics' (Qualcomm uC Conditional Select Extension)">;
+def HasVendorXqcics
+    : Predicate<"Subtarget->hasVendorXqcics()">,
+      AssemblerPredicate<(all_of FeatureVendorXqcics),
+                         "'Xqcics' (Qualcomm uC Conditional Select Extension)">;
+
 //===----------------------------------------------------------------------===//
 // LLVM specific features and extensions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td
index 7cb419b610e60..f9c17cf5eed5d 100644
--- a/llvm/lib/Target/RISCV/RISCVGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVGISel.td
@@ -58,15 +58,6 @@ def GIAddrRegImm :
 def GIVLOp : GIComplexOperandMatcher<s32, "renderVLOp">,
              GIComplexPatternEquiv<VLOp>;
 
-// Convert from i32 immediate to i64 target immediate to make SelectionDAG type
-// checking happy so we can use ADDIW which expects an XLen immediate.
-def as_i64imm : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64);
-}]>;
-
-def gi_as_i64imm : GICustomOperandRenderer<"renderImm">,
-  GISDNodeXFormEquiv<as_i64imm>;
-
 def gi_trailing_zero : GICustomOperandRenderer<"renderTrailingZeros">,
   GISDNodeXFormEquiv<TrailingZeros>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index c5432619a3646..c3922e38729dc 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1026,13 +1026,13 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     unsigned ShAmt = N1C->getZExtValue();
     uint64_t Mask = N0.getConstantOperandVal(1);
 
-    // Optimize (shl (and X, C2), C) -> (slli (srliw X, C3), C3+C) where C2 has
-    // 32 leading zeros and C3 trailing zeros.
     if (ShAmt <= 32 && isShiftedMask_64(Mask)) {
       unsigned XLen = Subtarget->getXLen();
       unsigned LeadingZeros = XLen - llvm::bit_width(Mask);
       unsigned TrailingZeros = llvm::countr_zero(Mask);
       if (TrailingZeros > 0 && LeadingZeros == 32) {
+        // Optimize (shl (and X, C2), C) -> (slli (srliw X, C3), C3+C)
+        // where C2 has 32 leading zeros and C3 trailing zeros.
         SDNode *SRLIW = CurDAG->getMachineNode(
             RISCV::SRLIW, DL, VT, N0->getOperand(0),
             CurDAG->getTargetConstant(TrailingZeros, DL, VT));
@@ -1042,6 +1042,25 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
         ReplaceNode(Node, SLLI);
         return;
       }
+      if (TrailingZeros == 0 && LeadingZeros > ShAmt &&
+          XLen - LeadingZeros > 11 && LeadingZeros != 32) {
+        // Optimize (shl (and X, C2), C) -> (srli (slli X, C4), C4-C)
+        // where C2 has C4 leading zeros and no trailing zeros.
+        // This is profitable if the "and" was to be lowered to
+        // (srli (slli X, C4), C4) and not (andi X, C2).
+        // For "LeadingZeros == 32":
+        // - with Zba it's just (slli.uw X, C)
+        // - without Zba a tablegen pattern applies the very same
+        //   transform as we would have done here
+        SDNode *SLLI = CurDAG->getMachineNode(
+            RISCV::SLLI, DL, VT, N0->getOperand(0),
+            CurDAG->getTargetConstant(LeadingZeros, DL, VT));
+        SDNode *SRLI = CurDAG->getMachineNode(
+            RISCV::SRLI, DL, VT, SDValue(SLLI, 0),
+            CurDAG->getTargetConstant(LeadingZeros - ShAmt, DL, VT));
+        ReplaceNode(Node, SRLI);
+        return;
+      }
     }
     break;
   }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index c683857363720..096b9fa79173f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5197,6 +5197,67 @@ static bool isCompressMask(ArrayRef<int> Mask) {
   return true;
 }
 
+/// Given a shuffle where the indices are disjoint between the two sources,
+/// e.g.:
+///
+/// t2:v4i8 = vector_shuffle t0:v4i8, t1:v4i8, <2, 7, 1, 4>
+///
+/// Merge the two sources into one and do a single source shuffle:
+///
+/// t2:v4i8 = vselect t1:v4i8, t0:v4i8, <0, 1, 0, 1>
+/// t3:v4i8 = vector_shuffle t2:v4i8, undef, <2, 3, 1, 0>
+///
+/// A vselect will either be merged into a masked instruction or be lowered as a
+/// vmerge.vvm, which is cheaper than a vrgather.vv.
+static SDValue lowerDisjointIndicesShuffle(ShuffleVectorSDNode *SVN,
+                                           SelectionDAG &DAG,
+                                           const RISCVSubtarget &Subtarget) {
+  MVT VT = SVN->getSimpleValueType(0);
+  MVT XLenVT = Subtarget.getXLenVT();
+  SDLoc DL(SVN);
+
+  const ArrayRef<int> Mask = SVN->getMask();
+
+  // Work out which source each lane will come from.
+  SmallVector<int, 16> Srcs(Mask.size(), -1);
+
+  for (int Idx : Mask) {
+    if (Idx == -1)
+      continue;
+    unsigned SrcIdx = Idx % Mask.size();
+    int Src = (uint32_t)Idx < Mask.size() ? 0 : 1;
+    if (Srcs[SrcIdx] == -1)
+      // Mark this source as using this lane.
+      Srcs[SrcIdx] = Src;
+    else if (Srcs[SrcIdx] != Src)
+      // The other source is using this lane: not disjoint.
+      return SDValue();
+  }
+
+  SmallVector<SDValue> SelectMaskVals;
+  for (int Lane : Srcs) {
+    if (Lane == -1)
+      SelectMaskVals.push_back(DAG.getUNDEF(XLenVT));
+    else
+      SelectMaskVals.push_back(DAG.getConstant(Lane ? 0 : 1, DL, XLenVT));
+  }
+  MVT MaskVT = VT.changeVectorElementType(MVT::i1);
+  SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, SelectMaskVals);
+  SDValue Select = DAG.getNode(ISD::VSELECT, DL, VT, SelectMask,
+                               SVN->getOperand(0), SVN->getOperand(1));
+
+  // Move all indices relative to the first source.
+  SmallVector<int> NewMask(Mask.size());
+  for (unsigned I = 0; I < Mask.size(); I++) {
+    if (Mask[I] == -1)
+      NewMask[I] = -1;
+    else
+      NewMask[I] = Mask[I] % Mask.size();
+  }
+
+  return DAG.getVectorShuffle(VT, DL, Select, DAG.getUNDEF(VT), NewMask);
+}
+
 static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
                                    const RISCVSubtarget &Subtarget) {
   SDValue V1 = Op.getOperand(0);
@@ -5540,6 +5601,17 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     ShuffleMaskRHS.push_back(IsLHSOrUndefIndex ? -1 : (MaskIndex - NumElts));
   }
 
+  // If the mask indices are disjoint between the two sources, we can lower it
+  // as a vselect + a single source vrgather.vv. Don't do this if we think the
+  // operands may end up being lowered to something cheaper than a vrgather.vv.
+  if (!DAG.isSplatValue(V2) && !DAG.isSplatValue(V1) &&
+      !ShuffleVectorSDNode::isSplatMask(ShuffleMaskLHS.data(), VT) &&
+      !ShuffleVectorSDNode::isSplatMask(ShuffleMaskRHS.data(), VT) &&
+      !ShuffleVectorInst::isIdentityMask(ShuffleMaskLHS, NumElts) &&
+      !ShuffleVectorInst::isIdentityMask(ShuffleMaskRHS, NumElts))
+    if (SDValue V = lowerDisjointIndicesShuffle(SVN, DAG, Subtarget))
+      return V;
+
   // Try to pick a profitable operand order.
   bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
   SwapOps = SwapOps ^ ShuffleVectorInst::isIdentityMask(ShuffleMaskRHS, NumElts);
@@ -18237,41 +18309,21 @@ bool RISCVTargetLowering::isDesirableToCommuteWithShift(
   // LD/ST will optimize constant Offset extraction, so when AddNode is used by
   // LD/ST, it can still complete the folding optimization operation performed
   // above.
-  auto isUsedByLdSt = [&]() {
-    bool CanOptAlways = false;
-    if (N0->getOpcode() == ISD::ADD && !N0->hasOneUse()) {
-      for (SDNode *Use : N0->uses()) {
-        // This use is the one we're on right now. Skip it
-        if (Use == N || Use->getOpcode() == ISD::SELECT)
-          continue;
-        if (!isa<StoreSDNode>(Use) && !isa<LoadSDNode>(Use)) {
-          CanOptAlways = false;
-          break;
-        }
-        CanOptAlways = true;
-      }
-    }
-
-    if (N0->getOpcode() == ISD::SIGN_EXTEND &&
-        !N0->getOperand(0)->hasOneUse()) {
-      for (SDNode *Use : N0->getOperand(0)->uses()) {
-        // This use is the one we're on right now. Skip it
-        if (Use == N0.getNode() || Use->getOpcode() == ISD::SELECT)
-          continue;
-        if (!isa<StoreSDNode>(Use) && !isa<LoadSDNode>(Use)) {
-          CanOptAlways = false;
-          break;
-        }
-        CanOptAlways = true;
-      }
+  auto isUsedByLdSt = [](const SDNode *X, const SDNode *User) {
+    for (SDNode *Use : X->uses()) {
+      // This use is the one we're on right now. Skip it
+      if (Use == User || Use->getOpcode() == ISD::SELECT)
+        continue;
+      if (!isa<StoreSDNode>(Use) && !isa<LoadSDNode>(Use))
+        return false;
     }
-    return CanOptAlways;
+    return true;
   };
 
   if (Ty.isScalarInteger() &&
       (N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR)) {
     if (N0.getOpcode() == ISD::ADD && !N0->hasOneUse())
-      return isUsedByLdSt();
+      return isUsedByLdSt(N0.getNode(), N);
 
     auto *C1 = dyn_cast<ConstantSDNode>(N0->getOperand(1));
     auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
@@ -18314,7 +18366,7 @@ bool RISCVTargetLowering::isDesirableToCommuteWithShift(
   if (N0->getOpcode() == ISD::SIGN_EXTEND &&
       N0->getOperand(0)->getOpcode() == ISD::ADD &&
       !N0->getOperand(0)->hasOneUse())
-    return isUsedByLdSt();
+    return isUsedByLdSt(N0->getOperand(0).getNode(), N0.getNode());
 
   return true;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 6a3a89371b57a..91f8a2f47e21c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2568,7 +2568,7 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
           Ok = (Imm & (RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC)) == Imm;
           break;
         case RISCVOp::OPERAND_SEW:
-          Ok = Imm == 0 || (Imm >= 3 && Imm <= 6);
+          Ok = Imm == 0 || (isUInt<5>(Imm) && RISCVVType::isValidSEW(1 << Imm));
           break;
         case RISCVOp::OPERAND_VEC_RM:
           assert(RISCVII::hasRoundModeOp(Desc.TSFlags));
@@ -3188,33 +3188,38 @@ std::string RISCVInstrInfo::createMIROperandComment(
   if (!Op.isImm())
     return std::string();
 
+  const MCInstrDesc &Desc = MI.getDesc();
+  if (OpIdx >= Desc.getNumOperands())
+    return std::string();
+
   std::string Comment;
   raw_string_ostream OS(Comment);
 
-  uint64_t TSFlags = MI.getDesc().TSFlags;
+  const MCOperandInfo &OpInfo = Desc.operands()[OpIdx];
 
   // Print the full VType operand of vsetvli/vsetivli instructions, and the SEW
   // operand of vector codegen pseudos.
-  if ((MI.getOpcode() == RISCV::VSETVLI || MI.getOpcode() == RISCV::VSETIVLI ||
-       MI.getOpcode() == RISCV::PseudoVSETVLI ||
-       MI.getOpcode() == RISCV::PseudoVSETIVLI ||
-       MI.getOpcode() == RISCV::PseudoVSETVLIX0) &&
-      OpIdx == 2) {
-    unsigned Imm = MI.getOperand(OpIdx).getImm();
+  switch (OpInfo.OperandType) {
+  case RISCVOp::OPERAND_VTYPEI10:
+  case RISCVOp::OPERAND_VTYPEI11: {
+    unsigned Imm = Op.getImm();
     RISCVVType::printVType(Imm, OS);
-  } else if (RISCVII::hasSEWOp(TSFlags) &&
-             OpIdx == RISCVII::getSEWOpNum(MI.getDesc())) {
-    unsigned Log2SEW = MI.getOperand(OpIdx).getImm();
+    break;
+  }
+  case RISCVOp::OPERAND_SEW: {
+    unsigned Log2SEW = Op.getImm();
     unsigned SEW = Log2SEW ? 1 << Log2SEW : 8;
     assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW");
     OS << "e" << SEW;
-  } else if (RISCVII::hasVecPolicyOp(TSFlags) &&
-             OpIdx == RISCVII::getVecPolicyOpNum(MI.getDesc())) {
-    unsigned Policy = MI.getOperand(OpIdx).getImm();
+    break;
+  }
+  case RISCVOp::OPERAND_VEC_POLICY:
+    unsigned Policy = Op.getImm();
     assert(Policy <= (RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC) &&
            "Invalid Policy Value");
     OS << (Policy & RISCVII::TAIL_AGNOSTIC ? "ta" : "tu") << ", "
        << (Policy & RISCVII::MASK_AGNOSTIC ? "ma" : "mu");
+    break;
   }
 
   return Comment;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 29758014f73ed..3af49d7e74460 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -25,38 +25,86 @@ def uimm11 : RISCVUImmLeafOp<11>;
 //===----------------------------------------------------------------------===//
 
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
-class QCILoad_ScaleIdx<bits<4> func4, string opcodestr>
+class QCILoad_ScaleIdx<bits<4> funct4, string opcodestr>
     : RVInstRBase<0b111, OPC_CUSTOM_0,
                   (outs GPR:$rd), (ins GPRMem:$rs1, GPRNoX0:$rs2, uimm3:$shamt),
                   opcodestr, "$rd, $rs1, $rs2, $shamt"> {
   bits<3> shamt;
-  let Inst{31-28} = func4;
+  let Inst{31-28} = funct4;
   let Inst{27-25} = shamt;
 }
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
 // rd corresponds to the source for the store 'rs3' described in the spec.
-class QCIStore_ScaleIdx<bits<4> func4, string opcodestr>
+class QCIStore_ScaleIdx<bits<4> funct4, string opcodestr>
     : RVInstRBase<0b110, OPC_CUSTOM_1, (outs),
                   (ins GPR:$rd, GPRMem:$rs1, GPRNoX0:$rs2, uimm3:$shamt),
                   opcodestr, "$rd, $rs1, $rs2, $shamt"> {
   bits<3> shamt;
-  let Inst{31-28} = func4;
+  let Inst{31-28} = funct4;
   let Inst{27-25} = shamt;
 }
 }
 
-class QCIRVInstR<bits<4> func4, string opcodestr>
-    : RVInstR<{0b000, func4}, 0b011, OPC_CUSTOM_0, (outs GPRNoX0:$rd),
+class QCIRVInstR<bits<4> funct4, string opcodestr>
+    : RVInstR<{0b000, funct4}, 0b011, OPC_CUSTOM_0, (outs GPRNoX0:$rd),
               (ins GPRNoX0:$rs1), opcodestr, "$rd, $rs1"> {
   let rs2 = 0;
 }
 
-class QCIRVInstRR<bits<5> func5, DAGOperand InTyRs1, string opcodestr>
-    : RVInstR<{0b00, func5}, 0b011, OPC_CUSTOM_0, (outs GPRNoX0:$rd),
+class QCIRVInstRR<bits<5> funct5, DAGOperand InTyRs1, string opcodestr>
+    : RVInstR<{0b00, funct5}, 0b011, OPC_CUSTOM_0, (outs GPRNoX0:$rd),
               (ins InTyRs1:$rs1, GPRNoX0:$rs2), opcodestr, "$rd, $rs1, $rs2">;
 
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class QCISELECTIICC<bits<3> funct3, string opcodestr>
+    : RVInstR4<0b00, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd_wb),
+               (ins GPRNoX0:$rd, GPRNoX0:$rs1, simm5:$simm1, simm5:$simm2),
+               opcodestr, "$rd, $rs1, $simm1, $simm2"> {
+  let Constraints = "$rd = $rd_wb";
+  bits<5> simm1;
+  bits<5> simm2;
+
+  let rs3 = simm2;
+  let rs2 = simm1;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class QCISELECTICC<bits<3> funct3, string opcodestr>
+    : RVInstR4<0b01, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd_wb),
+               (ins GPRNoX0:$rd, GPRNoX0:$rs1, GPRNoX0:$rs2, simm5:$simm2),
+               opcodestr, "$rd, $rs1, $rs2, $simm2"> {
+  let Constraints = "$rd = $rd_wb";
+  bits<5> simm2;
+
+  let rs3 = simm2;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class QCISELECTCCI<bits<3> funct3, string opcodestr>
+    : RVInstR4<0b10, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd_wb),
+               (ins GPRNoX0:$rd, simm5:$imm, GPRNoX0:$rs2, GPRNoX0:$rs3),
+               opcodestr, "$rd, $imm, $rs2, $rs3"> {
+  let Constraints = "$rd = $rd_wb";
+  bits<5> imm;
+
+  let rs1 = imm;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class QCISELECTICCI<bits<3> funct3, string opcodestr>
+    : RVInstR4<0b11, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd_wb),
+               (ins GPRNoX0:$rd, simm5:$imm, GPRNoX0:$rs2, simm5:$simm2),
+               opcodestr, "$rd, $imm, $rs2, $simm2"> {
+  let Constraints = "$rd = $rd_wb";
+  bits<5> imm;
+  bits<5> simm2;
+
+  let rs3 = simm2;
+  let rs1 = imm;
+}
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -108,3 +156,14 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
   def QC_NORMEU : QCIRVInstR<0b1001, "qc.normeu">;
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 } // Predicates = [HasVendorXqcia, IsRV32], DecoderNamespace = "Xqcia"
+
+let Predicates = [HasVendorXqcics, IsRV32], DecoderNamespace = "Xqcics" in {
+  def QC_SELECTIIEQ : QCISELECTIICC <0b010, "qc.selectiieq">;
+  def QC_SELECTIINE : QCISELECTIICC <0b011, "qc.selectiine">;
+  def QC_SELECTIEQ  : QCISELECTICC  <0b010, "qc.selectieq">;
+  def QC_SELECTINE  : QCISELECTICC  <0b011, "qc.selectine">;
+  def QC_SELECTEQI  : QCISELECTCCI  <0b010, "qc.selecteqi">;
+  def QC_SELECTNEI  : QCISELECTCCI  <0b011, "qc.selectnei">;
+  def QC_SELECTIEQI : QCISELECTICCI <0b010, "qc.selectieqi">;
+  def QC_SELECTINEI : QCISELECTICCI <0b011, "qc.selectinei">;
+} // Predicates = [HasVendorXqcics, IsRV32], DecoderNamespace = "Xqcics"
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 489fc3c4918fd..49192bd638022 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2446,6 +2446,8 @@ bool RISCVTTIImpl::canSplatOperand(Instruction *I, int Operand) const {
   switch (II->getIntrinsicID()) {
   case Intrinsic::fma:
   case Intrinsic::vp_fma:
+  case Intrinsic::fmuladd:
+  case Intrinsic::vp_fmuladd:
     return Operand == 0 || Operand == 1;
   case Intrinsic::vp_shl:
   case Intrinsic::vp_lshr:
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 0df4c451894be..1d5684d6038ea 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -159,6 +159,12 @@ getEMULEqualsEEWDivSEWTimesLMUL(unsigned Log2EEW, const MachineInstr &MI) {
   auto [MILMUL, MILMULIsFractional] = RISCVVType::decodeVLMUL(MIVLMUL);
   unsigned MILog2SEW =
       MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
+
+  // Mask instructions will have 0 as the SEW operand. But the LMUL of these
+  // instructions is calculated is as if the SEW operand was 3 (e8).
+  if (MILog2SEW == 0)
+    MILog2SEW = 3;
+
   unsigned MISEW = 1 << MILog2SEW;
 
   unsigned EEW = 1 << Log2EEW;
@@ -492,6 +498,29 @@ static OperandInfo getOperandInfo(const MachineInstr &MI,
     return OperandInfo(EMUL, Log2EEW);
   }
 
+  // Vector Mask Instructions
+  // Vector Mask-Register Logical Instructions
+  // vmsbf.m set-before-first mask bit
+  // vmsif.m set-including-first mask bit
+  // vmsof.m set-only-first mask bit
+  // EEW=1 and EMUL=(EEW/SEW)*LMUL
+  // We handle the cases when operand is a v0 mask operand above the switch,
+  // but these instructions may use non-v0 mask operands and need to be handled
+  // specifically.
+  case RISCV::VMAND_MM:
+  case RISCV::VMNAND_MM:
+  case RISCV::VMANDN_MM:
+  case RISCV::VMXOR_MM:
+  case RISCV::VMOR_MM:
+  case RISCV::VMNOR_MM:
+  case RISCV::VMORN_MM:
+  case RISCV::VMXNOR_MM:
+  case RISCV::VMSBF_M:
+  case RISCV::VMSIF_M:
+  case RISCV::VMSOF_M: {
+    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(0, MI), 0);
+  }
+
   default:
     return {};
   }
@@ -632,6 +661,23 @@ static bool isSupportedInstr(const MachineInstr &MI) {
 
   // Vector Crypto
   case RISCV::VWSLL_VI:
+
+  // Vector Mask Instructions
+  // Vector Mask-Register Logical Instructions
+  // vmsbf.m set-before-first mask bit
+  // vmsif.m set-including-first mask bit
+  // vmsof.m set-only-first mask bit
+  case RISCV::VMAND_MM:
+  case RISCV::VMNAND_MM:
+  case RISCV::VMANDN_MM:
+  case RISCV::VMXOR_MM:
+  case RISCV::VMOR_MM:
+  case RISCV::VMNOR_MM:
+  case RISCV::VMORN_MM:
+  case RISCV::VMXNOR_MM:
+  case RISCV::VMSBF_M:
+  case RISCV::VMSIF_M:
+  case RISCV::VMSOF_M:
     return true;
   }
 
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index d54b81e0d3981..de3d5564210ff 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -742,7 +742,7 @@ Error RISCVISAInfo::checkDependency() {
   bool HasZvl = MinVLen != 0;
   bool HasZcmt = Exts.count("zcmt") != 0;
   static constexpr StringLiteral XqciExts[] = {
-      {"xqcia"}, {"xqcicsr"}, {"xqcisls"}};
+      {"xqcia"}, {"xqcics"}, {"xqcicsr"}, {"xqcisls"}};
 
   if (HasI && HasE)
     return getIncompatibleError("i", "e");
diff --git a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
index 0baa34d50abf3..20fc630a74a86 100644
--- a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
+++ b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
@@ -32,10 +32,9 @@ namespace {
 
 bool AlwaysInlineImpl(
     Module &M, bool InsertLifetime, ProfileSummaryInfo &PSI,
+    FunctionAnalysisManager *FAM,
     function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
-    function_ref<AAResults &(Function &)> GetAAR,
-    function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
-    function_ref<BlockFrequencyInfo *(Function &)> GetCachedBFI) {
+    function_ref<AAResults &(Function &)> GetAAR) {
   SmallSetVector<CallBase *, 16> Calls;
   bool Changed = false;
   SmallVector<Function *, 16> InlinedComdatFunctions;
@@ -62,12 +61,7 @@ bool AlwaysInlineImpl(
       DebugLoc DLoc = CB->getDebugLoc();
       BasicBlock *Block = CB->getParent();
 
-      // Only update CallerBFI if already available. The CallerBFI update
-      // requires CalleeBFI.
-      BlockFrequencyInfo *CallerBFI = GetCachedBFI(*Caller);
-      InlineFunctionInfo IFI(GetAssumptionCache, &PSI, CallerBFI,
-                             CallerBFI ? &GetBFI(F) : nullptr);
-
+      InlineFunctionInfo IFI(GetAssumptionCache, &PSI, nullptr, nullptr);
       InlineResult Res = InlineFunction(*CB, IFI, /*MergeAttributes=*/true,
                                         &GetAAR(F), InsertLifetime);
       if (!Res.isSuccess()) {
@@ -86,6 +80,8 @@ bool AlwaysInlineImpl(
           /*ForProfileContext=*/false, DEBUG_TYPE);
 
       Changed = true;
+      if (FAM)
+        FAM->invalidate(*Caller, PreservedAnalyses::none());
     }
 
     F.removeDeadConstantUsers();
@@ -95,6 +91,8 @@ bool AlwaysInlineImpl(
       if (F.hasComdat()) {
         InlinedComdatFunctions.push_back(&F);
       } else {
+        if (FAM)
+          FAM->clear(F, F.getName());
         M.getFunctionList().erase(F);
         Changed = true;
       }
@@ -107,6 +105,8 @@ bool AlwaysInlineImpl(
     filterDeadComdatFunctions(InlinedComdatFunctions);
     // The remaining functions are actually dead.
     for (Function *F : InlinedComdatFunctions) {
+      if (FAM)
+        FAM->clear(*F, F->getName());
       M.getFunctionList().erase(F);
       Changed = true;
     }
@@ -136,12 +136,9 @@ struct AlwaysInlinerLegacyPass : public ModulePass {
     auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
       return getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     };
-    auto GetCachedBFI = [](Function &) -> BlockFrequencyInfo * {
-      return nullptr;
-    };
 
-    return AlwaysInlineImpl(M, InsertLifetime, PSI, GetAssumptionCache, GetAAR,
-                            /*GetBFI=*/nullptr, GetCachedBFI);
+    return AlwaysInlineImpl(M, InsertLifetime, PSI, /*FAM=*/nullptr,
+                            GetAssumptionCache, GetAAR);
   }
 
   static char ID; // Pass identification, replacement for typeid
@@ -175,19 +172,18 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
   auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
     return FAM.getResult<AssumptionAnalysis>(F);
   };
-  auto GetBFI = [&](Function &F) -> BlockFrequencyInfo & {
-    return FAM.getResult<BlockFrequencyAnalysis>(F);
-  };
-  auto GetCachedBFI = [&](Function &F) -> BlockFrequencyInfo * {
-    return FAM.getCachedResult<BlockFrequencyAnalysis>(F);
-  };
   auto GetAAR = [&](Function &F) -> AAResults & {
     return FAM.getResult<AAManager>(F);
   };
   auto &PSI = MAM.getResult<ProfileSummaryAnalysis>(M);
 
-  bool Changed = AlwaysInlineImpl(M, InsertLifetime, PSI, GetAssumptionCache,
-                                  GetAAR, GetBFI, GetCachedBFI);
+  bool Changed = AlwaysInlineImpl(M, InsertLifetime, PSI, &FAM,
+                                  GetAssumptionCache, GetAAR);
+  if (!Changed)
+    return PreservedAnalyses::all();
 
-  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  // We have already invalidated all analyses on modified functions.
+  PA.preserveSet<AllAnalysesOn<Function>>();
+  return PA;
 }
diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 0bc783412595e..e706a6f83b1e7 100644
--- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -105,8 +105,8 @@ static cl::opt<bool> PrintRangeChecks("irce-print-range-checks", cl::Hidden,
 static cl::opt<bool> SkipProfitabilityChecks("irce-skip-profitability-checks",
                                              cl::Hidden, cl::init(false));
 
-static cl::opt<unsigned> MinRuntimeIterations("irce-min-runtime-iterations",
-                                              cl::Hidden, cl::init(10));
+static cl::opt<unsigned> MinEliminatedChecks("irce-min-eliminated-checks",
+                                             cl::Hidden, cl::init(10));
 
 static cl::opt<bool> AllowUnsignedLatchCondition("irce-allow-unsigned-latch",
                                                  cl::Hidden, cl::init(true));
@@ -130,15 +130,9 @@ static cl::opt<bool>
 
 namespace {
 
-/// An inductive range check is conditional branch in a loop with
-///
-///  1. a very cold successor (i.e. the branch jumps to that successor very
-///     rarely)
-///
-///  and
-///
-///  2. a condition that is provably true for some contiguous range of values
-///     taken by the containing loop's induction variable.
+/// An inductive range check is conditional branch in a loop with a condition
+/// that is provably true for some contiguous range of values taken by the
+/// containing loop's induction variable.
 ///
 class InductiveRangeCheck {
 
@@ -233,6 +227,7 @@ class InductiveRangeCheck {
   /// checks, and hence don't end up in \p Checks.
   static void extractRangeChecksFromBranch(
       BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI,
+      std::optional<uint64_t> EstimatedTripCount,
       SmallVectorImpl<InductiveRangeCheck> &Checks, bool &Changed);
 };
 
@@ -246,9 +241,10 @@ class InductiveRangeCheckElimination {
       std::optional<llvm::function_ref<llvm::BlockFrequencyInfo &()>>;
   GetBFIFunc GetBFI;
 
-  // Returns true if it is profitable to do a transform basing on estimation of
-  // number of iterations.
-  bool isProfitableToTransform(const Loop &L);
+  // Returns the estimated number of iterations based on block frequency info if
+  // available, or on branch probability info. Nullopt is returned if the number
+  // of iterations cannot be estimated.
+  std::optional<uint64_t> estimatedTripCount(const Loop &L);
 
 public:
   InductiveRangeCheckElimination(ScalarEvolution &SE,
@@ -522,6 +518,7 @@ void InductiveRangeCheck::extractRangeChecksFromCond(
 
 void InductiveRangeCheck::extractRangeChecksFromBranch(
     BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI,
+    std::optional<uint64_t> EstimatedTripCount,
     SmallVectorImpl<InductiveRangeCheck> &Checks, bool &Changed) {
   if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch())
     return;
@@ -529,11 +526,32 @@ void InductiveRangeCheck::extractRangeChecksFromBranch(
   unsigned IndexLoopSucc = L->contains(BI->getSuccessor(0)) ? 0 : 1;
   assert(L->contains(BI->getSuccessor(IndexLoopSucc)) &&
          "No edges coming to loop?");
-  BranchProbability LikelyTaken(15, 16);
 
-  if (!SkipProfitabilityChecks && BPI &&
-      BPI->getEdgeProbability(BI->getParent(), IndexLoopSucc) < LikelyTaken)
-    return;
+  if (!SkipProfitabilityChecks && BPI) {
+    auto SuccessProbability =
+        BPI->getEdgeProbability(BI->getParent(), IndexLoopSucc);
+    if (EstimatedTripCount) {
+      auto EstimatedEliminatedChecks =
+          SuccessProbability.scale(*EstimatedTripCount);
+      if (EstimatedEliminatedChecks < MinEliminatedChecks) {
+        LLVM_DEBUG(dbgs() << "irce: could not prove profitability for branch "
+                          << *BI << ": "
+                          << "estimated eliminated checks too low "
+                          << EstimatedEliminatedChecks << "\n";);
+        return;
+      }
+    } else {
+      BranchProbability LikelyTaken(15, 16);
+      if (SuccessProbability < LikelyTaken) {
+        LLVM_DEBUG(dbgs() << "irce: could not prove profitability for branch "
+                          << *BI << ": "
+                          << "could not estimate trip count "
+                          << "and branch success probability too low "
+                          << SuccessProbability << "\n";);
+        return;
+      }
+    }
+  }
 
   // IRCE expects branch's true edge comes to loop. Invert branch for opposite
   // case.
@@ -938,42 +956,34 @@ PreservedAnalyses IRCEPass::run(Function &F, FunctionAnalysisManager &AM) {
   return getLoopPassPreservedAnalyses();
 }
 
-bool InductiveRangeCheckElimination::isProfitableToTransform(const Loop &L) {
-  if (SkipProfitabilityChecks)
-    return true;
+std::optional<uint64_t>
+InductiveRangeCheckElimination::estimatedTripCount(const Loop &L) {
   if (GetBFI) {
     BlockFrequencyInfo &BFI = (*GetBFI)();
     uint64_t hFreq = BFI.getBlockFreq(L.getHeader()).getFrequency();
     uint64_t phFreq = BFI.getBlockFreq(L.getLoopPreheader()).getFrequency();
-    if (phFreq != 0 && hFreq != 0 && (hFreq / phFreq < MinRuntimeIterations)) {
-      LLVM_DEBUG(dbgs() << "irce: could not prove profitability: "
-                        << "the estimated number of iterations basing on "
-                           "frequency info is " << (hFreq / phFreq) << "\n";);
-      return false;
-    }
-    return true;
+    if (phFreq == 0 || hFreq == 0)
+      return std::nullopt;
+    return {hFreq / phFreq};
   }
 
   if (!BPI)
-    return true;
+    return std::nullopt;
 
   auto *Latch = L.getLoopLatch();
   if (!Latch)
-    return true;
+    return std::nullopt;
   auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
   if (!LatchBr)
-    return true;
-  auto LatchBrExitIdx = LatchBr->getSuccessor(0) == L.getHeader() ? 1 : 0;
+    return std::nullopt;
 
+  auto LatchBrExitIdx = LatchBr->getSuccessor(0) == L.getHeader() ? 1 : 0;
   BranchProbability ExitProbability =
       BPI->getEdgeProbability(Latch, LatchBrExitIdx);
-  if (ExitProbability > BranchProbability(1, MinRuntimeIterations)) {
-    LLVM_DEBUG(dbgs() << "irce: could not prove profitability: "
-                      << "the exit probability is too big " << ExitProbability
-                      << "\n";);
-    return false;
-  }
-  return true;
+  if (ExitProbability.isUnknown() || ExitProbability.isZero())
+    return std::nullopt;
+
+  return {ExitProbability.scaleByInverse(1)};
 }
 
 bool InductiveRangeCheckElimination::run(
@@ -989,8 +999,14 @@ bool InductiveRangeCheckElimination::run(
     return false;
   }
 
-  if (!isProfitableToTransform(*L))
+  auto EstimatedTripCount = estimatedTripCount(*L);
+  if (!SkipProfitabilityChecks && EstimatedTripCount &&
+      *EstimatedTripCount < MinEliminatedChecks) {
+    LLVM_DEBUG(dbgs() << "irce: could not prove profitability: "
+                      << "the estimated number of iterations is "
+                      << *EstimatedTripCount << "\n");
     return false;
+  }
 
   LLVMContext &Context = Preheader->getContext();
   SmallVector<InductiveRangeCheck, 16> RangeChecks;
@@ -998,8 +1014,8 @@ bool InductiveRangeCheckElimination::run(
 
   for (auto *BBI : L->getBlocks())
     if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator()))
-      InductiveRangeCheck::extractRangeChecksFromBranch(TBI, L, SE, BPI,
-                                                        RangeChecks, Changed);
+      InductiveRangeCheck::extractRangeChecksFromBranch(
+          TBI, L, SE, BPI, EstimatedTripCount, RangeChecks, Changed);
 
   if (RangeChecks.empty())
     return Changed;
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index d80af26451ac7..2cb3525231eac 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -43,6 +43,7 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/PtrUseVisitor.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -83,6 +84,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -246,6 +248,7 @@ class SROA {
   bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
   AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
   bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
+  bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
   std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
   void clobberUse(Use &U);
   bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
@@ -598,6 +601,7 @@ class AllocaSlices {
   /// If this is true, the slices are never fully built and should be
   /// ignored.
   bool isEscaped() const { return PointerEscapingInstr; }
+  bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
 
   /// Support for iterating over the slices.
   /// @{
@@ -680,6 +684,7 @@ class AllocaSlices {
   /// store a pointer to that here and abort trying to form slices of the
   /// alloca. This will be null if the alloca slices are analyzed successfully.
   Instruction *PointerEscapingInstr;
+  Instruction *PointerEscapingInstrReadOnly;
 
   /// The slices of the alloca.
   ///
@@ -1390,6 +1395,18 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
 
   /// Disable SROA entirely if there are unhandled users of the alloca.
   void visitInstruction(Instruction &I) { PI.setAborted(&I); }
+
+  void visitCallBase(CallBase &CB) {
+    // If the call operand is NoCapture ReadOnly, then we mark it as
+    // EscapedReadOnly.
+    if (CB.doesNotCapture(U->getOperandNo()) &&
+        CB.onlyReadsMemory(U->getOperandNo())) {
+      PI.setEscapedReadOnly(&CB);
+      return;
+    }
+
+    Base::visitCallBase(CB);
+  }
 };
 
 AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
@@ -1397,7 +1414,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
       AI(AI),
 #endif
-      PointerEscapingInstr(nullptr) {
+      PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
   SliceBuilder PB(DL, AI, *this);
   SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
   if (PtrI.isEscaped() || PtrI.isAborted()) {
@@ -1408,6 +1425,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
     assert(PointerEscapingInstr && "Did not track a bad instruction");
     return;
   }
+  PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
 
   llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
 
@@ -1445,6 +1463,9 @@ void AllocaSlices::print(raw_ostream &OS) const {
     return;
   }
 
+  if (PointerEscapingInstrReadOnly)
+    OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
+
   OS << "Slices of alloca: " << AI << "\n";
   for (const_iterator I = begin(), E = end(); I != E; ++I)
     print(OS, I);
@@ -5454,6 +5475,86 @@ void SROA::clobberUse(Use &U) {
     }
 }
 
+/// A basic LoadAndStorePromoter that does not remove store nodes.
+class BasicLoadAndStorePromoter : public LoadAndStorePromoter {
+public:
+  BasicLoadAndStorePromoter(ArrayRef<const Instruction *> Insts, SSAUpdater &S,
+                            Type *ZeroType)
+      : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {}
+  bool shouldDelete(Instruction *I) const override {
+    return !isa<StoreInst>(I) && !isa<AllocaInst>(I);
+  }
+
+  Value *getValueToUseForAlloca(Instruction *I) const override {
+    return UndefValue::get(ZeroType);
+  }
+
+private:
+  Type *ZeroType;
+};
+
+bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
+  // Look through each "partition", looking for slices with the same start/end
+  // that do not overlap with any before them. The slices are sorted by
+  // increasing beginOffset. We don't use AS.partitions(), as it will use a more
+  // sophisticated algorithm that takes splittable slices into account.
+  auto PartitionBegin = AS.begin();
+  auto PartitionEnd = PartitionBegin;
+  uint64_t BeginOffset = PartitionBegin->beginOffset();
+  uint64_t EndOffset = PartitionBegin->endOffset();
+  while (PartitionBegin != AS.end()) {
+    bool AllSameAndValid = true;
+    SmallVector<Instruction *> Insts;
+    Type *PartitionType = nullptr;
+    while (PartitionEnd != AS.end() &&
+           (PartitionEnd->beginOffset() < EndOffset ||
+            PartitionEnd->endOffset() <= EndOffset)) {
+      if (AllSameAndValid) {
+        AllSameAndValid &= PartitionEnd->beginOffset() == BeginOffset &&
+                           PartitionEnd->endOffset() == EndOffset;
+        Instruction *User =
+            cast<Instruction>(PartitionEnd->getUse()->getUser());
+        if (auto *LI = dyn_cast<LoadInst>(User)) {
+          Type *UserTy = LI->getType();
+          // LoadAndStorePromoter requires all the types to be the same.
+          if (!LI->isSimple() || (PartitionType && UserTy != PartitionType))
+            AllSameAndValid = false;
+          PartitionType = UserTy;
+          Insts.push_back(User);
+        } else if (auto *SI = dyn_cast<StoreInst>(User)) {
+          Type *UserTy = SI->getValueOperand()->getType();
+          if (!SI->isSimple() || (PartitionType && UserTy != PartitionType))
+            AllSameAndValid = false;
+          PartitionType = UserTy;
+          Insts.push_back(User);
+        } else if (!isAssumeLikeIntrinsic(User)) {
+          AllSameAndValid = false;
+        }
+      }
+      EndOffset = std::max(EndOffset, PartitionEnd->endOffset());
+      ++PartitionEnd;
+    }
+
+    // So long as all the slices start and end offsets matched, update loads to
+    // the values stored in the partition.
+    if (AllSameAndValid && !Insts.empty()) {
+      LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
+                        << EndOffset << ")\n");
+      SmallVector<PHINode *, 4> NewPHIs;
+      SSAUpdater SSA(&NewPHIs);
+      Insts.push_back(&AI);
+      BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
+      Promoter.run(Insts);
+    }
+
+    // Step on to the next partition.
+    PartitionBegin = PartitionEnd;
+    BeginOffset = PartitionBegin->beginOffset();
+    EndOffset = PartitionBegin->endOffset();
+  }
+  return true;
+}
+
 /// Analyze an alloca for SROA.
 ///
 /// This analyzes the alloca to ensure we can reason about it, builds
@@ -5494,6 +5595,11 @@ SROA::runOnAlloca(AllocaInst &AI) {
   if (AS.isEscaped())
     return {Changed, CFGChanged};
 
+  if (AS.isEscapedReadOnly()) {
+    Changed |= propagateStoredValuesToLoads(AI, AS);
+    return {Changed, CFGChanged};
+  }
+
   // Delete all the dead users of this alloca before splitting and rewriting it.
   for (Instruction *DeadUser : AS.getDeadUsers()) {
     // Free up everything used by this instruction.
diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp
index cf1a8b4af1126..2af447aadce22 100644
--- a/llvm/lib/Transforms/Utils/Evaluator.cpp
+++ b/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -253,40 +253,17 @@ Evaluator::getCalleeWithFormalArgs(CallBase &CB,
 
 bool Evaluator::getFormalParams(CallBase &CB, Function *F,
                                 SmallVectorImpl<Constant *> &Formals) {
-  if (!F)
-    return false;
-
   auto *FTy = F->getFunctionType();
-  if (FTy->getNumParams() > CB.arg_size()) {
-    LLVM_DEBUG(dbgs() << "Too few arguments for function.\n");
+  if (FTy != CB.getFunctionType()) {
+    LLVM_DEBUG(dbgs() << "Signature mismatch.\n");
     return false;
   }
 
-  auto ArgI = CB.arg_begin();
-  for (Type *PTy : FTy->params()) {
-    auto *ArgC = ConstantFoldLoadThroughBitcast(getVal(*ArgI), PTy, DL);
-    if (!ArgC) {
-      LLVM_DEBUG(dbgs() << "Can not convert function argument.\n");
-      return false;
-    }
-    Formals.push_back(ArgC);
-    ++ArgI;
-  }
+  for (Value *Arg : CB.args())
+    Formals.push_back(getVal(Arg));
   return true;
 }
 
-/// If call expression contains bitcast then we may need to cast
-/// evaluated return value to a type of the call expression.
-Constant *Evaluator::castCallResultIfNeeded(Type *ReturnType, Constant *RV) {
-  if (!RV || RV->getType() == ReturnType)
-    return RV;
-
-  RV = ConstantFoldLoadThroughBitcast(RV, ReturnType, DL);
-  if (!RV)
-    LLVM_DEBUG(dbgs() << "Failed to fold bitcast call expr\n");
-  return RV;
-}
-
 /// Evaluate all instructions in block BB, returning true if successful, false
 /// if we can't evaluate it.  NewBB returns the next BB that control flows into,
 /// or null upon return. StrippedPointerCastsForAliasAnalysis is set to true if
@@ -520,9 +497,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
         if (Callee->isDeclaration()) {
           // If this is a function we can constant fold, do it.
           if (Constant *C = ConstantFoldCall(&CB, Callee, Formals, TLI)) {
-            InstResult = castCallResultIfNeeded(CB.getType(), C);
-            if (!InstResult)
-              return false;
+            InstResult = C;
             LLVM_DEBUG(dbgs() << "Constant folded function call. Result: "
                               << *InstResult << "\n");
           } else {
@@ -544,10 +519,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
             return false;
           }
           ValueStack.pop_back();
-          InstResult = castCallResultIfNeeded(CB.getType(), RetVal);
-          if (RetVal && !InstResult)
-            return false;
-
+          InstResult = RetVal;
           if (InstResult) {
             LLVM_DEBUG(dbgs() << "Successfully evaluated function. Result: "
                               << *InstResult << "\n\n");
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 70047273c3b9a..45915c10107b2 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1208,6 +1208,23 @@ Value *llvm::createAnyOfReduction(IRBuilderBase &Builder, Value *Src,
   return Builder.CreateSelect(AnyOf, NewVal, InitVal, "rdx.select");
 }
 
+Value *llvm::createFindLastIVReduction(IRBuilderBase &Builder, Value *Src,
+                                       const RecurrenceDescriptor &Desc) {
+  assert(RecurrenceDescriptor::isFindLastIVRecurrenceKind(
+             Desc.getRecurrenceKind()) &&
+         "Unexpected reduction kind");
+  Value *StartVal = Desc.getRecurrenceStartValue();
+  Value *Sentinel = Desc.getSentinelValue();
+  Value *MaxRdx = Src->getType()->isVectorTy()
+                      ? Builder.CreateIntMaxReduce(Src, true)
+                      : Src;
+  // Correct the final reduction result back to the start value if the maximum
+  // reduction is sentinel value.
+  Value *Cmp =
+      Builder.CreateCmp(CmpInst::ICMP_NE, MaxRdx, Sentinel, "rdx.select.cmp");
+  return Builder.CreateSelect(Cmp, MaxRdx, StartVal, "rdx.select");
+}
+
 Value *llvm::getReductionIdentity(Intrinsic::ID RdxID, Type *Ty,
                                   FastMathFlags Flags) {
   bool Negative = false;
@@ -1315,6 +1332,8 @@ Value *llvm::createReduction(IRBuilderBase &B,
   RecurKind RK = Desc.getRecurrenceKind();
   if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
     return createAnyOfReduction(B, Src, Desc, OrigPhi);
+  if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK))
+    return createFindLastIVReduction(B, Src, Desc);
 
   return createSimpleReduction(B, Src, RK);
 }
diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 597d470f18ff3..4bf4acd6330f5 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -412,9 +412,13 @@ void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) {
       if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
         updateDebugInfo(SI);
         SSA.AddAvailableValue(BB, SI->getOperand(0));
-      } else
+      } else if (auto *AI = dyn_cast<AllocaInst>(User)) {
+        // We treat AllocaInst as a store of an getValueToUseForAlloca value.
+        SSA.AddAvailableValue(BB, getValueToUseForAlloca(AI));
+      } else {
         // Otherwise it is a load, queue it to rewrite as a live-in load.
         LiveInLoads.push_back(cast<LoadInst>(User));
+      }
       BlockUses.clear();
       continue;
     }
@@ -422,7 +426,7 @@ void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) {
     // Otherwise, check to see if this block is all loads.
     bool HasStore = false;
     for (Instruction *I : BlockUses) {
-      if (isa<StoreInst>(I)) {
+      if (isa<StoreInst>(I) || isa<AllocaInst>(I)) {
         HasStore = true;
         break;
       }
@@ -468,6 +472,12 @@ void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) {
 
         // Remember that this is the active value in the block.
         StoredValue = SI->getOperand(0);
+      } else if (auto *AI = dyn_cast<AllocaInst>(&I)) {
+        // Check if this an alloca, in which case we treat it as a store of
+        // getValueToUseForAlloca.
+        if (!isInstInList(AI, Insts))
+          continue;
+        StoredValue = getValueToUseForAlloca(AI);
       }
     }
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 0438ccf36aeaf..7c45822572d4e 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -397,9 +397,8 @@ Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
 
   // We have enough information to now generate the memcpy call to do the
   // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
-  B.CreateMemCpy(
-      CpyDst, Align(1), Src, Align(1),
-      ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1));
+  B.CreateMemCpy(CpyDst, Align(1), Src, Align(1),
+                 TLI->getAsSizeT(Len + 1, *B.GetInsertBlock()->getModule()));
   return Dst;
 }
 
@@ -590,26 +589,21 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilderBase &B) {
   if (Len1 && Len2) {
     return copyFlags(
         *CI, emitMemCmp(Str1P, Str2P,
-                        ConstantInt::get(DL.getIntPtrType(CI->getContext()),
-                                         std::min(Len1, Len2)),
+                        TLI->getAsSizeT(std::min(Len1, Len2), *CI->getModule()),
                         B, DL, TLI));
   }
 
   // strcmp to memcmp
   if (!HasStr1 && HasStr2) {
     if (canTransformToMemCmp(CI, Str1P, Len2, DL))
-      return copyFlags(
-          *CI,
-          emitMemCmp(Str1P, Str2P,
-                     ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2),
-                     B, DL, TLI));
+      return copyFlags(*CI, emitMemCmp(Str1P, Str2P,
+                                       TLI->getAsSizeT(Len2, *CI->getModule()),
+                                       B, DL, TLI));
   } else if (HasStr1 && !HasStr2) {
     if (canTransformToMemCmp(CI, Str2P, Len1, DL))
-      return copyFlags(
-          *CI,
-          emitMemCmp(Str1P, Str2P,
-                     ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1),
-                     B, DL, TLI));
+      return copyFlags(*CI, emitMemCmp(Str1P, Str2P,
+                                       TLI->getAsSizeT(Len1, *CI->getModule()),
+                                       B, DL, TLI));
   }
 
   annotateNonNullNoUndefBasedOnAccess(CI, {0, 1});
@@ -676,19 +670,15 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) {
   if (!HasStr1 && HasStr2) {
     Len2 = std::min(Len2, Length);
     if (canTransformToMemCmp(CI, Str1P, Len2, DL))
-      return copyFlags(
-          *CI,
-          emitMemCmp(Str1P, Str2P,
-                     ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2),
-                     B, DL, TLI));
+      return copyFlags(*CI, emitMemCmp(Str1P, Str2P,
+                                       TLI->getAsSizeT(Len2, *CI->getModule()),
+                                       B, DL, TLI));
   } else if (HasStr1 && !HasStr2) {
     Len1 = std::min(Len1, Length);
     if (canTransformToMemCmp(CI, Str2P, Len1, DL))
-      return copyFlags(
-          *CI,
-          emitMemCmp(Str1P, Str2P,
-                     ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1),
-                     B, DL, TLI));
+      return copyFlags(*CI, emitMemCmp(Str1P, Str2P,
+                                       TLI->getAsSizeT(Len1, *CI->getModule()),
+                                       B, DL, TLI));
   }
 
   return nullptr;
@@ -722,15 +712,13 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) {
 
   // We have enough information to now generate the memcpy call to do the
   // copy for us.  Make a memcpy to copy the nul byte with align = 1.
-  CallInst *NewCI =
-      B.CreateMemCpy(Dst, Align(1), Src, Align(1),
-                     ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len));
+  CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1),
+                                   TLI->getAsSizeT(Len, *CI->getModule()));
   mergeAttributesAndFlags(NewCI, *CI);
   return Dst;
 }
 
 Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) {
-  Function *Callee = CI->getCalledFunction();
   Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
 
   // stpcpy(d,s) -> strcpy(d,s) if the result is not used.
@@ -749,10 +737,9 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) {
   else
     return nullptr;
 
-  Type *PT = Callee->getFunctionType()->getParamType(0);
-  Value *LenV = ConstantInt::get(DL.getIntPtrType(PT), Len);
+  Value *LenV = TLI->getAsSizeT(Len, *CI->getModule());
   Value *DstEnd = B.CreateInBoundsGEP(
-      B.getInt8Ty(), Dst, ConstantInt::get(DL.getIntPtrType(PT), Len - 1));
+      B.getInt8Ty(), Dst, TLI->getAsSizeT(Len - 1, *CI->getModule()));
 
   // We have enough information to now generate the memcpy call to do the
   // copy for us.  Make a memcpy to copy the nul byte with align = 1.
@@ -819,13 +806,11 @@ Value *LibCallSimplifier::optimizeStrLCpy(CallInst *CI, IRBuilderBase &B) {
     return ConstantInt::get(CI->getType(), 0);
   }
 
-  Function *Callee = CI->getCalledFunction();
-  Type *PT = Callee->getFunctionType()->getParamType(0);
   // Transform strlcpy(D, S, N) to memcpy(D, S, N') where N' is the lower
   // bound on strlen(S) + 1 and N, optionally followed by a nul store to
   // D[N' - 1] if necessary.
   CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1),
-                        ConstantInt::get(DL.getIntPtrType(PT), NBytes));
+                                   TLI->getAsSizeT(NBytes, *CI->getModule()));
   mergeAttributesAndFlags(NewCI, *CI);
 
   if (!NulTerm) {
@@ -844,7 +829,6 @@ Value *LibCallSimplifier::optimizeStrLCpy(CallInst *CI, IRBuilderBase &B) {
 // otherwise.
 Value *LibCallSimplifier::optimizeStringNCpy(CallInst *CI, bool RetEnd,
                                              IRBuilderBase &B) {
-  Function *Callee = CI->getCalledFunction();
   Value *Dst = CI->getArgOperand(0);
   Value *Src = CI->getArgOperand(1);
   Value *Size = CI->getArgOperand(2);
@@ -922,11 +906,10 @@ Value *LibCallSimplifier::optimizeStringNCpy(CallInst *CI, bool RetEnd,
         /*M=*/nullptr, /*AddNull=*/false);
   }
 
-  Type *PT = Callee->getFunctionType()->getParamType(0);
   // st{p,r}ncpy(D, S, N) -> memcpy(align 1 D, align 1 S, N) when both
   // S and N are constant.
   CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1),
-                                   ConstantInt::get(DL.getIntPtrType(PT), N));
+                                   TLI->getAsSizeT(N, *CI->getModule()));
   mergeAttributesAndFlags(NewCI, *CI);
   if (!RetEnd)
     return Dst;
@@ -3438,10 +3421,9 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
       return nullptr; // we found a format specifier, bail out.
 
     // sprintf(str, fmt) -> llvm.memcpy(align 1 str, align 1 fmt, strlen(fmt)+1)
-    B.CreateMemCpy(
-        Dest, Align(1), CI->getArgOperand(1), Align(1),
-        ConstantInt::get(DL.getIntPtrType(CI->getContext()),
-                         FormatStr.size() + 1)); // Copy the null byte.
+    B.CreateMemCpy(Dest, Align(1), CI->getArgOperand(1), Align(1),
+                   // Copy the null byte.
+                   TLI->getAsSizeT(FormatStr.size() + 1, *CI->getModule()));
     return ConstantInt::get(CI->getType(), FormatStr.size());
   }
 
@@ -3476,9 +3458,8 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
 
     uint64_t SrcLen = GetStringLength(CI->getArgOperand(2));
     if (SrcLen) {
-      B.CreateMemCpy(
-          Dest, Align(1), CI->getArgOperand(2), Align(1),
-          ConstantInt::get(DL.getIntPtrType(CI->getContext()), SrcLen));
+      B.CreateMemCpy(Dest, Align(1), CI->getArgOperand(2), Align(1),
+                     TLI->getAsSizeT(SrcLen, *CI->getModule()));
       // Returns total number of characters written without null-character.
       return ConstantInt::get(CI->getType(), SrcLen - 1);
     } else if (Value *V = emitStpCpy(Dest, CI->getArgOperand(2), B, TLI)) {
@@ -3576,11 +3557,8 @@ Value *LibCallSimplifier::emitSnPrintfMemCpy(CallInst *CI, Value *StrArg,
   Value *DstArg = CI->getArgOperand(0);
   if (NCopy && StrArg)
     // Transform the call to lvm.memcpy(dst, fmt, N).
-    copyFlags(
-         *CI,
-          B.CreateMemCpy(
-                         DstArg, Align(1), StrArg, Align(1),
-              ConstantInt::get(DL.getIntPtrType(CI->getContext()), NCopy)));
+    copyFlags(*CI, B.CreateMemCpy(DstArg, Align(1), StrArg, Align(1),
+                                  TLI->getAsSizeT(NCopy, *CI->getModule())));
 
   if (N > Str.size())
     // Return early when the whole format string, including the final nul,
@@ -3696,11 +3674,9 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI,
     if (FormatStr.contains('%'))
       return nullptr; // We found a format specifier.
 
-    unsigned SizeTBits = TLI->getSizeTSize(*CI->getModule());
-    Type *SizeTTy = IntegerType::get(CI->getContext(), SizeTBits);
     return copyFlags(
         *CI, emitFWrite(CI->getArgOperand(1),
-                        ConstantInt::get(SizeTTy, FormatStr.size()),
+                        TLI->getAsSizeT(FormatStr.size(), *CI->getModule()),
                         CI->getArgOperand(0), B, DL, TLI));
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ed00c844285c6..122dc1db0b59d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2466,6 +2466,25 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
   return VectorTripCount;
 }
 
+/// Introduces a new VPIRBasicBlock for \p CheckIRBB to \p Plan between the
+/// vector preheader and its predecessor, also connecting the new block to the
+/// scalar preheader.
+static void introduceCheckBlockInVPlan(VPlan &Plan, BasicBlock *CheckIRBB) {
+  VPBlockBase *ScalarPH = Plan.getScalarPreheader();
+  VPBlockBase *VectorPH = Plan.getVectorPreheader();
+  VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
+  if (PreVectorPH->getNumSuccessors() != 1) {
+    assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
+    assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
+           "Unexpected successor");
+    VPIRBasicBlock *CheckVPIRBB = VPIRBasicBlock::fromBasicBlock(CheckIRBB);
+    VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckVPIRBB);
+    PreVectorPH = CheckVPIRBB;
+  }
+  VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
+  PreVectorPH->swapSuccessors();
+}
+
 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
   Value *Count = getTripCount();
   // Reuse existing vector loop preheader for TC checks.
@@ -2540,14 +2559,15 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
                                DT->getNode(Bypass)->getIDom()) &&
          "TC check is expected to dominate Bypass");
 
-  // Update dominator for Bypass & LoopExit (if needed).
-  DT->changeImmediateDominator(Bypass, TCCheckBlock);
   BranchInst &BI =
       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
   LoopBypassBlocks.push_back(TCCheckBlock);
+
+  // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
+  introduceCheckBlockInVPlan(Plan, TCCheckBlock);
 }
 
 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
@@ -2564,6 +2584,8 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
          "Should already be a bypass block due to iteration count check");
   LoopBypassBlocks.push_back(SCEVCheckBlock);
   AddedSafetyChecks = true;
+
+  introduceCheckBlockInVPlan(Plan, SCEVCheckBlock);
   return SCEVCheckBlock;
 }
 
@@ -2600,6 +2622,7 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
 
   AddedSafetyChecks = true;
 
+  introduceCheckBlockInVPlan(Plan, MemCheckBlock);
   return MemCheckBlock;
 }
 
@@ -2747,7 +2770,7 @@ void InnerLoopVectorizer::createInductionResumeVPValues(
   // no suitable resume phi was already created.
   ScalarPHBuilder.createNaryOp(
       VPInstruction::ResumePhi,
-      {Plan.getOrAddLiveIn(VectorTripCount),
+      {&Plan.getVectorTripCount(),
        Plan.getOrAddLiveIn(ConstantInt::get(VectorTripCount->getType(), 0))},
       {}, "vec.epilog.resume.val");
 }
@@ -5185,8 +5208,9 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
         HasReductions &&
         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
           const RecurrenceDescriptor &RdxDesc = Reduction.second;
-          return RecurrenceDescriptor::isAnyOfRecurrenceKind(
-              RdxDesc.getRecurrenceKind());
+          RecurKind RK = RdxDesc.getRecurrenceKind();
+          return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
+                 RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);
         });
     if (HasSelectCmpReductions) {
       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
@@ -7750,6 +7774,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV,
                          &BestVPlan, Legal->getWidestInductionType());
 
+#ifdef EXPENSIVE_CHECKS
+  assert(DT->verify(DominatorTree::VerificationLevel::Fast));
+#endif
+
   // 0. Generate SCEV-dependent code into the preheader, including TripCount,
   // before making any changes to the CFG.
   if (!BestVPlan.getPreheader()->empty()) {
@@ -7770,10 +7798,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   if (VectorizingEpilogue)
     VPlanTransforms::removeDeadRecipes(BestVPlan);
 
-#ifdef EXPENSIVE_CHECKS
-  assert(DT->verify(DominatorTree::VerificationLevel::Fast));
-#endif
-
   // Only use noalias metadata when using memory checks guaranteeing no overlap
   // across all iterations.
   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
@@ -7979,8 +8003,6 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
                                  DT->getNode(Bypass)->getIDom()) &&
            "TC check is expected to dominate Bypass");
 
-    // Update dominator for Bypass.
-    DT->changeImmediateDominator(Bypass, TCCheckBlock);
     LoopBypassBlocks.push_back(TCCheckBlock);
 
     // Save the trip count so we don't have to regenerate it in the
@@ -7995,6 +8017,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
 
+  introduceCheckBlockInVPlan(Plan, TCCheckBlock);
   return TCCheckBlock;
 }
 
@@ -8026,9 +8049,6 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
 
-  DT->changeImmediateDominator(LoopVectorPreHeader,
-                               EPI.MainLoopIterationCountCheck);
-
   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
 
@@ -8039,19 +8059,8 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
 
-  DT->changeImmediateDominator(
-      VecEpilogueIterationCountCheck,
-      VecEpilogueIterationCountCheck->getSinglePredecessor());
-
   DT->changeImmediateDominator(LoopScalarPreHeader,
                                EPI.EpilogueIterationCountCheck);
-  if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
-    // If there is an epilogue which must run, there's no edge from the
-    // middle block to exit blocks  and thus no need to update the immediate
-    // dominator of the exit blocks.
-    DT->changeImmediateDominator(OrigLoop->getUniqueLatchExitBlock(),
-                                 EPI.EpilogueIterationCountCheck);
-
   // Keep track of bypass blocks, as they feed start values to the induction and
   // reduction phis in the scalar loop preheader.
   if (EPI.SCEVSafetyCheck)
@@ -8142,6 +8151,16 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
   }
   ReplaceInstWithInst(Insert->getTerminator(), &BI);
   LoopBypassBlocks.push_back(Insert);
+
+  // A new entry block has been created for the epilogue VPlan. Hook it in, as
+  // otherwise we would try to modify the entry to the main vector loop.
+  VPIRBasicBlock *NewEntry = VPIRBasicBlock::fromBasicBlock(Insert);
+  VPBasicBlock *OldEntry = Plan.getEntry();
+  VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
+  Plan.setEntry(NewEntry);
+  delete OldEntry;
+
+  introduceCheckBlockInVPlan(Plan, Insert);
   return Insert;
 }
 
@@ -9449,8 +9468,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
 
     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
     RecurKind Kind = RdxDesc.getRecurrenceKind();
-    assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
-           "AnyOf reductions are not allowed for in-loop reductions");
+    assert(
+        !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
+        !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) &&
+        "AnyOf and FindLast reductions are not allowed for in-loop reductions");
 
     // Collect the chain of "link" recipes for the reduction starting at PhiR.
     SetVector<VPSingleDefRecipe *> Worklist;
@@ -10492,8 +10513,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         EpilogILV.setTripCount(MainILV.getTripCount());
         preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
 
-        assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
-               "DT not preserved correctly");
         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
                         DT, true, &ExpandedSCEVs);
         ++LoopsEpilogueVectorized;
@@ -10521,6 +10540,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
       checkMixedPrecision(L, ORE);
   }
 
+  assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
+         "DT not preserved correctly");
+
   std::optional<MDNode *> RemainderLoopID =
       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
                                       LLVMLoopVectorizeFollowupEpilogue});
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a1d7515f031cf..0e11e8704db2f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -344,6 +344,8 @@ static unsigned getShufflevectorNumGroups(ArrayRef<Value *> VL) {
   unsigned SVNumElements =
       cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
   unsigned ShuffleMaskSize = SV->getShuffleMask().size();
+  if (SVNumElements % ShuffleMaskSize != 0)
+    return 0;
   unsigned GroupSize = SVNumElements / ShuffleMaskSize;
   if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
     return 0;
@@ -10218,9 +10220,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       // sub-Mask into the CommonMask to estimate it later and avoid double cost
       // estimation.
       if ((InVectors.size() == 2 &&
-           InVectors.front().get<const TreeEntry *>() == &E1 &&
-           InVectors.back().get<const TreeEntry *>() == E2) ||
-          (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
+           cast<const TreeEntry *>(InVectors.front()) == &E1 &&
+           cast<const TreeEntry *>(InVectors.back()) == E2) ||
+          (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
         unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
         assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
                       [](int Idx) { return Idx == PoisonMaskElem; }) &&
@@ -10246,7 +10248,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         VF = std::max(VF,
                       cast<FixedVectorType>(V1->getType())->getNumElements());
       } else {
-        const auto *E = InVectors.front().get<const TreeEntry *>();
+        const auto *E = cast<const TreeEntry *>(InVectors.front());
         VF = std::max(VF, E->getVectorFactor());
       }
       for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
@@ -10262,7 +10264,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         VF = std::max(VF,
                       getNumElements(V1->getType()));
       } else {
-        const auto *E = P.get<const TreeEntry *>();
+        const auto *E = cast<const TreeEntry *>(P);
         VF = std::max(VF, E->getVectorFactor());
       }
       for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
@@ -10368,9 +10370,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     };
     if (!V1 && !V2 && !P2.isNull()) {
       // Shuffle 2 entry nodes.
-      const TreeEntry *E = P1.get<const TreeEntry *>();
+      const TreeEntry *E = cast<const TreeEntry *>(P1);
       unsigned VF = E->getVectorFactor();
-      const TreeEntry *E2 = P2.get<const TreeEntry *>();
+      const TreeEntry *E2 = cast<const TreeEntry *>(P2);
       CommonVF = std::max(VF, E2->getVectorFactor());
       assert(all_of(Mask,
                     [=](int Idx) {
@@ -10402,7 +10404,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
     } else if (!V1 && P2.isNull()) {
       // Shuffle single entry node.
-      const TreeEntry *E = P1.get<const TreeEntry *>();
+      const TreeEntry *E = cast<const TreeEntry *>(P1);
       unsigned VF = E->getVectorFactor();
       CommonVF = VF;
       assert(
@@ -10451,7 +10453,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     } else if (V1 && !V2) {
       // Shuffle vector and tree node.
       unsigned VF = getVF(V1);
-      const TreeEntry *E2 = P2.get<const TreeEntry *>();
+      const TreeEntry *E2 = cast<const TreeEntry *>(P2);
       CommonVF = std::max(VF, E2->getVectorFactor());
       assert(all_of(Mask,
                     [=](int Idx) {
@@ -10477,7 +10479,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     } else if (!V1 && V2) {
       // Shuffle vector and tree node.
       unsigned VF = getVF(V2);
-      const TreeEntry *E1 = P1.get<const TreeEntry *>();
+      const TreeEntry *E1 = cast<const TreeEntry *>(P1);
       CommonVF = std::max(VF, E1->getVectorFactor());
       assert(all_of(Mask,
                     [=](int Idx) {
@@ -10715,8 +10717,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                     if (P.value() == PoisonMaskElem)
                       return Mask[P.index()] == PoisonMaskElem;
                     auto *EI = cast<ExtractElementInst>(
-                        InVectors.front().get<const TreeEntry *>()->getOrdered(
-                            P.index()));
+                        cast<const TreeEntry *>(InVectors.front())
+                            ->getOrdered(P.index()));
                     return EI->getVectorOperand() == V1 ||
                            EI->getVectorOperand() == V2;
                   }) &&
@@ -10734,7 +10736,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     if (ForExtracts) {
       // No need to add vectors here, already handled them in adjustExtracts.
       assert(
-          InVectors.size() == 1 && InVectors.front().is<const TreeEntry *>() &&
+          InVectors.size() == 1 && isa<const TreeEntry *>(InVectors.front()) &&
           !CommonMask.empty() &&
           all_of(enumerate(CommonMask),
                  [&](auto P) {
@@ -10764,7 +10766,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       VF = std::max(VF, InTE->getVectorFactor());
     } else {
       VF = std::max(
-          VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
+          VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
                   ->getNumElements());
     }
     InVectors.push_back(V1);
@@ -10834,7 +10836,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
           CommonMask[Idx] = Idx;
       assert(VF > 0 &&
              "Expected vector length for the final value before action.");
-      Value *V = Vec.get<Value *>();
+      Value *V = cast<Value *>(Vec);
       Action(V, CommonMask);
       InVectors.front() = V;
     }
@@ -20451,6 +20453,8 @@ class HorizontalReduction {
     case RecurKind::FMulAdd:
     case RecurKind::IAnyOf:
     case RecurKind::FAnyOf:
+    case RecurKind::IFindLastIV:
+    case RecurKind::FFindLastIV:
     case RecurKind::None:
       llvm_unreachable("Unexpected reduction kind for repeated scalar.");
     }
@@ -20548,6 +20552,8 @@ class HorizontalReduction {
     case RecurKind::FMulAdd:
     case RecurKind::IAnyOf:
     case RecurKind::FAnyOf:
+    case RecurKind::IFindLastIV:
+    case RecurKind::FFindLastIV:
     case RecurKind::None:
       llvm_unreachable("Unexpected reduction kind for reused scalars.");
     }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 81c76bc99fbf7..d3476399dabf1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -170,9 +170,7 @@ VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
 }
 
 void VPBlockBase::setPlan(VPlan *ParentPlan) {
-  assert(
-      (ParentPlan->getEntry() == this || ParentPlan->getPreheader() == this) &&
-      "Can only set plan on its entry or preheader block.");
+  assert(ParentPlan->getEntry() == this && "Can only set plan on its entry.");
   Plan = ParentPlan;
 }
 
@@ -310,9 +308,8 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
   if (!hasScalarValue(Def, LastLane)) {
     // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
     // VPExpandSCEVRecipes can also be uniform.
-    assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||
-            isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) ||
-            isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&
+    assert((isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe,
+                VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&
            "unexpected recipe found to be invariant");
     IsUniform = true;
     LastLane = 0;
@@ -361,7 +358,7 @@ void VPTransformState::addNewMetadata(Instruction *To,
                                       const Instruction *Orig) {
   // If the loop was versioned with memchecks, add the corresponding no-alias
   // metadata.
-  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
+  if (LVer && isa<LoadInst, StoreInst>(Orig))
     LVer->annotateInstWithNoAlias(To, Orig);
 }
 
@@ -824,6 +821,18 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+VPlan::VPlan(VPBasicBlock *OriginalPreheader, VPValue *TC,
+             VPBasicBlock *EntryVectorPreHeader, VPIRBasicBlock *ScalarHeader)
+    : VPlan(OriginalPreheader, TC, ScalarHeader) {
+  VPBlockUtils::connectBlocks(OriginalPreheader, EntryVectorPreHeader);
+}
+
+VPlan::VPlan(VPBasicBlock *OriginalPreheader,
+             VPBasicBlock *EntryVectorPreHeader, VPIRBasicBlock *ScalarHeader)
+    : VPlan(OriginalPreheader, ScalarHeader) {
+  VPBlockUtils::connectBlocks(OriginalPreheader, EntryVectorPreHeader);
+}
+
 VPlan::~VPlan() {
   if (Entry) {
     VPValue DummyValue;
@@ -831,9 +840,6 @@ VPlan::~VPlan() {
       Block->dropAllReferences(&DummyValue);
 
     VPBlockBase::deleteCFG(Entry);
-
-    Preheader->dropAllReferences(&DummyValue);
-    delete Preheader;
   }
   for (VPValue *VPV : VPLiveInsToFree)
     delete VPV;
@@ -856,9 +862,16 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   VPIRBasicBlock *Entry =
       VPIRBasicBlock::fromBasicBlock(TheLoop->getLoopPreheader());
   VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph");
+  // Connect entry only to vector preheader initially. Entry will also be
+  // connected to the scalar preheader later, during skeleton creation when
+  // runtime guards are added as needed. Note that when executing the VPlan for
+  // an epilogue vector loop, the original entry block here will be replaced by
+  // a new VPIRBasicBlock wrapping the entry to the epilogue vector loop after
+  // generating code for the main vector loop.
+  VPBlockUtils::connectBlocks(Entry, VecPreheader);
   VPIRBasicBlock *ScalarHeader =
       VPIRBasicBlock::fromBasicBlock(TheLoop->getHeader());
-  auto Plan = std::make_unique<VPlan>(Entry, VecPreheader, ScalarHeader);
+  auto Plan = std::make_unique<VPlan>(Entry, ScalarHeader);
 
   // Create SCEV and VPValue for the trip count.
   // We use the symbolic max backedge-taken-count, which works also when
@@ -982,15 +995,21 @@ void VPlan::execute(VPTransformState *State) {
   State->CFG.DTU.applyUpdates(
       {{DominatorTree::Delete, VectorPreHeader, State->CFG.ExitBB}});
 
-  // Replace regular VPBB's for the middle and scalar preheader blocks with
-  // VPIRBasicBlocks wrapping their IR blocks. The IR blocks are created during
-  // skeleton creation, so we can only create the VPIRBasicBlocks now during
-  // VPlan execution rather than earlier during VPlan construction.
+  // Replace regular VPBB's for the vector preheader, middle and scalar
+  // preheader blocks with VPIRBasicBlocks wrapping their IR blocks. The IR
+  // blocks are created during skeleton creation, so we can only create the
+  // VPIRBasicBlocks now during VPlan execution rather than earlier during VPlan
+  // construction.
   BasicBlock *MiddleBB = State->CFG.ExitBB;
-  VPBasicBlock *MiddleVPBB = getMiddleBlock();
   BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor();
+  replaceVPBBWithIRVPBB(getVectorPreheader(), VectorPreHeader);
+  replaceVPBBWithIRVPBB(getMiddleBlock(), MiddleBB);
   replaceVPBBWithIRVPBB(getScalarPreheader(), ScalarPh);
-  replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB);
+
+  LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State->VF
+                    << ", UF=" << getUF() << '\n');
+  setName("Final VPlan");
+  LLVM_DEBUG(dump());
 
   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State->VF
                     << ", UF=" << getUF() << '\n');
@@ -1028,8 +1047,7 @@ void VPlan::execute(VPTransformState *State) {
     if (isa<VPWidenPHIRecipe>(&R))
       continue;
 
-    if (isa<VPWidenPointerInductionRecipe>(&R) ||
-        isa<VPWidenIntOrFpInductionRecipe>(&R)) {
+    if (isa<VPWidenPointerInductionRecipe, VPWidenIntOrFpInductionRecipe>(&R)) {
       PHINode *Phi = nullptr;
       if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
         Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
@@ -1064,9 +1082,6 @@ void VPlan::execute(VPTransformState *State) {
   }
 
   State->CFG.DTU.flush();
-  assert(State->CFG.DTU.getDomTree().verify(
-             DominatorTree::VerificationLevel::Fast) &&
-         "DT not preserved correctly");
 }
 
 InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
@@ -1075,6 +1090,21 @@ InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
   return getVectorLoopRegion()->cost(VF, Ctx);
 }
 
+VPRegionBlock *VPlan::getVectorLoopRegion() {
+  // TODO: Cache if possible.
+  for (VPBlockBase *B : vp_depth_first_shallow(getEntry()))
+    if (auto *R = dyn_cast<VPRegionBlock>(B))
+      return R;
+  return nullptr;
+}
+
+const VPRegionBlock *VPlan::getVectorLoopRegion() const {
+  for (const VPBlockBase *B : vp_depth_first_shallow(getEntry()))
+    if (auto *R = dyn_cast<VPRegionBlock>(B))
+      return R;
+  return nullptr;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPlan::printLiveIns(raw_ostream &O) const {
   VPSlotTracker SlotTracker(this);
@@ -1119,11 +1149,6 @@ void VPlan::print(raw_ostream &O) const {
 
   printLiveIns(O);
 
-  if (!getPreheader()->empty()) {
-    O << "\n";
-    getPreheader()->print(O, "", SlotTracker);
-  }
-
   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<const VPBlockBase *>>
       RPOT(getEntry());
   for (const VPBlockBase *Block : RPOT) {
@@ -1207,7 +1232,6 @@ static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
 
 VPlan *VPlan::duplicate() {
   // Clone blocks.
-  VPBasicBlock *NewPreheader = Preheader->clone();
   const auto &[NewEntry, __] = cloneFrom(Entry);
 
   BasicBlock *ScalarHeaderIRBB = getScalarHeader()->getIRBasicBlock();
@@ -1217,8 +1241,7 @@ VPlan *VPlan::duplicate() {
         return VPIRBB && VPIRBB->getIRBasicBlock() == ScalarHeaderIRBB;
       }));
   // Create VPlan, clone live-ins and remap operands in the cloned blocks.
-  auto *NewPlan =
-      new VPlan(NewPreheader, cast<VPBasicBlock>(NewEntry), NewScalarHeader);
+  auto *NewPlan = new VPlan(cast<VPBasicBlock>(NewEntry), NewScalarHeader);
   DenseMap<VPValue *, VPValue *> Old2NewVPValues;
   for (VPValue *OldLiveIn : VPLiveInsToFree) {
     Old2NewVPValues[OldLiveIn] =
@@ -1238,7 +1261,6 @@ VPlan *VPlan::duplicate() {
   // else NewTripCount will be created and inserted into Old2NewVPValues when
   // TripCount is cloned. In any case NewPlan->TripCount is updated below.
 
-  remapOperands(Preheader, NewPreheader, Old2NewVPValues);
   remapOperands(Entry, NewEntry, Old2NewVPValues);
 
   // Initialize remaining fields of cloned VPlan.
@@ -1290,8 +1312,6 @@ void VPlanPrinter::dump() {
   OS << "edge [fontname=Courier, fontsize=30]\n";
   OS << "compound=true\n";
 
-  dumpBlock(Plan.getPreheader());
-
   for (const VPBlockBase *Block : vp_depth_first_shallow(Plan.getEntry()))
     dumpBlock(Block);
 
@@ -1552,7 +1572,6 @@ void VPSlotTracker::assignNames(const VPlan &Plan) {
     assignName(Plan.BackedgeTakenCount);
   for (VPValue *LI : Plan.VPLiveInsToFree)
     assignName(LI);
-  assignNames(Plan.getPreheader());
 
   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<const VPBlockBase *>>
       RPOT(VPBlockDeepTraversalWrapper<const VPBlockBase *>(Plan.getEntry()));
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 7440a3a386fd2..ae68e1fc63a13 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3768,14 +3768,12 @@ class VPlan {
   friend class VPlanPrinter;
   friend class VPSlotTracker;
 
-  /// Hold the single entry to the Hierarchical CFG of the VPlan, i.e. the
-  /// preheader of the vector loop.
-  VPBasicBlock *Entry;
-
   /// VPBasicBlock corresponding to the original preheader. Used to place
   /// VPExpandSCEV recipes for expressions used during skeleton creation and the
   /// rest of VPlan execution.
-  VPBasicBlock *Preheader;
+  /// When this VPlan is used for the epilogue vector loop, the entry will be
+  /// replaced by a new entry block created during skeleton creation.
+  VPBasicBlock *Entry;
 
   /// VPIRBasicBlock wrapping the header of the original scalar loop.
   VPIRBasicBlock *ScalarHeader;
@@ -3821,45 +3819,47 @@ class VPlan {
   DenseMap<const SCEV *, VPValue *> SCEVToExpansion;
 
 public:
-  /// Construct a VPlan with original preheader \p Preheader, trip count \p TC,
-  /// \p Entry to the plan and with \p ScalarHeader wrapping the original header
-  /// of the scalar loop. At the moment, \p Preheader and \p Entry need to be
-  /// disconnected, as the bypass blocks between them are not yet modeled in
-  /// VPlan.
-  VPlan(VPBasicBlock *Preheader, VPValue *TC, VPBasicBlock *Entry,
-        VPIRBasicBlock *ScalarHeader)
-      : VPlan(Preheader, Entry, ScalarHeader) {
+  /// Construct a VPlan with \p Entry entering the plan, trip count \p TC and
+  /// with \p ScalarHeader wrapping the original header of the scalar loop.
+  VPlan(VPBasicBlock *Entry, VPValue *TC, VPIRBasicBlock *ScalarHeader)
+      : VPlan(Entry, ScalarHeader) {
     TripCount = TC;
   }
 
-  /// Construct a VPlan with original preheader \p Preheader, \p Entry to
-  /// the plan and with \p ScalarHeader wrapping the original header of the
-  /// scalar loop. At the moment, \p Preheader and \p Entry need to be
-  /// disconnected, as the bypass blocks between them are not yet modeled in
-  /// VPlan.
-  VPlan(VPBasicBlock *Preheader, VPBasicBlock *Entry,
-        VPIRBasicBlock *ScalarHeader)
-      : Entry(Entry), Preheader(Preheader), ScalarHeader(ScalarHeader) {
+  /// Constructor variants that take disconnected preheader and entry blocks,
+  /// connecting them as part of construction.
+  /// FIXME: Only used to reduce the need of code changes during transition.
+  VPlan(VPBasicBlock *OriginalPreheader, VPValue *TC,
+        VPBasicBlock *EntryVectorPreHeader, VPIRBasicBlock *ScalarHeader);
+  VPlan(VPBasicBlock *OriginalPreheader, VPBasicBlock *EntryVectorPreHeader,
+        VPIRBasicBlock *ScalarHeader);
+
+  /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader
+  /// wrapping the original header of the scalar loop.
+  VPlan(VPBasicBlock *Entry, VPIRBasicBlock *ScalarHeader)
+      : Entry(Entry), ScalarHeader(ScalarHeader) {
     Entry->setPlan(this);
-    Preheader->setPlan(this);
-    assert(Preheader->getNumSuccessors() == 0 &&
-           Preheader->getNumPredecessors() == 0 &&
-           "preheader must be disconnected");
     assert(ScalarHeader->getNumSuccessors() == 0 &&
            "scalar header must be a leaf node");
   }
 
   ~VPlan();
 
+  void setEntry(VPBasicBlock *VPBB) {
+    Entry = VPBB;
+    VPBB->setPlan(this);
+  }
+
   /// Create initial VPlan, having an "entry" VPBasicBlock (wrapping
-  /// original scalar pre-header ) which contains SCEV expansions that need
-  /// to happen before the CFG is modified; a VPBasicBlock for the vector
-  /// pre-header, followed by a region for the vector loop, followed by the
-  /// middle VPBasicBlock. If a check is needed to guard executing the scalar
-  /// epilogue loop, it will be added to the middle block, together with
-  /// VPBasicBlocks for the scalar preheader and exit blocks.
-  /// \p InductionTy is the type of the canonical induction and used for related
-  /// values, like the trip count expression.
+  /// original scalar pre-header) which contains SCEV expansions that need
+  /// to happen before the CFG is modified (when executing a VPlan for the
+  /// epilogue vector loop, the original entry needs to be replaced by a new
+  /// one); a VPBasicBlock for the vector pre-header, followed by a region for
+  /// the vector loop, followed by the middle VPBasicBlock. If a check is needed
+  /// to guard executing the scalar epilogue loop, it will be added to the
+  /// middle block, together with VPBasicBlocks for the scalar preheader and
+  /// exit blocks. \p InductionTy is the type of the canonical induction and
+  /// used for related values, like the trip count expression.
   static VPlanPtr createInitialVPlan(Type *InductionTy,
                                      PredicatedScalarEvolution &PSE,
                                      bool RequiresScalarEpilogueCheck,
@@ -3884,26 +3884,22 @@ class VPlan {
   }
 
   /// Returns the VPRegionBlock of the vector loop.
-  VPRegionBlock *getVectorLoopRegion() {
-    return cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
-  }
-  const VPRegionBlock *getVectorLoopRegion() const {
-    return cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
-  }
+  VPRegionBlock *getVectorLoopRegion();
+  const VPRegionBlock *getVectorLoopRegion() const;
 
   /// Returns the 'middle' block of the plan, that is the block that selects
   /// whether to execute the scalar tail loop or the exit block from the loop
   /// latch.
   const VPBasicBlock *getMiddleBlock() const {
-    return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor());
+    return cast<VPBasicBlock>(getScalarPreheader()->getPredecessors().front());
   }
   VPBasicBlock *getMiddleBlock() {
-    return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor());
+    return cast<VPBasicBlock>(getScalarPreheader()->getPredecessors().front());
   }
 
   /// Return the VPBasicBlock for the preheader of the scalar loop.
   VPBasicBlock *getScalarPreheader() const {
-    return cast<VPBasicBlock>(ScalarHeader->getSinglePredecessor());
+    return cast<VPBasicBlock>(getScalarHeader()->getSinglePredecessor());
   }
 
   /// Return the VPIRBasicBlock wrapping the header of the scalar loop.
@@ -4039,8 +4035,10 @@ class VPlan {
   }
 
   /// \return The block corresponding to the original preheader.
-  VPBasicBlock *getPreheader() { return Preheader; }
-  const VPBasicBlock *getPreheader() const { return Preheader; }
+  /// FIXME: There's no separate preheader any longer and Entry now serves the
+  /// same purpose as the original preheader. Remove after transition.
+  VPBasicBlock *getPreheader() { return Entry; }
+  const VPBasicBlock *getPreheader() const { return Entry; }
 
   /// Clone the current VPlan, update all VPValues of the new VPlan and cloned
   /// recipes to refer to the clones, and return it.
@@ -4190,8 +4188,6 @@ class VPBlockUtils {
            "Can't connect two block with different parents");
     assert((SuccIdx != -1u || From->getNumSuccessors() < 2) &&
            "Blocks can't have more than two successors.");
-    assert((PredIdx != -1u || To->getNumPredecessors() < 2) &&
-           "Blocks can't have more than two predecessors.");
     if (SuccIdx == -1u)
       From->appendSuccessor(To);
     else
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 969d07b229e46..0d981ff5826ed 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -127,7 +127,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
 }
 
 Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
-  assert((isa<VPWidenLoadRecipe>(R) || isa<VPWidenLoadEVLRecipe>(R)) &&
+  assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
          "Store recipes should not define any values");
   return cast<LoadInst>(&R->getIngredient())->getType();
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8fea2ca946104..02774d8e5c5fe 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -567,6 +567,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
           ReducedPartRdx = Builder.CreateBinOp(
               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
+        else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK))
+          ReducedPartRdx =
+              createMinMaxOp(Builder, RecurKind::SMax, ReducedPartRdx, RdxPart);
         else
           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
       }
@@ -575,7 +578,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
     // Create the reduction after the loop. Note that inloop reductions create
     // the target reduction in the loop using a Reduction recipe.
     if ((State.VF.isVector() ||
-         RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
+         RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
+         RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) &&
         !PhiR->isInLoop()) {
       ReducedPartRdx =
           createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
@@ -629,14 +633,14 @@ Value *VPInstruction::generate(VPTransformState &State) {
         Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name);
     BasicBlock *VPlanPred =
         State.CFG
-            .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())];
+            .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getPredecessors()[0])];
     NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);
     // TODO: Predecessors are temporarily reversed to reduce test changes.
     // Remove it and update remaining tests after functional change landed.
     auto Predecessors = to_vector(predecessors(Builder.GetInsertBlock()));
     for (auto *OtherPred : reverse(Predecessors)) {
-      assert(OtherPred != VPlanPred &&
-             "VPlan predecessors should not be connected yet");
+      if (OtherPred == VPlanPred)
+        continue;
       NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);
     }
     return NewPhi;
@@ -3253,13 +3257,22 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
 
 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
   assert(!State.Lane && "cannot be used in per-lane");
+  if (State.ExpandedSCEVs.contains(Expr)) {
+    // SCEV Expr has already been expanded, result must already be set. At the
+    // moment we have to execute the entry block twice (once before skeleton
+    // creation to get expanded SCEVs used by the skeleton and once during
+    // regular VPlan execution).
+    State.Builder.SetInsertPoint(State.CFG.VPBB2IRBB[getParent()]);
+    assert(State.get(this, VPLane(0)) == State.ExpandedSCEVs[Expr] &&
+           "Results must match");
+    return;
+  }
+
   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
   SCEVExpander Exp(SE, DL, "induction");
 
   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
                                  &*State.Builder.GetInsertPoint());
-  assert(!State.ExpandedSCEVs.contains(Expr) &&
-         "Same SCEV expanded multiple times");
   State.ExpandedSCEVs[Expr] = Res;
   State.set(this, Res, VPLane(0));
 }
@@ -3398,6 +3411,20 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
       Builder.SetInsertPoint(VectorPH->getTerminator());
       StartV = Iden = State.get(StartVPV);
     }
+  } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
+    // [I|F]FindLastIV will use a sentinel value to initialize the reduction
+    // phi. In the exit block, ComputeReductionResult will generate checks to
+    // verify if the reduction result is the sentinel value. If the result is
+    // the sentinel value, it will be corrected back to the start value.
+    // TODO: The sentinel value is not always necessary. When the start value is
+    // a constant, and smaller than the start value of the induction variable,
+    // the start value can be directly used to initialize the reduction phi.
+    StartV = Iden = RdxDesc.getSentinelValue();
+    if (!ScalarPHI) {
+      IRBuilderBase::InsertPointGuard IPBuilder(Builder);
+      Builder.SetInsertPoint(VectorPH->getTerminator());
+      StartV = Iden = Builder.CreateVectorSplat(State.VF, Iden);
+    }
   } else {
     Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(),
                                        RdxDesc.getFastMathFlags());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index e27c1bfba9352..f7b9a676ae808 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -377,7 +377,8 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
       continue;
     auto *PredVPBB =
         dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
-    if (!PredVPBB || PredVPBB->getNumSuccessors() != 1)
+    if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
+        isa<VPIRBasicBlock>(PredVPBB))
       continue;
     WorkList.push_back(VPBB);
   }
@@ -1687,8 +1688,8 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
       // instruction. Widen memory instructions involved in address computation
       // will lead to gather/scatter instructions, which don't need to be
       // handled.
-      if (isa<VPWidenMemoryRecipe>(CurRec) || isa<VPInterleaveRecipe>(CurRec) ||
-          isa<VPScalarIVStepsRecipe>(CurRec) || isa<VPHeaderPHIRecipe>(CurRec))
+      if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe,
+              VPHeaderPHIRecipe>(CurRec))
         continue;
 
       // This recipe contributes to the address computation of a widen
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index ff6c9295ee205..89e372d6b46cf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -412,8 +412,6 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
 
   UnrollState Unroller(Plan, UF, Ctx);
 
-  Unroller.unrollBlock(Plan.getPreheader());
-
   // Iterate over all blocks in the plan starting from Entry, and unroll
   // recipes inside them. This includes the vector preheader and middle blocks,
   // which may set up or post-process per-part values.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 4621c28b05129..e40af3e2e3d30 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -34,7 +34,7 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
     Expanded = Plan.getOrAddLiveIn(E->getValue());
   else {
     Expanded = new VPExpandSCEVRecipe(Expr, SE);
-    Plan.getPreheader()->appendRecipe(Expanded->getDefiningRecipe());
+    Plan.getEntry()->appendRecipe(Expanded->getDefiningRecipe());
   }
   Plan.addSCEVExpansion(Expr, Expanded);
   return Expanded;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 71c7d547ac7d9..be420a873bef5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -185,7 +185,7 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
     RecipeNumbering[&R] = Cnt++;
 
   for (const VPRecipeBase &R : *VPBB) {
-    if (isa<VPIRInstruction>(&R) ^ isa<VPIRBasicBlock>(VPBB)) {
+    if (isa<VPIRInstruction>(&R) && !isa<VPIRBasicBlock>(VPBB)) {
       errs() << "VPIRInstructions ";
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
       R.dump();
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 9003642f1f93b..0c324cbab88bc 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -115,6 +115,7 @@ class VectorCombine {
   bool foldExtractedCmps(Instruction &I);
   bool foldSingleElementStore(Instruction &I);
   bool scalarizeLoadExtract(Instruction &I);
+  bool foldConcatOfBoolMasks(Instruction &I);
   bool foldPermuteOfBinops(Instruction &I);
   bool foldShuffleOfBinops(Instruction &I);
   bool foldShuffleOfCastops(Instruction &I);
@@ -1423,6 +1424,113 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
   return true;
 }
 
+/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
+/// to "(bitcast (concat X, Y))"
+/// where X/Y are bitcasted from i1 mask vectors.
+bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
+  Type *Ty = I.getType();
+  if (!Ty->isIntegerTy())
+    return false;
+
+  // TODO: Add big endian test coverage
+  if (DL->isBigEndian())
+    return false;
+
+  // Restrict to disjoint cases so the mask vectors aren't overlapping.
+  Instruction *X, *Y;
+  if (!match(&I, m_DisjointOr(m_Instruction(X), m_Instruction(Y))))
+    return false;
+
+  // Allow both sources to contain shl, to handle more generic pattern:
+  // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
+  Value *SrcX;
+  uint64_t ShAmtX = 0;
+  if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
+      !match(X, m_OneUse(
+                    m_Shl(m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX))))),
+                          m_ConstantInt(ShAmtX)))))
+    return false;
+
+  Value *SrcY;
+  uint64_t ShAmtY = 0;
+  if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
+      !match(Y, m_OneUse(
+                    m_Shl(m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY))))),
+                          m_ConstantInt(ShAmtY)))))
+    return false;
+
+  // Canonicalize larger shift to the RHS.
+  if (ShAmtX > ShAmtY) {
+    std::swap(X, Y);
+    std::swap(SrcX, SrcY);
+    std::swap(ShAmtX, ShAmtY);
+  }
+
+  // Ensure both sources are matching vXi1 bool mask types, and that the shift
+  // difference is the mask width so they can be easily concatenated together.
+  uint64_t ShAmtDiff = ShAmtY - ShAmtX;
+  unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
+  unsigned BitWidth = Ty->getPrimitiveSizeInBits();
+  auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
+  if (!MaskTy || SrcX->getType() != SrcY->getType() ||
+      !MaskTy->getElementType()->isIntegerTy(1) ||
+      MaskTy->getNumElements() != ShAmtDiff ||
+      MaskTy->getNumElements() > (BitWidth / 2))
+    return false;
+
+  auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
+  auto *ConcatIntTy =
+      Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
+  auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
+
+  SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
+  std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+
+  // TODO: Is it worth supporting multi use cases?
+  InstructionCost OldCost = 0;
+  OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
+  OldCost +=
+      NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
+  OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
+                                      TTI::CastContextHint::None, CostKind);
+  OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
+                                      TTI::CastContextHint::None, CostKind);
+
+  InstructionCost NewCost = 0;
+  NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, MaskTy,
+                                ConcatMask, CostKind);
+  NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
+                                  TTI::CastContextHint::None, CostKind);
+  if (Ty != ConcatIntTy)
+    NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
+                                    TTI::CastContextHint::None, CostKind);
+  if (ShAmtX > 0)
+    NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
+
+  if (NewCost > OldCost)
+    return false;
+
+  // Build bool mask concatenation, bitcast back to scalar integer, and perform
+  // any residual zero-extension or shifting.
+  Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
+  Worklist.pushValue(Concat);
+
+  Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
+
+  if (Ty != ConcatIntTy) {
+    Worklist.pushValue(Result);
+    Result = Builder.CreateZExt(Result, Ty);
+  }
+
+  if (ShAmtX > 0) {
+    Worklist.pushValue(Result);
+    Result = Builder.CreateShl(Result, ShAmtX);
+  }
+
+  replaceValue(I, *Result);
+  return true;
+}
+
 /// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
 ///           -->  "binop (shuffle), (shuffle)".
 bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
@@ -2873,6 +2981,7 @@ bool VectorCombine::run() {
   bool MadeChange = false;
   auto FoldInst = [this, &MadeChange](Instruction &I) {
     Builder.SetInsertPoint(&I);
+    bool IsVectorType = isa<VectorType>(I.getType());
     bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
     auto Opcode = I.getOpcode();
 
@@ -2895,7 +3004,7 @@ bool VectorCombine::run() {
 
     // This transform works with scalable and fixed vectors
     // TODO: Identify and allow other scalable transforms
-    if (isa<VectorType>(I.getType())) {
+    if (IsVectorType) {
       MadeChange |= scalarizeBinopOrCmp(I);
       MadeChange |= scalarizeLoadExtract(I);
       MadeChange |= scalarizeVPIntrinsic(I);
@@ -2944,6 +3053,9 @@ bool VectorCombine::run() {
       case Instruction::FCmp:
         MadeChange |= foldExtractExtract(I);
         break;
+      case Instruction::Or:
+        MadeChange |= foldConcatOfBoolMasks(I);
+        [[fallthrough]];
       default:
         if (Instruction::isBinaryOp(Opcode)) {
           MadeChange |= foldExtractExtract(I);
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll
index 88061756d8fee..d93ef6f8b2869 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll
@@ -5755,8 +5755,8 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -5785,8 +5785,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -5815,8 +5815,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -5845,8 +5845,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -5875,8 +5875,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6230,8 +6230,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6252,8 +6252,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6274,8 +6274,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6296,8 +6296,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6318,8 +6318,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6725,8 +6725,8 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6755,8 +6755,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6785,8 +6785,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6815,8 +6815,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6845,8 +6845,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7200,8 +7200,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7222,8 +7222,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7244,8 +7244,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7266,8 +7266,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7288,8 +7288,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7690,8 +7690,8 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7720,8 +7720,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7750,8 +7750,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7780,8 +7780,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7810,8 +7810,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8160,8 +8160,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8182,8 +8182,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8204,8 +8204,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8226,8 +8226,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8248,8 +8248,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8650,8 +8650,8 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8680,8 +8680,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8710,8 +8710,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8740,8 +8740,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8770,8 +8770,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -9120,8 +9120,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -9142,8 +9142,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -9164,8 +9164,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -9186,8 +9186,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -9208,8 +9208,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2_lse128.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2_lse128.ll
index a1712a5ec7a27..1fad4a6b54f6b 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2_lse128.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2_lse128.ll
@@ -3930,8 +3930,8 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -3958,8 +3958,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -3986,8 +3986,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_release:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -4014,8 +4014,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -4042,8 +4042,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -4325,8 +4325,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -4347,8 +4347,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -4369,8 +4369,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -4391,8 +4391,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -4413,8 +4413,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -4575,8 +4575,8 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -4603,8 +4603,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -4631,8 +4631,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_release:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -4659,8 +4659,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -4687,8 +4687,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -4970,8 +4970,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -4992,8 +4992,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5014,8 +5014,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5036,8 +5036,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5058,8 +5058,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5220,8 +5220,8 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -5248,8 +5248,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -5276,8 +5276,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_release:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -5304,8 +5304,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -5332,8 +5332,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -5610,8 +5610,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5632,8 +5632,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5654,8 +5654,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5676,8 +5676,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5698,8 +5698,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5860,8 +5860,8 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -5888,8 +5888,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -5916,8 +5916,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_release:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -5944,8 +5944,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -5972,8 +5972,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -6250,8 +6250,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6272,8 +6272,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6294,8 +6294,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6316,8 +6316,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6338,8 +6338,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
index e9b096e8c6c44..4605bdd2f6073 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
@@ -4390,8 +4390,8 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -4416,8 +4416,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -4442,8 +4442,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -4468,8 +4468,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -4494,8 +4494,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -4835,8 +4835,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -4857,8 +4857,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -4879,8 +4879,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -4901,8 +4901,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -4923,8 +4923,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5290,8 +5290,8 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -5316,8 +5316,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -5342,8 +5342,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -5368,8 +5368,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -5394,8 +5394,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -5735,8 +5735,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5757,8 +5757,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5779,8 +5779,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5801,8 +5801,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5823,8 +5823,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6185,8 +6185,8 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -6211,8 +6211,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -6237,8 +6237,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -6263,8 +6263,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -6289,8 +6289,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -6625,8 +6625,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6647,8 +6647,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6669,8 +6669,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6691,8 +6691,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6713,8 +6713,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7075,8 +7075,8 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -7101,8 +7101,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -7127,8 +7127,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -7153,8 +7153,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -7179,8 +7179,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x11
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w12, w10, #0x1
@@ -7515,8 +7515,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7537,8 +7537,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7559,8 +7559,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7581,8 +7581,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7603,8 +7603,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll
index 4f9e520997a22..912d87dcd2b9b 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll
@@ -5755,8 +5755,8 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -5785,8 +5785,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -5815,8 +5815,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -5845,8 +5845,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -5875,8 +5875,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6230,8 +6230,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6252,8 +6252,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6274,8 +6274,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6296,8 +6296,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6318,8 +6318,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6725,8 +6725,8 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6755,8 +6755,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6785,8 +6785,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6815,8 +6815,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6845,8 +6845,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7200,8 +7200,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7222,8 +7222,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7244,8 +7244,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7266,8 +7266,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7288,8 +7288,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7690,8 +7690,8 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7720,8 +7720,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7750,8 +7750,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7780,8 +7780,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7810,8 +7810,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8160,8 +8160,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8182,8 +8182,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8204,8 +8204,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8226,8 +8226,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8248,8 +8248,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8650,8 +8650,8 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8680,8 +8680,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8710,8 +8710,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8740,8 +8740,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8770,8 +8770,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -9120,8 +9120,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -9142,8 +9142,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -9164,8 +9164,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -9186,8 +9186,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -9208,8 +9208,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll
index 3437ccc8be40d..725558f2dcf72 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll
@@ -5755,8 +5755,8 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -5785,8 +5785,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -5815,8 +5815,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -5845,8 +5845,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -5875,8 +5875,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6230,8 +6230,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6252,8 +6252,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6274,8 +6274,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6296,8 +6296,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6318,8 +6318,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6725,8 +6725,8 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6755,8 +6755,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6785,8 +6785,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6815,8 +6815,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6845,8 +6845,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7200,8 +7200,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7222,8 +7222,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7244,8 +7244,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7266,8 +7266,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7288,8 +7288,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7690,8 +7690,8 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7720,8 +7720,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7750,8 +7750,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7780,8 +7780,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7810,8 +7810,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8160,8 +8160,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8182,8 +8182,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8204,8 +8204,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8226,8 +8226,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8248,8 +8248,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8650,8 +8650,8 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8680,8 +8680,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8710,8 +8710,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8740,8 +8740,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8770,8 +8770,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -9120,8 +9120,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -9142,8 +9142,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -9164,8 +9164,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -9186,8 +9186,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -9208,8 +9208,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8_1a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8_1a.ll
index ee5fbe39b4492..51933261313ea 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8_1a.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8_1a.ll
@@ -4055,8 +4055,8 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -4083,8 +4083,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -4111,8 +4111,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_release:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -4139,8 +4139,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -4167,8 +4167,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -4450,8 +4450,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -4472,8 +4472,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -4494,8 +4494,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -4516,8 +4516,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -4538,8 +4538,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -4700,8 +4700,8 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -4728,8 +4728,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -4756,8 +4756,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_release:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -4784,8 +4784,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -4812,8 +4812,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -5095,8 +5095,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5117,8 +5117,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5139,8 +5139,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5161,8 +5161,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5183,8 +5183,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5345,8 +5345,8 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -5373,8 +5373,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -5401,8 +5401,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_release:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -5429,8 +5429,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -5457,8 +5457,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -5735,8 +5735,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5757,8 +5757,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5779,8 +5779,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5801,8 +5801,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5823,8 +5823,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -5985,8 +5985,8 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -6013,8 +6013,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -6041,8 +6041,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_release:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -6069,8 +6069,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -6097,8 +6097,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst:
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x13, x13, x10
 ; -O0:    csel w11, w9, w11, eq
 ; -O0:    ands w13, w11, #0x1
@@ -6375,8 +6375,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6397,8 +6397,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6419,8 +6419,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6441,8 +6441,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6463,8 +6463,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll
index 2473147509dc8..004e433d9408a 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll
@@ -5755,8 +5755,8 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -5785,8 +5785,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -5815,8 +5815,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -5845,8 +5845,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -5875,8 +5875,8 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6230,8 +6230,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6252,8 +6252,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6274,8 +6274,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6296,8 +6296,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6318,8 +6318,8 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -6725,8 +6725,8 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6755,8 +6755,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6785,8 +6785,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6815,8 +6815,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -6845,8 +6845,8 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7200,8 +7200,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7222,8 +7222,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7244,8 +7244,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7266,8 +7266,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7288,8 +7288,8 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -7690,8 +7690,8 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7720,8 +7720,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7750,8 +7750,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7780,8 +7780,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -7810,8 +7810,8 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8160,8 +8160,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8182,8 +8182,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8204,8 +8204,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8226,8 +8226,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8248,8 +8248,8 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -8650,8 +8650,8 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8680,8 +8680,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8710,8 +8710,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_release:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8740,8 +8740,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -8770,8 +8770,8 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst:
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
+; -O0:    subs x10, x10, x9
 ; -O0:    subs x13, x13, x9
 ; -O0:    csel w10, w8, w10, eq
 ; -O0:    ands w13, w10, #0x1
@@ -9120,8 +9120,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -9142,8 +9142,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acquire:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -9164,8 +9164,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_release:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -9186,8 +9186,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
@@ -9208,8 +9208,8 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
-; -O0:    subs x8, x8, x10
 ; -O0:    subs x8, x8, x9
+; -O0:    subs x11, x11, x10
 ; -O0:    subs x12, x12, x10
 ; -O0:    csel w11, w8, w11, eq
 ; -O0:    ands w12, w11, #0x1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir
index bd80a892e239e..d96f6fbc12c7c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir
@@ -47,10 +47,10 @@ body:             |
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967296
-  ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[DEF]](s64), [[C1]]
-  ; CHECK-NEXT:   [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[DEF]](s64), [[C1]]
-  ; CHECK-NEXT:   [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[DEF]](s64), [[C]]
-  ; CHECK-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+  ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[DEF]](s64), [[C]]
+  ; CHECK-NEXT:   [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[DEF]](s64), [[C1]]
+  ; CHECK-NEXT:   [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[DEF]](s64), [[C1]]
+  ; CHECK-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
   ; CHECK-NEXT:   G_BRCOND [[SELECT]](s32), %bb.1
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir
index e69f79bdd187a..18c4f3c31efa5 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir
@@ -121,17 +121,17 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[DEF]](s64), [[DEF]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[DEF]](s64), [[DEF]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[DEF]](s64), [[DEF]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[DEF]](s64), [[DEF]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[C]], [[C1]]
-    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[DEF]](s64), [[DEF]]
-    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[DEF]](s64), [[DEF]]
-    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP4]](s32), [[ICMP5]], [[ICMP3]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[DEF]](s64), [[DEF]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[DEF]](s64), [[DEF]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s32), [[ICMP3]], [[ICMP4]]
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[SELECT2]](s32), [[C2]], [[SELECT1]]
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SELECT3]], 2
diff --git a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
index f7aa57a068a4c..8a503bb65c079 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
@@ -784,12 +784,11 @@ define i128 @sminv_v2i128(<2 x i128> %a) {
 ;
 ; CHECK-GI-LABEL: sminv_v2i128:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    cset w8, lt
 ; CHECK-GI-NEXT:    cmp x0, x2
-; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    cset w8, lo
 ; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    csel w8, w9, w8, eq
+; CHECK-GI-NEXT:    cset w9, lt
+; CHECK-GI-NEXT:    csel w8, w8, w9, eq
 ; CHECK-GI-NEXT:    tst w8, #0x1
 ; CHECK-GI-NEXT:    csel x0, x0, x2, ne
 ; CHECK-GI-NEXT:    csel x1, x1, x3, ne
@@ -1145,12 +1144,11 @@ define i128 @smaxv_v2i128(<2 x i128> %a) {
 ;
 ; CHECK-GI-LABEL: smaxv_v2i128:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    cset w8, gt
 ; CHECK-GI-NEXT:    cmp x0, x2
-; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    cset w8, hi
 ; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    csel w8, w9, w8, eq
+; CHECK-GI-NEXT:    cset w9, gt
+; CHECK-GI-NEXT:    csel w8, w8, w9, eq
 ; CHECK-GI-NEXT:    tst w8, #0x1
 ; CHECK-GI-NEXT:    csel x0, x0, x2, ne
 ; CHECK-GI-NEXT:    csel x1, x1, x3, ne
@@ -1504,12 +1502,11 @@ define i128 @uminv_v2i128(<2 x i128> %a) {
 ;
 ; CHECK-GI-LABEL: uminv_v2i128:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    cset w8, lo
 ; CHECK-GI-NEXT:    cmp x0, x2
-; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    cset w8, lo
 ; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    csel w8, w9, w8, eq
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    csel w8, w8, w9, eq
 ; CHECK-GI-NEXT:    tst w8, #0x1
 ; CHECK-GI-NEXT:    csel x0, x0, x2, ne
 ; CHECK-GI-NEXT:    csel x1, x1, x3, ne
@@ -1861,12 +1858,11 @@ define i128 @umaxv_v2i128(<2 x i128> %a) {
 ;
 ; CHECK-GI-LABEL: umaxv_v2i128:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    cset w8, hi
 ; CHECK-GI-NEXT:    cmp x0, x2
-; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    cset w8, hi
 ; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    csel w8, w9, w8, eq
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    csel w8, w8, w9, eq
 ; CHECK-GI-NEXT:    tst w8, #0x1
 ; CHECK-GI-NEXT:    csel x0, x0, x2, ne
 ; CHECK-GI-NEXT:    csel x1, x1, x3, ne
diff --git a/llvm/test/CodeGen/AArch64/dump-schedule-trace.mir b/llvm/test/CodeGen/AArch64/dump-schedule-trace.mir
index 2a8961649b26c..bff6d1d71b7c4 100644
--- a/llvm/test/CodeGen/AArch64/dump-schedule-trace.mir
+++ b/llvm/test/CodeGen/AArch64/dump-schedule-trace.mir
@@ -1,12 +1,12 @@
 # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55  \
 # RUN:  -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s \
-# RUN:  -misched-topdown=true -sched-print-cycles=true \
+# RUN:  -misched-prera-direction=topdown -sched-print-cycles=true \
 # RUN:  -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-header-width=21 \
 # RUN:  2>&1 | FileCheck %s --check-prefix=TOP --strict-whitespace
 
 # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55  \
 # RUN:  -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s \
-# RUN:  -misched-bottomup=true -sched-print-cycles=true \
+# RUN:  -misched-prera-direction=bottomup -sched-print-cycles=true \
 # RUN:  -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-width=4 \
 # RUN:  2>&1 | FileCheck %s --check-prefix=BOTTOM  --strict-whitespace
 
diff --git a/llvm/test/CodeGen/AArch64/force-enable-intervals.mir b/llvm/test/CodeGen/AArch64/force-enable-intervals.mir
index 98bee7a579c05..a53d4e7480307 100644
--- a/llvm/test/CodeGen/AArch64/force-enable-intervals.mir
+++ b/llvm/test/CodeGen/AArch64/force-enable-intervals.mir
@@ -1,12 +1,12 @@
 # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \
 # RUN:  -misched-dump-reserved-cycles=true \
 # RUN:  -run-pass=machine-scheduler -debug-only=machine-scheduler \
-# RUN:  -o - %s 2>&1 -misched-topdown| FileCheck %s 
+# RUN:  -o - %s 2>&1 -misched-prera-direction=topdown | FileCheck %s 
 
 # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \
 # RUN:  -misched-dump-reserved-cycles=true -sched-model-force-enable-intervals=true \
 # RUN:  -run-pass=machine-scheduler -debug-only=machine-scheduler \
-# RUN:  -o - %s 2>&1 -misched-topdown| FileCheck %s  --check-prefix=FORCE
+# RUN:  -o - %s 2>&1 -misched-prera-direction=topdown | FileCheck %s  --check-prefix=FORCE
 
 # REQUIRES: asserts, aarch64-registered-target
 ---
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index 61964060ca2c8..e284795760c5c 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -1403,28 +1403,26 @@ define <2 x i128> @v2i128_i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %d, <2 x
 ;
 ; CHECK-GI-LABEL: v2i128_i128:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    cmp x1, x5
-; CHECK-GI-NEXT:    ldp x8, x9, [sp]
-; CHECK-GI-NEXT:    cset w10, lt
 ; CHECK-GI-NEXT:    cmp x0, x4
-; CHECK-GI-NEXT:    cset w13, lo
+; CHECK-GI-NEXT:    ldp x9, x10, [sp]
+; CHECK-GI-NEXT:    cset w8, lo
 ; CHECK-GI-NEXT:    cmp x1, x5
-; CHECK-GI-NEXT:    csel w10, w13, w10, eq
-; CHECK-GI-NEXT:    cmp x3, x7
-; CHECK-GI-NEXT:    ldp x13, x14, [sp, #32]
-; CHECK-GI-NEXT:    cset w15, lt
+; CHECK-GI-NEXT:    cset w11, lt
+; CHECK-GI-NEXT:    ldp x14, x15, [sp, #32]
+; CHECK-GI-NEXT:    csel w8, w8, w11, eq
 ; CHECK-GI-NEXT:    cmp x2, x6
-; CHECK-GI-NEXT:    ldp x11, x12, [sp, #16]
-; CHECK-GI-NEXT:    cset w16, lo
+; CHECK-GI-NEXT:    cset w11, lo
 ; CHECK-GI-NEXT:    cmp x3, x7
+; CHECK-GI-NEXT:    ldp x12, x13, [sp, #16]
+; CHECK-GI-NEXT:    cset w16, lt
 ; CHECK-GI-NEXT:    ldp x17, x18, [sp, #48]
-; CHECK-GI-NEXT:    csel w15, w16, w15, eq
-; CHECK-GI-NEXT:    tst w10, #0x1
-; CHECK-GI-NEXT:    csel x0, x8, x13, ne
-; CHECK-GI-NEXT:    csel x1, x9, x14, ne
-; CHECK-GI-NEXT:    tst w15, #0x1
-; CHECK-GI-NEXT:    csel x2, x11, x17, ne
-; CHECK-GI-NEXT:    csel x3, x12, x18, ne
+; CHECK-GI-NEXT:    csel w11, w11, w16, eq
+; CHECK-GI-NEXT:    tst w8, #0x1
+; CHECK-GI-NEXT:    csel x0, x9, x14, ne
+; CHECK-GI-NEXT:    csel x1, x10, x15, ne
+; CHECK-GI-NEXT:    tst w11, #0x1
+; CHECK-GI-NEXT:    csel x2, x12, x17, ne
+; CHECK-GI-NEXT:    csel x3, x13, x18, ne
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = icmp slt <2 x i128> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-01.mir b/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-01.mir
index 6fb8ba2dfc839..ea40f9e52dcd6 100644
--- a/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-01.mir
+++ b/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-01.mir
@@ -1,7 +1,7 @@
 # RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon  -mcpu=cortex-a55 %s -o - 2>&1 \
 # RUN:   -misched-dump-reserved-cycles=true \
 # RUN:   -run-pass=machine-scheduler -debug-only=machine-scheduler \
-# RUN:   -misched-bottomup=true -sched-print-cycles=true \
+# RUN:   -misched-prera-direction=bottomup -sched-print-cycles=true \
 # RUN:   -misched-detail-resource-booking=true \
 # RUN:   -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-header-width=21 \
 # RUN: | FileCheck  %s
diff --git a/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-02.mir b/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-02.mir
index 9c9b6e281b15d..9be91b8a01e86 100644
--- a/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-02.mir
+++ b/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-02.mir
@@ -1,6 +1,6 @@
 # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55  \
 # RUN:  -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s \
-# RUN:  -misched-bottomup=true -sched-print-cycles=true \
+# RUN:  -misched-prera-direction=bottomup -sched-print-cycles=true \
 # RUN:  -misched-dump-reserved-cycles=true -misched-detail-resource-booking=true\
 # RUN:  -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-width=4 \
 # RUN:  2>&1 | FileCheck %s
diff --git a/llvm/test/CodeGen/AArch64/misched-sort-resource-in-trace.mir b/llvm/test/CodeGen/AArch64/misched-sort-resource-in-trace.mir
index 4b77444ec60d2..b04fd89b796ba 100644
--- a/llvm/test/CodeGen/AArch64/misched-sort-resource-in-trace.mir
+++ b/llvm/test/CodeGen/AArch64/misched-sort-resource-in-trace.mir
@@ -1,11 +1,11 @@
 # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=exynos-m3 -verify-machineinstrs \
 # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s \
-# RUN:  -misched-topdown=true -sched-print-cycles=true \
+# RUN:  -misched-prera-direction=topdown -sched-print-cycles=true \
 # RUN:  -misched-dump-schedule-trace=true --misched-sort-resources-in-trace=true 2>&1 | FileCheck --check-prefix=SORTED %s
 
 # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=exynos-m3 -verify-machineinstrs \
 # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s \
-# RUN:  -misched-topdown=true -sched-print-cycles=true \
+# RUN:  -misched-prera-direction=topdown -sched-print-cycles=true \
 # RUN:  -misched-dump-schedule-trace=true --misched-sort-resources-in-trace=false 2>&1 | FileCheck --check-prefix=UNSORTED %s
 
 # REQUIRES: asserts, aarch64-registered-target
diff --git a/llvm/test/CodeGen/AArch64/scmp.ll b/llvm/test/CodeGen/AArch64/scmp.ll
index 4aff5a836e1a1..be167fde7dbe6 100644
--- a/llvm/test/CodeGen/AArch64/scmp.ll
+++ b/llvm/test/CodeGen/AArch64/scmp.ll
@@ -79,20 +79,18 @@ define i8 @scmp.8.128(i128 %x, i128 %y) nounwind {
 ;
 ; CHECK-GI-LABEL: scmp.8.128:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    cset w8, gt
 ; CHECK-GI-NEXT:    cmp x0, x2
-; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    cset w8, hi
 ; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    csel w8, w9, w8, eq
+; CHECK-GI-NEXT:    cset w9, gt
+; CHECK-GI-NEXT:    csel w8, w8, w9, eq
 ; CHECK-GI-NEXT:    tst w8, #0x1
 ; CHECK-GI-NEXT:    cset w8, ne
-; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    cset w9, lt
 ; CHECK-GI-NEXT:    cmp x0, x2
-; CHECK-GI-NEXT:    cset w10, lo
+; CHECK-GI-NEXT:    cset w9, lo
 ; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    csel w9, w10, w9, eq
+; CHECK-GI-NEXT:    cset w10, lt
+; CHECK-GI-NEXT:    csel w9, w9, w10, eq
 ; CHECK-GI-NEXT:    tst w9, #0x1
 ; CHECK-GI-NEXT:    csinv w0, w8, wzr, eq
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/selectopt-cast.ll b/llvm/test/CodeGen/AArch64/selectopt-cast.ll
index 4afb6af6e757c..48d3301185f72 100644
--- a/llvm/test/CodeGen/AArch64/selectopt-cast.ll
+++ b/llvm/test/CodeGen/AArch64/selectopt-cast.ll
@@ -729,3 +729,127 @@ loop:
 exit:
   ret void
 }
+
+define void @test_add_lshr_add_regular_select(ptr %dst, ptr %src, i64 %i.start, i64 %j.start) {
+; CHECK-LABEL: @test_add_lshr_add_regular_select(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 100000, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[SELECT_END:%.*]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_START:%.*]], [[ENTRY]] ], [ [[I_NEXT:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[J_START:%.*]], [[ENTRY]] ], [ [[J_NEXT:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT:    [[GEP_I:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[GEP_I]], align 8
+; CHECK-NEXT:    [[GEP_J:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 [[J]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[GEP_J]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[TMP1]], -1
+; CHECK-NEXT:    [[SHIFT:%.*]] = lshr i64 [[TMP1]], 63
+; CHECK-NEXT:    [[CMP_FROZEN:%.*]] = freeze i1 [[CMP]]
+; CHECK-NEXT:    br i1 [[CMP_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_FALSE_SINK:%.*]]
+; CHECK:       select.true.sink:
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i64 [[I]], 1
+; CHECK-NEXT:    br label [[SELECT_END]]
+; CHECK:       select.false.sink:
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[J]], 1
+; CHECK-NEXT:    br label [[SELECT_END]]
+; CHECK:       select.end:
+; CHECK-NEXT:    [[J_NEXT]] = phi i64 [ [[J]], [[SELECT_TRUE_SINK]] ], [ [[TMP3]], [[SELECT_FALSE_SINK]] ]
+; CHECK-NEXT:    [[I_NEXT]] = phi i64 [ [[TMP2]], [[SELECT_TRUE_SINK]] ], [ [[I]], [[SELECT_FALSE_SINK]] ]
+; CHECK-NEXT:    [[COND:%.*]] = phi i64 [ [[J]], [[SELECT_TRUE_SINK]] ], [ [[I]], [[SELECT_FALSE_SINK]] ]
+; CHECK-NEXT:    [[INC:%.*]] = zext i1 [[CMP]] to i64
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr i64, ptr [[DST:%.*]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[COND]], ptr [[GEP_DST]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 100000, %entry ], [ %iv.next, %loop ]
+  %i = phi i64 [ %i.start, %entry ], [ %i.next, %loop ]
+  %j = phi i64 [ %j.start, %entry ], [ %j.next, %loop ]
+  %gep.i = getelementptr inbounds ptr, ptr %src, i64 %i
+  %0 = load ptr, ptr %gep.i, align 8
+  %gep.j = getelementptr inbounds i64, ptr %0, i64 %j
+  %1 = load i64, ptr %gep.j, align 8
+  %cmp = icmp sgt i64 %1, -1
+  %shift = lshr i64 %1, 63
+  %j.next = add nsw i64 %j, %shift
+  %inc = zext i1 %cmp to i64
+  %i.next = add nsw i64 %i, %inc
+  %cond = select i1 %cmp, i64 %j, i64 %i
+  %gep.dst = getelementptr i64, ptr %dst, i64 %iv
+  store i64 %cond, ptr %gep.dst, align 8
+  %iv.next = add nsw i64 %iv, -1
+  %ec = icmp eq i64 %iv.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_add_ashr_add_regular_select(ptr %dst, ptr %src, i64 %i.start, i64 %j.start) {
+; CHECK-LABEL: @test_add_ashr_add_regular_select(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 100000, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[SELECT_END:%.*]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_START:%.*]], [[ENTRY]] ], [ [[I_NEXT:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[J_START:%.*]], [[ENTRY]] ], [ [[J_NEXT:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT:    [[GEP_I:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[GEP_I]], align 8
+; CHECK-NEXT:    [[GEP_J:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 [[J]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[GEP_J]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[TMP1]], -1
+; CHECK-NEXT:    [[SHIFT:%.*]] = ashr i64 [[TMP1]], 63
+; CHECK-NEXT:    [[CMP_FROZEN:%.*]] = freeze i1 [[CMP]]
+; CHECK-NEXT:    br i1 [[CMP_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_FALSE_SINK:%.*]]
+; CHECK:       select.true.sink:
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i64 [[I]], 1
+; CHECK-NEXT:    br label [[SELECT_END]]
+; CHECK:       select.false.sink:
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[J]], -1
+; CHECK-NEXT:    br label [[SELECT_END]]
+; CHECK:       select.end:
+; CHECK-NEXT:    [[J_NEXT]] = phi i64 [ [[J]], [[SELECT_TRUE_SINK]] ], [ [[TMP3]], [[SELECT_FALSE_SINK]] ]
+; CHECK-NEXT:    [[I_NEXT]] = phi i64 [ [[TMP2]], [[SELECT_TRUE_SINK]] ], [ [[I]], [[SELECT_FALSE_SINK]] ]
+; CHECK-NEXT:    [[COND:%.*]] = phi i64 [ [[J]], [[SELECT_TRUE_SINK]] ], [ [[I]], [[SELECT_FALSE_SINK]] ]
+; CHECK-NEXT:    [[INC:%.*]] = zext i1 [[CMP]] to i64
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr i64, ptr [[DST:%.*]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[COND]], ptr [[GEP_DST]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 100000, %entry ], [ %iv.next, %loop ]
+  %i = phi i64 [ %i.start, %entry ], [ %i.next, %loop ]
+  %j = phi i64 [ %j.start, %entry ], [ %j.next, %loop ]
+  %gep.i = getelementptr inbounds ptr, ptr %src, i64 %i
+  %0 = load ptr, ptr %gep.i, align 8
+  %gep.j = getelementptr inbounds i64, ptr %0, i64 %j
+  %1 = load i64, ptr %gep.j, align 8
+  %cmp = icmp sgt i64 %1, -1
+  %shift = ashr i64 %1, 63
+  %j.next = add nsw i64 %j, %shift
+  %inc = zext i1 %cmp to i64
+  %i.next = add nsw i64 %i, %inc
+  %cond = select i1 %cmp, i64 %j, i64 %i
+  %gep.dst = getelementptr i64, ptr %dst, i64 %iv
+  store i64 %cond, ptr %gep.dst, align 8
+  %iv.next = add nsw i64 %iv, -1
+  %ec = icmp eq i64 %iv.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
index 1e835c92ba9e4..ef569e480ea3d 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -force-streaming -enable-subreg-liveness -verify-machineinstrs < %s | FileCheck %s
 
 target triple="aarch64-linux-gnu"
 
@@ -26,18 +26,18 @@ define void @udot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @udot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
 ; CHECK-LABEL: udot_multi_za32_u16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
-; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z25.d, z6.d
-; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
-; CHECK-NEXT:    mov z28.d, z1.d
-; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
-; CHECK-NEXT:    udot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
+; CHECK-NEXT:    udot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
                                         <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #0 {
   call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
@@ -68,18 +68,18 @@ define void @udot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
 define void @udot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
 ; CHECK-LABEL: udot_multi_za32_u8_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
-; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z25.d, z6.d
-; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
-; CHECK-NEXT:    mov z28.d, z1.d
-; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
-; CHECK-NEXT:    udot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
+; CHECK-NEXT:    udot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ret
                                       <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
   call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
@@ -110,18 +110,18 @@ define void @udot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @udot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
 ; CHECK-LABEL: udot_multi_za64_u16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
-; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z25.d, z6.d
-; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
-; CHECK-NEXT:    mov z28.d, z1.d
-; CHECK-NEXT:    udot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
-; CHECK-NEXT:    udot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    udot za.d[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
+; CHECK-NEXT:    udot za.d[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
                                        <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #1 {
   call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
@@ -152,18 +152,18 @@ define void @usdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @usdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
 ; CHECK-LABEL: usdot_multi_za32_u8_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
-; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z25.d, z6.d
-; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
-; CHECK-NEXT:    mov z28.d, z1.d
-; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
-; CHECK-NEXT:    usdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
+; CHECK-NEXT:    usdot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ret
                                       <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
   call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
@@ -197,18 +197,18 @@ define void @sdot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @sdot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
 ; CHECK-LABEL: sdot_multi_za32_u16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
-; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z25.d, z6.d
-; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
-; CHECK-NEXT:    mov z28.d, z1.d
-; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
-; CHECK-NEXT:    sdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
+; CHECK-NEXT:    sdot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
                                         <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #0 {
   call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
@@ -239,18 +239,18 @@ define void @sdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
 define void @sdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
 ; CHECK-LABEL: sdot_multi_za32_u8_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
-; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z25.d, z6.d
-; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
-; CHECK-NEXT:    mov z28.d, z1.d
-; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
-; CHECK-NEXT:    sdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
+; CHECK-NEXT:    sdot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ret
                                       <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
   call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
@@ -281,18 +281,18 @@ define void @sdot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @sdot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
 ; CHECK-LABEL: sdot_multi_za64_u16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
-; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z25.d, z6.d
-; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
-; CHECK-NEXT:    mov z28.d, z1.d
-; CHECK-NEXT:    sdot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
-; CHECK-NEXT:    sdot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    sdot za.d[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
+; CHECK-NEXT:    sdot za.d[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
                                        <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #1 {
   call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
@@ -309,9 +309,7 @@ define void @sdot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @udot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
 ; CHECK-LABEL: udot_single_za32_u16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    udot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    ret
@@ -324,11 +322,7 @@ define void @udot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
 define void @udot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
 ; CHECK-LABEL: udot_single_za32_u16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
 ; CHECK-NEXT:    udot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
 ; CHECK-NEXT:    ret
@@ -341,9 +335,7 @@ define void @udot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
 define void @udot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
 ; CHECK-LABEL: udot_single_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
 ; CHECK-NEXT:    udot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
 ; CHECK-NEXT:    ret
@@ -356,11 +348,7 @@ define void @udot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @udot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
 ; CHECK-LABEL: udot_single_za32_u8_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
 ; CHECK-NEXT:    udot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
 ; CHECK-NEXT:    ret
@@ -373,9 +361,7 @@ define void @udot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @udot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
 ; CHECK-LABEL: udot_single_za64_u16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    udot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    udot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    ret
@@ -388,11 +374,7 @@ define void @udot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
 define void @udot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
 ; CHECK-LABEL: udot_single_za64_u16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    udot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h
 ; CHECK-NEXT:    udot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h
 ; CHECK-NEXT:    ret
@@ -405,9 +387,7 @@ define void @udot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
 define void @usdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
 ; CHECK-LABEL: usdot_single_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    usdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
 ; CHECK-NEXT:    usdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
 ; CHECK-NEXT:    ret
@@ -420,11 +400,7 @@ define void @usdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
 define void @usdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
 ; CHECK-LABEL: usdot_single_za32_u8_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
 ; CHECK-NEXT:    usdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
 ; CHECK-NEXT:    ret
@@ -440,9 +416,7 @@ define void @usdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
 define void @sdot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
 ; CHECK-LABEL: sdot_single_za32_u16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    sdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    ret
@@ -455,11 +429,7 @@ define void @sdot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
 define void @sdot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
 ; CHECK-LABEL: sdot_single_za32_u16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
 ; CHECK-NEXT:    sdot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
 ; CHECK-NEXT:    ret
@@ -472,9 +442,7 @@ define void @sdot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
 define void @sdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
 ; CHECK-LABEL: sdot_single_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
 ; CHECK-NEXT:    sdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
 ; CHECK-NEXT:    ret
@@ -487,11 +455,7 @@ define void @sdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @sdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
 ; CHECK-LABEL: sdot_single_za32_u8_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
 ; CHECK-NEXT:    sdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
 ; CHECK-NEXT:    ret
@@ -504,9 +468,7 @@ define void @sdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @sdot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
 ; CHECK-LABEL: sdot_single_za64_u16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    sdot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    sdot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    ret
@@ -519,11 +481,7 @@ define void @sdot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
 define void @sdot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
 ; CHECK-LABEL: sdot_single_za64_u16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    sdot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h
 ; CHECK-NEXT:    sdot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h
 ; CHECK-NEXT:    ret
@@ -536,9 +494,7 @@ define void @sdot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
 define void @sudot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
 ; CHECK-LABEL: sudot_single_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    sudot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
 ; CHECK-NEXT:    sudot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
 ; CHECK-NEXT:    ret
@@ -551,11 +507,7 @@ define void @sudot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
 define void @sudot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
 ; CHECK-LABEL: sudot_single_za32_u8_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
 ; CHECK-NEXT:    sudot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
 ; CHECK-NEXT:    ret
@@ -571,8 +523,8 @@ define void @udot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
 ; CHECK-LABEL: udot_lane_za32_u16_vg1x2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z5.d, z2.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
 ; CHECK-NEXT:    udot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
 ; CHECK-NEXT:    ret
@@ -585,11 +537,7 @@ define void @udot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
 define void @udot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
 ; CHECK-LABEL: udot_lane_za32_u16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[3]
 ; CHECK-NEXT:    udot za.s[w8, 7, vgx4], { z0.h - z3.h }, z4.h[3]
 ; CHECK-NEXT:    ret
@@ -605,8 +553,8 @@ define void @udot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vs
 ; CHECK-LABEL: udot_lane_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z5.d, z2.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
 ; CHECK-NEXT:    udot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
 ; CHECK-NEXT:    ret
@@ -620,8 +568,8 @@ define void @udot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
 ; CHECK-LABEL: udot_lane_za32_u8_vg1x4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
@@ -635,12 +583,86 @@ define void @udot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
   ret void
 }
 
+define void @udot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: udot_form_2x_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1b { z16.b, z24.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
+; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+  ret void
+}
+
+define void @udot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: udot_form_4x_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x9, x1, #1
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    add x10, x9, x1
+; CHECK-NEXT:    ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT:    ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+  ret void
+}
+
 define void @udot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
 ; CHECK-LABEL: udot_lane_za64_u16_vg1x2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z5.d, z2.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    udot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1]
 ; CHECK-NEXT:    udot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1]
 ; CHECK-NEXT:    ret
@@ -654,8 +676,8 @@ define void @udot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
 ; CHECK-LABEL: udot_lane_za64_u16_vg1x4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    udot za.d[w8, 0, vgx4], { z24.h - z27.h }, z5.h[1]
@@ -673,8 +695,8 @@ define void @usdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
 ; CHECK-LABEL: usdot_lane_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z5.d, z2.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    usdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
 ; CHECK-NEXT:    usdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
 ; CHECK-NEXT:    ret
@@ -688,8 +710,8 @@ define void @usdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
 ; CHECK-LABEL: usdot_lane_za32_u8_vg1x4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
@@ -703,6 +725,79 @@ define void @usdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
   ret void
 }
 
+define void @usdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: usdot_form_2x_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1b { z16.b, z24.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    usdot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
+; CHECK-NEXT:    usdot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+  ret void
+}
+
+define void @usdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: usdot_form_4x_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x9, x1, #1
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    add x10, x9, x1
+; CHECK-NEXT:    ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT:    ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+  ret void
+}
 
 ; == Multi, indexed (signed) ==
 
@@ -710,8 +805,8 @@ define void @sdot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
 ; CHECK-LABEL: sdot_lane_za32_u16_vg1x2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z5.d, z2.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
 ; CHECK-NEXT:    sdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
 ; CHECK-NEXT:    ret
@@ -725,8 +820,8 @@ define void @sdot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
 ; CHECK-LABEL: sdot_lane_za32_u16_vg1x4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z24.h - z27.h }, z5.h[3]
@@ -744,8 +839,8 @@ define void @sdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vs
 ; CHECK-LABEL: sdot_lane_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z5.d, z2.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
 ; CHECK-NEXT:    sdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
 ; CHECK-NEXT:    ret
@@ -759,8 +854,8 @@ define void @sdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
 ; CHECK-LABEL: sdot_lane_za32_u8_vg1x4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
@@ -774,12 +869,86 @@ define void @sdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
   ret void
 }
 
+define void @sdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: sdot_form_2x_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1b { z16.b, z24.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
+; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+  ret void
+}
+
+define void @sdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: sdot_form_4x_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x9, x1, #1
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    add x10, x9, x1
+; CHECK-NEXT:    ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT:    ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+  ret void
+}
+
 define void @sdot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
 ; CHECK-LABEL: sdot_lane_za64_u16_vg1x2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z5.d, z2.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    sdot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1]
 ; CHECK-NEXT:    sdot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1]
 ; CHECK-NEXT:    ret
@@ -793,8 +962,8 @@ define void @sdot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
 ; CHECK-LABEL: sdot_lane_za64_u16_vg1x4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    sdot za.d[w8, 0, vgx4], { z24.h - z27.h }, z5.h[1]
@@ -814,8 +983,8 @@ define void @sudot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
 ; CHECK-LABEL: sudot_lane_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z5.d, z2.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    sudot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
 ; CHECK-NEXT:    sudot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
 ; CHECK-NEXT:    ret
@@ -829,8 +998,8 @@ define void @sudot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
 ; CHECK-LABEL: sudot_lane_za32_u8_vg1x4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
@@ -844,11 +1013,84 @@ define void @sudot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
   ret void
 }
 
+define void @sudot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: sudot_form_2x_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1b { z16.b, z24.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    sudot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
+; CHECK-NEXT:    sudot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+  ret void
+}
+
+define void @sudot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: sudot_form_4x_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x9, x1, #1
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    add x10, x9, x1
+; CHECK-NEXT:    ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT:    ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+  ret void
+}
+
 
 attributes #0 = { nounwind "target-features"="+sme2" }
 attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" }
 
-
 ; == Multi, multi (unsigned)
 
 declare void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
index a0d8c18f55c3a..49106e12378be 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
@@ -1,15 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-i16i64 -force-streaming -verify-machineinstrs < %s | FileCheck %s
-
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-i16i64 -enable-subreg-liveness -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 ; == FVDOT ==
 
 define void @test_fvdot_lane_za32_vg1x2_nxv8f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) {
 ; CHECK-LABEL: test_fvdot_lane_za32_vg1x2_nxv8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
 ; CHECK-NEXT:    fvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
 ; CHECK-NEXT:    ret
@@ -25,9 +22,7 @@ define void @test_fvdot_lane_za32_vg1x2_nxv8f16(i32 %slice, <vscale x 8 x half>
 define void @test_fvdot_lane_za32_vg1x2_nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm) {
 ; CHECK-LABEL: test_fvdot_lane_za32_vg1x2_nxv8bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    bfvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
 ; CHECK-NEXT:    bfvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
 ; CHECK-NEXT:    ret
@@ -43,9 +38,7 @@ define void @test_fvdot_lane_za32_vg1x2_nxv8bf16(i32 %slice, <vscale x 8 x bfloa
 define void @test_svdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: test_svdot_lane_za32_vg1x2_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
 ; CHECK-NEXT:    svdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
 ; CHECK-NEXT:    ret
@@ -58,11 +51,7 @@ define void @test_svdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %
 define void @test_svdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: test_svdot_lane_za32_vg1x4_nxv16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
 ; CHECK-NEXT:    svdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
 ; CHECK-NEXT:    ret
@@ -75,11 +64,7 @@ define void @test_svdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %
 define void @test_svdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: test_svdot_lane_za64_vg1x4_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    svdot za.d[w8, 0, vgx4], { z0.h - z3.h }, z4.h[1]
 ; CHECK-NEXT:    svdot za.d[w8, 7, vgx4], { z0.h - z3.h }, z4.h[1]
 ; CHECK-NEXT:    ret
@@ -89,15 +74,87 @@ define void @test_svdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %
   ret void
 }
 
+define void @svdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: svdot_form_2x_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x9, x0, x1
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1h { z16.h, z24.h }, pn8/z, [x0]
+; CHECK-NEXT:    ld1h { z17.h, z25.h }, pn8/z, [x9]
+; CHECK-NEXT:    svdot za.s[w8, 0, vgx2], { z16.h, z17.h }, z0.h[0]
+; CHECK-NEXT:    svdot za.s[w8, 0, vgx2], { z24.h, z25.h }, z0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
+  %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
+  %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
+  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> undef, i32 0)
+  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> undef, i32 0)
+  ret void
+}
+
+define void @svdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: svdot_form_4x_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x9, x1, #1
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    add x10, x9, x1
+; CHECK-NEXT:    ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT:    ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT:    svdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT:    svdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT:    svdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT:    svdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+  ret void
+}
 
 ; == UVDOT ==
 
 define void @test_uvdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: test_uvdot_lane_za32_vg1x2_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
 ; CHECK-NEXT:    uvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
 ; CHECK-NEXT:    ret
@@ -110,11 +167,7 @@ define void @test_uvdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %
 define void @test_uvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: test_uvdot_lane_za32_vg1x4_nxv16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
 ; CHECK-NEXT:    uvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
 ; CHECK-NEXT:    ret
@@ -127,11 +180,7 @@ define void @test_uvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %
 define void @test_uvdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: test_uvdot_lane_za64_vg1x4_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    uvdot za.d[w8, 0, vgx4], { z0.h - z3.h }, z4.h[1]
 ; CHECK-NEXT:    uvdot za.d[w8, 7, vgx4], { z0.h - z3.h }, z4.h[1]
 ; CHECK-NEXT:    ret
@@ -141,17 +190,87 @@ define void @test_uvdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %
   ret void
 }
 
+define void @uvdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: uvdot_form_2x_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x9, x0, x1
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1h { z16.h, z24.h }, pn8/z, [x0]
+; CHECK-NEXT:    ld1h { z17.h, z25.h }, pn8/z, [x9]
+; CHECK-NEXT:    uvdot za.s[w8, 0, vgx2], { z16.h, z17.h }, z0.h[0]
+; CHECK-NEXT:    uvdot za.s[w8, 0, vgx2], { z24.h, z25.h }, z0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
+  %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
+  %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
+  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> undef, i32 0)
+  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> undef, i32 0)
+  ret void
+}
+
+define void @uvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: uvdot_form_4x_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x9, x1, #1
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    add x10, x9, x1
+; CHECK-NEXT:    ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT:    ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT:    uvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT:    uvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT:    uvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT:    uvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+  ret void
+}
 
 ; == SUVDOT ==
 
 define void @test_suvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: test_suvdot_lane_za32_vg1x4_nxv16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
 ; CHECK-NEXT:    suvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
 ; CHECK-NEXT:    ret
@@ -161,17 +280,62 @@ define void @test_suvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8>
   ret void
 }
 
+define void @suvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: suvdot_form_4x_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x9, x1, #1
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    add x10, x9, x1
+; CHECK-NEXT:    ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT:    ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT:    suvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT:    suvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT:    suvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT:    suvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+  tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+  ret void
+}
 
 ; == USVDOT ==
 
 define void @test_usvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: test_usvdot_lane_za32_vg1x4_nxv16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
 ; CHECK-NEXT:    usvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
 ; CHECK-NEXT:    ret
@@ -181,6 +345,58 @@ define void @test_usvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8>
   ret void
 }
 
+define void @usvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: usvdot_form_4x_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x9, x1, #1
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    add x10, x9, x1
+; CHECK-NEXT:    ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT:    ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT:    usvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT:    usvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT:    usvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT:    usvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+  tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+  ret void
+}
+
+attributes #0 = { nounwind "target-features"="+sme2" }
+attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" }
 
 ; == FVDOT ==
 declare void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
diff --git a/llvm/test/CodeGen/AArch64/ucmp.ll b/llvm/test/CodeGen/AArch64/ucmp.ll
index 125ac7f61a41e..0e4da89fcaebc 100644
--- a/llvm/test/CodeGen/AArch64/ucmp.ll
+++ b/llvm/test/CodeGen/AArch64/ucmp.ll
@@ -79,20 +79,18 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
 ;
 ; CHECK-GI-LABEL: ucmp.8.128:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    cset w8, hi
 ; CHECK-GI-NEXT:    cmp x0, x2
-; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    cset w8, hi
 ; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    csel w8, w9, w8, eq
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    csel w8, w8, w9, eq
 ; CHECK-GI-NEXT:    tst w8, #0x1
 ; CHECK-GI-NEXT:    cset w8, ne
-; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    cset w9, lo
 ; CHECK-GI-NEXT:    cmp x0, x2
-; CHECK-GI-NEXT:    cset w10, lo
+; CHECK-GI-NEXT:    cset w9, lo
 ; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    csel w9, w10, w9, eq
+; CHECK-GI-NEXT:    cset w10, lo
+; CHECK-GI-NEXT:    csel w9, w9, w10, eq
 ; CHECK-GI-NEXT:    tst w9, #0x1
 ; CHECK-GI-NEXT:    csinv w0, w8, wzr, eq
 ; CHECK-GI-NEXT:    ret
@@ -151,20 +149,18 @@ define <1 x i64> @ucmp.1.64.65(<1 x i65> %x, <1 x i65> %y) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    and x8, x1, #0x1
 ; CHECK-GI-NEXT:    and x9, x3, #0x1
-; CHECK-GI-NEXT:    cmp x8, x9
-; CHECK-GI-NEXT:    cset w10, hi
 ; CHECK-GI-NEXT:    cmp x0, x2
-; CHECK-GI-NEXT:    cset w11, hi
+; CHECK-GI-NEXT:    cset w10, hi
 ; CHECK-GI-NEXT:    cmp x8, x9
-; CHECK-GI-NEXT:    csel w10, w11, w10, eq
+; CHECK-GI-NEXT:    cset w11, hi
+; CHECK-GI-NEXT:    csel w10, w10, w11, eq
 ; CHECK-GI-NEXT:    tst w10, #0x1
 ; CHECK-GI-NEXT:    cset x10, ne
-; CHECK-GI-NEXT:    cmp x8, x9
-; CHECK-GI-NEXT:    cset w11, lo
 ; CHECK-GI-NEXT:    cmp x0, x2
-; CHECK-GI-NEXT:    cset w12, lo
+; CHECK-GI-NEXT:    cset w11, lo
 ; CHECK-GI-NEXT:    cmp x8, x9
-; CHECK-GI-NEXT:    csel w8, w12, w11, eq
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    csel w8, w11, w8, eq
 ; CHECK-GI-NEXT:    tst w8, #0x1
 ; CHECK-GI-NEXT:    csinv x8, x10, xzr, eq
 ; CHECK-GI-NEXT:    fmov d0, x8
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
index 809a6d6556a7b..0806f7da5c89c 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
@@ -280,12 +280,11 @@ define i128 @test_v2i128(<2 x i128> %a) nounwind {
 ;
 ; CHECK-GI-LABEL: test_v2i128:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    cset w8, hi
 ; CHECK-GI-NEXT:    cmp x0, x2
-; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    cset w8, hi
 ; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    csel w8, w9, w8, eq
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    csel w8, w8, w9, eq
 ; CHECK-GI-NEXT:    tst w8, #0x1
 ; CHECK-GI-NEXT:    csel x0, x0, x2, ne
 ; CHECK-GI-NEXT:    csel x1, x1, x3, ne
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 168e6dfa5f147..e289ee759da15 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -5105,30 +5105,30 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s4, s0, s4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    s_addc_u32 s5, s1, s5
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    s_addc_u32 s8, s2, s6
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s2
+; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
 ; GFX6-NEXT:    s_addc_u32 s9, s3, s7
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v3, s3
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[2:3]
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NEXT:    v_mov_b32_e32 v2, s4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
@@ -5147,27 +5147,27 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s4, s0, s4
 ; GFX8-NEXT:    s_addc_u32 s5, s1, s5
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_addc_u32 s8, s2, s6
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_addc_u32 s9, s3, s7
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    s_cmp_eq_u64 s[8:9], s[2:3]
-; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
-; GFX8-NEXT:    s_and_b32 s0, 1, s2
+; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[2:3]
+; GFX8-NEXT:    s_and_b32 s0, 1, s0
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX8-NEXT:    s_cmp_eq_u64 s[6:7], 0
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX8-NEXT:    s_and_b32 s0, 1, s2
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s0, s9, 31
@@ -5194,27 +5194,27 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s4, s0, s4
 ; GFX9-NEXT:    s_addc_u32 s5, s1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_addc_u32 s8, s2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_addc_u32 s9, s3, s7
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    s_cmp_eq_u64 s[8:9], s[2:3]
-; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
-; GFX9-NEXT:    s_and_b32 s0, 1, s2
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[2:3]
+; GFX9-NEXT:    s_and_b32 s0, 1, s0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX9-NEXT:    s_cmp_eq_u64 s[6:7], 0
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX9-NEXT:    s_and_b32 s0, 1, s2
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s0, s9, 31
@@ -5895,30 +5895,30 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-LABEL: s_saddsat_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s8, s0, s8
-; GFX6-NEXT:    v_mov_b32_e32 v3, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    s_addc_u32 s9, s1, s9
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    s_addc_u32 s16, s2, s10
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s2
+; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
 ; GFX6-NEXT:    s_addc_u32 s17, s3, s11
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[16:17], v[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v3, s3
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[16:17], v[2:3]
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[2:3]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[10:11], 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s0, s17, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NEXT:    v_mov_b32_e32 v2, s8
-; GFX6-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v2, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
@@ -5928,30 +5928,30 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v2, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v3, v0, vcc
 ; GFX6-NEXT:    s_add_u32 s0, s4, s12
-; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_addc_u32 s1, s5, s13
-; GFX6-NEXT:    v_mov_b32_e32 v3, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    s_addc_u32 s2, s6, s14
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
 ; GFX6-NEXT:    s_addc_u32 s3, s7, s15
-; GFX6-NEXT:    v_mov_b32_e32 v1, s7
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v3, s7
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[14:15], 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
-; GFX6-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
@@ -5974,27 +5974,27 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s8, s0, s8
 ; GFX8-NEXT:    s_addc_u32 s9, s1, s9
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_addc_u32 s16, s2, s10
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_addc_u32 s17, s3, s11
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
-; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[16:17], v[0:1]
-; GFX8-NEXT:    s_and_b32 s0, 1, s2
+; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[16:17], v[2:3]
+; GFX8-NEXT:    s_and_b32 s0, 1, s0
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX8-NEXT:    s_cmp_eq_u64 s[10:11], 0
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX8-NEXT:    s_and_b32 s0, 1, s2
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s0, s17, 31
@@ -6013,27 +6013,27 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v0, vcc
 ; GFX8-NEXT:    s_addc_u32 s1, s5, s13
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    s_addc_u32 s2, s6, s14
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    s_addc_u32 s3, s7, s15
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX8-NEXT:    s_and_b32 s4, 1, s6
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT:    s_and_b32 s4, 1, s4
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX8-NEXT:    s_cmp_eq_u64 s[14:15], 0
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
 ; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX8-NEXT:    s_and_b32 s4, 1, s6
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s4, s3, 31
@@ -6064,27 +6064,27 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s8, s0, s8
 ; GFX9-NEXT:    s_addc_u32 s9, s1, s9
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_addc_u32 s16, s2, s10
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_addc_u32 s17, s3, s11
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
-; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[16:17], v[0:1]
-; GFX9-NEXT:    s_and_b32 s0, 1, s2
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[16:17], v[2:3]
+; GFX9-NEXT:    s_and_b32 s0, 1, s0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX9-NEXT:    s_cmp_eq_u64 s[10:11], 0
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX9-NEXT:    s_and_b32 s0, 1, s2
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s0, s17, 31
@@ -6103,27 +6103,27 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v3, v0, vcc
 ; GFX9-NEXT:    s_addc_u32 s1, s5, s13
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    s_addc_u32 s2, s6, s14
-; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    s_addc_u32 s3, s7, s15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX9-NEXT:    s_and_b32 s4, 1, s6
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GFX9-NEXT:    s_and_b32 s4, 1, s4
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX9-NEXT:    s_cmp_eq_u64 s[14:15], 0
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
 ; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX9-NEXT:    s_and_b32 s4, 1, s6
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 7214f4ab581d5..43ebe156eb2a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -5111,22 +5111,23 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sub_u32 s8, s0, s4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    s_subb_u32 s9, s1, s5
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    s_subb_u32 s10, s2, s6
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s2
+; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
 ; GFX6-NEXT:    s_subb_u32 s11, s3, s7
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v3, s3
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3]
 ; GFX6-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[4:5], 0
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[10:11], v[2:3]
+; GFX6-NEXT:    v_mov_b32_e32 v3, s9
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[6:7], 0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
@@ -5136,7 +5137,6 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
-; GFX6-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
@@ -5155,26 +5155,26 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s8, s0, s4
 ; GFX8-NEXT:    s_subb_u32 s9, s1, s5
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_subb_u32 s10, s2, s6
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_subb_u32 s11, s3, s7
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    s_cmp_eq_u64 s[10:11], s[2:3]
-; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
-; GFX8-NEXT:    s_and_b32 s0, 1, s2
+; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3]
+; GFX8-NEXT:    s_and_b32 s0, 1, s0
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX8-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[4:5], 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_cmp_eq_u64 s[6:7], 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX8-NEXT:    s_and_b32 s0, 1, s2
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
@@ -5204,26 +5204,26 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s8, s0, s4
 ; GFX9-NEXT:    s_subb_u32 s9, s1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_subb_u32 s10, s2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_subb_u32 s11, s3, s7
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    s_cmp_eq_u64 s[10:11], s[2:3]
-; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
-; GFX9-NEXT:    s_and_b32 s0, 1, s2
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3]
+; GFX9-NEXT:    s_and_b32 s0, 1, s0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX9-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[4:5], 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    s_cmp_eq_u64 s[6:7], 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX9-NEXT:    s_and_b32 s0, 1, s2
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
@@ -5949,22 +5949,23 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-LABEL: s_ssubsat_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sub_u32 s16, s0, s8
-; GFX6-NEXT:    v_mov_b32_e32 v3, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    s_subb_u32 s17, s1, s9
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    s_subb_u32 s18, s2, s10
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s2
+; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
 ; GFX6-NEXT:    s_subb_u32 s19, s3, s11
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v3, s3
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3]
 ; GFX6-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[8:9], 0
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[18:19], v[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[18:19], v[2:3]
+; GFX6-NEXT:    v_mov_b32_e32 v3, s17
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[10:11], 0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
@@ -5974,7 +5975,6 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s16
-; GFX6-NEXT:    v_mov_b32_e32 v3, s17
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v2, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
@@ -5984,22 +5984,23 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v2, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v3, v0, vcc
 ; GFX6-NEXT:    s_sub_u32 s0, s4, s12
-; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_subb_u32 s1, s5, s13
-; GFX6-NEXT:    v_mov_b32_e32 v3, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    s_subb_u32 s2, s6, s14
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
 ; GFX6-NEXT:    s_subb_u32 s3, s7, s15
-; GFX6-NEXT:    v_mov_b32_e32 v1, s7
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v3, s7
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
 ; GFX6-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[12:13], 0
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
+; GFX6-NEXT:    v_mov_b32_e32 v3, s1
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], s[14:15], 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[14:15], 0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
@@ -6009,7 +6010,6 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
-; GFX6-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
@@ -6032,26 +6032,26 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s16, s0, s8
 ; GFX8-NEXT:    s_subb_u32 s17, s1, s9
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_subb_u32 s18, s2, s10
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_subb_u32 s19, s3, s11
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    s_cmp_eq_u64 s[18:19], s[2:3]
-; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1]
-; GFX8-NEXT:    s_and_b32 s0, 1, s2
+; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3]
+; GFX8-NEXT:    s_and_b32 s0, 1, s0
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX8-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[8:9], 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_cmp_eq_u64 s[10:11], 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX8-NEXT:    s_and_b32 s0, 1, s2
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
@@ -6073,26 +6073,26 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v0, vcc
 ; GFX8-NEXT:    s_subb_u32 s1, s5, s13
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    s_subb_u32 s2, s6, s14
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    s_subb_u32 s3, s7, s15
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX8-NEXT:    s_and_b32 s4, 1, s6
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT:    s_and_b32 s4, 1, s4
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX8-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[12:13], 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_cmp_eq_u64 s[14:15], 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], s[14:15], 0
 ; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX8-NEXT:    s_and_b32 s4, 1, s6
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
@@ -6126,26 +6126,26 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s16, s0, s8
 ; GFX9-NEXT:    s_subb_u32 s17, s1, s9
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_subb_u32 s18, s2, s10
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_subb_u32 s19, s3, s11
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    s_cmp_eq_u64 s[18:19], s[2:3]
-; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1]
-; GFX9-NEXT:    s_and_b32 s0, 1, s2
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3]
+; GFX9-NEXT:    s_and_b32 s0, 1, s0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX9-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[8:9], 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    s_cmp_eq_u64 s[10:11], 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX9-NEXT:    s_and_b32 s0, 1, s2
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
@@ -6167,26 +6167,26 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v3, v0, vcc
 ; GFX9-NEXT:    s_subb_u32 s1, s5, s13
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    s_subb_u32 s2, s6, s14
-; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    s_subb_u32 s3, s7, s15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX9-NEXT:    s_and_b32 s4, 1, s6
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GFX9-NEXT:    s_and_b32 s4, 1, s4
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX9-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[12:13], 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    s_cmp_eq_u64 s[14:15], 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], s[14:15], 0
 ; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX9-NEXT:    s_and_b32 s4, 1, s6
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
@@ -6300,15 +6300,15 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ;
 ; GFX11-LABEL: s_ssubsat_v2i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_sub_u32 s16, s0, s8
-; GFX11-NEXT:    s_subb_u32 s17, s1, s9
-; GFX11-NEXT:    s_subb_u32 s18, s2, s10
-; GFX11-NEXT:    v_cmp_lt_u64_e64 s0, s[16:17], s[0:1]
-; GFX11-NEXT:    s_subb_u32 s19, s3, s11
-; GFX11-NEXT:    s_cmp_eq_u64 s[18:19], s[2:3]
+; GFX11-NEXT:    s_sub_u32 s18, s0, s8
+; GFX11-NEXT:    s_subb_u32 s19, s1, s9
+; GFX11-NEXT:    s_subb_u32 s16, s2, s10
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s0, s[18:19], s[0:1]
+; GFX11-NEXT:    s_subb_u32 s17, s3, s11
+; GFX11-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
 ; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[18:19], s[2:3]
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
 ; GFX11-NEXT:    v_cmp_gt_u64_e64 s2, s[8:9], 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX11-NEXT:    s_and_b32 s0, 1, s20
@@ -6317,7 +6317,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s2, s[10:11], 0
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX11-NEXT:    s_ashr_i32 s8, s19, 31
+; GFX11-NEXT:    s_ashr_i32 s8, s17, 31
 ; GFX11-NEXT:    s_and_b32 s1, 1, s1
 ; GFX11-NEXT:    s_add_i32 s9, s8, 0x80000000
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
@@ -6351,12 +6351,12 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s6
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
 ; GFX11-NEXT:    s_add_i32 s0, s4, 0x80000000
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_mov_b32 v3, s16
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_mov_b32 v3, s18
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s18
+; GFX11-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX11-NEXT:    v_xor_b32_e32 v1, v2, v1
-; GFX11-NEXT:    v_mov_b32_e32 v4, s17
-; GFX11-NEXT:    v_mov_b32_e32 v2, s19
+; GFX11-NEXT:    v_mov_b32_e32 v4, s19
+; GFX11-NEXT:    v_mov_b32_e32 v2, s17
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s8, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s8, vcc_lo
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index b7436aeb1d530..4f04c15b3d44a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -9999,3 +9999,100 @@ define <2 x i64> @v_udiv_i64_exact(<2 x i64> %num) {
    %result = udiv exact <2 x i64> %num, <i64 4096, i64 1024>
    ret <2 x i64> %result
 }
+
+define i64 @udiv_i64_gt_smax(i8 %size) {
+; GFX6-LABEL: udiv_i64_gt_smax:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX6-NEXT:    v_not_b32_e32 v1, v1
+; GFX6-NEXT:    v_not_b32_e32 v0, v0
+; GFX6-NEXT:    s_mov_b32 s4, 0xcccccccd
+; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s4
+; GFX6-NEXT:    v_mul_hi_u32 v4, v0, s4
+; GFX6-NEXT:    s_mov_b32 s6, 0xcccccccc
+; GFX6-NEXT:    v_mul_hi_u32 v5, v1, s4
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s6
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s6
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s6
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; GFX6-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_alignbit_b32 v0, v1, v0, 3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 3, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: udiv_i64_gt_smax:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 31
+; GFX9-NEXT:    v_not_b32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX9-NEXT:    s_mov_b32 s4, 0xcccccccd
+; GFX9-NEXT:    v_ashrrev_i32_sdwa v1, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v4, s4
+; GFX9-NEXT:    v_not_b32_e32 v5, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_mov_b32 s6, 0xcccccccc
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, s4, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, s6, v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, s6, v[0:1]
+; GFX9-NEXT:    v_alignbit_b32 v0, v1, v0, 3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 3, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %esize = sext i8 %size to i64
+  %minus = sub nuw nsw i64 -1, %esize
+  %div = udiv i64 %minus, 10
+  ret i64 %div
+}
+
+define i64 @udiv_i64_9divbits(i8 %size) {
+; GFX6-LABEL: udiv_i64_9divbits:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX6-NEXT:    s_mov_b32 s4, 0x41200000
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x3dcccccd, v0
+; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX6-NEXT:    v_mad_f32 v0, -v1, s4, v0
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, 0x1ff, v0
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: udiv_i64_9divbits:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 1
+; GFX9-NEXT:    v_add_u32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x41200000
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3dcccccd, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX9-NEXT:    v_mad_f32 v0, -v1, s4, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x1ff, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %zextend = zext i8 %size to i64
+  %num = add nuw nsw i64 1, %zextend
+  %div = udiv i64 %num, 10
+  ret i64 %div
+}
diff --git a/llvm/test/CodeGen/AMDGPU/av-spill-expansion-with-machine-cp.mir b/llvm/test/CodeGen/AMDGPU/av-spill-expansion-with-machine-cp.mir
index 460f6d24b9b18..dfe4b8a33f396 100644
--- a/llvm/test/CodeGen/AMDGPU/av-spill-expansion-with-machine-cp.mir
+++ b/llvm/test/CodeGen/AMDGPU/av-spill-expansion-with-machine-cp.mir
@@ -3,7 +3,6 @@
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 %s -o - -run-pass prologepilog,machine-cp -verify-machineinstrs | FileCheck -check-prefix=GFX908-PEI-MACHINECP %s
 
 # When VGPRs are available for spilling, prologepilog marks the tuple implicit-def as well as implicit in the first spill instruction.
-# As a consequence, machine-cp would NOT delete agpr2 copy here.
 
 ---
 name:  agpr-spill-to-vgpr-machine-cp
@@ -11,6 +10,7 @@ tracksRegLiveness: true
 stack:
   - { id: 0, name: '', type: spill-slot, offset: 0, size: 128, alignment: 4 }
 machineFunctionInfo:
+  isEntryFunction: true
   scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
   stackPtrOffsetReg: '$sgpr32'
   hasSpilledVGPRs: true
@@ -43,8 +43,8 @@ body: |
     S_ENDPGM 0
 ...
 
-# When VGPRs are NOT available for spilling (stack is used), prologepilog marks the tuple implicit-def only and NOT implicit.
-# As a consequence, machine-cp would delete agpr2 copy here. Presently, this is incorrect behavior.
+# When VGPRs are NOT available for spilling (stack is used), prologepilog should also mark the tuple implicit-def and implicit (similar to above usecase).
+# As a consequence, machine-cp would not delete agpr2 copy here.
 
 ---
 name:  agpr-spill-to-vgpr-to-stack-machine-cp
@@ -52,6 +52,7 @@ tracksRegLiveness: true
 stack:
   - { id: 0, name: '', type: spill-slot, offset: 0, size: 128, alignment: 4 }
 machineFunctionInfo:
+  isEntryFunction: true
   scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
   stackPtrOffsetReg: '$sgpr32'
   hasSpilledVGPRs: true
@@ -60,29 +61,34 @@ body: |
     successors:
     liveins: $vgpr0, $vgpr1
     ; GFX908-PEI-LABEL: name: agpr-spill-to-vgpr-to-stack-machine-cp
-    ; GFX908-PEI: liveins: $vgpr0, $vgpr1, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55
+    ; GFX908-PEI: liveins: $vgpr0, $vgpr1, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX908-PEI-NEXT: {{  $}}
+    ; GFX908-PEI-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908-PEI-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX908-PEI-NEXT: renamable $agpr0 = COPY renamable $vgpr0, implicit $exec
     ; GFX908-PEI-NEXT: renamable $agpr2 = COPY renamable $vgpr1, implicit $exec
     ; GFX908-PEI-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = IMPLICIT_DEF
     ; GFX908-PEI-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = IMPLICIT_DEF
-    ; GFX908-PEI-NEXT: $vgpr40 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
-    ; GFX908-PEI-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX908-PEI-NEXT: $vgpr40 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2
+    ; GFX908-PEI-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store (s32) into %stack.0, addrspace 5)
     ; GFX908-PEI-NEXT: $vgpr40 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
-    ; GFX908-PEI-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 4, addrspace 5)
+    ; GFX908-PEI-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 4, addrspace 5)
     ; GFX908-PEI-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2
     ; GFX908-PEI-NEXT: S_ENDPGM 0
     ;
     ; GFX908-PEI-MACHINECP-LABEL: name: agpr-spill-to-vgpr-to-stack-machine-cp
-    ; GFX908-PEI-MACHINECP: liveins: $vgpr0, $vgpr1, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55
+    ; GFX908-PEI-MACHINECP: liveins: $vgpr0, $vgpr1, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX908-PEI-MACHINECP-NEXT: {{  $}}
+    ; GFX908-PEI-MACHINECP-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908-PEI-MACHINECP-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX908-PEI-MACHINECP-NEXT: renamable $agpr0 = COPY renamable $vgpr0, implicit $exec
+    ; GFX908-PEI-MACHINECP-NEXT: renamable $agpr2 = COPY renamable $vgpr1, implicit $exec
     ; GFX908-PEI-MACHINECP-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = IMPLICIT_DEF
     ; GFX908-PEI-MACHINECP-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = IMPLICIT_DEF
-    ; GFX908-PEI-MACHINECP-NEXT: $vgpr40 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
-    ; GFX908-PEI-MACHINECP-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX908-PEI-MACHINECP-NEXT: $vgpr40 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2
+    ; GFX908-PEI-MACHINECP-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store (s32) into %stack.0, addrspace 5)
     ; GFX908-PEI-MACHINECP-NEXT: $vgpr40 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
-    ; GFX908-PEI-MACHINECP-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 4, addrspace 5)
+    ; GFX908-PEI-MACHINECP-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 4, addrspace 5)
     ; GFX908-PEI-MACHINECP-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2
     ; GFX908-PEI-MACHINECP-NEXT: S_ENDPGM 0
     renamable $agpr0 = COPY renamable $vgpr0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index 5bbea7ecf3f2d..5dde193528aa4 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -1021,8 +1021,116 @@ define i64 @sdiv64_known32(i64 %a, i64 %b) {
 ; GFX9-LABEL: sdiv64_known32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
+; GFX9-NEXT:    v_or_b32_e32 v5, v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execz .LBB10_2
+; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v0
+; GFX9-NEXT:    v_sub_co_u32_e32 v11, vcc, 0, v6
+; GFX9-NEXT:    v_subb_co_u32_e32 v12, vcc, 0, v0, vcc
+; GFX9-NEXT:    v_madmk_f32 v1, v3, 0x4f800000, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX9-NEXT:    v_madmk_f32 v1, v3, 0xcf800000, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v10, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_mul_lo_u32 v5, v11, v10
+; GFX9-NEXT:    v_mul_lo_u32 v8, v12, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v1, 0
+; GFX9-NEXT:    v_add3_u32 v8, v4, v5, v8
+; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v3
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v8, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v9, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v10, v3, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v10, v8, 0
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v13, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v9, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, v10, v4, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v5, v11, v13
+; GFX9-NEXT:    v_mul_lo_u32 v8, v12, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v1, 0
+; GFX9-NEXT:    v_add3_u32 v8, v4, v5, v8
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v8, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v8, 0
+; GFX9-NEXT:    v_mul_hi_u32 v12, v1, v3
+; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v3, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v12, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v9, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v10
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v11, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v13, v4, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v5, 0
+; GFX9-NEXT:    v_mul_hi_u32 v8, v7, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v8, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v1, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v2, v5, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v10, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v9, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v8, v0, v1
+; GFX9-NEXT:    v_mul_lo_u32 v9, v6, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v1, 0
+; GFX9-NEXT:    v_add3_u32 v4, v4, v9, v8
+; GFX9-NEXT:    v_sub_u32_e32 v8, v2, v4
+; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, v7, v3
+; GFX9-NEXT:    v_subb_co_u32_e64 v7, s[4:5], v8, v0, vcc
+; GFX9-NEXT:    v_sub_co_u32_e64 v8, s[4:5], v3, v6
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[4:5], 0, v7, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v8, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e64 v8, s[4:5], 2, v1
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], 0, v5, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
+; GFX9-NEXT:    v_add_co_u32_e64 v10, s[4:5], 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v6
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[4:5], 0, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v0
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v9, s[4:5]
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v10, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
+; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT:  .LBB10_2: ; %Flow
+; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[6:7]
+; GFX9-NEXT:    s_cbranch_execz .LBB10_4
+; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v3
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -1033,14 +1141,17 @@ define i64 @sdiv64_known32(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v0, v3
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v2
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
+; GFX9-NEXT:  .LBB10_4:
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.ext = ashr i64 %a, 32
   %b.ext = ashr i64 %b, 32
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index b2f178c6c1041..d9182d7ace8bf 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -1455,7 +1455,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v7
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-G-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
+; GFX9-G-O0-NEXT:    s_mov_b64 s[8:9], 0x7f
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr3_vgpr4 killed $exec
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v5
@@ -1544,77 +1544,77 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v10
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], v[10:11]
-; GFX9-G-O0-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], v[8:9], v[10:11]
+; GFX9-G-O0-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], v[10:11]
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], v[8:9], v[10:11]
 ; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v4, v4
 ; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v7, v7
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, 32
 ; GFX9-G-O0-NEXT:    v_add_u32_e64 v7, v7, v8
 ; GFX9-G-O0-NEXT:    v_min_u32_e64 v4, v4, v7
-; GFX9-G-O0-NEXT:    s_mov_b32 s10, 64
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s10
+; GFX9-G-O0-NEXT:    s_mov_b32 s12, 64
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s12
 ; GFX9-G-O0-NEXT:    v_add_u32_e64 v4, v4, v7
 ; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v3, v3
 ; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v6, v6
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, 32
 ; GFX9-G-O0-NEXT:    v_add_u32_e64 v6, v6, v7
 ; GFX9-G-O0-NEXT:    v_min_u32_e64 v3, v3, v6
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[8:9]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[10:11]
 ; GFX9-G-O0-NEXT:    s_mov_b32 s16, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, s5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[6:7], v[8:9]
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], v[6:7], v[8:9]
 ; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v4, v1
 ; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v6, v0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, 32
 ; GFX9-G-O0-NEXT:    v_add_u32_e64 v6, v6, v7
 ; GFX9-G-O0-NEXT:    v_min_u32_e64 v4, v4, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s10
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s12
 ; GFX9-G-O0-NEXT:    v_add_u32_e64 v6, v4, v6
 ; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v4, v2
 ; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v7, v5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, 32
 ; GFX9-G-O0-NEXT:    v_add_u32_e64 v7, v7, v8
 ; GFX9-G-O0-NEXT:    v_min_u32_e64 v4, v4, v7
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[8:9]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[10:11]
 ; GFX9-G-O0-NEXT:    s_mov_b32 s15, 0
-; GFX9-G-O0-NEXT:    s_mov_b32 s11, 0
+; GFX9-G-O0-NEXT:    s_mov_b32 s13, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s14, 0
-; GFX9-G-O0-NEXT:    s_mov_b32 s10, 0
-; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v6, s[8:9], v3, v4
+; GFX9-G-O0-NEXT:    s_mov_b32 s12, 0
+; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v6, s[10:11], v3, v4
 ; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s16
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v3, s[8:9], v3, v4, s[8:9]
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v3, s[10:11], v3, v4, s[10:11]
 ; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s15
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s14
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v8, s[8:9], v4, v7, s[8:9]
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v8, s[10:11], v4, v7, s[10:11]
 ; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s10
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v7, s[8:9], v4, v7, s[8:9]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s13
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s12
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v7, s[10:11], v4, v7, s[10:11]
 ; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v3
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, s4
-; GFX9-G-O0-NEXT:    v_cmp_gt_u64_e64 s[10:11], v[11:12], v[13:14]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[11:12], v[13:14]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s12
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, s13
-; GFX9-G-O0-NEXT:    v_cmp_gt_u64_e64 s[12:13], v[9:10], v[11:12]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v3
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v8
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s9
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, s8
+; GFX9-G-O0-NEXT:    v_cmp_gt_u64_e64 s[12:13], v[11:12], v[13:14]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s4
+; GFX9-G-O0-NEXT:    v_cmp_gt_u64_e64 s[10:11], v[9:10], v[11:12]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s4
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, 1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v9, v4, v9, s[12:13]
@@ -3688,20 +3688,20 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s10
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v7, s[8:9], v4, v7, s[8:9]
 ; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, s4
-; GFX9-G-O0-NEXT:    v_cmp_gt_u64_e64 s[10:11], v[11:12], v[13:14]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[11:12], v[13:14]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s12
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, s13
-; GFX9-G-O0-NEXT:    v_cmp_gt_u64_e64 s[12:13], v[9:10], v[11:12]
+; GFX9-G-O0-NEXT:    s_mov_b64 s[8:9], 0x7f
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v8
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s9
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, s8
+; GFX9-G-O0-NEXT:    v_cmp_gt_u64_e64 s[12:13], v[11:12], v[13:14]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s4
+; GFX9-G-O0-NEXT:    v_cmp_gt_u64_e64 s[10:11], v[9:10], v[11:12]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s4
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, 1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v9, v4, v9, s[12:13]
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index a77e3c226ad26..db7d816386a70 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -716,199 +716,66 @@ define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64
 define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 %y) {
 ; GCN-LABEL: s_test_udiv24_i48:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s0, s0, 0xff000000
-; GCN-NEXT:    s_and_b32 s1, s1, 0xffff
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_alignbit_b32 v0, s1, v0, 24
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v0
-; GCN-NEXT:    s_and_b32 s7, s7, 0xffff
-; GCN-NEXT:    s_and_b32 s6, s6, 0xff000000
-; GCN-NEXT:    s_lshr_b64 s[0:1], s[0:1], 24
-; GCN-NEXT:    v_mac_f32_e32 v1, 0, v2
-; GCN-NEXT:    v_rcp_f32_e32 v1, v1
-; GCN-NEXT:    s_sub_u32 s8, 0, s0
-; GCN-NEXT:    s_subb_u32 s9, 0, s1
-; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
+; GCN-NEXT:    s_and_b32 s2, s2, 0xff000000
+; GCN-NEXT:    s_and_b32 s4, s4, 0xff000000
+; GCN-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_alignbit_b32 v0, s5, v0, 24
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-NEXT:    s_and_b32 s3, s3, 0xffff
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_alignbit_b32 v1, s3, v1, 24
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_madmk_f32 v1, v2, 0xcf800000, v1
+; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    v_mul_lo_u32 v3, s8, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s8, v1
-; GCN-NEXT:    v_mul_lo_u32 v5, s9, v1
-; GCN-NEXT:    v_mul_lo_u32 v6, s8, v1
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_mul_lo_u32 v4, v1, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, v1, v6
-; GCN-NEXT:    v_mul_hi_u32 v7, v1, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v2, v3
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GCN-NEXT:    v_mul_lo_u32 v7, v2, v6
-; GCN-NEXT:    v_mul_hi_u32 v6, v2, v6
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v5, v6, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v8, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, s8, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s8, v1
-; GCN-NEXT:    v_mul_lo_u32 v5, s9, v1
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, s8, v1
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_mul_lo_u32 v7, v1, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v1, v4
-; GCN-NEXT:    v_mul_hi_u32 v9, v1, v3
-; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v2, v4
-; GCN-NEXT:    v_mul_hi_u32 v5, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v6, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_mov_b32_e32 v3, s6
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GCN-NEXT:    v_alignbit_b32 v3, s7, v3, 24
-; GCN-NEXT:    v_mul_lo_u32 v4, v3, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, v3, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, v3, v2
-; GCN-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, 0, v1
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, 0, v1
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v7, v0, v1
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
-; GCN-NEXT:    v_mul_lo_u32 v10, v0, v1
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v2, vcc
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 2, v1
-; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v2, vcc
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v10
-; GCN-NEXT:    v_subb_u32_e32 v6, vcc, 0, v6, vcc
-; GCN-NEXT:    v_sub_i32_e32 v7, vcc, v3, v0
-; GCN-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v6, vcc
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
-; GCN-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
-; GCN-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v5, vcc
-; GCN-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-NEXT:    buffer_store_short v3, off, s[4:7], 0 offset:4
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_udiv24_i48:
-; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_and_b32 s1, s1, 0xffff
-; GCN-IR-NEXT:    s_and_b32 s0, s0, 0xff000000
-; GCN-IR-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GCN-IR-NEXT:    s_and_b32 s2, s2, 0xff000000
-; GCN-IR-NEXT:    s_lshr_b64 s[8:9], s[0:1], 24
-; GCN-IR-NEXT:    s_lshr_b64 s[0:1], s[2:3], 24
-; GCN-IR-NEXT:    s_and_b32 s9, s9, 0xffff
-; GCN-IR-NEXT:    s_and_b32 s1, s1, 0xffff
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[0:1], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[8:9], 0
-; GCN-IR-NEXT:    s_flbit_i32_b64 s10, s[0:1]
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[2:3], s[6:7]
-; GCN-IR-NEXT:    s_flbit_i32_b64 s16, s[8:9]
-; GCN-IR-NEXT:    s_sub_u32 s12, s10, s16
-; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[14:15], s[12:13], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[12:13], 63
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[6:7], s[14:15]
-; GCN-IR-NEXT:    s_and_b64 s[6:7], s[14:15], exec
-; GCN-IR-NEXT:    s_cselect_b32 s7, 0, s9
-; GCN-IR-NEXT:    s_cselect_b32 s6, 0, s8
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[18:19]
-; GCN-IR-NEXT:    s_mov_b64 s[2:3], 0
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s15, s13, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[14:15], 0
-; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[8:9], s12
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[8:9], s14
-; GCN-IR-NEXT:    s_add_u32 s14, s0, -1
-; GCN-IR-NEXT:    s_addc_u32 s15, s1, -1
-; GCN-IR-NEXT:    s_not_b64 s[2:3], s[10:11]
-; GCN-IR-NEXT:    s_add_u32 s8, s2, s16
-; GCN-IR-NEXT:    s_addc_u32 s9, s3, 0
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT:    s_mov_b32 s3, 0
-; GCN-IR-NEXT:  .LBB7_3: ; %udiv-do-while
-; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
-; GCN-IR-NEXT:    s_lshr_b32 s2, s7, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[2:3]
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s2, s14, s12
-; GCN-IR-NEXT:    s_subb_u32 s2, s15, s13
-; GCN-IR-NEXT:    s_ashr_i32 s10, s2, 31
-; GCN-IR-NEXT:    s_mov_b32 s11, s10
-; GCN-IR-NEXT:    s_and_b32 s2, s10, 1
-; GCN-IR-NEXT:    s_and_b64 s[10:11], s[10:11], s[0:1]
-; GCN-IR-NEXT:    s_sub_u32 s12, s12, s10
-; GCN-IR-NEXT:    s_subb_u32 s13, s13, s11
-; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[16:17], s[8:9], 0
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_3
-; GCN-IR-NEXT:  .LBB7_4: ; %Flow4
-; GCN-IR-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[2:3], s[0:1]
-; GCN-IR-NEXT:  .LBB7_5: ; %udiv-end
-; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s2, -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s6
-; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
-; GCN-IR-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GCN-IR-NEXT:    s_and_b32 s4, s4, 0xff000000
+; GCN-IR-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-IR-NEXT:    v_alignbit_b32 v0, s5, v0, 24
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-IR-NEXT:    s_and_b32 s3, s3, 0xffff
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-IR-NEXT:    v_alignbit_b32 v1, s3, v1, 24
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    buffer_store_short v3, off, s[4:7], 0 offset:4
+; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = lshr i48 %x, 24
   %2 = lshr i48 %y, 24
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index b4f977db80439..a794d139063d5 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -665,54 +665,47 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6
 ; GCN-LABEL: s_test_urem23_64_v2i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, s13, 1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GCN-NEXT:    s_sub_i32 s1, 0, s0
-; GCN-NEXT:    s_lshr_b32 s6, s15, 9
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s6
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    s_lshr_b32 s6, s13, 1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GCN-NEXT:    s_lshr_b32 s0, s15, 9
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s0
 ; GCN-NEXT:    s_lshr_b32 s7, s11, 9
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s7
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GCN-NEXT:    s_sub_i32 s1, 0, s6
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_lo_u32 v1, s1, v0
-; GCN-NEXT:    s_lshr_b32 s1, s9, 1
-; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_readfirstlane_b32 s2, v0
-; GCN-NEXT:    s_mul_i32 s2, s2, s0
-; GCN-NEXT:    s_sub_i32 s1, s1, s2
-; GCN-NEXT:    s_sub_i32 s2, s1, s0
-; GCN-NEXT:    s_cmp_ge_u32 s1, s0
-; GCN-NEXT:    s_cselect_b32 s1, s2, s1
-; GCN-NEXT:    s_sub_i32 s2, s1, s0
-; GCN-NEXT:    s_cmp_ge_u32 s1, s0
-; GCN-NEXT:    s_cselect_b32 s8, s2, s1
-; GCN-NEXT:    s_sub_i32 s0, 0, s6
-; GCN-NEXT:    v_mul_lo_u32 v0, s0, v1
+; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GCN-NEXT:    v_trunc_f32_e32 v4, v4
+; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
+; GCN-NEXT:    v_mul_lo_u32 v5, s1, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
+; GCN-NEXT:    s_lshr_b32 s8, s9, 1
+; GCN-NEXT:    v_mul_hi_u32 v5, v0, v5
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v4, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v2, s0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
+; GCN-NEXT:    v_mul_hi_u32 v0, s8, v0
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mul_hi_u32 v0, v1, v0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_mul_hi_u32 v2, s7, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NEXT:    v_readfirstlane_b32 s4, v2
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s7, v2
+; GCN-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NEXT:    s_mul_i32 s4, s4, s6
-; GCN-NEXT:    s_sub_i32 s4, s7, s4
+; GCN-NEXT:    s_sub_i32 s4, s8, s4
 ; GCN-NEXT:    s_sub_i32 s5, s4, s6
 ; GCN-NEXT:    s_cmp_ge_u32 s4, s6
 ; GCN-NEXT:    s_cselect_b32 s4, s5, s4
 ; GCN-NEXT:    s_sub_i32 s5, s4, s6
 ; GCN-NEXT:    s_cmp_ge_u32 s4, s6
 ; GCN-NEXT:    s_cselect_b32 s4, s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_and_b32_e32 v2, 0x7fffff, v2
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -720,54 +713,47 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6
 ; GCN-IR-LABEL: s_test_urem23_64_v2i64:
 ; GCN-IR:       ; %bb.0:
 ; GCN-IR-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_lshr_b32 s0, s13, 1
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GCN-IR-NEXT:    s_sub_i32 s1, 0, s0
-; GCN-IR-NEXT:    s_lshr_b32 s6, s15, 9
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v2, s6
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-IR-NEXT:    s_lshr_b32 s6, s13, 1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GCN-IR-NEXT:    s_lshr_b32 s0, s15, 9
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v2, s0
 ; GCN-IR-NEXT:    s_lshr_b32 s7, s11, 9
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v3, s7
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GCN-IR-NEXT:    s_sub_i32 s1, 0, s6
 ; GCN-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, s1, v0
-; GCN-IR-NEXT:    s_lshr_b32 s1, s9, 1
-; GCN-IR-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
-; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-IR-NEXT:    v_readfirstlane_b32 s2, v0
-; GCN-IR-NEXT:    s_mul_i32 s2, s2, s0
-; GCN-IR-NEXT:    s_sub_i32 s1, s1, s2
-; GCN-IR-NEXT:    s_sub_i32 s2, s1, s0
-; GCN-IR-NEXT:    s_cmp_ge_u32 s1, s0
-; GCN-IR-NEXT:    s_cselect_b32 s1, s2, s1
-; GCN-IR-NEXT:    s_sub_i32 s2, s1, s0
-; GCN-IR-NEXT:    s_cmp_ge_u32 s1, s0
-; GCN-IR-NEXT:    s_cselect_b32 s8, s2, s1
-; GCN-IR-NEXT:    s_sub_i32 s0, 0, s6
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, s0, v1
+; GCN-IR-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GCN-IR-NEXT:    v_trunc_f32_e32 v4, v4
+; GCN-IR-NEXT:    v_mad_f32 v3, -v4, v2, v3
+; GCN-IR-NEXT:    v_mul_lo_u32 v5, s1, v0
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
+; GCN-IR-NEXT:    s_lshr_b32 s8, s9, 1
+; GCN-IR-NEXT:    v_mul_hi_u32 v5, v0, v5
+; GCN-IR-NEXT:    v_addc_u32_e32 v2, vcc, 0, v4, vcc
+; GCN-IR-NEXT:    v_mul_lo_u32 v2, v2, s0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, s8, v0
 ; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-IR-NEXT:    s_mov_b32 s2, -1
-; GCN-IR-NEXT:    v_mul_hi_u32 v0, v1, v0
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GCN-IR-NEXT:    v_mul_hi_u32 v2, s7, v0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-IR-NEXT:    v_readfirstlane_b32 s4, v2
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, s7, v2
+; GCN-IR-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-IR-NEXT:    s_mul_i32 s4, s4, s6
-; GCN-IR-NEXT:    s_sub_i32 s4, s7, s4
+; GCN-IR-NEXT:    s_sub_i32 s4, s8, s4
 ; GCN-IR-NEXT:    s_sub_i32 s5, s4, s6
 ; GCN-IR-NEXT:    s_cmp_ge_u32 s4, s6
 ; GCN-IR-NEXT:    s_cselect_b32 s4, s5, s4
 ; GCN-IR-NEXT:    s_sub_i32 s5, s4, s6
 ; GCN-IR-NEXT:    s_cmp_ge_u32 s4, s6
 ; GCN-IR-NEXT:    s_cselect_b32 s4, s5, s4
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 0x7fffff, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/ARM/single-issue-r52.mir b/llvm/test/CodeGen/ARM/single-issue-r52.mir
index 084afb6f666c1..d01ef82617f23 100644
--- a/llvm/test/CodeGen/ARM/single-issue-r52.mir
+++ b/llvm/test/CodeGen/ARM/single-issue-r52.mir
@@ -1,7 +1,7 @@
-# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -debug-only=machine-scheduler -misched-topdown 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=TOPDOWN
-# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -debug-only=machine-scheduler -misched-bottomup 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=BOTTOMUP
-# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52plus -run-pass  machine-scheduler  -enable-misched -debug-only=machine-scheduler -misched-topdown 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=TOPDOWN
-# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52plus -run-pass  machine-scheduler  -enable-misched -debug-only=machine-scheduler -misched-bottomup 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=BOTTOMUP
+# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -debug-only=machine-scheduler -misched-prera-direction=topdown 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=TOPDOWN
+# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -debug-only=machine-scheduler -misched-prera-direction=bottomup 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=BOTTOMUP
+# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52plus -run-pass  machine-scheduler  -enable-misched -debug-only=machine-scheduler -misched-prera-direction=topdown 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=TOPDOWN
+# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52plus -run-pass  machine-scheduler  -enable-misched -debug-only=machine-scheduler -misched-prera-direction=bottomup 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=BOTTOMUP
 # REQUIRES: asserts
 --- |
   ; ModuleID = 'foo.ll'
diff --git a/llvm/test/CodeGen/M68k/Atomics/load-store.ll b/llvm/test/CodeGen/M68k/Atomics/load-store.ll
index 23fdfad05cab5..c00a1faf2634b 100644
--- a/llvm/test/CodeGen/M68k/Atomics/load-store.ll
+++ b/llvm/test/CodeGen/M68k/Atomics/load-store.ll
@@ -604,3 +604,51 @@ define void @atomic_store_i64_seq_cst(ptr %a, i64 %val) nounwind {
   store atomic i64 %val, ptr %a seq_cst, align 8
   ret void
 }
+
+define void @store_arid(ptr nonnull align 4 %a) {
+; NO-ATOMIC-LABEL: store_arid:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0: ; %start
+; NO-ATOMIC-NEXT:    moveq #1, %d0
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.l %d0, (32,%a0)
+; NO-ATOMIC-NEXT:    rts
+;
+; ATOMIC-LABEL: store_arid:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0: ; %start
+; ATOMIC-NEXT:    moveq #1, %d0
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.l %d0, (32,%a0)
+; ATOMIC-NEXT:    rts
+start:
+  %1 = getelementptr inbounds i32, ptr %a, i32 8
+  store atomic i32 1, ptr %1 seq_cst, align 4
+  br label %exit
+
+exit:                                              ; preds = %start
+  ret void
+}
+
+define i32 @load_arid(ptr nonnull align 4 %a) {
+; NO-ATOMIC-LABEL: load_arid:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0: ; %start
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.l (32,%a0), %d0
+; NO-ATOMIC-NEXT:    rts
+;
+; ATOMIC-LABEL: load_arid:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0: ; %start
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.l (32,%a0), %d0
+; ATOMIC-NEXT:    rts
+start:
+  %1 = getelementptr inbounds i32, ptr %a, i32 8
+  %2 = load atomic i32, ptr %1 seq_cst, align 4
+  br label %exit
+
+exit:                                              ; preds = %start
+  ret i32 %2
+}
diff --git a/llvm/test/CodeGen/M68k/Atomics/rmw.ll b/llvm/test/CodeGen/M68k/Atomics/rmw.ll
index ce456f0960eec..a277b8fe72ae4 100644
--- a/llvm/test/CodeGen/M68k/Atomics/rmw.ll
+++ b/llvm/test/CodeGen/M68k/Atomics/rmw.ll
@@ -588,3 +588,144 @@ entry:
   %old = atomicrmw xchg ptr %ptr, i32 %val monotonic
   ret i32 %old
 }
+
+define i8 @atomicrmw_sub_i8_arid(ptr align 2 %self) {
+; NO-ATOMIC-LABEL: atomicrmw_sub_i8_arid:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0: ; %start
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-NEXT:    move.l (16,%sp), %a0
+; NO-ATOMIC-NEXT:    move.l (%a0), %d0
+; NO-ATOMIC-NEXT:    add.l #4, %d0
+; NO-ATOMIC-NEXT:    move.l %d0, (%sp)
+; NO-ATOMIC-NEXT:    move.l #1, (4,%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_sub_1
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomicrmw_sub_i8_arid:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0: ; %start
+; ATOMIC-NEXT:    suba.l #4, %sp
+; ATOMIC-NEXT:    .cfi_def_cfa_offset -8
+; ATOMIC-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
+; ATOMIC-NEXT:    move.l (8,%sp), %a0
+; ATOMIC-NEXT:    move.l (%a0), %a0
+; ATOMIC-NEXT:    move.b (4,%a0), %d1
+; ATOMIC-NEXT:    move.b %d1, %d0
+; ATOMIC-NEXT:  .LBB12_1: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-NEXT:    move.b %d1, %d2
+; ATOMIC-NEXT:    add.b #-1, %d2
+; ATOMIC-NEXT:    cas.b %d0, %d2, (4,%a0)
+; ATOMIC-NEXT:    move.b %d0, %d2
+; ATOMIC-NEXT:    sub.b %d1, %d2
+; ATOMIC-NEXT:    seq %d1
+; ATOMIC-NEXT:    sub.b #1, %d1
+; ATOMIC-NEXT:    move.b %d0, %d1
+; ATOMIC-NEXT:    bne .LBB12_1
+; ATOMIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-NEXT:    movem.l (0,%sp), %d2 ; 8-byte Folded Reload
+; ATOMIC-NEXT:    adda.l #4, %sp
+; ATOMIC-NEXT:    rts
+start:
+  %self1 = load ptr, ptr %self, align 2
+  %_18.i.i = getelementptr inbounds i8, ptr %self1, i32 4
+  %6 = atomicrmw sub ptr %_18.i.i, i8 1 release, align 4
+  ret i8 %6
+}
+
+define i16 @atomicrmw_sub_i16_arid(ptr align 2 %self) {
+; NO-ATOMIC-LABEL: atomicrmw_sub_i16_arid:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0: ; %start
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-NEXT:    move.l (16,%sp), %a0
+; NO-ATOMIC-NEXT:    move.l (%a0), %d0
+; NO-ATOMIC-NEXT:    add.l #4, %d0
+; NO-ATOMIC-NEXT:    move.l %d0, (%sp)
+; NO-ATOMIC-NEXT:    move.l #1, (4,%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_sub_2
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomicrmw_sub_i16_arid:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0: ; %start
+; ATOMIC-NEXT:    suba.l #4, %sp
+; ATOMIC-NEXT:    .cfi_def_cfa_offset -8
+; ATOMIC-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
+; ATOMIC-NEXT:    move.l (8,%sp), %a0
+; ATOMIC-NEXT:    move.l (%a0), %a0
+; ATOMIC-NEXT:    move.w (4,%a0), %d1
+; ATOMIC-NEXT:    move.w %d1, %d0
+; ATOMIC-NEXT:  .LBB13_1: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-NEXT:    move.w %d1, %d2
+; ATOMIC-NEXT:    add.w #-1, %d2
+; ATOMIC-NEXT:    cas.w %d0, %d2, (4,%a0)
+; ATOMIC-NEXT:    move.w %d0, %d2
+; ATOMIC-NEXT:    sub.w %d1, %d2
+; ATOMIC-NEXT:    seq %d1
+; ATOMIC-NEXT:    sub.b #1, %d1
+; ATOMIC-NEXT:    move.w %d0, %d1
+; ATOMIC-NEXT:    bne .LBB13_1
+; ATOMIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-NEXT:    movem.l (0,%sp), %d2 ; 8-byte Folded Reload
+; ATOMIC-NEXT:    adda.l #4, %sp
+; ATOMIC-NEXT:    rts
+start:
+  %self1 = load ptr, ptr %self, align 2
+  %_18.i.i = getelementptr inbounds i8, ptr %self1, i32 4
+  %6 = atomicrmw sub ptr %_18.i.i, i16 1 release, align 4
+  ret i16 %6
+}
+
+define i32 @atomicrmw_sub_i32_arid(ptr align 2 %self) {
+; NO-ATOMIC-LABEL: atomicrmw_sub_i32_arid:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0: ; %start
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-NEXT:    move.l (16,%sp), %a0
+; NO-ATOMIC-NEXT:    move.l (%a0), %d0
+; NO-ATOMIC-NEXT:    add.l #4, %d0
+; NO-ATOMIC-NEXT:    move.l %d0, (%sp)
+; NO-ATOMIC-NEXT:    move.l #1, (4,%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_sub_4
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomicrmw_sub_i32_arid:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0: ; %start
+; ATOMIC-NEXT:    suba.l #4, %sp
+; ATOMIC-NEXT:    .cfi_def_cfa_offset -8
+; ATOMIC-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
+; ATOMIC-NEXT:    move.l (8,%sp), %a0
+; ATOMIC-NEXT:    move.l (%a0), %a0
+; ATOMIC-NEXT:    move.l (4,%a0), %d1
+; ATOMIC-NEXT:    move.l %d1, %d0
+; ATOMIC-NEXT:  .LBB14_1: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-NEXT:    move.l %d1, %d2
+; ATOMIC-NEXT:    add.l #-1, %d2
+; ATOMIC-NEXT:    cas.l %d0, %d2, (4,%a0)
+; ATOMIC-NEXT:    move.l %d0, %d2
+; ATOMIC-NEXT:    sub.l %d1, %d2
+; ATOMIC-NEXT:    seq %d1
+; ATOMIC-NEXT:    sub.b #1, %d1
+; ATOMIC-NEXT:    move.l %d0, %d1
+; ATOMIC-NEXT:    bne .LBB14_1
+; ATOMIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-NEXT:    movem.l (0,%sp), %d2 ; 8-byte Folded Reload
+; ATOMIC-NEXT:    adda.l #4, %sp
+; ATOMIC-NEXT:    rts
+start:
+  %self1 = load ptr, ptr %self, align 2
+  %_18.i.i = getelementptr inbounds i8, ptr %self1, i32 4
+  %6 = atomicrmw sub ptr %_18.i.i, i32 1 release, align 4
+  ret i32 %6
+}
diff --git a/llvm/test/CodeGen/M68k/CodeModel/Large/Atomics/cmpxchg.ll b/llvm/test/CodeGen/M68k/CodeModel/Large/Atomics/cmpxchg.ll
new file mode 100644
index 0000000000000..36bd4654d3e54
--- /dev/null
+++ b/llvm/test/CodeGen/M68k/CodeModel/Large/Atomics/cmpxchg.ll
@@ -0,0 +1,316 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68000 --code-model=large | FileCheck %s --check-prefix=NO-ATOMIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68010 --code-model=large | FileCheck %s --check-prefix=NO-ATOMIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68000 --code-model=large --relocation-model=pic | FileCheck %s --check-prefix=NO-ATOMIC-PIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68010 --code-model=large --relocation-model=pic | FileCheck %s --check-prefix=NO-ATOMIC-PIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68020 --code-model=large | FileCheck %s --check-prefix=ATOMIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68030 --code-model=large | FileCheck %s --check-prefix=ATOMIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68040 --code-model=large | FileCheck %s --check-prefix=ATOMIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68020 --code-model=large --relocation-model=pic | FileCheck %s --check-prefix=ATOMIC-PIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68030 --code-model=large --relocation-model=pic | FileCheck %s --check-prefix=ATOMIC-PIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68040 --code-model=large --relocation-model=pic | FileCheck %s --check-prefix=ATOMIC-PIC
+
+@thread_id = internal global <{ [5 x i8] }> <{ [5 x i8] zeroinitializer}>, align 4
+
+define { i32, i1 } @std_thread_new() {
+; NO-ATOMIC-LABEL: std_thread_new:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0: ; %start
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-NEXT:    move.l #1, (8,%sp)
+; NO-ATOMIC-NEXT:    move.l #0, (4,%sp)
+; NO-ATOMIC-NEXT:    move.l #thread_id, (%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_val_compare_and_swap_4
+; NO-ATOMIC-NEXT:    cmpi.l #0, %d0
+; NO-ATOMIC-NEXT:    seq %d1
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: std_thread_new:
+; NO-ATOMIC-PIC:         .cfi_startproc
+; NO-ATOMIC-PIC-NEXT:  ; %bb.0: ; %start
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-PIC-NEXT:    lea (_GLOBAL_OFFSET_TABLE_@GOTPCREL,%pc), %a0
+; NO-ATOMIC-PIC-NEXT:    adda.l #thread_id@GOTOFF, %a0
+; NO-ATOMIC-PIC-NEXT:    move.l %a0, (%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l #1, (8,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l #0, (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__sync_val_compare_and_swap_4@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    cmpi.l #0, %d0
+; NO-ATOMIC-PIC-NEXT:    seq %d1
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: std_thread_new:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0: ; %start
+; ATOMIC-NEXT:    move.l #thread_id, %a0
+; ATOMIC-NEXT:    moveq #1, %d1
+; ATOMIC-NEXT:    moveq #0, %d0
+; ATOMIC-NEXT:    cas.l %d0, %d1, (%a0)
+; ATOMIC-NEXT:    cmpi.l #0, %d0
+; ATOMIC-NEXT:    seq %d1
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: std_thread_new:
+; ATOMIC-PIC:         .cfi_startproc
+; ATOMIC-PIC-NEXT:  ; %bb.0: ; %start
+; ATOMIC-PIC-NEXT:    suba.l #4, %sp
+; ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -8
+; ATOMIC-PIC-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
+; ATOMIC-PIC-NEXT:    lea (_GLOBAL_OFFSET_TABLE_@GOTPCREL,%pc), %a0
+; ATOMIC-PIC-NEXT:    move.l #thread_id@GOTOFF, %d1
+; ATOMIC-PIC-NEXT:    moveq #1, %d2
+; ATOMIC-PIC-NEXT:    moveq #0, %d0
+; ATOMIC-PIC-NEXT:    cas.l %d0, %d2, (0,%a0,%d1)
+; ATOMIC-PIC-NEXT:    cmpi.l #0, %d0
+; ATOMIC-PIC-NEXT:    seq %d1
+; ATOMIC-PIC-NEXT:    movem.l (0,%sp), %d2 ; 8-byte Folded Reload
+; ATOMIC-PIC-NEXT:    adda.l #4, %sp
+; ATOMIC-PIC-NEXT:    rts
+start:
+  %1 = cmpxchg ptr @thread_id, i32 0, i32 1 acquire monotonic, align 4
+  ret { i32, i1 } %1
+}
+
+define i1 @cmpxchg_i8_monotonic_monotonic(i8 %cmp, i8 %new, ptr %mem) nounwind {
+; NO-ATOMIC-LABEL: cmpxchg_i8_monotonic_monotonic:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #20, %sp
+; NO-ATOMIC-NEXT:    movem.l %d2, (16,%sp) ; 8-byte Folded Spill
+; NO-ATOMIC-NEXT:    move.b (31,%sp), %d0
+; NO-ATOMIC-NEXT:    and.l #255, %d0
+; NO-ATOMIC-NEXT:    move.l %d0, (8,%sp)
+; NO-ATOMIC-NEXT:    move.b (27,%sp), %d2
+; NO-ATOMIC-NEXT:    move.l %d2, %d0
+; NO-ATOMIC-NEXT:    and.l #255, %d0
+; NO-ATOMIC-NEXT:    move.l %d0, (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (32,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_val_compare_and_swap_1
+; NO-ATOMIC-NEXT:    sub.b %d2, %d0
+; NO-ATOMIC-NEXT:    seq %d0
+; NO-ATOMIC-NEXT:    movem.l (16,%sp), %d2 ; 8-byte Folded Reload
+; NO-ATOMIC-NEXT:    adda.l #20, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: cmpxchg_i8_monotonic_monotonic:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #20, %sp
+; NO-ATOMIC-PIC-NEXT:    movem.l %d2, (16,%sp) ; 8-byte Folded Spill
+; NO-ATOMIC-PIC-NEXT:    move.b (31,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    and.l #255, %d0
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (8,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.b (27,%sp), %d2
+; NO-ATOMIC-PIC-NEXT:    move.l %d2, %d0
+; NO-ATOMIC-PIC-NEXT:    and.l #255, %d0
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (32,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__sync_val_compare_and_swap_1@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    sub.b %d2, %d0
+; NO-ATOMIC-PIC-NEXT:    seq %d0
+; NO-ATOMIC-PIC-NEXT:    movem.l (16,%sp), %d2 ; 8-byte Folded Reload
+; NO-ATOMIC-PIC-NEXT:    adda.l #20, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: cmpxchg_i8_monotonic_monotonic:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    suba.l #4, %sp
+; ATOMIC-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
+; ATOMIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-NEXT:    move.b (15,%sp), %d0
+; ATOMIC-NEXT:    move.b (11,%sp), %d1
+; ATOMIC-NEXT:    move.b %d1, %d2
+; ATOMIC-NEXT:    cas.b %d2, %d0, (%a0)
+; ATOMIC-NEXT:    sub.b %d1, %d2
+; ATOMIC-NEXT:    seq %d0
+; ATOMIC-NEXT:    movem.l (0,%sp), %d2 ; 8-byte Folded Reload
+; ATOMIC-NEXT:    adda.l #4, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: cmpxchg_i8_monotonic_monotonic:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #4, %sp
+; ATOMIC-PIC-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
+; ATOMIC-PIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.b (15,%sp), %d0
+; ATOMIC-PIC-NEXT:    move.b (11,%sp), %d1
+; ATOMIC-PIC-NEXT:    move.b %d1, %d2
+; ATOMIC-PIC-NEXT:    cas.b %d2, %d0, (%a0)
+; ATOMIC-PIC-NEXT:    sub.b %d1, %d2
+; ATOMIC-PIC-NEXT:    seq %d0
+; ATOMIC-PIC-NEXT:    movem.l (0,%sp), %d2 ; 8-byte Folded Reload
+; ATOMIC-PIC-NEXT:    adda.l #4, %sp
+; ATOMIC-PIC-NEXT:    rts
+  %res = cmpxchg ptr %mem, i8 %cmp, i8 %new monotonic monotonic
+  %val = extractvalue {i8, i1} %res, 1
+  ret i1 %val
+}
+
+define i16 @cmpxchg_i16_release_monotonic(i16 %cmp, i16 %new, ptr %mem) nounwind {
+; NO-ATOMIC-LABEL: cmpxchg_i16_release_monotonic:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    move.w (22,%sp), %d0
+; NO-ATOMIC-NEXT:    and.l #65535, %d0
+; NO-ATOMIC-NEXT:    move.l %d0, (8,%sp)
+; NO-ATOMIC-NEXT:    move.w (18,%sp), %d0
+; NO-ATOMIC-NEXT:    and.l #65535, %d0
+; NO-ATOMIC-NEXT:    move.l %d0, (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (24,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_val_compare_and_swap_2
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: cmpxchg_i16_release_monotonic:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    move.w (22,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    and.l #65535, %d0
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (8,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.w (18,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    and.l #65535, %d0
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (24,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__sync_val_compare_and_swap_2@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: cmpxchg_i16_release_monotonic:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.l (12,%sp), %a0
+; ATOMIC-NEXT:    move.w (10,%sp), %d1
+; ATOMIC-NEXT:    move.w (6,%sp), %d0
+; ATOMIC-NEXT:    cas.w %d0, %d1, (%a0)
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: cmpxchg_i16_release_monotonic:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.l (12,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.w (10,%sp), %d1
+; ATOMIC-PIC-NEXT:    move.w (6,%sp), %d0
+; ATOMIC-PIC-NEXT:    cas.w %d0, %d1, (%a0)
+; ATOMIC-PIC-NEXT:    rts
+  %res = cmpxchg ptr %mem, i16 %cmp, i16 %new release monotonic
+  %val = extractvalue {i16, i1} %res, 0
+  ret i16 %val
+}
+
+define i32 @cmpxchg_i32_release_acquire(i32 %cmp, i32 %new, ptr %mem) nounwind {
+; NO-ATOMIC-LABEL: cmpxchg_i32_release_acquire:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    move.l (20,%sp), (8,%sp)
+; NO-ATOMIC-NEXT:    move.l (16,%sp), (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (24,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_val_compare_and_swap_4
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: cmpxchg_i32_release_acquire:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    move.l (20,%sp), (8,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (16,%sp), (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (24,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__sync_val_compare_and_swap_4@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: cmpxchg_i32_release_acquire:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.l (12,%sp), %a0
+; ATOMIC-NEXT:    move.l (8,%sp), %d1
+; ATOMIC-NEXT:    move.l (4,%sp), %d0
+; ATOMIC-NEXT:    cas.l %d0, %d1, (%a0)
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: cmpxchg_i32_release_acquire:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.l (12,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.l (8,%sp), %d1
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %d0
+; ATOMIC-PIC-NEXT:    cas.l %d0, %d1, (%a0)
+; ATOMIC-PIC-NEXT:    rts
+  %res = cmpxchg ptr %mem, i32 %cmp, i32 %new release acquire
+  %val = extractvalue {i32, i1} %res, 0
+  ret i32 %val
+}
+
+define i64 @cmpxchg_i64_seqcst_seqcst(i64 %cmp, i64 %new, ptr %mem) nounwind {
+; NO-ATOMIC-LABEL: cmpxchg_i64_seqcst_seqcst:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #36, %sp
+; NO-ATOMIC-NEXT:    move.l (44,%sp), (28,%sp)
+; NO-ATOMIC-NEXT:    move.l (40,%sp), (24,%sp)
+; NO-ATOMIC-NEXT:    lea (24,%sp), %a0
+; NO-ATOMIC-NEXT:    move.l %a0, (4,%sp)
+; NO-ATOMIC-NEXT:    move.l #5, (20,%sp)
+; NO-ATOMIC-NEXT:    move.l #5, (16,%sp)
+; NO-ATOMIC-NEXT:    move.l (52,%sp), (12,%sp)
+; NO-ATOMIC-NEXT:    move.l (48,%sp), (8,%sp)
+; NO-ATOMIC-NEXT:    move.l (56,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __atomic_compare_exchange_8
+; NO-ATOMIC-NEXT:    move.l (28,%sp), %d1
+; NO-ATOMIC-NEXT:    move.l (24,%sp), %d0
+; NO-ATOMIC-NEXT:    adda.l #36, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: cmpxchg_i64_seqcst_seqcst:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #36, %sp
+; NO-ATOMIC-PIC-NEXT:    move.l (44,%sp), (28,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (40,%sp), (24,%sp)
+; NO-ATOMIC-PIC-NEXT:    lea (24,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.l %a0, (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l #5, (20,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l #5, (16,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (52,%sp), (12,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (48,%sp), (8,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (56,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__atomic_compare_exchange_8@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    move.l (28,%sp), %d1
+; NO-ATOMIC-PIC-NEXT:    move.l (24,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    adda.l #36, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: cmpxchg_i64_seqcst_seqcst:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    suba.l #36, %sp
+; ATOMIC-NEXT:    move.l (44,%sp), (28,%sp)
+; ATOMIC-NEXT:    move.l (40,%sp), (24,%sp)
+; ATOMIC-NEXT:    lea (24,%sp), %a0
+; ATOMIC-NEXT:    move.l %a0, (4,%sp)
+; ATOMIC-NEXT:    move.l #5, (20,%sp)
+; ATOMIC-NEXT:    move.l #5, (16,%sp)
+; ATOMIC-NEXT:    move.l (52,%sp), (12,%sp)
+; ATOMIC-NEXT:    move.l (48,%sp), (8,%sp)
+; ATOMIC-NEXT:    move.l (56,%sp), (%sp)
+; ATOMIC-NEXT:    jsr __atomic_compare_exchange_8
+; ATOMIC-NEXT:    move.l (28,%sp), %d1
+; ATOMIC-NEXT:    move.l (24,%sp), %d0
+; ATOMIC-NEXT:    adda.l #36, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: cmpxchg_i64_seqcst_seqcst:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #36, %sp
+; ATOMIC-PIC-NEXT:    move.l (44,%sp), (28,%sp)
+; ATOMIC-PIC-NEXT:    move.l (40,%sp), (24,%sp)
+; ATOMIC-PIC-NEXT:    lea (24,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.l %a0, (4,%sp)
+; ATOMIC-PIC-NEXT:    move.l #5, (20,%sp)
+; ATOMIC-PIC-NEXT:    move.l #5, (16,%sp)
+; ATOMIC-PIC-NEXT:    move.l (52,%sp), (12,%sp)
+; ATOMIC-PIC-NEXT:    move.l (48,%sp), (8,%sp)
+; ATOMIC-PIC-NEXT:    move.l (56,%sp), (%sp)
+; ATOMIC-PIC-NEXT:    jsr (__atomic_compare_exchange_8@PLT,%pc)
+; ATOMIC-PIC-NEXT:    move.l (28,%sp), %d1
+; ATOMIC-PIC-NEXT:    move.l (24,%sp), %d0
+; ATOMIC-PIC-NEXT:    adda.l #36, %sp
+; ATOMIC-PIC-NEXT:    rts
+  %res = cmpxchg ptr %mem, i64 %cmp, i64 %new seq_cst seq_cst
+  %val = extractvalue {i64, i1} %res, 0
+  ret i64 %val
+}
diff --git a/llvm/test/CodeGen/M68k/CodeModel/Large/Atomics/fence.ll b/llvm/test/CodeGen/M68k/CodeModel/Large/Atomics/fence.ll
new file mode 100644
index 0000000000000..727c4d1192b87
--- /dev/null
+++ b/llvm/test/CodeGen/M68k/CodeModel/Large/Atomics/fence.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=m68k-linux-gnu --code-model=large < %s | FileCheck %s
+; RUN: llc -mtriple=m68k-linux-gnu --code-model=large --relocation-model=pic < %s | FileCheck %s --check-prefix=PIC
+
+; M68k's libgcc does NOT have __sync_synchronize so we shouldn't
+; lower to that.
+
+define void @atomic_fence() {
+; CHECK-LABEL: atomic_fence:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0: ; %entry
+; CHECK-NEXT:    ;APP
+; CHECK-NEXT:    ;NO_APP
+; CHECK-NEXT:    ;APP
+; CHECK-NEXT:    ;NO_APP
+; CHECK-NEXT:    ;APP
+; CHECK-NEXT:    ;NO_APP
+; CHECK-NEXT:    ;APP
+; CHECK-NEXT:    ;NO_APP
+; CHECK-NEXT:    rts
+;
+; PIC-LABEL: atomic_fence:
+; PIC:         .cfi_startproc
+; PIC-NEXT:  ; %bb.0: ; %entry
+; PIC-NEXT:    ;APP
+; PIC-NEXT:    ;NO_APP
+; PIC-NEXT:    ;APP
+; PIC-NEXT:    ;NO_APP
+; PIC-NEXT:    ;APP
+; PIC-NEXT:    ;NO_APP
+; PIC-NEXT:    ;APP
+; PIC-NEXT:    ;NO_APP
+; PIC-NEXT:    rts
+entry:
+  fence acquire
+  fence release
+  fence acq_rel
+  fence seq_cst
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/M68k/CodeModel/Large/Atomics/load-store.ll b/llvm/test/CodeGen/M68k/CodeModel/Large/Atomics/load-store.ll
new file mode 100644
index 0000000000000..a59a40d8e9fd2
--- /dev/null
+++ b/llvm/test/CodeGen/M68k/CodeModel/Large/Atomics/load-store.ll
@@ -0,0 +1,1161 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68000 --code-model=large | FileCheck %s --check-prefix=NO-ATOMIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68010 --code-model=large | FileCheck %s --check-prefix=NO-ATOMIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68000 --code-model=large --relocation-model=pic | FileCheck %s --check-prefix=NO-ATOMIC-PIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68010 --code-model=large --relocation-model=pic | FileCheck %s --check-prefix=NO-ATOMIC-PIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68020 --code-model=large | FileCheck %s --check-prefix=ATOMIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68030 --code-model=large | FileCheck %s --check-prefix=ATOMIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68040 --code-model=large | FileCheck %s --check-prefix=ATOMIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68020 --code-model=large --relocation-model=pic | FileCheck %s --check-prefix=ATOMIC-PIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68030 --code-model=large --relocation-model=pic | FileCheck %s --check-prefix=ATOMIC-PIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68040 --code-model=large --relocation-model=pic | FileCheck %s --check-prefix=ATOMIC-PIC
+
+define i8 @atomic_load_i8_unordered(ptr %a) nounwind {
+; NO-ATOMIC-LABEL: atomic_load_i8_unordered:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.b (%a0), %d0
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_load_i8_unordered:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.b (%a0), %d0
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_load_i8_unordered:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.b (%a0), %d0
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_load_i8_unordered:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.b (%a0), %d0
+; ATOMIC-PIC-NEXT:    rts
+  %1 = load atomic i8, ptr %a unordered, align 1
+  ret i8 %1
+}
+
+define i8 @atomic_load_i8_monotonic(ptr %a) nounwind {
+; NO-ATOMIC-LABEL: atomic_load_i8_monotonic:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.b (%a0), %d0
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_load_i8_monotonic:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.b (%a0), %d0
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_load_i8_monotonic:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.b (%a0), %d0
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_load_i8_monotonic:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.b (%a0), %d0
+; ATOMIC-PIC-NEXT:    rts
+  %1 = load atomic i8, ptr %a monotonic, align 1
+  ret i8 %1
+}
+
+define i8 @atomic_load_i8_acquire(ptr %a) nounwind {
+; NO-ATOMIC-LABEL: atomic_load_i8_acquire:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.b (%a0), %d0
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_load_i8_acquire:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.b (%a0), %d0
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_load_i8_acquire:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.b (%a0), %d0
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_load_i8_acquire:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.b (%a0), %d0
+; ATOMIC-PIC-NEXT:    rts
+  %1 = load atomic i8, ptr %a acquire, align 1
+  ret i8 %1
+}
+
+define i8 @atomic_load_i8_seq_cst(ptr %a) nounwind {
+; NO-ATOMIC-LABEL: atomic_load_i8_seq_cst:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.b (%a0), %d0
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_load_i8_seq_cst:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.b (%a0), %d0
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_load_i8_seq_cst:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.b (%a0), %d0
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_load_i8_seq_cst:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.b (%a0), %d0
+; ATOMIC-PIC-NEXT:    rts
+  %1 = load atomic i8, ptr %a seq_cst, align 1
+  ret i8 %1
+}
+
+define i16 @atomic_load_i16_unordered(ptr %a) nounwind {
+; NO-ATOMIC-LABEL: atomic_load_i16_unordered:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.w (%a0), %d0
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_load_i16_unordered:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.w (%a0), %d0
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_load_i16_unordered:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.w (%a0), %d0
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_load_i16_unordered:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.w (%a0), %d0
+; ATOMIC-PIC-NEXT:    rts
+  %1 = load atomic i16, ptr %a unordered, align 2
+  ret i16 %1
+}
+
+define i16 @atomic_load_i16_monotonic(ptr %a) nounwind {
+; NO-ATOMIC-LABEL: atomic_load_i16_monotonic:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.w (%a0), %d0
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_load_i16_monotonic:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.w (%a0), %d0
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_load_i16_monotonic:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.w (%a0), %d0
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_load_i16_monotonic:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.w (%a0), %d0
+; ATOMIC-PIC-NEXT:    rts
+  %1 = load atomic i16, ptr %a monotonic, align 2
+  ret i16 %1
+}
+
+define i16 @atomic_load_i16_acquire(ptr %a) nounwind {
+; NO-ATOMIC-LABEL: atomic_load_i16_acquire:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.w (%a0), %d0
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_load_i16_acquire:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.w (%a0), %d0
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_load_i16_acquire:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.w (%a0), %d0
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_load_i16_acquire:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.w (%a0), %d0
+; ATOMIC-PIC-NEXT:    rts
+  %1 = load atomic i16, ptr %a acquire, align 2
+  ret i16 %1
+}
+
+define i16 @atomic_load_i16_seq_cst(ptr %a) nounwind {
+; NO-ATOMIC-LABEL: atomic_load_i16_seq_cst:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.w (%a0), %d0
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_load_i16_seq_cst:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.w (%a0), %d0
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_load_i16_seq_cst:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.w (%a0), %d0
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_load_i16_seq_cst:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.w (%a0), %d0
+; ATOMIC-PIC-NEXT:    rts
+  %1 = load atomic i16, ptr %a seq_cst, align 2
+  ret i16 %1
+}
+
+define i32 @atomic_load_i32_unordered(ptr %a) nounwind {
+; NO-ATOMIC-LABEL: atomic_load_i32_unordered:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.l (%a0), %d0
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_load_i32_unordered:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.l (%a0), %d0
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_load_i32_unordered:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.l (%a0), %d0
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_load_i32_unordered:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.l (%a0), %d0
+; ATOMIC-PIC-NEXT:    rts
+  %1 = load atomic i32, ptr %a unordered, align 4
+  ret i32 %1
+}
+
+define i32 @atomic_load_i32_monotonic(ptr %a) nounwind {
+; NO-ATOMIC-LABEL: atomic_load_i32_monotonic:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.l (%a0), %d0
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_load_i32_monotonic:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.l (%a0), %d0
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_load_i32_monotonic:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.l (%a0), %d0
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_load_i32_monotonic:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.l (%a0), %d0
+; ATOMIC-PIC-NEXT:    rts
+  %1 = load atomic i32, ptr %a monotonic, align 4
+  ret i32 %1
+}
+
+define i32 @atomic_load_i32_acquire(ptr %a) nounwind {
+; NO-ATOMIC-LABEL: atomic_load_i32_acquire:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.l (%a0), %d0
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_load_i32_acquire:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.l (%a0), %d0
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_load_i32_acquire:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.l (%a0), %d0
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_load_i32_acquire:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.l (%a0), %d0
+; ATOMIC-PIC-NEXT:    rts
+  %1 = load atomic i32, ptr %a acquire, align 4
+  ret i32 %1
+}
+
+define i32 @atomic_load_i32_seq_cst(ptr %a) nounwind {
+; NO-ATOMIC-LABEL: atomic_load_i32_seq_cst:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.l (%a0), %d0
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_load_i32_seq_cst:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.l (%a0), %d0
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_load_i32_seq_cst:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.l (%a0), %d0
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_load_i32_seq_cst:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.l (%a0), %d0
+; ATOMIC-PIC-NEXT:    rts
+  %1 = load atomic i32, ptr %a seq_cst, align 4
+  ret i32 %1
+}
+
+define i64 @atomic_load_i64_unordered(ptr %a) nounwind {
+; NO-ATOMIC-LABEL: atomic_load_i64_unordered:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    move.l #0, (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (16,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __atomic_load_8
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_load_i64_unordered:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    move.l #0, (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (16,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__atomic_load_8@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_load_i64_unordered:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    suba.l #12, %sp
+; ATOMIC-NEXT:    move.l #0, (4,%sp)
+; ATOMIC-NEXT:    move.l (16,%sp), (%sp)
+; ATOMIC-NEXT:    jsr __atomic_load_8
+; ATOMIC-NEXT:    adda.l #12, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_load_i64_unordered:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; ATOMIC-PIC-NEXT:    move.l #0, (4,%sp)
+; ATOMIC-PIC-NEXT:    move.l (16,%sp), (%sp)
+; ATOMIC-PIC-NEXT:    jsr (__atomic_load_8@PLT,%pc)
+; ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; ATOMIC-PIC-NEXT:    rts
+  %1 = load atomic i64, ptr %a unordered, align 8
+  ret i64 %1
+}
+
+define i64 @atomic_load_i64_monotonic(ptr %a) nounwind {
+; NO-ATOMIC-LABEL: atomic_load_i64_monotonic:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    move.l #0, (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (16,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __atomic_load_8
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_load_i64_monotonic:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    move.l #0, (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (16,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__atomic_load_8@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_load_i64_monotonic:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    suba.l #12, %sp
+; ATOMIC-NEXT:    move.l #0, (4,%sp)
+; ATOMIC-NEXT:    move.l (16,%sp), (%sp)
+; ATOMIC-NEXT:    jsr __atomic_load_8
+; ATOMIC-NEXT:    adda.l #12, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_load_i64_monotonic:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; ATOMIC-PIC-NEXT:    move.l #0, (4,%sp)
+; ATOMIC-PIC-NEXT:    move.l (16,%sp), (%sp)
+; ATOMIC-PIC-NEXT:    jsr (__atomic_load_8@PLT,%pc)
+; ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; ATOMIC-PIC-NEXT:    rts
+  %1 = load atomic i64, ptr %a monotonic, align 8
+  ret i64 %1
+}
+
+define i64 @atomic_load_i64_acquire(ptr %a) nounwind {
+; NO-ATOMIC-LABEL: atomic_load_i64_acquire:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    move.l #2, (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (16,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __atomic_load_8
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_load_i64_acquire:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    move.l #2, (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (16,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__atomic_load_8@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_load_i64_acquire:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    suba.l #12, %sp
+; ATOMIC-NEXT:    move.l #2, (4,%sp)
+; ATOMIC-NEXT:    move.l (16,%sp), (%sp)
+; ATOMIC-NEXT:    jsr __atomic_load_8
+; ATOMIC-NEXT:    adda.l #12, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_load_i64_acquire:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; ATOMIC-PIC-NEXT:    move.l #2, (4,%sp)
+; ATOMIC-PIC-NEXT:    move.l (16,%sp), (%sp)
+; ATOMIC-PIC-NEXT:    jsr (__atomic_load_8@PLT,%pc)
+; ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; ATOMIC-PIC-NEXT:    rts
+  %1 = load atomic i64, ptr %a acquire, align 8
+  ret i64 %1
+}
+
+define i64 @atomic_load_i64_seq_cst(ptr %a) nounwind {
+; NO-ATOMIC-LABEL: atomic_load_i64_seq_cst:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    move.l #5, (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (16,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __atomic_load_8
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_load_i64_seq_cst:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    move.l #5, (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (16,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__atomic_load_8@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_load_i64_seq_cst:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    suba.l #12, %sp
+; ATOMIC-NEXT:    move.l #5, (4,%sp)
+; ATOMIC-NEXT:    move.l (16,%sp), (%sp)
+; ATOMIC-NEXT:    jsr __atomic_load_8
+; ATOMIC-NEXT:    adda.l #12, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_load_i64_seq_cst:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; ATOMIC-PIC-NEXT:    move.l #5, (4,%sp)
+; ATOMIC-PIC-NEXT:    move.l (16,%sp), (%sp)
+; ATOMIC-PIC-NEXT:    jsr (__atomic_load_8@PLT,%pc)
+; ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; ATOMIC-PIC-NEXT:    rts
+  %1 = load atomic i64, ptr %a seq_cst, align 8
+  ret i64 %1
+}
+
+define void @atomic_store_i8_unordered(ptr %a, i8 %val) nounwind {
+; NO-ATOMIC-LABEL: atomic_store_i8_unordered:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.b (11,%sp), %d0
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.b %d0, (%a0)
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_store_i8_unordered:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.b (11,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.b %d0, (%a0)
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_store_i8_unordered:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.b (11,%sp), %d0
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.b %d0, (%a0)
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_store_i8_unordered:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.b (11,%sp), %d0
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.b %d0, (%a0)
+; ATOMIC-PIC-NEXT:    rts
+  store atomic i8 %val, ptr %a unordered, align 1
+  ret void
+}
+
+define void @atomic_store_i8_monotonic(ptr %a, i8 %val) nounwind {
+; NO-ATOMIC-LABEL: atomic_store_i8_monotonic:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.b (11,%sp), %d0
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.b %d0, (%a0)
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_store_i8_monotonic:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.b (11,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.b %d0, (%a0)
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_store_i8_monotonic:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.b (11,%sp), %d0
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.b %d0, (%a0)
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_store_i8_monotonic:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.b (11,%sp), %d0
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.b %d0, (%a0)
+; ATOMIC-PIC-NEXT:    rts
+  store atomic i8 %val, ptr %a monotonic, align 1
+  ret void
+}
+
+define void @atomic_store_i8_release(ptr %a, i8 %val) nounwind {
+; NO-ATOMIC-LABEL: atomic_store_i8_release:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.b (11,%sp), %d0
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.b %d0, (%a0)
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_store_i8_release:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.b (11,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.b %d0, (%a0)
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_store_i8_release:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.b (11,%sp), %d0
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.b %d0, (%a0)
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_store_i8_release:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.b (11,%sp), %d0
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.b %d0, (%a0)
+; ATOMIC-PIC-NEXT:    rts
+  store atomic i8 %val, ptr %a release, align 1
+  ret void
+}
+
+define void @atomic_store_i8_seq_cst(ptr %a, i8 %val) nounwind {
+; NO-ATOMIC-LABEL: atomic_store_i8_seq_cst:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.b (11,%sp), %d0
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.b %d0, (%a0)
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_store_i8_seq_cst:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.b (11,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.b %d0, (%a0)
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_store_i8_seq_cst:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.b (11,%sp), %d0
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.b %d0, (%a0)
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_store_i8_seq_cst:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.b (11,%sp), %d0
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.b %d0, (%a0)
+; ATOMIC-PIC-NEXT:    rts
+  store atomic i8 %val, ptr %a seq_cst, align 1
+  ret void
+}
+
+define void @atomic_store_i16_unordered(ptr %a, i16 %val) nounwind {
+; NO-ATOMIC-LABEL: atomic_store_i16_unordered:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.w (10,%sp), %d0
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.w %d0, (%a0)
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_store_i16_unordered:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.w (10,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.w %d0, (%a0)
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_store_i16_unordered:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.w (10,%sp), %d0
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.w %d0, (%a0)
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_store_i16_unordered:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.w (10,%sp), %d0
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.w %d0, (%a0)
+; ATOMIC-PIC-NEXT:    rts
+  store atomic i16 %val, ptr %a unordered, align 2
+  ret void
+}
+
+define void @atomic_store_i16_monotonic(ptr %a, i16 %val) nounwind {
+; NO-ATOMIC-LABEL: atomic_store_i16_monotonic:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.w (10,%sp), %d0
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.w %d0, (%a0)
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_store_i16_monotonic:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.w (10,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.w %d0, (%a0)
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_store_i16_monotonic:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.w (10,%sp), %d0
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.w %d0, (%a0)
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_store_i16_monotonic:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.w (10,%sp), %d0
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.w %d0, (%a0)
+; ATOMIC-PIC-NEXT:    rts
+  store atomic i16 %val, ptr %a monotonic, align 2
+  ret void
+}
+
+define void @atomic_store_i16_release(ptr %a, i16 %val) nounwind {
+; NO-ATOMIC-LABEL: atomic_store_i16_release:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.w (10,%sp), %d0
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.w %d0, (%a0)
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_store_i16_release:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.w (10,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.w %d0, (%a0)
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_store_i16_release:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.w (10,%sp), %d0
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.w %d0, (%a0)
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_store_i16_release:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.w (10,%sp), %d0
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.w %d0, (%a0)
+; ATOMIC-PIC-NEXT:    rts
+  store atomic i16 %val, ptr %a release, align 2
+  ret void
+}
+
+define void @atomic_store_i16_seq_cst(ptr %a, i16 %val) nounwind {
+; NO-ATOMIC-LABEL: atomic_store_i16_seq_cst:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.w (10,%sp), %d0
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.w %d0, (%a0)
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_store_i16_seq_cst:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.w (10,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.w %d0, (%a0)
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_store_i16_seq_cst:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.w (10,%sp), %d0
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.w %d0, (%a0)
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_store_i16_seq_cst:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.w (10,%sp), %d0
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.w %d0, (%a0)
+; ATOMIC-PIC-NEXT:    rts
+  store atomic i16 %val, ptr %a seq_cst, align 2
+  ret void
+}
+
+define void @atomic_store_i32_unordered(ptr %a, i32 %val) nounwind {
+; NO-ATOMIC-LABEL: atomic_store_i32_unordered:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.l (8,%sp), %d0
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.l %d0, (%a0)
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_store_i32_unordered:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.l (8,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (%a0)
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_store_i32_unordered:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.l (8,%sp), %d0
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.l %d0, (%a0)
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_store_i32_unordered:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.l (8,%sp), %d0
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.l %d0, (%a0)
+; ATOMIC-PIC-NEXT:    rts
+  store atomic i32 %val, ptr %a unordered, align 4
+  ret void
+}
+
+define void @atomic_store_i32_monotonic(ptr %a, i32 %val) nounwind {
+; NO-ATOMIC-LABEL: atomic_store_i32_monotonic:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.l (8,%sp), %d0
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.l %d0, (%a0)
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_store_i32_monotonic:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.l (8,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (%a0)
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_store_i32_monotonic:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.l (8,%sp), %d0
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.l %d0, (%a0)
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_store_i32_monotonic:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.l (8,%sp), %d0
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.l %d0, (%a0)
+; ATOMIC-PIC-NEXT:    rts
+  store atomic i32 %val, ptr %a monotonic, align 4
+  ret void
+}
+
+define void @atomic_store_i32_release(ptr %a, i32 %val) nounwind {
+; NO-ATOMIC-LABEL: atomic_store_i32_release:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.l (8,%sp), %d0
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.l %d0, (%a0)
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_store_i32_release:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.l (8,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (%a0)
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_store_i32_release:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.l (8,%sp), %d0
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.l %d0, (%a0)
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_store_i32_release:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.l (8,%sp), %d0
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.l %d0, (%a0)
+; ATOMIC-PIC-NEXT:    rts
+  store atomic i32 %val, ptr %a release, align 4
+  ret void
+}
+
+define void @atomic_store_i32_seq_cst(ptr %a, i32 %val) nounwind {
+; NO-ATOMIC-LABEL: atomic_store_i32_seq_cst:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    move.l (8,%sp), %d0
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.l %d0, (%a0)
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_store_i32_seq_cst:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    move.l (8,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (%a0)
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_store_i32_seq_cst:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    move.l (8,%sp), %d0
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.l %d0, (%a0)
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_store_i32_seq_cst:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    move.l (8,%sp), %d0
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.l %d0, (%a0)
+; ATOMIC-PIC-NEXT:    rts
+  store atomic i32 %val, ptr %a seq_cst, align 4
+  ret void
+}
+
+define void @atomic_store_i64_unordered(ptr %a, i64 %val) nounwind {
+; NO-ATOMIC-LABEL: atomic_store_i64_unordered:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #20, %sp
+; NO-ATOMIC-NEXT:    move.l #0, (12,%sp)
+; NO-ATOMIC-NEXT:    move.l (32,%sp), (8,%sp)
+; NO-ATOMIC-NEXT:    move.l (28,%sp), (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (24,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __atomic_store_8
+; NO-ATOMIC-NEXT:    adda.l #20, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_store_i64_unordered:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #20, %sp
+; NO-ATOMIC-PIC-NEXT:    move.l #0, (12,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (32,%sp), (8,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (28,%sp), (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (24,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__atomic_store_8@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #20, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_store_i64_unordered:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    suba.l #20, %sp
+; ATOMIC-NEXT:    move.l #0, (12,%sp)
+; ATOMIC-NEXT:    move.l (32,%sp), (8,%sp)
+; ATOMIC-NEXT:    move.l (28,%sp), (4,%sp)
+; ATOMIC-NEXT:    move.l (24,%sp), (%sp)
+; ATOMIC-NEXT:    jsr __atomic_store_8
+; ATOMIC-NEXT:    adda.l #20, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_store_i64_unordered:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #20, %sp
+; ATOMIC-PIC-NEXT:    move.l #0, (12,%sp)
+; ATOMIC-PIC-NEXT:    move.l (32,%sp), (8,%sp)
+; ATOMIC-PIC-NEXT:    move.l (28,%sp), (4,%sp)
+; ATOMIC-PIC-NEXT:    move.l (24,%sp), (%sp)
+; ATOMIC-PIC-NEXT:    jsr (__atomic_store_8@PLT,%pc)
+; ATOMIC-PIC-NEXT:    adda.l #20, %sp
+; ATOMIC-PIC-NEXT:    rts
+  store atomic i64 %val, ptr %a unordered, align 8
+  ret void
+}
+
+define void @atomic_store_i64_monotonic(ptr %a, i64 %val) nounwind {
+; NO-ATOMIC-LABEL: atomic_store_i64_monotonic:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #20, %sp
+; NO-ATOMIC-NEXT:    move.l #0, (12,%sp)
+; NO-ATOMIC-NEXT:    move.l (32,%sp), (8,%sp)
+; NO-ATOMIC-NEXT:    move.l (28,%sp), (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (24,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __atomic_store_8
+; NO-ATOMIC-NEXT:    adda.l #20, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_store_i64_monotonic:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #20, %sp
+; NO-ATOMIC-PIC-NEXT:    move.l #0, (12,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (32,%sp), (8,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (28,%sp), (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (24,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__atomic_store_8@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #20, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_store_i64_monotonic:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    suba.l #20, %sp
+; ATOMIC-NEXT:    move.l #0, (12,%sp)
+; ATOMIC-NEXT:    move.l (32,%sp), (8,%sp)
+; ATOMIC-NEXT:    move.l (28,%sp), (4,%sp)
+; ATOMIC-NEXT:    move.l (24,%sp), (%sp)
+; ATOMIC-NEXT:    jsr __atomic_store_8
+; ATOMIC-NEXT:    adda.l #20, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_store_i64_monotonic:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #20, %sp
+; ATOMIC-PIC-NEXT:    move.l #0, (12,%sp)
+; ATOMIC-PIC-NEXT:    move.l (32,%sp), (8,%sp)
+; ATOMIC-PIC-NEXT:    move.l (28,%sp), (4,%sp)
+; ATOMIC-PIC-NEXT:    move.l (24,%sp), (%sp)
+; ATOMIC-PIC-NEXT:    jsr (__atomic_store_8@PLT,%pc)
+; ATOMIC-PIC-NEXT:    adda.l #20, %sp
+; ATOMIC-PIC-NEXT:    rts
+  store atomic i64 %val, ptr %a monotonic, align 8
+  ret void
+}
+
+define void @atomic_store_i64_release(ptr %a, i64 %val) nounwind {
+; NO-ATOMIC-LABEL: atomic_store_i64_release:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #20, %sp
+; NO-ATOMIC-NEXT:    move.l #3, (12,%sp)
+; NO-ATOMIC-NEXT:    move.l (32,%sp), (8,%sp)
+; NO-ATOMIC-NEXT:    move.l (28,%sp), (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (24,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __atomic_store_8
+; NO-ATOMIC-NEXT:    adda.l #20, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_store_i64_release:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #20, %sp
+; NO-ATOMIC-PIC-NEXT:    move.l #3, (12,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (32,%sp), (8,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (28,%sp), (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (24,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__atomic_store_8@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #20, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_store_i64_release:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    suba.l #20, %sp
+; ATOMIC-NEXT:    move.l #3, (12,%sp)
+; ATOMIC-NEXT:    move.l (32,%sp), (8,%sp)
+; ATOMIC-NEXT:    move.l (28,%sp), (4,%sp)
+; ATOMIC-NEXT:    move.l (24,%sp), (%sp)
+; ATOMIC-NEXT:    jsr __atomic_store_8
+; ATOMIC-NEXT:    adda.l #20, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_store_i64_release:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #20, %sp
+; ATOMIC-PIC-NEXT:    move.l #3, (12,%sp)
+; ATOMIC-PIC-NEXT:    move.l (32,%sp), (8,%sp)
+; ATOMIC-PIC-NEXT:    move.l (28,%sp), (4,%sp)
+; ATOMIC-PIC-NEXT:    move.l (24,%sp), (%sp)
+; ATOMIC-PIC-NEXT:    jsr (__atomic_store_8@PLT,%pc)
+; ATOMIC-PIC-NEXT:    adda.l #20, %sp
+; ATOMIC-PIC-NEXT:    rts
+  store atomic i64 %val, ptr %a release, align 8
+  ret void
+}
+
+define void @atomic_store_i64_seq_cst(ptr %a, i64 %val) nounwind {
+; NO-ATOMIC-LABEL: atomic_store_i64_seq_cst:
+; NO-ATOMIC:       ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #20, %sp
+; NO-ATOMIC-NEXT:    move.l #5, (12,%sp)
+; NO-ATOMIC-NEXT:    move.l (32,%sp), (8,%sp)
+; NO-ATOMIC-NEXT:    move.l (28,%sp), (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (24,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __atomic_store_8
+; NO-ATOMIC-NEXT:    adda.l #20, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomic_store_i64_seq_cst:
+; NO-ATOMIC-PIC:       ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #20, %sp
+; NO-ATOMIC-PIC-NEXT:    move.l #5, (12,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (32,%sp), (8,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (28,%sp), (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (24,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__atomic_store_8@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #20, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomic_store_i64_seq_cst:
+; ATOMIC:       ; %bb.0:
+; ATOMIC-NEXT:    suba.l #20, %sp
+; ATOMIC-NEXT:    move.l #5, (12,%sp)
+; ATOMIC-NEXT:    move.l (32,%sp), (8,%sp)
+; ATOMIC-NEXT:    move.l (28,%sp), (4,%sp)
+; ATOMIC-NEXT:    move.l (24,%sp), (%sp)
+; ATOMIC-NEXT:    jsr __atomic_store_8
+; ATOMIC-NEXT:    adda.l #20, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomic_store_i64_seq_cst:
+; ATOMIC-PIC:       ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #20, %sp
+; ATOMIC-PIC-NEXT:    move.l #5, (12,%sp)
+; ATOMIC-PIC-NEXT:    move.l (32,%sp), (8,%sp)
+; ATOMIC-PIC-NEXT:    move.l (28,%sp), (4,%sp)
+; ATOMIC-PIC-NEXT:    move.l (24,%sp), (%sp)
+; ATOMIC-PIC-NEXT:    jsr (__atomic_store_8@PLT,%pc)
+; ATOMIC-PIC-NEXT:    adda.l #20, %sp
+; ATOMIC-PIC-NEXT:    rts
+  store atomic i64 %val, ptr %a seq_cst, align 8
+  ret void
+}
+
+define void @store_arid(ptr nonnull align 4 %a) {
+; NO-ATOMIC-LABEL: store_arid:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0: ; %start
+; NO-ATOMIC-NEXT:    moveq #1, %d0
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.l %d0, (32,%a0)
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: store_arid:
+; NO-ATOMIC-PIC:         .cfi_startproc
+; NO-ATOMIC-PIC-NEXT:  ; %bb.0: ; %start
+; NO-ATOMIC-PIC-NEXT:    moveq #1, %d0
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (32,%a0)
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: store_arid:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0: ; %start
+; ATOMIC-NEXT:    moveq #1, %d0
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.l %d0, (32,%a0)
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: store_arid:
+; ATOMIC-PIC:         .cfi_startproc
+; ATOMIC-PIC-NEXT:  ; %bb.0: ; %start
+; ATOMIC-PIC-NEXT:    moveq #1, %d0
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.l %d0, (32,%a0)
+; ATOMIC-PIC-NEXT:    rts
+start:
+  %1 = getelementptr inbounds i32, ptr %a, i32 8
+  store atomic i32 1, ptr %1 seq_cst, align 4
+  br label %exit
+
+exit:                                              ; preds = %start
+  ret void
+}
+
+define i32 @load_arid(ptr nonnull align 4 %a) {
+; NO-ATOMIC-LABEL: load_arid:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0: ; %start
+; NO-ATOMIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-NEXT:    move.l (32,%a0), %d0
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: load_arid:
+; NO-ATOMIC-PIC:         .cfi_startproc
+; NO-ATOMIC-PIC-NEXT:  ; %bb.0: ; %start
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.l (32,%a0), %d0
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: load_arid:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0: ; %start
+; ATOMIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-NEXT:    move.l (32,%a0), %d0
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: load_arid:
+; ATOMIC-PIC:         .cfi_startproc
+; ATOMIC-PIC-NEXT:  ; %bb.0: ; %start
+; ATOMIC-PIC-NEXT:    move.l (4,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.l (32,%a0), %d0
+; ATOMIC-PIC-NEXT:    rts
+start:
+  %1 = getelementptr inbounds i32, ptr %a, i32 8
+  %2 = load atomic i32, ptr %1 seq_cst, align 4
+  br label %exit
+
+exit:                                              ; preds = %start
+  ret i32 %2
+}
diff --git a/llvm/test/CodeGen/M68k/CodeModel/Large/Atomics/rmw.ll b/llvm/test/CodeGen/M68k/CodeModel/Large/Atomics/rmw.ll
new file mode 100644
index 0000000000000..b4c2bb1d223c9
--- /dev/null
+++ b/llvm/test/CodeGen/M68k/CodeModel/Large/Atomics/rmw.ll
@@ -0,0 +1,1390 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68000 -verify-machineinstrs --code-model=large | FileCheck %s --check-prefix=NO-ATOMIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68010 -verify-machineinstrs --code-model=large | FileCheck %s --check-prefix=NO-ATOMIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68000 -verify-machineinstrs --code-model=large --relocation-model=pic | FileCheck %s --check-prefix=NO-ATOMIC-PIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68010 -verify-machineinstrs --code-model=large --relocation-model=pic | FileCheck %s --check-prefix=NO-ATOMIC-PIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68020 -verify-machineinstrs --code-model=large | FileCheck %s --check-prefix=ATOMIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68030 -verify-machineinstrs --code-model=large | FileCheck %s --check-prefix=ATOMIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68040 -verify-machineinstrs --code-model=large | FileCheck %s --check-prefix=ATOMIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68020 -verify-machineinstrs --code-model=large --relocation-model=pic | FileCheck %s --check-prefix=ATOMIC-PIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68030 -verify-machineinstrs --code-model=large --relocation-model=pic | FileCheck %s --check-prefix=ATOMIC-PIC
+; RUN: llc %s -o - -mtriple=m68k -mcpu=M68040 -verify-machineinstrs --code-model=large --relocation-model=pic | FileCheck %s --check-prefix=ATOMIC-PIC
+
+define i8 @atomicrmw_add_i8(i8 %val, ptr %ptr) {
+; NO-ATOMIC-LABEL: atomicrmw_add_i8:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-NEXT:    move.b (19,%sp), %d0
+; NO-ATOMIC-NEXT:    and.l #255, %d0
+; NO-ATOMIC-NEXT:    move.l %d0, (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_add_1
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomicrmw_add_i8:
+; NO-ATOMIC-PIC:         .cfi_startproc
+; NO-ATOMIC-PIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-PIC-NEXT:    move.b (19,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    and.l #255, %d0
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__sync_fetch_and_add_1@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomicrmw_add_i8:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0:
+; ATOMIC-NEXT:    suba.l #8, %sp
+; ATOMIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-NEXT:    move.b (15,%sp), %d1
+; ATOMIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-NEXT:    move.b (%a0), %d2
+; ATOMIC-NEXT:    move.b %d2, %d0
+; ATOMIC-NEXT:  .LBB0_1: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-NEXT:    move.b %d2, %d3
+; ATOMIC-NEXT:    add.b %d1, %d3
+; ATOMIC-NEXT:    cas.b %d0, %d3, (%a0)
+; ATOMIC-NEXT:    move.b %d0, %d3
+; ATOMIC-NEXT:    sub.b %d2, %d3
+; ATOMIC-NEXT:    seq %d2
+; ATOMIC-NEXT:    sub.b #1, %d2
+; ATOMIC-NEXT:    move.b %d0, %d2
+; ATOMIC-NEXT:    bne .LBB0_1
+; ATOMIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-NEXT:    adda.l #8, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomicrmw_add_i8:
+; ATOMIC-PIC:         .cfi_startproc
+; ATOMIC-PIC-NEXT:  ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #8, %sp
+; ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-PIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-PIC-NEXT:    move.b (15,%sp), %d1
+; ATOMIC-PIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.b (%a0), %d2
+; ATOMIC-PIC-NEXT:    move.b %d2, %d0
+; ATOMIC-PIC-NEXT:  .LBB0_1: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-PIC-NEXT:    move.b %d2, %d3
+; ATOMIC-PIC-NEXT:    add.b %d1, %d3
+; ATOMIC-PIC-NEXT:    cas.b %d0, %d3, (%a0)
+; ATOMIC-PIC-NEXT:    move.b %d0, %d3
+; ATOMIC-PIC-NEXT:    sub.b %d2, %d3
+; ATOMIC-PIC-NEXT:    seq %d2
+; ATOMIC-PIC-NEXT:    sub.b #1, %d2
+; ATOMIC-PIC-NEXT:    move.b %d0, %d2
+; ATOMIC-PIC-NEXT:    bne .LBB0_1
+; ATOMIC-PIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-PIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-PIC-NEXT:    adda.l #8, %sp
+; ATOMIC-PIC-NEXT:    rts
+  %old = atomicrmw add ptr %ptr, i8 %val monotonic
+  ret i8 %old
+}
+
+define i16 @atomicrmw_sub_i16(i16 %val, ptr %ptr) {
+; NO-ATOMIC-LABEL: atomicrmw_sub_i16:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-NEXT:    move.w (18,%sp), %d0
+; NO-ATOMIC-NEXT:    and.l #65535, %d0
+; NO-ATOMIC-NEXT:    move.l %d0, (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_sub_2
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomicrmw_sub_i16:
+; NO-ATOMIC-PIC:         .cfi_startproc
+; NO-ATOMIC-PIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-PIC-NEXT:    move.w (18,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    and.l #65535, %d0
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__sync_fetch_and_sub_2@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomicrmw_sub_i16:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0:
+; ATOMIC-NEXT:    suba.l #8, %sp
+; ATOMIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-NEXT:    move.w (14,%sp), %d1
+; ATOMIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-NEXT:    move.w (%a0), %d2
+; ATOMIC-NEXT:    move.w %d2, %d0
+; ATOMIC-NEXT:  .LBB1_1: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-NEXT:    move.w %d2, %d3
+; ATOMIC-NEXT:    sub.w %d1, %d3
+; ATOMIC-NEXT:    cas.w %d0, %d3, (%a0)
+; ATOMIC-NEXT:    move.w %d0, %d3
+; ATOMIC-NEXT:    sub.w %d2, %d3
+; ATOMIC-NEXT:    seq %d2
+; ATOMIC-NEXT:    sub.b #1, %d2
+; ATOMIC-NEXT:    move.w %d0, %d2
+; ATOMIC-NEXT:    bne .LBB1_1
+; ATOMIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-NEXT:    adda.l #8, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomicrmw_sub_i16:
+; ATOMIC-PIC:         .cfi_startproc
+; ATOMIC-PIC-NEXT:  ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #8, %sp
+; ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-PIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-PIC-NEXT:    move.w (14,%sp), %d1
+; ATOMIC-PIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.w (%a0), %d2
+; ATOMIC-PIC-NEXT:    move.w %d2, %d0
+; ATOMIC-PIC-NEXT:  .LBB1_1: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-PIC-NEXT:    move.w %d2, %d3
+; ATOMIC-PIC-NEXT:    sub.w %d1, %d3
+; ATOMIC-PIC-NEXT:    cas.w %d0, %d3, (%a0)
+; ATOMIC-PIC-NEXT:    move.w %d0, %d3
+; ATOMIC-PIC-NEXT:    sub.w %d2, %d3
+; ATOMIC-PIC-NEXT:    seq %d2
+; ATOMIC-PIC-NEXT:    sub.b #1, %d2
+; ATOMIC-PIC-NEXT:    move.w %d0, %d2
+; ATOMIC-PIC-NEXT:    bne .LBB1_1
+; ATOMIC-PIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-PIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-PIC-NEXT:    adda.l #8, %sp
+; ATOMIC-PIC-NEXT:    rts
+  %old = atomicrmw sub ptr %ptr, i16 %val acquire
+  ret i16 %old
+}
+
+define i32 @atomicrmw_and_i32(i32 %val, ptr %ptr) {
+; NO-ATOMIC-LABEL: atomicrmw_and_i32:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-NEXT:    move.l (16,%sp), (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_and_4
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomicrmw_and_i32:
+; NO-ATOMIC-PIC:         .cfi_startproc
+; NO-ATOMIC-PIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-PIC-NEXT:    move.l (16,%sp), (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__sync_fetch_and_and_4@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomicrmw_and_i32:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0:
+; ATOMIC-NEXT:    suba.l #8, %sp
+; ATOMIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-NEXT:    move.l (12,%sp), %d1
+; ATOMIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-NEXT:    move.l (%a0), %d2
+; ATOMIC-NEXT:    move.l %d2, %d0
+; ATOMIC-NEXT:  .LBB2_1: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-NEXT:    move.l %d2, %d3
+; ATOMIC-NEXT:    and.l %d1, %d3
+; ATOMIC-NEXT:    cas.l %d0, %d3, (%a0)
+; ATOMIC-NEXT:    move.l %d0, %d3
+; ATOMIC-NEXT:    sub.l %d2, %d3
+; ATOMIC-NEXT:    seq %d2
+; ATOMIC-NEXT:    sub.b #1, %d2
+; ATOMIC-NEXT:    move.l %d0, %d2
+; ATOMIC-NEXT:    bne .LBB2_1
+; ATOMIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-NEXT:    adda.l #8, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomicrmw_and_i32:
+; ATOMIC-PIC:         .cfi_startproc
+; ATOMIC-PIC-NEXT:  ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #8, %sp
+; ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-PIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-PIC-NEXT:    move.l (12,%sp), %d1
+; ATOMIC-PIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.l (%a0), %d2
+; ATOMIC-PIC-NEXT:    move.l %d2, %d0
+; ATOMIC-PIC-NEXT:  .LBB2_1: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-PIC-NEXT:    move.l %d2, %d3
+; ATOMIC-PIC-NEXT:    and.l %d1, %d3
+; ATOMIC-PIC-NEXT:    cas.l %d0, %d3, (%a0)
+; ATOMIC-PIC-NEXT:    move.l %d0, %d3
+; ATOMIC-PIC-NEXT:    sub.l %d2, %d3
+; ATOMIC-PIC-NEXT:    seq %d2
+; ATOMIC-PIC-NEXT:    sub.b #1, %d2
+; ATOMIC-PIC-NEXT:    move.l %d0, %d2
+; ATOMIC-PIC-NEXT:    bne .LBB2_1
+; ATOMIC-PIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-PIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-PIC-NEXT:    adda.l #8, %sp
+; ATOMIC-PIC-NEXT:    rts
+  %old = atomicrmw and ptr %ptr, i32 %val seq_cst
+  ret i32 %old
+}
+
+define i64 @atomicrmw_xor_i64(i64 %val, ptr %ptr) {
+; NO-ATOMIC-LABEL: atomicrmw_xor_i64:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #20, %sp
+; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -24
+; NO-ATOMIC-NEXT:    move.l #3, (12,%sp)
+; NO-ATOMIC-NEXT:    move.l (28,%sp), (8,%sp)
+; NO-ATOMIC-NEXT:    move.l (24,%sp), (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (32,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __atomic_fetch_xor_8
+; NO-ATOMIC-NEXT:    adda.l #20, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomicrmw_xor_i64:
+; NO-ATOMIC-PIC:         .cfi_startproc
+; NO-ATOMIC-PIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #20, %sp
+; NO-ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -24
+; NO-ATOMIC-PIC-NEXT:    move.l #3, (12,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (28,%sp), (8,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (24,%sp), (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (32,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__atomic_fetch_xor_8@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #20, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomicrmw_xor_i64:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0:
+; ATOMIC-NEXT:    suba.l #20, %sp
+; ATOMIC-NEXT:    .cfi_def_cfa_offset -24
+; ATOMIC-NEXT:    move.l #3, (12,%sp)
+; ATOMIC-NEXT:    move.l (28,%sp), (8,%sp)
+; ATOMIC-NEXT:    move.l (24,%sp), (4,%sp)
+; ATOMIC-NEXT:    move.l (32,%sp), (%sp)
+; ATOMIC-NEXT:    jsr __atomic_fetch_xor_8
+; ATOMIC-NEXT:    adda.l #20, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomicrmw_xor_i64:
+; ATOMIC-PIC:         .cfi_startproc
+; ATOMIC-PIC-NEXT:  ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #20, %sp
+; ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -24
+; ATOMIC-PIC-NEXT:    move.l #3, (12,%sp)
+; ATOMIC-PIC-NEXT:    move.l (28,%sp), (8,%sp)
+; ATOMIC-PIC-NEXT:    move.l (24,%sp), (4,%sp)
+; ATOMIC-PIC-NEXT:    move.l (32,%sp), (%sp)
+; ATOMIC-PIC-NEXT:    jsr (__atomic_fetch_xor_8@PLT,%pc)
+; ATOMIC-PIC-NEXT:    adda.l #20, %sp
+; ATOMIC-PIC-NEXT:    rts
+  %old = atomicrmw xor ptr %ptr, i64 %val release
+  ret i64 %old
+}
+
+define i8 @atomicrmw_or_i8(i8 %val, ptr %ptr) {
+; NO-ATOMIC-LABEL: atomicrmw_or_i8:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-NEXT:    move.b (19,%sp), %d0
+; NO-ATOMIC-NEXT:    and.l #255, %d0
+; NO-ATOMIC-NEXT:    move.l %d0, (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_or_1
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomicrmw_or_i8:
+; NO-ATOMIC-PIC:         .cfi_startproc
+; NO-ATOMIC-PIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-PIC-NEXT:    move.b (19,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    and.l #255, %d0
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__sync_fetch_and_or_1@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomicrmw_or_i8:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0:
+; ATOMIC-NEXT:    suba.l #8, %sp
+; ATOMIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-NEXT:    move.b (15,%sp), %d1
+; ATOMIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-NEXT:    move.b (%a0), %d2
+; ATOMIC-NEXT:    move.b %d2, %d0
+; ATOMIC-NEXT:  .LBB4_1: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-NEXT:    move.b %d2, %d3
+; ATOMIC-NEXT:    or.b %d1, %d3
+; ATOMIC-NEXT:    cas.b %d0, %d3, (%a0)
+; ATOMIC-NEXT:    move.b %d0, %d3
+; ATOMIC-NEXT:    sub.b %d2, %d3
+; ATOMIC-NEXT:    seq %d2
+; ATOMIC-NEXT:    sub.b #1, %d2
+; ATOMIC-NEXT:    move.b %d0, %d2
+; ATOMIC-NEXT:    bne .LBB4_1
+; ATOMIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-NEXT:    adda.l #8, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomicrmw_or_i8:
+; ATOMIC-PIC:         .cfi_startproc
+; ATOMIC-PIC-NEXT:  ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #8, %sp
+; ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-PIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-PIC-NEXT:    move.b (15,%sp), %d1
+; ATOMIC-PIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.b (%a0), %d2
+; ATOMIC-PIC-NEXT:    move.b %d2, %d0
+; ATOMIC-PIC-NEXT:  .LBB4_1: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-PIC-NEXT:    move.b %d2, %d3
+; ATOMIC-PIC-NEXT:    or.b %d1, %d3
+; ATOMIC-PIC-NEXT:    cas.b %d0, %d3, (%a0)
+; ATOMIC-PIC-NEXT:    move.b %d0, %d3
+; ATOMIC-PIC-NEXT:    sub.b %d2, %d3
+; ATOMIC-PIC-NEXT:    seq %d2
+; ATOMIC-PIC-NEXT:    sub.b #1, %d2
+; ATOMIC-PIC-NEXT:    move.b %d0, %d2
+; ATOMIC-PIC-NEXT:    bne .LBB4_1
+; ATOMIC-PIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-PIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-PIC-NEXT:    adda.l #8, %sp
+; ATOMIC-PIC-NEXT:    rts
+  %old = atomicrmw or ptr %ptr, i8 %val monotonic
+  ret i8 %old
+}
+
+define i16 @atmoicrmw_nand_i16(i16 %val, ptr %ptr) {
+; NO-ATOMIC-LABEL: atmoicrmw_nand_i16:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-NEXT:    movem.l %d2, (8,%sp) ; 8-byte Folded Spill
+; NO-ATOMIC-NEXT:    move.w (18,%sp), %d2
+; NO-ATOMIC-NEXT:    move.l %d2, %d0
+; NO-ATOMIC-NEXT:    and.l #65535, %d0
+; NO-ATOMIC-NEXT:    move.l %d0, (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_nand_2
+; NO-ATOMIC-NEXT:    move.w %d2, %d0
+; NO-ATOMIC-NEXT:    movem.l (8,%sp), %d2 ; 8-byte Folded Reload
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atmoicrmw_nand_i16:
+; NO-ATOMIC-PIC:         .cfi_startproc
+; NO-ATOMIC-PIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-PIC-NEXT:    movem.l %d2, (8,%sp) ; 8-byte Folded Spill
+; NO-ATOMIC-PIC-NEXT:    move.w (18,%sp), %d2
+; NO-ATOMIC-PIC-NEXT:    move.l %d2, %d0
+; NO-ATOMIC-PIC-NEXT:    and.l #65535, %d0
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__sync_fetch_and_nand_2@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    move.w %d2, %d0
+; NO-ATOMIC-PIC-NEXT:    movem.l (8,%sp), %d2 ; 8-byte Folded Reload
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atmoicrmw_nand_i16:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0:
+; ATOMIC-NEXT:    suba.l #8, %sp
+; ATOMIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-NEXT:    move.w (14,%sp), %d0
+; ATOMIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-NEXT:    move.w (%a0), %d2
+; ATOMIC-NEXT:    move.w %d2, %d1
+; ATOMIC-NEXT:  .LBB5_1: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-NEXT:    move.w %d2, %d3
+; ATOMIC-NEXT:    and.w %d0, %d3
+; ATOMIC-NEXT:    not.w %d3
+; ATOMIC-NEXT:    cas.w %d1, %d3, (%a0)
+; ATOMIC-NEXT:    move.w %d1, %d3
+; ATOMIC-NEXT:    sub.w %d2, %d3
+; ATOMIC-NEXT:    seq %d2
+; ATOMIC-NEXT:    sub.b #1, %d2
+; ATOMIC-NEXT:    move.w %d1, %d2
+; ATOMIC-NEXT:    bne .LBB5_1
+; ATOMIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-NEXT:    adda.l #8, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atmoicrmw_nand_i16:
+; ATOMIC-PIC:         .cfi_startproc
+; ATOMIC-PIC-NEXT:  ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #8, %sp
+; ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-PIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-PIC-NEXT:    move.w (14,%sp), %d0
+; ATOMIC-PIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.w (%a0), %d2
+; ATOMIC-PIC-NEXT:    move.w %d2, %d1
+; ATOMIC-PIC-NEXT:  .LBB5_1: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-PIC-NEXT:    move.w %d2, %d3
+; ATOMIC-PIC-NEXT:    and.w %d0, %d3
+; ATOMIC-PIC-NEXT:    not.w %d3
+; ATOMIC-PIC-NEXT:    cas.w %d1, %d3, (%a0)
+; ATOMIC-PIC-NEXT:    move.w %d1, %d3
+; ATOMIC-PIC-NEXT:    sub.w %d2, %d3
+; ATOMIC-PIC-NEXT:    seq %d2
+; ATOMIC-PIC-NEXT:    sub.b #1, %d2
+; ATOMIC-PIC-NEXT:    move.w %d1, %d2
+; ATOMIC-PIC-NEXT:    bne .LBB5_1
+; ATOMIC-PIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-PIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-PIC-NEXT:    adda.l #8, %sp
+; ATOMIC-PIC-NEXT:    rts
+  %old = atomicrmw nand ptr %ptr, i16 %val seq_cst
+  ret i16 %val
+}
+
+define i32 @atomicrmw_min_i32(i32 %val, ptr %ptr) {
+; NO-ATOMIC-LABEL: atomicrmw_min_i32:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-NEXT:    move.l (16,%sp), (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_min_4
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomicrmw_min_i32:
+; NO-ATOMIC-PIC:         .cfi_startproc
+; NO-ATOMIC-PIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-PIC-NEXT:    move.l (16,%sp), (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__sync_fetch_and_min_4@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomicrmw_min_i32:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0:
+; ATOMIC-NEXT:    suba.l #8, %sp
+; ATOMIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-NEXT:    move.l (12,%sp), %d1
+; ATOMIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-NEXT:    move.l (%a0), %d2
+; ATOMIC-NEXT:    bra .LBB6_1
+; ATOMIC-NEXT:  .LBB6_3: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; in Loop: Header=BB6_1 Depth=1
+; ATOMIC-NEXT:    move.l %d2, %d0
+; ATOMIC-NEXT:    cas.l %d0, %d3, (%a0)
+; ATOMIC-NEXT:    move.l %d0, %d3
+; ATOMIC-NEXT:    sub.l %d2, %d3
+; ATOMIC-NEXT:    seq %d2
+; ATOMIC-NEXT:    sub.b #1, %d2
+; ATOMIC-NEXT:    move.l %d0, %d2
+; ATOMIC-NEXT:    beq .LBB6_4
+; ATOMIC-NEXT:  .LBB6_1: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-NEXT:    move.l %d2, %d0
+; ATOMIC-NEXT:    sub.l %d1, %d0
+; ATOMIC-NEXT:    move.l %d2, %d3
+; ATOMIC-NEXT:    ble .LBB6_3
+; ATOMIC-NEXT:  ; %bb.2: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; in Loop: Header=BB6_1 Depth=1
+; ATOMIC-NEXT:    move.l %d1, %d3
+; ATOMIC-NEXT:    bra .LBB6_3
+; ATOMIC-NEXT:  .LBB6_4: ; %atomicrmw.end
+; ATOMIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-NEXT:    adda.l #8, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomicrmw_min_i32:
+; ATOMIC-PIC:         .cfi_startproc
+; ATOMIC-PIC-NEXT:  ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #8, %sp
+; ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-PIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-PIC-NEXT:    move.l (12,%sp), %d1
+; ATOMIC-PIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.l (%a0), %d2
+; ATOMIC-PIC-NEXT:    bra .LBB6_1
+; ATOMIC-PIC-NEXT:  .LBB6_3: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; in Loop: Header=BB6_1 Depth=1
+; ATOMIC-PIC-NEXT:    move.l %d2, %d0
+; ATOMIC-PIC-NEXT:    cas.l %d0, %d3, (%a0)
+; ATOMIC-PIC-NEXT:    move.l %d0, %d3
+; ATOMIC-PIC-NEXT:    sub.l %d2, %d3
+; ATOMIC-PIC-NEXT:    seq %d2
+; ATOMIC-PIC-NEXT:    sub.b #1, %d2
+; ATOMIC-PIC-NEXT:    move.l %d0, %d2
+; ATOMIC-PIC-NEXT:    beq .LBB6_4
+; ATOMIC-PIC-NEXT:  .LBB6_1: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-PIC-NEXT:    move.l %d2, %d0
+; ATOMIC-PIC-NEXT:    sub.l %d1, %d0
+; ATOMIC-PIC-NEXT:    move.l %d2, %d3
+; ATOMIC-PIC-NEXT:    ble .LBB6_3
+; ATOMIC-PIC-NEXT:  ; %bb.2: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; in Loop: Header=BB6_1 Depth=1
+; ATOMIC-PIC-NEXT:    move.l %d1, %d3
+; ATOMIC-PIC-NEXT:    bra .LBB6_3
+; ATOMIC-PIC-NEXT:  .LBB6_4: ; %atomicrmw.end
+; ATOMIC-PIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-PIC-NEXT:    adda.l #8, %sp
+; ATOMIC-PIC-NEXT:    rts
+  %old = atomicrmw min ptr %ptr, i32 %val acquire
+  ret i32 %old
+}
+
+define i64 @atomicrmw_max_i64(i64 %val, ptr %ptr) {
+; NO-ATOMIC-LABEL: atomicrmw_max_i64:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #52, %sp
+; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -56
+; NO-ATOMIC-NEXT:    movem.l %d2-%d4/%a2-%a3, (32,%sp) ; 24-byte Folded Spill
+; NO-ATOMIC-NEXT:    move.l (60,%sp), %d3
+; NO-ATOMIC-NEXT:    move.l (56,%sp), %d4
+; NO-ATOMIC-NEXT:    move.l (64,%sp), %a2
+; NO-ATOMIC-NEXT:    move.l (4,%a2), %d1
+; NO-ATOMIC-NEXT:    move.l (%a2), %d0
+; NO-ATOMIC-NEXT:    lea (24,%sp), %a3
+; NO-ATOMIC-NEXT:    bra .LBB7_1
+; NO-ATOMIC-NEXT:  .LBB7_3: ; %atomicrmw.start
+; NO-ATOMIC-NEXT:    ; in Loop: Header=BB7_1 Depth=1
+; NO-ATOMIC-NEXT:    move.l %d1, (12,%sp)
+; NO-ATOMIC-NEXT:    move.l %d0, (8,%sp)
+; NO-ATOMIC-NEXT:    move.l #5, (20,%sp)
+; NO-ATOMIC-NEXT:    move.l #5, (16,%sp)
+; NO-ATOMIC-NEXT:    jsr __atomic_compare_exchange_8
+; NO-ATOMIC-NEXT:    move.b %d0, %d2
+; NO-ATOMIC-NEXT:    move.l (28,%sp), %d1
+; NO-ATOMIC-NEXT:    move.l (24,%sp), %d0
+; NO-ATOMIC-NEXT:    cmpi.b #0, %d2
+; NO-ATOMIC-NEXT:    bne .LBB7_4
+; NO-ATOMIC-NEXT:  .LBB7_1: ; %atomicrmw.start
+; NO-ATOMIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; NO-ATOMIC-NEXT:    move.l %d0, (24,%sp)
+; NO-ATOMIC-NEXT:    move.l %d1, (28,%sp)
+; NO-ATOMIC-NEXT:    move.l %a2, (%sp)
+; NO-ATOMIC-NEXT:    move.l %a3, (4,%sp)
+; NO-ATOMIC-NEXT:    move.l %d3, %d2
+; NO-ATOMIC-NEXT:    sub.l %d1, %d2
+; NO-ATOMIC-NEXT:    move.l %d4, %d2
+; NO-ATOMIC-NEXT:    subx.l %d0, %d2
+; NO-ATOMIC-NEXT:    slt %d2
+; NO-ATOMIC-NEXT:    cmpi.b #0, %d2
+; NO-ATOMIC-NEXT:    bne .LBB7_3
+; NO-ATOMIC-NEXT:  ; %bb.2: ; %atomicrmw.start
+; NO-ATOMIC-NEXT:    ; in Loop: Header=BB7_1 Depth=1
+; NO-ATOMIC-NEXT:    move.l %d3, %d1
+; NO-ATOMIC-NEXT:    move.l %d4, %d0
+; NO-ATOMIC-NEXT:    bra .LBB7_3
+; NO-ATOMIC-NEXT:  .LBB7_4: ; %atomicrmw.end
+; NO-ATOMIC-NEXT:    movem.l (32,%sp), %d2-%d4/%a2-%a3 ; 24-byte Folded Reload
+; NO-ATOMIC-NEXT:    adda.l #52, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomicrmw_max_i64:
+; NO-ATOMIC-PIC:         .cfi_startproc
+; NO-ATOMIC-PIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #52, %sp
+; NO-ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -56
+; NO-ATOMIC-PIC-NEXT:    movem.l %d2-%d4/%a2-%a3, (32,%sp) ; 24-byte Folded Spill
+; NO-ATOMIC-PIC-NEXT:    move.l (60,%sp), %d3
+; NO-ATOMIC-PIC-NEXT:    move.l (56,%sp), %d4
+; NO-ATOMIC-PIC-NEXT:    move.l (64,%sp), %a2
+; NO-ATOMIC-PIC-NEXT:    move.l (4,%a2), %d1
+; NO-ATOMIC-PIC-NEXT:    move.l (%a2), %d0
+; NO-ATOMIC-PIC-NEXT:    lea (24,%sp), %a3
+; NO-ATOMIC-PIC-NEXT:    bra .LBB7_1
+; NO-ATOMIC-PIC-NEXT:  .LBB7_3: ; %atomicrmw.start
+; NO-ATOMIC-PIC-NEXT:    ; in Loop: Header=BB7_1 Depth=1
+; NO-ATOMIC-PIC-NEXT:    move.l %d1, (12,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (8,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l #5, (20,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l #5, (16,%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__atomic_compare_exchange_8@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    move.b %d0, %d2
+; NO-ATOMIC-PIC-NEXT:    move.l (28,%sp), %d1
+; NO-ATOMIC-PIC-NEXT:    move.l (24,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    cmpi.b #0, %d2
+; NO-ATOMIC-PIC-NEXT:    bne .LBB7_4
+; NO-ATOMIC-PIC-NEXT:  .LBB7_1: ; %atomicrmw.start
+; NO-ATOMIC-PIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (24,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l %d1, (28,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l %a2, (%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l %a3, (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l %d3, %d2
+; NO-ATOMIC-PIC-NEXT:    sub.l %d1, %d2
+; NO-ATOMIC-PIC-NEXT:    move.l %d4, %d2
+; NO-ATOMIC-PIC-NEXT:    subx.l %d0, %d2
+; NO-ATOMIC-PIC-NEXT:    slt %d2
+; NO-ATOMIC-PIC-NEXT:    cmpi.b #0, %d2
+; NO-ATOMIC-PIC-NEXT:    bne .LBB7_3
+; NO-ATOMIC-PIC-NEXT:  ; %bb.2: ; %atomicrmw.start
+; NO-ATOMIC-PIC-NEXT:    ; in Loop: Header=BB7_1 Depth=1
+; NO-ATOMIC-PIC-NEXT:    move.l %d3, %d1
+; NO-ATOMIC-PIC-NEXT:    move.l %d4, %d0
+; NO-ATOMIC-PIC-NEXT:    bra .LBB7_3
+; NO-ATOMIC-PIC-NEXT:  .LBB7_4: ; %atomicrmw.end
+; NO-ATOMIC-PIC-NEXT:    movem.l (32,%sp), %d2-%d4/%a2-%a3 ; 24-byte Folded Reload
+; NO-ATOMIC-PIC-NEXT:    adda.l #52, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomicrmw_max_i64:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0:
+; ATOMIC-NEXT:    suba.l #52, %sp
+; ATOMIC-NEXT:    .cfi_def_cfa_offset -56
+; ATOMIC-NEXT:    movem.l %d2-%d4/%a2-%a3, (32,%sp) ; 24-byte Folded Spill
+; ATOMIC-NEXT:    move.l (60,%sp), %d3
+; ATOMIC-NEXT:    move.l (56,%sp), %d4
+; ATOMIC-NEXT:    move.l (64,%sp), %a2
+; ATOMIC-NEXT:    move.l (4,%a2), %d1
+; ATOMIC-NEXT:    move.l (%a2), %d0
+; ATOMIC-NEXT:    lea (24,%sp), %a3
+; ATOMIC-NEXT:    bra .LBB7_1
+; ATOMIC-NEXT:  .LBB7_3: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; in Loop: Header=BB7_1 Depth=1
+; ATOMIC-NEXT:    move.l %d1, (12,%sp)
+; ATOMIC-NEXT:    move.l %d0, (8,%sp)
+; ATOMIC-NEXT:    move.l #5, (20,%sp)
+; ATOMIC-NEXT:    move.l #5, (16,%sp)
+; ATOMIC-NEXT:    jsr __atomic_compare_exchange_8
+; ATOMIC-NEXT:    move.b %d0, %d2
+; ATOMIC-NEXT:    move.l (28,%sp), %d1
+; ATOMIC-NEXT:    move.l (24,%sp), %d0
+; ATOMIC-NEXT:    cmpi.b #0, %d2
+; ATOMIC-NEXT:    bne .LBB7_4
+; ATOMIC-NEXT:  .LBB7_1: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-NEXT:    move.l %d0, (24,%sp)
+; ATOMIC-NEXT:    move.l %d1, (28,%sp)
+; ATOMIC-NEXT:    move.l %a2, (%sp)
+; ATOMIC-NEXT:    move.l %a3, (4,%sp)
+; ATOMIC-NEXT:    move.l %d3, %d2
+; ATOMIC-NEXT:    sub.l %d1, %d2
+; ATOMIC-NEXT:    move.l %d4, %d2
+; ATOMIC-NEXT:    subx.l %d0, %d2
+; ATOMIC-NEXT:    slt %d2
+; ATOMIC-NEXT:    cmpi.b #0, %d2
+; ATOMIC-NEXT:    bne .LBB7_3
+; ATOMIC-NEXT:  ; %bb.2: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; in Loop: Header=BB7_1 Depth=1
+; ATOMIC-NEXT:    move.l %d3, %d1
+; ATOMIC-NEXT:    move.l %d4, %d0
+; ATOMIC-NEXT:    bra .LBB7_3
+; ATOMIC-NEXT:  .LBB7_4: ; %atomicrmw.end
+; ATOMIC-NEXT:    movem.l (32,%sp), %d2-%d4/%a2-%a3 ; 24-byte Folded Reload
+; ATOMIC-NEXT:    adda.l #52, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomicrmw_max_i64:
+; ATOMIC-PIC:         .cfi_startproc
+; ATOMIC-PIC-NEXT:  ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #52, %sp
+; ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -56
+; ATOMIC-PIC-NEXT:    movem.l %d2-%d4/%a2-%a3, (32,%sp) ; 24-byte Folded Spill
+; ATOMIC-PIC-NEXT:    move.l (60,%sp), %d3
+; ATOMIC-PIC-NEXT:    move.l (56,%sp), %d4
+; ATOMIC-PIC-NEXT:    move.l (64,%sp), %a2
+; ATOMIC-PIC-NEXT:    move.l (4,%a2), %d1
+; ATOMIC-PIC-NEXT:    move.l (%a2), %d0
+; ATOMIC-PIC-NEXT:    lea (24,%sp), %a3
+; ATOMIC-PIC-NEXT:    bra .LBB7_1
+; ATOMIC-PIC-NEXT:  .LBB7_3: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; in Loop: Header=BB7_1 Depth=1
+; ATOMIC-PIC-NEXT:    move.l %d1, (12,%sp)
+; ATOMIC-PIC-NEXT:    move.l %d0, (8,%sp)
+; ATOMIC-PIC-NEXT:    move.l #5, (20,%sp)
+; ATOMIC-PIC-NEXT:    move.l #5, (16,%sp)
+; ATOMIC-PIC-NEXT:    jsr (__atomic_compare_exchange_8@PLT,%pc)
+; ATOMIC-PIC-NEXT:    move.b %d0, %d2
+; ATOMIC-PIC-NEXT:    move.l (28,%sp), %d1
+; ATOMIC-PIC-NEXT:    move.l (24,%sp), %d0
+; ATOMIC-PIC-NEXT:    cmpi.b #0, %d2
+; ATOMIC-PIC-NEXT:    bne .LBB7_4
+; ATOMIC-PIC-NEXT:  .LBB7_1: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-PIC-NEXT:    move.l %d0, (24,%sp)
+; ATOMIC-PIC-NEXT:    move.l %d1, (28,%sp)
+; ATOMIC-PIC-NEXT:    move.l %a2, (%sp)
+; ATOMIC-PIC-NEXT:    move.l %a3, (4,%sp)
+; ATOMIC-PIC-NEXT:    move.l %d3, %d2
+; ATOMIC-PIC-NEXT:    sub.l %d1, %d2
+; ATOMIC-PIC-NEXT:    move.l %d4, %d2
+; ATOMIC-PIC-NEXT:    subx.l %d0, %d2
+; ATOMIC-PIC-NEXT:    slt %d2
+; ATOMIC-PIC-NEXT:    cmpi.b #0, %d2
+; ATOMIC-PIC-NEXT:    bne .LBB7_3
+; ATOMIC-PIC-NEXT:  ; %bb.2: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; in Loop: Header=BB7_1 Depth=1
+; ATOMIC-PIC-NEXT:    move.l %d3, %d1
+; ATOMIC-PIC-NEXT:    move.l %d4, %d0
+; ATOMIC-PIC-NEXT:    bra .LBB7_3
+; ATOMIC-PIC-NEXT:  .LBB7_4: ; %atomicrmw.end
+; ATOMIC-PIC-NEXT:    movem.l (32,%sp), %d2-%d4/%a2-%a3 ; 24-byte Folded Reload
+; ATOMIC-PIC-NEXT:    adda.l #52, %sp
+; ATOMIC-PIC-NEXT:    rts
+  %old = atomicrmw max ptr %ptr, i64 %val seq_cst
+  ret i64 %old
+}
+
+define i8 @atomicrmw_i8_umin(i8 %val, ptr %ptr) {
+; NO-ATOMIC-LABEL: atomicrmw_i8_umin:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-NEXT:    move.b (19,%sp), %d0
+; NO-ATOMIC-NEXT:    and.l #255, %d0
+; NO-ATOMIC-NEXT:    move.l %d0, (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_umin_1
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomicrmw_i8_umin:
+; NO-ATOMIC-PIC:         .cfi_startproc
+; NO-ATOMIC-PIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-PIC-NEXT:    move.b (19,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    and.l #255, %d0
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__sync_fetch_and_umin_1@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomicrmw_i8_umin:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0:
+; ATOMIC-NEXT:    suba.l #8, %sp
+; ATOMIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-NEXT:    move.b (15,%sp), %d1
+; ATOMIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-NEXT:    move.b (%a0), %d2
+; ATOMIC-NEXT:    bra .LBB8_1
+; ATOMIC-NEXT:  .LBB8_3: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; in Loop: Header=BB8_1 Depth=1
+; ATOMIC-NEXT:    move.b %d2, %d0
+; ATOMIC-NEXT:    cas.b %d0, %d3, (%a0)
+; ATOMIC-NEXT:    move.b %d0, %d3
+; ATOMIC-NEXT:    sub.b %d2, %d3
+; ATOMIC-NEXT:    seq %d2
+; ATOMIC-NEXT:    sub.b #1, %d2
+; ATOMIC-NEXT:    move.b %d0, %d2
+; ATOMIC-NEXT:    beq .LBB8_4
+; ATOMIC-NEXT:  .LBB8_1: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-NEXT:    move.b %d2, %d0
+; ATOMIC-NEXT:    sub.b %d1, %d0
+; ATOMIC-NEXT:    move.b %d2, %d3
+; ATOMIC-NEXT:    bls .LBB8_3
+; ATOMIC-NEXT:  ; %bb.2: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; in Loop: Header=BB8_1 Depth=1
+; ATOMIC-NEXT:    move.b %d1, %d3
+; ATOMIC-NEXT:    bra .LBB8_3
+; ATOMIC-NEXT:  .LBB8_4: ; %atomicrmw.end
+; ATOMIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-NEXT:    adda.l #8, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomicrmw_i8_umin:
+; ATOMIC-PIC:         .cfi_startproc
+; ATOMIC-PIC-NEXT:  ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #8, %sp
+; ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-PIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-PIC-NEXT:    move.b (15,%sp), %d1
+; ATOMIC-PIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.b (%a0), %d2
+; ATOMIC-PIC-NEXT:    bra .LBB8_1
+; ATOMIC-PIC-NEXT:  .LBB8_3: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; in Loop: Header=BB8_1 Depth=1
+; ATOMIC-PIC-NEXT:    move.b %d2, %d0
+; ATOMIC-PIC-NEXT:    cas.b %d0, %d3, (%a0)
+; ATOMIC-PIC-NEXT:    move.b %d0, %d3
+; ATOMIC-PIC-NEXT:    sub.b %d2, %d3
+; ATOMIC-PIC-NEXT:    seq %d2
+; ATOMIC-PIC-NEXT:    sub.b #1, %d2
+; ATOMIC-PIC-NEXT:    move.b %d0, %d2
+; ATOMIC-PIC-NEXT:    beq .LBB8_4
+; ATOMIC-PIC-NEXT:  .LBB8_1: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-PIC-NEXT:    move.b %d2, %d0
+; ATOMIC-PIC-NEXT:    sub.b %d1, %d0
+; ATOMIC-PIC-NEXT:    move.b %d2, %d3
+; ATOMIC-PIC-NEXT:    bls .LBB8_3
+; ATOMIC-PIC-NEXT:  ; %bb.2: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; in Loop: Header=BB8_1 Depth=1
+; ATOMIC-PIC-NEXT:    move.b %d1, %d3
+; ATOMIC-PIC-NEXT:    bra .LBB8_3
+; ATOMIC-PIC-NEXT:  .LBB8_4: ; %atomicrmw.end
+; ATOMIC-PIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-PIC-NEXT:    adda.l #8, %sp
+; ATOMIC-PIC-NEXT:    rts
+  %old = atomicrmw umin ptr %ptr, i8 %val release
+  ret i8 %old
+}
+
+define i16 @atomicrmw_umax_i16(i16 %val, ptr %ptr) {
+; NO-ATOMIC-LABEL: atomicrmw_umax_i16:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-NEXT:    move.w (18,%sp), %d0
+; NO-ATOMIC-NEXT:    and.l #65535, %d0
+; NO-ATOMIC-NEXT:    move.l %d0, (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_umax_2
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomicrmw_umax_i16:
+; NO-ATOMIC-PIC:         .cfi_startproc
+; NO-ATOMIC-PIC-NEXT:  ; %bb.0:
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-PIC-NEXT:    move.w (18,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    and.l #65535, %d0
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__sync_fetch_and_umax_2@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomicrmw_umax_i16:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0:
+; ATOMIC-NEXT:    suba.l #8, %sp
+; ATOMIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-NEXT:    move.w (14,%sp), %d1
+; ATOMIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-NEXT:    move.w (%a0), %d2
+; ATOMIC-NEXT:    bra .LBB9_1
+; ATOMIC-NEXT:  .LBB9_3: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; in Loop: Header=BB9_1 Depth=1
+; ATOMIC-NEXT:    move.w %d2, %d0
+; ATOMIC-NEXT:    cas.w %d0, %d3, (%a0)
+; ATOMIC-NEXT:    move.w %d0, %d3
+; ATOMIC-NEXT:    sub.w %d2, %d3
+; ATOMIC-NEXT:    seq %d2
+; ATOMIC-NEXT:    sub.b #1, %d2
+; ATOMIC-NEXT:    move.w %d0, %d2
+; ATOMIC-NEXT:    beq .LBB9_4
+; ATOMIC-NEXT:  .LBB9_1: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-NEXT:    move.w %d2, %d0
+; ATOMIC-NEXT:    sub.w %d1, %d0
+; ATOMIC-NEXT:    move.w %d2, %d3
+; ATOMIC-NEXT:    bhi .LBB9_3
+; ATOMIC-NEXT:  ; %bb.2: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; in Loop: Header=BB9_1 Depth=1
+; ATOMIC-NEXT:    move.w %d1, %d3
+; ATOMIC-NEXT:    bra .LBB9_3
+; ATOMIC-NEXT:  .LBB9_4: ; %atomicrmw.end
+; ATOMIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-NEXT:    adda.l #8, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomicrmw_umax_i16:
+; ATOMIC-PIC:         .cfi_startproc
+; ATOMIC-PIC-NEXT:  ; %bb.0:
+; ATOMIC-PIC-NEXT:    suba.l #8, %sp
+; ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-PIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-PIC-NEXT:    move.w (14,%sp), %d1
+; ATOMIC-PIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.w (%a0), %d2
+; ATOMIC-PIC-NEXT:    bra .LBB9_1
+; ATOMIC-PIC-NEXT:  .LBB9_3: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; in Loop: Header=BB9_1 Depth=1
+; ATOMIC-PIC-NEXT:    move.w %d2, %d0
+; ATOMIC-PIC-NEXT:    cas.w %d0, %d3, (%a0)
+; ATOMIC-PIC-NEXT:    move.w %d0, %d3
+; ATOMIC-PIC-NEXT:    sub.w %d2, %d3
+; ATOMIC-PIC-NEXT:    seq %d2
+; ATOMIC-PIC-NEXT:    sub.b #1, %d2
+; ATOMIC-PIC-NEXT:    move.w %d0, %d2
+; ATOMIC-PIC-NEXT:    beq .LBB9_4
+; ATOMIC-PIC-NEXT:  .LBB9_1: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-PIC-NEXT:    move.w %d2, %d0
+; ATOMIC-PIC-NEXT:    sub.w %d1, %d0
+; ATOMIC-PIC-NEXT:    move.w %d2, %d3
+; ATOMIC-PIC-NEXT:    bhi .LBB9_3
+; ATOMIC-PIC-NEXT:  ; %bb.2: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; in Loop: Header=BB9_1 Depth=1
+; ATOMIC-PIC-NEXT:    move.w %d1, %d3
+; ATOMIC-PIC-NEXT:    bra .LBB9_3
+; ATOMIC-PIC-NEXT:  .LBB9_4: ; %atomicrmw.end
+; ATOMIC-PIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-PIC-NEXT:    adda.l #8, %sp
+; ATOMIC-PIC-NEXT:    rts
+  %old = atomicrmw umax ptr %ptr, i16 %val seq_cst
+  ret i16 %old
+}
+
+define i16 @atomicrmw_xchg_i16(i16 %val, ptr %ptr) {
+; NO-ATOMIC-LABEL: atomicrmw_xchg_i16:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0: ; %entry
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-NEXT:    move.w (18,%sp), %d0
+; NO-ATOMIC-NEXT:    and.l #65535, %d0
+; NO-ATOMIC-NEXT:    move.l %d0, (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_lock_test_and_set_2
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomicrmw_xchg_i16:
+; NO-ATOMIC-PIC:         .cfi_startproc
+; NO-ATOMIC-PIC-NEXT:  ; %bb.0: ; %entry
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-PIC-NEXT:    move.w (18,%sp), %d0
+; NO-ATOMIC-PIC-NEXT:    and.l #65535, %d0
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__sync_lock_test_and_set_2@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomicrmw_xchg_i16:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0: ; %entry
+; ATOMIC-NEXT:    suba.l #8, %sp
+; ATOMIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-NEXT:    move.w (14,%sp), %d1
+; ATOMIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-NEXT:    move.w (%a0), %d2
+; ATOMIC-NEXT:    move.w %d2, %d0
+; ATOMIC-NEXT:  .LBB10_1: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-NEXT:    cas.w %d0, %d1, (%a0)
+; ATOMIC-NEXT:    move.w %d0, %d3
+; ATOMIC-NEXT:    sub.w %d2, %d3
+; ATOMIC-NEXT:    seq %d2
+; ATOMIC-NEXT:    sub.b #1, %d2
+; ATOMIC-NEXT:    move.w %d0, %d2
+; ATOMIC-NEXT:    bne .LBB10_1
+; ATOMIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-NEXT:    adda.l #8, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomicrmw_xchg_i16:
+; ATOMIC-PIC:         .cfi_startproc
+; ATOMIC-PIC-NEXT:  ; %bb.0: ; %entry
+; ATOMIC-PIC-NEXT:    suba.l #8, %sp
+; ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-PIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-PIC-NEXT:    move.w (14,%sp), %d1
+; ATOMIC-PIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.w (%a0), %d2
+; ATOMIC-PIC-NEXT:    move.w %d2, %d0
+; ATOMIC-PIC-NEXT:  .LBB10_1: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-PIC-NEXT:    cas.w %d0, %d1, (%a0)
+; ATOMIC-PIC-NEXT:    move.w %d0, %d3
+; ATOMIC-PIC-NEXT:    sub.w %d2, %d3
+; ATOMIC-PIC-NEXT:    seq %d2
+; ATOMIC-PIC-NEXT:    sub.b #1, %d2
+; ATOMIC-PIC-NEXT:    move.w %d0, %d2
+; ATOMIC-PIC-NEXT:    bne .LBB10_1
+; ATOMIC-PIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-PIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-PIC-NEXT:    adda.l #8, %sp
+; ATOMIC-PIC-NEXT:    rts
+entry:
+  %old = atomicrmw xchg ptr %ptr, i16 %val monotonic
+  ret i16 %old
+}
+
+define i32 @atomicrmw_xchg_i32(i32 %val, ptr %ptr) {
+; NO-ATOMIC-LABEL: atomicrmw_xchg_i32:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0: ; %entry
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-NEXT:    move.l (16,%sp), (4,%sp)
+; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_lock_test_and_set_4
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomicrmw_xchg_i32:
+; NO-ATOMIC-PIC:         .cfi_startproc
+; NO-ATOMIC-PIC-NEXT:  ; %bb.0: ; %entry
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-PIC-NEXT:    move.l (16,%sp), (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l (20,%sp), (%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__sync_lock_test_and_set_4@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomicrmw_xchg_i32:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0: ; %entry
+; ATOMIC-NEXT:    suba.l #8, %sp
+; ATOMIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-NEXT:    move.l (12,%sp), %d1
+; ATOMIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-NEXT:    move.l (%a0), %d2
+; ATOMIC-NEXT:    move.l %d2, %d0
+; ATOMIC-NEXT:  .LBB11_1: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-NEXT:    cas.l %d0, %d1, (%a0)
+; ATOMIC-NEXT:    move.l %d0, %d3
+; ATOMIC-NEXT:    sub.l %d2, %d3
+; ATOMIC-NEXT:    seq %d2
+; ATOMIC-NEXT:    sub.b #1, %d2
+; ATOMIC-NEXT:    move.l %d0, %d2
+; ATOMIC-NEXT:    bne .LBB11_1
+; ATOMIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-NEXT:    adda.l #8, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomicrmw_xchg_i32:
+; ATOMIC-PIC:         .cfi_startproc
+; ATOMIC-PIC-NEXT:  ; %bb.0: ; %entry
+; ATOMIC-PIC-NEXT:    suba.l #8, %sp
+; ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -12
+; ATOMIC-PIC-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
+; ATOMIC-PIC-NEXT:    move.l (12,%sp), %d1
+; ATOMIC-PIC-NEXT:    move.l (16,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.l (%a0), %d2
+; ATOMIC-PIC-NEXT:    move.l %d2, %d0
+; ATOMIC-PIC-NEXT:  .LBB11_1: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-PIC-NEXT:    cas.l %d0, %d1, (%a0)
+; ATOMIC-PIC-NEXT:    move.l %d0, %d3
+; ATOMIC-PIC-NEXT:    sub.l %d2, %d3
+; ATOMIC-PIC-NEXT:    seq %d2
+; ATOMIC-PIC-NEXT:    sub.b #1, %d2
+; ATOMIC-PIC-NEXT:    move.l %d0, %d2
+; ATOMIC-PIC-NEXT:    bne .LBB11_1
+; ATOMIC-PIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-PIC-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
+; ATOMIC-PIC-NEXT:    adda.l #8, %sp
+; ATOMIC-PIC-NEXT:    rts
+entry:
+  %old = atomicrmw xchg ptr %ptr, i32 %val monotonic
+  ret i32 %old
+}
+
+define i8 @atomicrmw_sub_i8_arid(ptr align 2 %self) {
+; NO-ATOMIC-LABEL: atomicrmw_sub_i8_arid:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0: ; %start
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-NEXT:    move.l (16,%sp), %a0
+; NO-ATOMIC-NEXT:    move.l (%a0), %d0
+; NO-ATOMIC-NEXT:    add.l #4, %d0
+; NO-ATOMIC-NEXT:    move.l %d0, (%sp)
+; NO-ATOMIC-NEXT:    move.l #1, (4,%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_sub_1
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomicrmw_sub_i8_arid:
+; NO-ATOMIC-PIC:         .cfi_startproc
+; NO-ATOMIC-PIC-NEXT:  ; %bb.0: ; %start
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-PIC-NEXT:    move.l (16,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.l (%a0), %d0
+; NO-ATOMIC-PIC-NEXT:    add.l #4, %d0
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l #1, (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__sync_fetch_and_sub_1@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomicrmw_sub_i8_arid:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0: ; %start
+; ATOMIC-NEXT:    suba.l #4, %sp
+; ATOMIC-NEXT:    .cfi_def_cfa_offset -8
+; ATOMIC-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
+; ATOMIC-NEXT:    move.l (8,%sp), %a0
+; ATOMIC-NEXT:    move.l (%a0), %a0
+; ATOMIC-NEXT:    move.b (4,%a0), %d1
+; ATOMIC-NEXT:    move.b %d1, %d0
+; ATOMIC-NEXT:  .LBB12_1: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-NEXT:    move.b %d1, %d2
+; ATOMIC-NEXT:    add.b #-1, %d2
+; ATOMIC-NEXT:    cas.b %d0, %d2, (4,%a0)
+; ATOMIC-NEXT:    move.b %d0, %d2
+; ATOMIC-NEXT:    sub.b %d1, %d2
+; ATOMIC-NEXT:    seq %d1
+; ATOMIC-NEXT:    sub.b #1, %d1
+; ATOMIC-NEXT:    move.b %d0, %d1
+; ATOMIC-NEXT:    bne .LBB12_1
+; ATOMIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-NEXT:    movem.l (0,%sp), %d2 ; 8-byte Folded Reload
+; ATOMIC-NEXT:    adda.l #4, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomicrmw_sub_i8_arid:
+; ATOMIC-PIC:         .cfi_startproc
+; ATOMIC-PIC-NEXT:  ; %bb.0: ; %start
+; ATOMIC-PIC-NEXT:    suba.l #4, %sp
+; ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -8
+; ATOMIC-PIC-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
+; ATOMIC-PIC-NEXT:    move.l (8,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.l (%a0), %a0
+; ATOMIC-PIC-NEXT:    move.b (4,%a0), %d1
+; ATOMIC-PIC-NEXT:    move.b %d1, %d0
+; ATOMIC-PIC-NEXT:  .LBB12_1: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-PIC-NEXT:    move.b %d1, %d2
+; ATOMIC-PIC-NEXT:    add.b #-1, %d2
+; ATOMIC-PIC-NEXT:    cas.b %d0, %d2, (4,%a0)
+; ATOMIC-PIC-NEXT:    move.b %d0, %d2
+; ATOMIC-PIC-NEXT:    sub.b %d1, %d2
+; ATOMIC-PIC-NEXT:    seq %d1
+; ATOMIC-PIC-NEXT:    sub.b #1, %d1
+; ATOMIC-PIC-NEXT:    move.b %d0, %d1
+; ATOMIC-PIC-NEXT:    bne .LBB12_1
+; ATOMIC-PIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-PIC-NEXT:    movem.l (0,%sp), %d2 ; 8-byte Folded Reload
+; ATOMIC-PIC-NEXT:    adda.l #4, %sp
+; ATOMIC-PIC-NEXT:    rts
+start:
+  %self1 = load ptr, ptr %self, align 2
+  %_18.i.i = getelementptr inbounds i8, ptr %self1, i32 4
+  %6 = atomicrmw sub ptr %_18.i.i, i8 1 release, align 4
+  ret i8 %6
+}
+
+define i16 @atomicrmw_sub_i16_arid(ptr align 2 %self) {
+; NO-ATOMIC-LABEL: atomicrmw_sub_i16_arid:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0: ; %start
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-NEXT:    move.l (16,%sp), %a0
+; NO-ATOMIC-NEXT:    move.l (%a0), %d0
+; NO-ATOMIC-NEXT:    add.l #4, %d0
+; NO-ATOMIC-NEXT:    move.l %d0, (%sp)
+; NO-ATOMIC-NEXT:    move.l #1, (4,%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_sub_2
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomicrmw_sub_i16_arid:
+; NO-ATOMIC-PIC:         .cfi_startproc
+; NO-ATOMIC-PIC-NEXT:  ; %bb.0: ; %start
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-PIC-NEXT:    move.l (16,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.l (%a0), %d0
+; NO-ATOMIC-PIC-NEXT:    add.l #4, %d0
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l #1, (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__sync_fetch_and_sub_2@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomicrmw_sub_i16_arid:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0: ; %start
+; ATOMIC-NEXT:    suba.l #4, %sp
+; ATOMIC-NEXT:    .cfi_def_cfa_offset -8
+; ATOMIC-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
+; ATOMIC-NEXT:    move.l (8,%sp), %a0
+; ATOMIC-NEXT:    move.l (%a0), %a0
+; ATOMIC-NEXT:    move.w (4,%a0), %d1
+; ATOMIC-NEXT:    move.w %d1, %d0
+; ATOMIC-NEXT:  .LBB13_1: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-NEXT:    move.w %d1, %d2
+; ATOMIC-NEXT:    add.w #-1, %d2
+; ATOMIC-NEXT:    cas.w %d0, %d2, (4,%a0)
+; ATOMIC-NEXT:    move.w %d0, %d2
+; ATOMIC-NEXT:    sub.w %d1, %d2
+; ATOMIC-NEXT:    seq %d1
+; ATOMIC-NEXT:    sub.b #1, %d1
+; ATOMIC-NEXT:    move.w %d0, %d1
+; ATOMIC-NEXT:    bne .LBB13_1
+; ATOMIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-NEXT:    movem.l (0,%sp), %d2 ; 8-byte Folded Reload
+; ATOMIC-NEXT:    adda.l #4, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomicrmw_sub_i16_arid:
+; ATOMIC-PIC:         .cfi_startproc
+; ATOMIC-PIC-NEXT:  ; %bb.0: ; %start
+; ATOMIC-PIC-NEXT:    suba.l #4, %sp
+; ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -8
+; ATOMIC-PIC-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
+; ATOMIC-PIC-NEXT:    move.l (8,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.l (%a0), %a0
+; ATOMIC-PIC-NEXT:    move.w (4,%a0), %d1
+; ATOMIC-PIC-NEXT:    move.w %d1, %d0
+; ATOMIC-PIC-NEXT:  .LBB13_1: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-PIC-NEXT:    move.w %d1, %d2
+; ATOMIC-PIC-NEXT:    add.w #-1, %d2
+; ATOMIC-PIC-NEXT:    cas.w %d0, %d2, (4,%a0)
+; ATOMIC-PIC-NEXT:    move.w %d0, %d2
+; ATOMIC-PIC-NEXT:    sub.w %d1, %d2
+; ATOMIC-PIC-NEXT:    seq %d1
+; ATOMIC-PIC-NEXT:    sub.b #1, %d1
+; ATOMIC-PIC-NEXT:    move.w %d0, %d1
+; ATOMIC-PIC-NEXT:    bne .LBB13_1
+; ATOMIC-PIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-PIC-NEXT:    movem.l (0,%sp), %d2 ; 8-byte Folded Reload
+; ATOMIC-PIC-NEXT:    adda.l #4, %sp
+; ATOMIC-PIC-NEXT:    rts
+start:
+  %self1 = load ptr, ptr %self, align 2
+  %_18.i.i = getelementptr inbounds i8, ptr %self1, i32 4
+  %6 = atomicrmw sub ptr %_18.i.i, i16 1 release, align 4
+  ret i16 %6
+}
+
+define i32 @atomicrmw_sub_i32_arid(ptr align 2 %self) {
+; NO-ATOMIC-LABEL: atomicrmw_sub_i32_arid:
+; NO-ATOMIC:         .cfi_startproc
+; NO-ATOMIC-NEXT:  ; %bb.0: ; %start
+; NO-ATOMIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-NEXT:    move.l (16,%sp), %a0
+; NO-ATOMIC-NEXT:    move.l (%a0), %d0
+; NO-ATOMIC-NEXT:    add.l #4, %d0
+; NO-ATOMIC-NEXT:    move.l %d0, (%sp)
+; NO-ATOMIC-NEXT:    move.l #1, (4,%sp)
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_sub_4
+; NO-ATOMIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-NEXT:    rts
+;
+; NO-ATOMIC-PIC-LABEL: atomicrmw_sub_i32_arid:
+; NO-ATOMIC-PIC:         .cfi_startproc
+; NO-ATOMIC-PIC-NEXT:  ; %bb.0: ; %start
+; NO-ATOMIC-PIC-NEXT:    suba.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -16
+; NO-ATOMIC-PIC-NEXT:    move.l (16,%sp), %a0
+; NO-ATOMIC-PIC-NEXT:    move.l (%a0), %d0
+; NO-ATOMIC-PIC-NEXT:    add.l #4, %d0
+; NO-ATOMIC-PIC-NEXT:    move.l %d0, (%sp)
+; NO-ATOMIC-PIC-NEXT:    move.l #1, (4,%sp)
+; NO-ATOMIC-PIC-NEXT:    jsr (__sync_fetch_and_sub_4@PLT,%pc)
+; NO-ATOMIC-PIC-NEXT:    adda.l #12, %sp
+; NO-ATOMIC-PIC-NEXT:    rts
+;
+; ATOMIC-LABEL: atomicrmw_sub_i32_arid:
+; ATOMIC:         .cfi_startproc
+; ATOMIC-NEXT:  ; %bb.0: ; %start
+; ATOMIC-NEXT:    suba.l #4, %sp
+; ATOMIC-NEXT:    .cfi_def_cfa_offset -8
+; ATOMIC-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
+; ATOMIC-NEXT:    move.l (8,%sp), %a0
+; ATOMIC-NEXT:    move.l (%a0), %a0
+; ATOMIC-NEXT:    move.l (4,%a0), %d1
+; ATOMIC-NEXT:    move.l %d1, %d0
+; ATOMIC-NEXT:  .LBB14_1: ; %atomicrmw.start
+; ATOMIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-NEXT:    move.l %d1, %d2
+; ATOMIC-NEXT:    add.l #-1, %d2
+; ATOMIC-NEXT:    cas.l %d0, %d2, (4,%a0)
+; ATOMIC-NEXT:    move.l %d0, %d2
+; ATOMIC-NEXT:    sub.l %d1, %d2
+; ATOMIC-NEXT:    seq %d1
+; ATOMIC-NEXT:    sub.b #1, %d1
+; ATOMIC-NEXT:    move.l %d0, %d1
+; ATOMIC-NEXT:    bne .LBB14_1
+; ATOMIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-NEXT:    movem.l (0,%sp), %d2 ; 8-byte Folded Reload
+; ATOMIC-NEXT:    adda.l #4, %sp
+; ATOMIC-NEXT:    rts
+;
+; ATOMIC-PIC-LABEL: atomicrmw_sub_i32_arid:
+; ATOMIC-PIC:         .cfi_startproc
+; ATOMIC-PIC-NEXT:  ; %bb.0: ; %start
+; ATOMIC-PIC-NEXT:    suba.l #4, %sp
+; ATOMIC-PIC-NEXT:    .cfi_def_cfa_offset -8
+; ATOMIC-PIC-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
+; ATOMIC-PIC-NEXT:    move.l (8,%sp), %a0
+; ATOMIC-PIC-NEXT:    move.l (%a0), %a0
+; ATOMIC-PIC-NEXT:    move.l (4,%a0), %d1
+; ATOMIC-PIC-NEXT:    move.l %d1, %d0
+; ATOMIC-PIC-NEXT:  .LBB14_1: ; %atomicrmw.start
+; ATOMIC-PIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ATOMIC-PIC-NEXT:    move.l %d1, %d2
+; ATOMIC-PIC-NEXT:    add.l #-1, %d2
+; ATOMIC-PIC-NEXT:    cas.l %d0, %d2, (4,%a0)
+; ATOMIC-PIC-NEXT:    move.l %d0, %d2
+; ATOMIC-PIC-NEXT:    sub.l %d1, %d2
+; ATOMIC-PIC-NEXT:    seq %d1
+; ATOMIC-PIC-NEXT:    sub.b #1, %d1
+; ATOMIC-PIC-NEXT:    move.l %d0, %d1
+; ATOMIC-PIC-NEXT:    bne .LBB14_1
+; ATOMIC-PIC-NEXT:  ; %bb.2: ; %atomicrmw.end
+; ATOMIC-PIC-NEXT:    movem.l (0,%sp), %d2 ; 8-byte Folded Reload
+; ATOMIC-PIC-NEXT:    adda.l #4, %sp
+; ATOMIC-PIC-NEXT:    rts
+start:
+  %self1 = load ptr, ptr %self, align 2
+  %_18.i.i = getelementptr inbounds i8, ptr %self1, i32 4
+  %6 = atomicrmw sub ptr %_18.i.i, i32 1 release, align 4
+  ret i32 %6
+}
diff --git a/llvm/test/CodeGen/M68k/CodeModel/large-pic.ll b/llvm/test/CodeGen/M68k/CodeModel/Large/large-pic.ll
similarity index 100%
rename from llvm/test/CodeGen/M68k/CodeModel/large-pic.ll
rename to llvm/test/CodeGen/M68k/CodeModel/Large/large-pic.ll
diff --git a/llvm/test/CodeGen/M68k/CodeModel/large-pie-global-access.ll b/llvm/test/CodeGen/M68k/CodeModel/Large/large-pie-global-access.ll
similarity index 100%
rename from llvm/test/CodeGen/M68k/CodeModel/large-pie-global-access.ll
rename to llvm/test/CodeGen/M68k/CodeModel/Large/large-pie-global-access.ll
diff --git a/llvm/test/CodeGen/M68k/CodeModel/large-pie.ll b/llvm/test/CodeGen/M68k/CodeModel/Large/large-pie.ll
similarity index 100%
rename from llvm/test/CodeGen/M68k/CodeModel/large-pie.ll
rename to llvm/test/CodeGen/M68k/CodeModel/Large/large-pie.ll
diff --git a/llvm/test/CodeGen/M68k/CodeModel/large-static.ll b/llvm/test/CodeGen/M68k/CodeModel/Large/large-static.ll
similarity index 100%
rename from llvm/test/CodeGen/M68k/CodeModel/large-static.ll
rename to llvm/test/CodeGen/M68k/CodeModel/Large/large-static.ll
diff --git a/llvm/test/CodeGen/M68k/CodeModel/medium-pic.ll b/llvm/test/CodeGen/M68k/CodeModel/Medium/medium-pic.ll
similarity index 100%
rename from llvm/test/CodeGen/M68k/CodeModel/medium-pic.ll
rename to llvm/test/CodeGen/M68k/CodeModel/Medium/medium-pic.ll
diff --git a/llvm/test/CodeGen/M68k/CodeModel/medium-pie-global-access.ll b/llvm/test/CodeGen/M68k/CodeModel/Medium/medium-pie-global-access.ll
similarity index 100%
rename from llvm/test/CodeGen/M68k/CodeModel/medium-pie-global-access.ll
rename to llvm/test/CodeGen/M68k/CodeModel/Medium/medium-pie-global-access.ll
diff --git a/llvm/test/CodeGen/M68k/CodeModel/medium-pie.ll b/llvm/test/CodeGen/M68k/CodeModel/Medium/medium-pie.ll
similarity index 100%
rename from llvm/test/CodeGen/M68k/CodeModel/medium-pie.ll
rename to llvm/test/CodeGen/M68k/CodeModel/Medium/medium-pie.ll
diff --git a/llvm/test/CodeGen/M68k/CodeModel/medium-static.ll b/llvm/test/CodeGen/M68k/CodeModel/Medium/medium-static.ll
similarity index 100%
rename from llvm/test/CodeGen/M68k/CodeModel/medium-static.ll
rename to llvm/test/CodeGen/M68k/CodeModel/Medium/medium-static.ll
diff --git a/llvm/test/CodeGen/M68k/CodeModel/small-pic.ll b/llvm/test/CodeGen/M68k/CodeModel/Small/small-pic.ll
similarity index 100%
rename from llvm/test/CodeGen/M68k/CodeModel/small-pic.ll
rename to llvm/test/CodeGen/M68k/CodeModel/Small/small-pic.ll
diff --git a/llvm/test/CodeGen/M68k/CodeModel/small-pie-global-access.ll b/llvm/test/CodeGen/M68k/CodeModel/Small/small-pie-global-access.ll
similarity index 100%
rename from llvm/test/CodeGen/M68k/CodeModel/small-pie-global-access.ll
rename to llvm/test/CodeGen/M68k/CodeModel/Small/small-pie-global-access.ll
diff --git a/llvm/test/CodeGen/M68k/CodeModel/small-pie.ll b/llvm/test/CodeGen/M68k/CodeModel/Small/small-pie.ll
similarity index 100%
rename from llvm/test/CodeGen/M68k/CodeModel/small-pie.ll
rename to llvm/test/CodeGen/M68k/CodeModel/Small/small-pie.ll
diff --git a/llvm/test/CodeGen/M68k/CodeModel/small-static.ll b/llvm/test/CodeGen/M68k/CodeModel/Small/small-static.ll
similarity index 100%
rename from llvm/test/CodeGen/M68k/CodeModel/small-static.ll
rename to llvm/test/CodeGen/M68k/CodeModel/Small/small-static.ll
diff --git a/llvm/test/CodeGen/M68k/TLS/tls-arid.ll b/llvm/test/CodeGen/M68k/TLS/tls-arid.ll
new file mode 100644
index 0000000000000..88189f648854b
--- /dev/null
+++ b/llvm/test/CodeGen/M68k/TLS/tls-arid.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=m68k < %s | FileCheck %s
+
+@tls = internal thread_local global <{ [5 x i8], [1 x i8] }> <{ [5 x i8] zeroinitializer, [1 x i8] undef }>, align 4
+
+define i8 @tls_arid(ptr noundef nonnull %0) unnamed_addr #2 {
+; CHECK-LABEL: tls_arid:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0: ; %start
+; CHECK-NEXT:    suba.l #4, %sp
+; CHECK-NEXT:    .cfi_def_cfa_offset -8
+; CHECK-NEXT:    jsr __m68k_read_tp
+; CHECK-NEXT:    move.b (tls@TPOFF+4,%a0), %d0
+; CHECK-NEXT:    adda.l #4, %sp
+; CHECK-NEXT:    rts
+start:
+  %1 = load i8, ptr getelementptr inbounds (i8, ptr @tls, i32 4), align 4
+  ret i8 %1
+}
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/icmp.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/icmp.mir
index 3c81427231a35..c3ccc820fc02b 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/icmp.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/icmp.mir
@@ -209,10 +209,10 @@ body:             |
     ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
     ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
     ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY2]]
-    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY2]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
     ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
@@ -242,10 +242,10 @@ body:             |
     ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
     ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
     ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sge), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), [[COPY]](s32), [[COPY2]]
-    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), [[COPY]](s32), [[COPY2]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(sge), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
     ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
@@ -275,10 +275,10 @@ body:             |
     ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
     ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
     ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
-    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
     ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
@@ -308,10 +308,10 @@ body:             |
     ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
     ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
     ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sle), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), [[COPY]](s32), [[COPY2]]
-    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), [[COPY]](s32), [[COPY2]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(sle), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
     ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
@@ -341,10 +341,10 @@ body:             |
     ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
     ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
     ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY2]]
-    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY2]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
     ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
@@ -374,10 +374,10 @@ body:             |
     ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
     ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
     ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), [[COPY]](s32), [[COPY2]]
-    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), [[COPY]](s32), [[COPY2]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
     ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
@@ -407,10 +407,10 @@ body:             |
     ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
     ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
     ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
-    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
     ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
@@ -440,10 +440,10 @@ body:             |
     ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
     ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
     ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), [[COPY]](s32), [[COPY2]]
-    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), [[COPY]](s32), [[COPY2]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
     ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/icmp.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/icmp.ll
index 49f2a78683b0a..2f8d075bf766a 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/icmp.ll
+++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/icmp.ll
@@ -188,8 +188,8 @@ entry:
 define i1 @sgt_i64(i64 %a, i64 %b) {
 ; MIPS32-LABEL: sgt_i64:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    slt $2, $7, $5
 ; MIPS32-NEXT:    sltu $1, $6, $4
+; MIPS32-NEXT:    slt $2, $7, $5
 ; MIPS32-NEXT:    xor $3, $5, $7
 ; MIPS32-NEXT:    movz $2, $1, $3
 ; MIPS32-NEXT:    jr $ra
@@ -202,10 +202,10 @@ entry:
 define i1 @sge_i64(i64 %a, i64 %b) {
 ; MIPS32-LABEL: sge_i64:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    slt $1, $5, $7
-; MIPS32-NEXT:    xori $2, $1, 1
 ; MIPS32-NEXT:    sltu $1, $4, $6
 ; MIPS32-NEXT:    xori $1, $1, 1
+; MIPS32-NEXT:    slt $2, $5, $7
+; MIPS32-NEXT:    xori $2, $2, 1
 ; MIPS32-NEXT:    xor $3, $5, $7
 ; MIPS32-NEXT:    movz $2, $1, $3
 ; MIPS32-NEXT:    jr $ra
@@ -218,8 +218,8 @@ entry:
 define i1 @slt_i64(i64 %a, i64 %b) {
 ; MIPS32-LABEL: slt_i64:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    slt $2, $5, $7
 ; MIPS32-NEXT:    sltu $1, $4, $6
+; MIPS32-NEXT:    slt $2, $5, $7
 ; MIPS32-NEXT:    xor $3, $5, $7
 ; MIPS32-NEXT:    movz $2, $1, $3
 ; MIPS32-NEXT:    jr $ra
@@ -232,10 +232,10 @@ entry:
 define i1 @sle_i64(i64 %a, i64 %b) {
 ; MIPS32-LABEL: sle_i64:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    slt $1, $7, $5
-; MIPS32-NEXT:    xori $2, $1, 1
 ; MIPS32-NEXT:    sltu $1, $6, $4
 ; MIPS32-NEXT:    xori $1, $1, 1
+; MIPS32-NEXT:    slt $2, $7, $5
+; MIPS32-NEXT:    xori $2, $2, 1
 ; MIPS32-NEXT:    xor $3, $5, $7
 ; MIPS32-NEXT:    movz $2, $1, $3
 ; MIPS32-NEXT:    jr $ra
@@ -248,8 +248,8 @@ entry:
 define i1 @ugt_i64(i64 %a, i64 %b) {
 ; MIPS32-LABEL: ugt_i64:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    sltu $2, $7, $5
 ; MIPS32-NEXT:    sltu $1, $6, $4
+; MIPS32-NEXT:    sltu $2, $7, $5
 ; MIPS32-NEXT:    xor $3, $5, $7
 ; MIPS32-NEXT:    movz $2, $1, $3
 ; MIPS32-NEXT:    jr $ra
@@ -262,10 +262,10 @@ entry:
 define i1 @uge_i64(i64 %a, i64 %b) {
 ; MIPS32-LABEL: uge_i64:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    sltu $1, $5, $7
-; MIPS32-NEXT:    xori $2, $1, 1
 ; MIPS32-NEXT:    sltu $1, $4, $6
 ; MIPS32-NEXT:    xori $1, $1, 1
+; MIPS32-NEXT:    sltu $2, $5, $7
+; MIPS32-NEXT:    xori $2, $2, 1
 ; MIPS32-NEXT:    xor $3, $5, $7
 ; MIPS32-NEXT:    movz $2, $1, $3
 ; MIPS32-NEXT:    jr $ra
@@ -278,8 +278,8 @@ entry:
 define i1 @ult_i64(i64 %a, i64 %b) {
 ; MIPS32-LABEL: ult_i64:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    sltu $2, $5, $7
 ; MIPS32-NEXT:    sltu $1, $4, $6
+; MIPS32-NEXT:    sltu $2, $5, $7
 ; MIPS32-NEXT:    xor $3, $5, $7
 ; MIPS32-NEXT:    movz $2, $1, $3
 ; MIPS32-NEXT:    jr $ra
@@ -292,10 +292,10 @@ entry:
 define i1 @ule_i64(i64 %a, i64 %b) {
 ; MIPS32-LABEL: ule_i64:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    sltu $1, $7, $5
-; MIPS32-NEXT:    xori $2, $1, 1
 ; MIPS32-NEXT:    sltu $1, $6, $4
 ; MIPS32-NEXT:    xori $1, $1, 1
+; MIPS32-NEXT:    sltu $2, $7, $5
+; MIPS32-NEXT:    xori $2, $2, 1
 ; MIPS32-NEXT:    xor $3, $5, $7
 ; MIPS32-NEXT:    movz $2, $1, $3
 ; MIPS32-NEXT:    jr $ra
diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
index 00d1c471c2fa7..501227c9072c4 100644
--- a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
@@ -1102,12 +1102,12 @@ define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6
   ; 32BIT-NEXT: {{  $}}
   ; 32BIT-NEXT:   renamable $r11 = LWZ 0, %fixed-stack.0 :: (load (s32) from %fixed-stack.0)
   ; 32BIT-NEXT:   renamable $r12 = LWZ 0, %fixed-stack.4 :: (load (s32) from %fixed-stack.4)
-  ; 32BIT-NEXT:   renamable $r0 = LWZ 0, %fixed-stack.1 :: (load (s32) from %fixed-stack.1, align 8)
+  ; 32BIT-NEXT:   renamable $r0 = LBZ 3, %fixed-stack.1 :: (load (s8) from %fixed-stack.1 + 3, basealign 4)
   ; 32BIT-NEXT:   renamable $r31 = LWZ 4, %fixed-stack.3 :: (load (s32) from %fixed-stack.3 + 4, basealign 16)
   ; 32BIT-NEXT:   renamable $r30 = LWZ 0, %fixed-stack.3 :: (load (s32) from %fixed-stack.3, align 16)
   ; 32BIT-NEXT:   renamable $r29 = LWZ 0, %fixed-stack.5 :: (load (s32) from %fixed-stack.5, align 8)
-  ; 32BIT-NEXT:   renamable $r28 = LWZ 0, %fixed-stack.6 :: (load (s32) from %fixed-stack.6)
-  ; 32BIT-NEXT:   renamable $r27 = LWZ 0, %fixed-stack.7 :: (load (s32) from %fixed-stack.7, align 16)
+  ; 32BIT-NEXT:   renamable $r28 = LBZ 3, %fixed-stack.6 :: (load (s8) from %fixed-stack.6 + 3, basealign 4)
+  ; 32BIT-NEXT:   renamable $r27 = LHA 2, %fixed-stack.7 :: (load (s16) from %fixed-stack.7 + 2, basealign 4)
   ; 32BIT-NEXT:   renamable $r26 = LWZ 4, %fixed-stack.9 :: (load (s32) from %fixed-stack.9 + 4, basealign 8)
   ; 32BIT-NEXT:   renamable $r25 = LWZ 0, %fixed-stack.9 :: (load (s32) from %fixed-stack.9, align 8)
   ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r4
@@ -1143,13 +1143,13 @@ define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6
   ; 64BIT: bb.0.entry:
   ; 64BIT-NEXT:   liveins: $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10
   ; 64BIT-NEXT: {{  $}}
-  ; 64BIT-NEXT:   renamable $r11 = LWZ 0, %fixed-stack.1, implicit-def $x11 :: (load (s32) from %fixed-stack.1)
+  ; 64BIT-NEXT:   renamable $r11 = LBZ 3, %fixed-stack.1, implicit-def $x11 :: (load (s8) from %fixed-stack.1 + 3, basealign 4)
   ; 64BIT-NEXT:   renamable $x12 = LWZ8 0, %fixed-stack.4 :: (load (s32) from %fixed-stack.4)
-  ; 64BIT-NEXT:   renamable $x0 = LWA 0, %fixed-stack.0 :: (load (s32) from %fixed-stack.0)
-  ; 64BIT-NEXT:   renamable $x2 = LD 0, %fixed-stack.2 :: (load (s64) from %fixed-stack.2)
-  ; 64BIT-NEXT:   renamable $x31 = LWA 0, %fixed-stack.3 :: (load (s32) from %fixed-stack.3)
-  ; 64BIT-NEXT:   renamable $r30 = LWZ 0, %fixed-stack.5, implicit-def $x30 :: (load (s32) from %fixed-stack.5)
-  ; 64BIT-NEXT:   renamable $x29 = LWA 0, %fixed-stack.6 :: (load (s32) from %fixed-stack.6)
+  ; 64BIT-NEXT:   renamable $r0 = LBZ 3, %fixed-stack.5, implicit-def $x0 :: (load (s8) from %fixed-stack.5 + 3, basealign 4)
+  ; 64BIT-NEXT:   renamable $x2 = LWA 0, %fixed-stack.0 :: (load (s32) from %fixed-stack.0)
+  ; 64BIT-NEXT:   renamable $x31 = LD 0, %fixed-stack.2 :: (load (s64) from %fixed-stack.2)
+  ; 64BIT-NEXT:   renamable $x30 = LWA 0, %fixed-stack.3 :: (load (s32) from %fixed-stack.3)
+  ; 64BIT-NEXT:   renamable $x29 = LHA8 2, %fixed-stack.6
   ; 64BIT-NEXT:   renamable $x28 = LD 0, %fixed-stack.7 :: (load (s64) from %fixed-stack.7, align 16)
   ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 renamable $r3, renamable $r4, implicit killed $x4, implicit killed $x3
   ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r5, implicit killed $x5
@@ -1161,12 +1161,12 @@ define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6
   ; 64BIT-NEXT:   renamable $x3 = EXTSW_32_64 killed renamable $r3
   ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x28
   ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x29
-  ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x30
+  ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x0
   ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x12
+  ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x30
   ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x31
-  ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x2
   ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x11
-  ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x0
+  ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x2
   ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
 entry:
   %add = add nsw i32 %i1, %i2
@@ -1611,8 +1611,8 @@ define i32 @mix_callee(double %d1, double %d2, double %d3, double %d4, i8 zeroex
   ; 32BIT-NEXT:   liveins: $f1, $f2, $f3, $f4
   ; 32BIT-NEXT: {{  $}}
   ; 32BIT-NEXT:   renamable $r3 = LWZ 0, %fixed-stack.3 :: (load (s32) from %fixed-stack.3)
-  ; 32BIT-NEXT:   renamable $r4 = LWZ 0, %fixed-stack.5 :: (load (s32) from %fixed-stack.5)
-  ; 32BIT-NEXT:   renamable $r5 = LWZ 0, %fixed-stack.6 :: (load (s32) from %fixed-stack.6, align 8)
+  ; 32BIT-NEXT:   renamable $r4 = LHA 2, %fixed-stack.5 :: (load (s16) from %fixed-stack.5 + 2, basealign 4)
+  ; 32BIT-NEXT:   renamable $r5 = LBZ 3, %fixed-stack.6 :: (load (s8) from %fixed-stack.6 + 3, basealign 4)
   ; 32BIT-NEXT:   renamable $r6 = LWZ 0, %fixed-stack.2 :: (load (s32) from %fixed-stack.2, align 8)
   ; 32BIT-NEXT:   renamable $r7 = LIS 17200
   ; 32BIT-NEXT:   STW killed renamable $r7, 0, %stack.1 :: (store (s32) into %stack.1, align 8)
diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll
index 433d427344466..79c59e925302a 100644
--- a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll
@@ -1181,78 +1181,95 @@ entry:
 
 declare void @test_stackarg_float3(i32, i32, i32, i32, i32, i32, i32, ...)
 
-define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i64 %ll9, i16 signext %s10, i8 zeroext %c11, i32 %ui12, i32 %si13, i64 %ll14, i8 zeroext %uc15, i32 %i16) {
+define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i64 %ll9, i16 signext %s10, i8 zeroext %c11, i32 %ui12, i32 %si13, i64 %ll14, i8 zeroext %uc15, i32 %i16, i8 signext %si8, i1 zeroext %zi1) {
 ; ASM32PWR4-LABEL: test_ints_stack:
 ; ASM32PWR4:       # %bb.0: # %entry
 ; ASM32PWR4-NEXT:    add 3, 3, 4
-; ASM32PWR4-NEXT:    lwz 11, 92(1)
+; ASM32PWR4-NEXT:    stw 31, -4(1) # 4-byte Folded Spill
 ; ASM32PWR4-NEXT:    add 3, 3, 5
 ; ASM32PWR4-NEXT:    add 3, 3, 6
 ; ASM32PWR4-NEXT:    add 3, 3, 7
-; ASM32PWR4-NEXT:    lwz 12, 76(1)
+; ASM32PWR4-NEXT:    lbz 12, 99(1)
 ; ASM32PWR4-NEXT:    add 3, 3, 8
 ; ASM32PWR4-NEXT:    add 3, 3, 9
-; ASM32PWR4-NEXT:    lwz 6, 60(1)
+; ASM32PWR4-NEXT:    lwz 0, 92(1)
 ; ASM32PWR4-NEXT:    add 3, 3, 10
-; ASM32PWR4-NEXT:    srawi 5, 11, 31
+; ASM32PWR4-NEXT:    extsb 4, 12
 ; ASM32PWR4-NEXT:    srawi 8, 3, 31
-; ASM32PWR4-NEXT:    lwz 4, 64(1)
+; ASM32PWR4-NEXT:    lwz 31, 76(1)
+; ASM32PWR4-NEXT:    srawi 12, 0, 31
+; ASM32PWR4-NEXT:    lwz 6, 60(1)
+; ASM32PWR4-NEXT:    lha 11, 66(1)
 ; ASM32PWR4-NEXT:    lwz 7, 56(1)
-; ASM32PWR4-NEXT:    stw 31, -4(1) # 4-byte Folded Spill
-; ASM32PWR4-NEXT:    srawi 31, 12, 31
+; ASM32PWR4-NEXT:    stw 30, -8(1) # 4-byte Folded Spill
+; ASM32PWR4-NEXT:    srawi 30, 31, 31
 ; ASM32PWR4-NEXT:    addc 3, 3, 6
 ; ASM32PWR4-NEXT:    adde 7, 8, 7
-; ASM32PWR4-NEXT:    lwz 6, 68(1)
-; ASM32PWR4-NEXT:    srawi 8, 4, 31
-; ASM32PWR4-NEXT:    addc 3, 3, 4
+; ASM32PWR4-NEXT:    lbz 6, 71(1)
+; ASM32PWR4-NEXT:    srawi 8, 11, 31
+; ASM32PWR4-NEXT:    addc 3, 3, 11
 ; ASM32PWR4-NEXT:    adde 7, 7, 8
-; ASM32PWR4-NEXT:    lwz 4, 72(1)
+; ASM32PWR4-NEXT:    lwz 9, 72(1)
 ; ASM32PWR4-NEXT:    addc 3, 3, 6
 ; ASM32PWR4-NEXT:    addze 6, 7
-; ASM32PWR4-NEXT:    addc 3, 3, 4
-; ASM32PWR4-NEXT:    lwz 0, 84(1)
-; ASM32PWR4-NEXT:    addze 4, 6
-; ASM32PWR4-NEXT:    addc 3, 3, 12
+; ASM32PWR4-NEXT:    addc 3, 3, 9
+; ASM32PWR4-NEXT:    lwz 5, 84(1)
+; ASM32PWR4-NEXT:    addze 6, 6
+; ASM32PWR4-NEXT:    addc 3, 3, 31
 ; ASM32PWR4-NEXT:    lwz 7, 80(1)
-; ASM32PWR4-NEXT:    adde 4, 4, 31
+; ASM32PWR4-NEXT:    adde 6, 6, 30
+; ASM32PWR4-NEXT:    addc 3, 3, 5
+; ASM32PWR4-NEXT:    lbz 8, 91(1)
+; ASM32PWR4-NEXT:    adde 5, 6, 7
+; ASM32PWR4-NEXT:    addc 3, 3, 8
+; ASM32PWR4-NEXT:    lbz 6, 103(1)
+; ASM32PWR4-NEXT:    addze 5, 5
 ; ASM32PWR4-NEXT:    addc 3, 3, 0
-; ASM32PWR4-NEXT:    lwz 6, 88(1)
-; ASM32PWR4-NEXT:    adde 4, 4, 7
-; ASM32PWR4-NEXT:    addc 3, 3, 6
+; ASM32PWR4-NEXT:    adde 5, 5, 12
 ; ASM32PWR4-NEXT:    lwz 31, -4(1) # 4-byte Folded Reload
-; ASM32PWR4-NEXT:    addze 6, 4
-; ASM32PWR4-NEXT:    addc 4, 3, 11
-; ASM32PWR4-NEXT:    adde 3, 6, 5
+; ASM32PWR4-NEXT:    srawi 7, 4, 31
+; ASM32PWR4-NEXT:    addc 3, 3, 4
+; ASM32PWR4-NEXT:    adde 5, 5, 7
+; ASM32PWR4-NEXT:    lwz 30, -8(1) # 4-byte Folded Reload
+; ASM32PWR4-NEXT:    addc 4, 3, 6
+; ASM32PWR4-NEXT:    addze 3, 5
 ; ASM32PWR4-NEXT:    blr
 ;
 ; ASM64PWR4-LABEL: test_ints_stack:
 ; ASM64PWR4:       # %bb.0: # %entry
 ; ASM64PWR4-NEXT:    add 3, 3, 4
-; ASM64PWR4-NEXT:    ld 4, 112(1)
+; ASM64PWR4-NEXT:    std 31, -8(1) # 8-byte Folded Spill
 ; ASM64PWR4-NEXT:    add 3, 3, 5
 ; ASM64PWR4-NEXT:    add 3, 3, 6
 ; ASM64PWR4-NEXT:    add 3, 3, 7
-; ASM64PWR4-NEXT:    lwa 12, 124(1)
+; ASM64PWR4-NEXT:    std 2, -16(1) # 8-byte Folded Spill
 ; ASM64PWR4-NEXT:    add 3, 3, 8
 ; ASM64PWR4-NEXT:    add 3, 3, 9
+; ASM64PWR4-NEXT:    ld 6, 112(1)
 ; ASM64PWR4-NEXT:    add 3, 3, 10
 ; ASM64PWR4-NEXT:    extsw 3, 3
-; ASM64PWR4-NEXT:    lwz 5, 132(1)
-; ASM64PWR4-NEXT:    add 3, 3, 4
+; ASM64PWR4-NEXT:    lha 0, 126(1)
+; ASM64PWR4-NEXT:    add 3, 3, 6
+; ASM64PWR4-NEXT:    add 3, 3, 0
+; ASM64PWR4-NEXT:    lbz 5, 135(1)
+; ASM64PWR4-NEXT:    lwz 7, 140(1)
+; ASM64PWR4-NEXT:    add 3, 3, 5
+; ASM64PWR4-NEXT:    lwa 12, 148(1)
+; ASM64PWR4-NEXT:    add 3, 3, 7
 ; ASM64PWR4-NEXT:    add 3, 3, 12
-; ASM64PWR4-NEXT:    std 2, -8(1) # 8-byte Folded Spill
+; ASM64PWR4-NEXT:    ld 31, 152(1)
+; ASM64PWR4-NEXT:    lbz 5, 167(1)
+; ASM64PWR4-NEXT:    add 3, 3, 31
+; ASM64PWR4-NEXT:    lwa 11, 172(1)
 ; ASM64PWR4-NEXT:    add 3, 3, 5
-; ASM64PWR4-NEXT:    lwz 2, 140(1)
-; ASM64PWR4-NEXT:    lwa 11, 148(1)
-; ASM64PWR4-NEXT:    add 3, 3, 2
 ; ASM64PWR4-NEXT:    add 3, 3, 11
-; ASM64PWR4-NEXT:    ld 4, 152(1)
-; ASM64PWR4-NEXT:    lwz 0, 164(1)
+; ASM64PWR4-NEXT:    lbz 2, 183(1)
+; ASM64PWR4-NEXT:    lbz 6, 191(1)
+; ASM64PWR4-NEXT:    extsb 4, 2
 ; ASM64PWR4-NEXT:    add 3, 3, 4
-; ASM64PWR4-NEXT:    lwa 5, 172(1)
-; ASM64PWR4-NEXT:    add 3, 3, 0
-; ASM64PWR4-NEXT:    add 3, 3, 5
-; ASM64PWR4-NEXT:    ld 2, -8(1) # 8-byte Folded Reload
+; ASM64PWR4-NEXT:    add 3, 3, 6
+; ASM64PWR4-NEXT:    ld 2, -16(1) # 8-byte Folded Reload
+; ASM64PWR4-NEXT:    ld 31, -8(1) # 8-byte Folded Reload
 ; ASM64PWR4-NEXT:    blr
 entry:
   %add = add nsw i32 %i1, %i2
@@ -1277,7 +1294,11 @@ entry:
   %add18 = add nsw i64 %add16, %conv17
   %conv19 = sext i32 %i16 to i64
   %add20 = add nsw i64 %add18, %conv19
-  ret i64 %add20
+  %conv21 = sext i8 %si8 to i64
+  %add22 = add nsw i64 %add20, %conv21
+  %conv23 = zext i1 %zi1 to i64
+  %add24 = add nsw i64 %add22, %conv23
+  ret i64 %add24
 }
 
 @ll1 = common global i64 0, align 8
@@ -1720,17 +1741,17 @@ entry:
 define i32 @mix_callee(double %d1, double %d2, double %d3, double %d4, i8 zeroext %c1, i16 signext %s1, i64 %ll1, i32 %i1, i32 %i2, i32 %i3) {
 ; ASM32PWR4-LABEL: mix_callee:
 ; ASM32PWR4:       # %bb.0: # %entry
-; ASM32PWR4-NEXT:    lwz 4, 60(1)
+; ASM32PWR4-NEXT:    lha 3, 62(1)
 ; ASM32PWR4-NEXT:    lis 8, 17200
 ; ASM32PWR4-NEXT:    fadd 1, 1, 2
 ; ASM32PWR4-NEXT:    fadd 1, 1, 3
-; ASM32PWR4-NEXT:    lwz 5, 56(1)
-; ASM32PWR4-NEXT:    lwz 3, 68(1)
-; ASM32PWR4-NEXT:    add 4, 5, 4
-; ASM32PWR4-NEXT:    lwz 5, L..C34(2) # %const.0
+; ASM32PWR4-NEXT:    lbz 5, 59(1)
 ; ASM32PWR4-NEXT:    fadd 1, 1, 4
+; ASM32PWR4-NEXT:    lwz 4, 68(1)
+; ASM32PWR4-NEXT:    add 3, 5, 3
+; ASM32PWR4-NEXT:    lwz 5, L..C34(2) # %const.0
 ; ASM32PWR4-NEXT:    lwz 6, 72(1)
-; ASM32PWR4-NEXT:    add 3, 4, 3
+; ASM32PWR4-NEXT:    add 3, 3, 4
 ; ASM32PWR4-NEXT:    lwz 7, 76(1)
 ; ASM32PWR4-NEXT:    add 3, 3, 6
 ; ASM32PWR4-NEXT:    stw 8, -16(1)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv32.mir
index 3a292093aa607..5288bb5e87011 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv32.mir
@@ -121,14 +121,14 @@ body:             |
     ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32)
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY5]](s32), [[COPY1]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY5]](s32), [[COPY1]]
-    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY4]](s32), [[COPY]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
-    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY3]](s32), [[C]]
-    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]]
-    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s32), [[ICMP6]], [[ICMP4]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY4]](s32), [[COPY]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY5]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY5]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s32), [[ICMP1]], [[ICMP2]]
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY3]](s32), [[C]]
+    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s32), [[ICMP4]], [[ICMP5]]
     ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SELECT1]], [[SELECT]]
     ; CHECK-NEXT: $x10 = COPY [[COPY4]](s32)
     ; CHECK-NEXT: $x11 = COPY [[COPY5]](s32)
@@ -266,14 +266,14 @@ body:             |
     ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY3]]
     ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[ICMP]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[SUB2]](s32), [[COPY1]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB2]](s32), [[COPY1]]
-    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB]](s32), [[COPY]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
-    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY3]](s32), [[C]]
-    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]]
-    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY2]](s32), [[C]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s32), [[ICMP6]], [[ICMP4]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB]](s32), [[COPY]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[SUB2]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB2]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s32), [[ICMP1]], [[ICMP2]]
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY2]](s32), [[C]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY3]](s32), [[C]]
+    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s32), [[ICMP4]], [[ICMP5]]
     ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SELECT1]], [[SELECT]]
     ; CHECK-NEXT: $x10 = COPY [[SUB]](s32)
     ; CHECK-NEXT: $x11 = COPY [[SUB2]](s32)
@@ -401,10 +401,10 @@ body:             |
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY3]]
     ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]]
     ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32)
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY5]](s32), [[COPY3]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY5]](s32), [[COPY3]]
-    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY4]](s32), [[COPY2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY4]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY5]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY5]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s32), [[ICMP1]], [[ICMP2]]
     ; CHECK-NEXT: $x10 = COPY [[COPY4]](s32)
     ; CHECK-NEXT: $x11 = COPY [[COPY5]](s32)
     ; CHECK-NEXT: $x12 = COPY [[SELECT]](s32)
@@ -528,10 +528,10 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
     ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY3]]
     ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[ICMP]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY1]](s32), [[COPY3]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY1]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s32), [[ICMP1]], [[ICMP2]]
     ; CHECK-NEXT: $x10 = COPY [[SUB]](s32)
     ; CHECK-NEXT: $x11 = COPY [[SUB2]](s32)
     ; CHECK-NEXT: $x12 = COPY [[SELECT]](s32)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-icmp-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-icmp-rv32.mir
index 413dbe275dfde..8081cfbd7edab 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-icmp-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-icmp-rv32.mir
@@ -108,10 +108,10 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL %ylo, [[C]](s32)
     ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C]](s32)
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[ASHR]](s32), [[ASHR1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ASHR]](s32), [[ASHR1]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), %xhi(s32), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[ASHR]](s32), [[ASHR1]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ASHR]](s32), [[ASHR1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -137,10 +137,10 @@ body:             |
     ; CHECK-NEXT: %xlo:_(s32) = COPY $x11
     ; CHECK-NEXT: %yhi:_(s32) = COPY $x12
     ; CHECK-NEXT: %ylo:_(s32) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), %xlo(s32), %ylo
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), %xhi(s32), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), %xlo(s32), %ylo
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -262,10 +262,10 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL %ylo, [[C]](s32)
     ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C]](s32)
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[ASHR]](s32), [[ASHR1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ASHR]](s32), [[ASHR1]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), %xhi(s32), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[ASHR]](s32), [[ASHR1]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ASHR]](s32), [[ASHR1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -291,10 +291,10 @@ body:             |
     ; CHECK-NEXT: %xlo:_(s32) = COPY $x11
     ; CHECK-NEXT: %yhi:_(s32) = COPY $x12
     ; CHECK-NEXT: %ylo:_(s32) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), %xlo(s32), %ylo
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), %xhi(s32), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), %xlo(s32), %ylo
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -416,10 +416,10 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL %ylo, [[C]](s32)
     ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C]](s32)
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sge), [[ASHR]](s32), [[ASHR1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ASHR]](s32), [[ASHR1]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), %xhi(s32), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(sge), [[ASHR]](s32), [[ASHR1]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ASHR]](s32), [[ASHR1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -445,10 +445,10 @@ body:             |
     ; CHECK-NEXT: %xlo:_(s32) = COPY $x11
     ; CHECK-NEXT: %yhi:_(s32) = COPY $x12
     ; CHECK-NEXT: %ylo:_(s32) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sge), %xlo(s32), %ylo
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), %xhi(s32), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(sge), %xlo(s32), %ylo
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -570,10 +570,10 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL %ylo, [[C]](s32)
     ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C]](s32)
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sle), [[ASHR]](s32), [[ASHR1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ASHR]](s32), [[ASHR1]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), %xhi(s32), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(sle), [[ASHR]](s32), [[ASHR1]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ASHR]](s32), [[ASHR1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -599,10 +599,10 @@ body:             |
     ; CHECK-NEXT: %xlo:_(s32) = COPY $x11
     ; CHECK-NEXT: %yhi:_(s32) = COPY $x12
     ; CHECK-NEXT: %ylo:_(s32) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sle), %xlo(s32), %ylo
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), %xhi(s32), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(sle), %xlo(s32), %ylo
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -719,10 +719,10 @@ body:             |
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND %xlo, [[C1]]
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND %yhi, [[C]]
     ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND %ylo, [[C1]]
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[AND1]](s32), [[AND3]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[AND1]](s32), [[AND3]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[AND]](s32), [[AND2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[AND]](s32), [[AND2]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[AND1]](s32), [[AND3]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[AND1]](s32), [[AND3]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -748,10 +748,10 @@ body:             |
     ; CHECK-NEXT: %xlo:_(s32) = COPY $x11
     ; CHECK-NEXT: %yhi:_(s32) = COPY $x12
     ; CHECK-NEXT: %ylo:_(s32) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), %xlo(s32), %ylo
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), %xhi(s32), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), %xlo(s32), %ylo
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -868,10 +868,10 @@ body:             |
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND %xlo, [[C1]]
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND %yhi, [[C]]
     ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND %ylo, [[C1]]
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[AND1]](s32), [[AND3]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[AND1]](s32), [[AND3]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[AND]](s32), [[AND2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[AND]](s32), [[AND2]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[AND1]](s32), [[AND3]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[AND1]](s32), [[AND3]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -897,10 +897,10 @@ body:             |
     ; CHECK-NEXT: %xlo:_(s32) = COPY $x11
     ; CHECK-NEXT: %yhi:_(s32) = COPY $x12
     ; CHECK-NEXT: %ylo:_(s32) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sge), %xlo(s32), %ylo
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), %xhi(s32), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(sge), %xlo(s32), %ylo
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -1017,10 +1017,10 @@ body:             |
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND %xlo, [[C1]]
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND %yhi, [[C]]
     ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND %ylo, [[C1]]
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), [[AND1]](s32), [[AND3]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[AND1]](s32), [[AND3]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), [[AND]](s32), [[AND2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), [[AND]](s32), [[AND2]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), [[AND1]](s32), [[AND3]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[AND1]](s32), [[AND3]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -1046,10 +1046,10 @@ body:             |
     ; CHECK-NEXT: %xlo:_(s32) = COPY $x11
     ; CHECK-NEXT: %yhi:_(s32) = COPY $x12
     ; CHECK-NEXT: %ylo:_(s32) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), %xlo(s32), %ylo
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), %xhi(s32), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), %xlo(s32), %ylo
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -1166,10 +1166,10 @@ body:             |
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND %xlo, [[C1]]
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND %yhi, [[C]]
     ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND %ylo, [[C1]]
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), [[AND1]](s32), [[AND3]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[AND1]](s32), [[AND3]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), [[AND]](s32), [[AND2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), [[AND]](s32), [[AND2]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), [[AND1]](s32), [[AND3]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[AND1]](s32), [[AND3]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -1195,10 +1195,10 @@ body:             |
     ; CHECK-NEXT: %xlo:_(s32) = COPY $x11
     ; CHECK-NEXT: %yhi:_(s32) = COPY $x12
     ; CHECK-NEXT: %ylo:_(s32) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), %xlo(s32), %ylo
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), %xhi(s32), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), %xlo(s32), %ylo
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -1533,3 +1533,53 @@ body:             |
     PseudoRET implicit $x10
 
 ...
+---
+name:            cmp_slt_i128
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: cmp_slt_i128
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 8)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s32) from unknown-address + 8, align 8)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from unknown-address + 12)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11
+    ; CHECK-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[COPY1]](p0) :: (load (s32), align 8)
+    ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C]](s32)
+    ; CHECK-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p0) :: (load (s32) from unknown-address + 4)
+    ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+    ; CHECK-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s32) from unknown-address + 8, align 8)
+    ; CHECK-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C2]](s32)
+    ; CHECK-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p0) :: (load (s32) from unknown-address + 12)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[LOAD]](s32), [[LOAD4]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[LOAD1]](s32), [[LOAD5]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[LOAD1]](s32), [[LOAD5]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[LOAD2]](s32), [[LOAD6]]
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[LOAD2]](s32), [[LOAD6]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP4]](s32), [[SELECT]], [[ICMP3]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[LOAD3]](s32), [[LOAD7]]
+    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[LOAD3]](s32), [[LOAD7]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s32), [[SELECT1]], [[ICMP5]]
+    ; CHECK-NEXT: $x10 = COPY [[SELECT2]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %2:_(p0) = COPY $x10
+    %0:_(s128) = G_LOAD %2(p0) :: (load (s128), align 8)
+    %3:_(p0) = COPY $x11
+    %1:_(s128) = G_LOAD %3(p0) :: (load (s128), align 8)
+    %4:_(s1) = G_ICMP intpred(slt), %0(s128), %1
+    %5:_(s32) = G_ANYEXT %4(s1)
+    $x10 = COPY %5(s32)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-icmp-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-icmp-rv64.mir
index 27ac60ae14313..74d745732d59d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-icmp-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-icmp-rv64.mir
@@ -130,12 +130,12 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64)
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL %ylo, [[C]](s64)
     ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C]](s64)
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sgt), [[ASHR]](s64), [[ASHR1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[ASHR]](s64), [[ASHR1]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s64), [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), %xhi(s64), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(sgt), [[ASHR]](s64), [[ASHR1]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[ASHR]](s64), [[ASHR1]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s64), [[TRUNC]], [[TRUNC1]]
     ; CHECK-NEXT: %z0:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY %z0(s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -162,12 +162,12 @@ body:             |
     ; CHECK-NEXT: %xlo:_(s64) = COPY $x11
     ; CHECK-NEXT: %yhi:_(s64) = COPY $x12
     ; CHECK-NEXT: %ylo:_(s64) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sgt), %xlo(s64), %ylo
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s64), [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), %xhi(s64), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(sgt), %xlo(s64), %ylo
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s64), [[TRUNC]], [[TRUNC1]]
     ; CHECK-NEXT: %z0:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY %z0(s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -312,12 +312,12 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64)
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL %ylo, [[C]](s64)
     ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C]](s64)
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sge), [[ASHR]](s64), [[ASHR1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[ASHR]](s64), [[ASHR1]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s64), [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), %xhi(s64), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(sge), [[ASHR]](s64), [[ASHR1]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[ASHR]](s64), [[ASHR1]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s64), [[TRUNC]], [[TRUNC1]]
     ; CHECK-NEXT: %z0:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY %z0(s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -344,12 +344,12 @@ body:             |
     ; CHECK-NEXT: %xlo:_(s64) = COPY $x11
     ; CHECK-NEXT: %yhi:_(s64) = COPY $x12
     ; CHECK-NEXT: %ylo:_(s64) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), %xlo(s64), %ylo
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s64), [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), %xhi(s64), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), %xlo(s64), %ylo
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s64), [[TRUNC]], [[TRUNC1]]
     ; CHECK-NEXT: %z0:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY %z0(s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -494,12 +494,12 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64)
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL %ylo, [[C]](s64)
     ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C]](s64)
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sge), [[ASHR]](s64), [[ASHR1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[ASHR]](s64), [[ASHR1]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s64), [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), %xhi(s64), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(sge), [[ASHR]](s64), [[ASHR1]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[ASHR]](s64), [[ASHR1]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s64), [[TRUNC]], [[TRUNC1]]
     ; CHECK-NEXT: %z0:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY %z0(s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -526,12 +526,12 @@ body:             |
     ; CHECK-NEXT: %xlo:_(s64) = COPY $x11
     ; CHECK-NEXT: %yhi:_(s64) = COPY $x12
     ; CHECK-NEXT: %ylo:_(s64) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sge), %xlo(s64), %ylo
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s64), [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), %xhi(s64), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(sge), %xlo(s64), %ylo
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s64), [[TRUNC]], [[TRUNC1]]
     ; CHECK-NEXT: %z0:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY %z0(s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -676,12 +676,12 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64)
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL %ylo, [[C]](s64)
     ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C]](s64)
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sle), [[ASHR]](s64), [[ASHR1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[ASHR]](s64), [[ASHR1]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ule), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s64), [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ule), %xhi(s64), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(sle), [[ASHR]](s64), [[ASHR1]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[ASHR]](s64), [[ASHR1]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s64), [[TRUNC]], [[TRUNC1]]
     ; CHECK-NEXT: %z0:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY %z0(s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -708,12 +708,12 @@ body:             |
     ; CHECK-NEXT: %xlo:_(s64) = COPY $x11
     ; CHECK-NEXT: %yhi:_(s64) = COPY $x12
     ; CHECK-NEXT: %ylo:_(s64) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sle), %xlo(s64), %ylo
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ule), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s64), [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ule), %xhi(s64), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(sle), %xlo(s64), %ylo
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s64), [[TRUNC]], [[TRUNC1]]
     ; CHECK-NEXT: %z0:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY %z0(s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -854,12 +854,12 @@ body:             |
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND %xlo, [[C1]]
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND %yhi, [[C]]
     ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND %ylo, [[C1]]
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), [[AND1]](s64), [[AND3]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[AND1]](s64), [[AND3]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), [[AND]](s64), [[AND2]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s64), [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), [[AND]](s64), [[AND2]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), [[AND1]](s64), [[AND3]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[AND1]](s64), [[AND3]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s64), [[TRUNC]], [[TRUNC1]]
     ; CHECK-NEXT: %z0:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY %z0(s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -886,12 +886,12 @@ body:             |
     ; CHECK-NEXT: %xlo:_(s64) = COPY $x11
     ; CHECK-NEXT: %yhi:_(s64) = COPY $x12
     ; CHECK-NEXT: %ylo:_(s64) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), %xlo(s64), %ylo
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s64), [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), %xhi(s64), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), %xlo(s64), %ylo
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s64), [[TRUNC]], [[TRUNC1]]
     ; CHECK-NEXT: %z0:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY %z0(s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -1032,12 +1032,12 @@ body:             |
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND %xlo, [[C1]]
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND %yhi, [[C]]
     ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND %ylo, [[C1]]
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[AND1]](s64), [[AND3]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[AND1]](s64), [[AND3]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[AND]](s64), [[AND2]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s64), [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[AND]](s64), [[AND2]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[AND1]](s64), [[AND3]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[AND1]](s64), [[AND3]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s64), [[TRUNC]], [[TRUNC1]]
     ; CHECK-NEXT: %z0:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY %z0(s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -1064,12 +1064,12 @@ body:             |
     ; CHECK-NEXT: %xlo:_(s64) = COPY $x11
     ; CHECK-NEXT: %yhi:_(s64) = COPY $x12
     ; CHECK-NEXT: %ylo:_(s64) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sge), %xlo(s64), %ylo
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s64), [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), %xhi(s64), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(sge), %xlo(s64), %ylo
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s64), [[TRUNC]], [[TRUNC1]]
     ; CHECK-NEXT: %z0:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY %z0(s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -1210,12 +1210,12 @@ body:             |
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND %xlo, [[C1]]
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND %yhi, [[C]]
     ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND %ylo, [[C1]]
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), [[AND1]](s64), [[AND3]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[AND1]](s64), [[AND3]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), [[AND]](s64), [[AND2]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s64), [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), [[AND]](s64), [[AND2]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), [[AND1]](s64), [[AND3]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[AND1]](s64), [[AND3]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s64), [[TRUNC]], [[TRUNC1]]
     ; CHECK-NEXT: %z0:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY %z0(s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -1242,12 +1242,12 @@ body:             |
     ; CHECK-NEXT: %xlo:_(s64) = COPY $x11
     ; CHECK-NEXT: %yhi:_(s64) = COPY $x12
     ; CHECK-NEXT: %ylo:_(s64) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), %xlo(s64), %ylo
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s64), [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), %xhi(s64), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), %xlo(s64), %ylo
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s64), [[TRUNC]], [[TRUNC1]]
     ; CHECK-NEXT: %z0:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY %z0(s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -1388,12 +1388,12 @@ body:             |
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND %xlo, [[C1]]
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND %yhi, [[C]]
     ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND %ylo, [[C1]]
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ule), [[AND1]](s64), [[AND3]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[AND1]](s64), [[AND3]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ule), [[AND]](s64), [[AND2]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s64), [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ule), [[AND]](s64), [[AND2]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(ule), [[AND1]](s64), [[AND3]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[AND1]](s64), [[AND3]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s64), [[TRUNC]], [[TRUNC1]]
     ; CHECK-NEXT: %z0:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY %z0(s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -1420,12 +1420,12 @@ body:             |
     ; CHECK-NEXT: %xlo:_(s64) = COPY $x11
     ; CHECK-NEXT: %yhi:_(s64) = COPY $x12
     ; CHECK-NEXT: %ylo:_(s64) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ule), %xlo(s64), %ylo
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ule), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s64), [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ule), %xhi(s64), %yhi
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(ule), %xlo(s64), %ylo
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s64), [[TRUNC]], [[TRUNC1]]
     ; CHECK-NEXT: %z0:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY %z0(s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv32.mir
index 4932b1aebdec6..227a8cd7eb5ba 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv32.mir
@@ -59,10 +59,10 @@ body:             |
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY3]]
     ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]]
     ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32)
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY5]](s32), [[COPY3]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY5]](s32), [[COPY3]]
-    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY4]](s32), [[COPY2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY4]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY5]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY5]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s32), [[ICMP1]], [[ICMP2]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
     ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[C]], [[COPY4]]
     ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[C]], [[COPY5]]
@@ -150,14 +150,14 @@ body:             |
     ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32)
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY5]](s32), [[COPY1]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY5]](s32), [[COPY1]]
-    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY4]](s32), [[COPY]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
-    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY3]](s32), [[C]]
-    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]]
-    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s32), [[ICMP6]], [[ICMP4]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY4]](s32), [[COPY]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY5]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY5]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s32), [[ICMP1]], [[ICMP2]]
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY3]](s32), [[C]]
+    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s32), [[ICMP4]], [[ICMP5]]
     ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SELECT1]], [[SELECT]]
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY5]], [[C1]](s32)
@@ -237,10 +237,10 @@ body:             |
     ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY3]]
     ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[ICMP]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY1]](s32), [[COPY3]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY1]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s32), [[ICMP1]], [[ICMP2]]
     ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[C]], [[SUB]]
     ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[C]], [[SUB2]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s32)
@@ -325,14 +325,14 @@ body:             |
     ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY3]]
     ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[ICMP]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[SUB2]](s32), [[COPY1]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB2]](s32), [[COPY1]]
-    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB]](s32), [[COPY]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
-    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY3]](s32), [[C]]
-    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]]
-    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY2]](s32), [[C]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s32), [[ICMP6]], [[ICMP4]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB]](s32), [[COPY]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[SUB2]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB2]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s32), [[ICMP1]], [[ICMP2]]
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY2]](s32), [[C]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY3]](s32), [[C]]
+    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s32), [[ICMP4]], [[ICMP5]]
     ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SELECT1]], [[SELECT]]
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SUB2]], [[C1]](s32)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smax-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smax-rv32.mir
index 6143755422f26..038d8e4b7f6c1 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smax-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smax-rv32.mir
@@ -111,10 +111,10 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x12
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY1]](s32), [[COPY3]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY1]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[COPY]], [[COPY2]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smin-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smin-rv32.mir
index bce8e25e9ceb1..631b6eb49ef24 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smin-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smin-rv32.mir
@@ -111,10 +111,10 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x12
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY1]](s32), [[COPY3]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY1]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[COPY]], [[COPY2]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-umax-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-umax-rv32.mir
index fdb890a112e54..83ba02b90dd5a 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-umax-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-umax-rv32.mir
@@ -107,10 +107,10 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x12
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY1]](s32), [[COPY3]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY1]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[COPY]], [[COPY2]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-umin-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-umin-rv32.mir
index 0c5830f5f1cd8..1c92dcf4b401e 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-umin-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-umin-rv32.mir
@@ -107,10 +107,10 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x12
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x13
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY1]](s32), [[COPY3]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY1]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[COPY]], [[COPY2]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
diff --git a/llvm/test/CodeGen/RISCV/and-shl.ll b/llvm/test/CodeGen/RISCV/and-shl.ll
new file mode 100644
index 0000000000000..c3cb5d8e2e37d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/and-shl.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64I
+
+define i32 @and_0xfff_shl_2(i32 %x) {
+; RV32I-LABEL: and_0xfff_shl_2:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 18
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: and_0xfff_shl_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 52
+; RV64I-NEXT:    srli a0, a0, 50
+; RV64I-NEXT:    ret
+  %a = and i32 %x, 4095
+  %s = shl i32 %a, 2
+  ret i32 %s
+}
+
+define i32 @and_0x7ff_shl_2(i32 %x) {
+; RV32I-LABEL: and_0x7ff_shl_2:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a0, a0, 2047
+; RV32I-NEXT:    slli a0, a0, 2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: and_0x7ff_shl_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 2047
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    ret
+  %a = and i32 %x, 2047
+  %s = shl i32 %a, 2
+  ret i32 %s
+}
+
+define i64 @and_0xffffffff_shl_2(i64 %x) {
+; RV32I-LABEL: and_0xffffffff_shl_2:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a2, a0, 2
+; RV32I-NEXT:    srli a1, a0, 30
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: and_0xffffffff_shl_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 30
+; RV64I-NEXT:    ret
+  %a = and i64 %x, 4294967295
+  %s = shl i64 %a, 2
+  ret i64 %s
+}
+
+define i32 @and_0xfff_shl_2_multi_use(i32 %x) {
+; RV32I-LABEL: and_0xfff_shl_2_multi_use:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 20
+; RV32I-NEXT:    slli a1, a0, 2
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: and_0xfff_shl_2_multi_use:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 52
+; RV64I-NEXT:    srli a0, a0, 52
+; RV64I-NEXT:    slli a1, a0, 2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+  %a = and i32 %x, 4095
+  %s = shl i32 %a, 2
+  %r = add i32 %a, %s
+  ret i32 %r
+}
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index 356dce2979565..cdccc712a4698 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -82,6 +82,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+xtheadsync %s -o - | FileCheck --check-prefix=RV32XTHEADSYNC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+xwchc %s -o - | FileCheck --check-prefix=RV32XWCHC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcia %s -o - | FileCheck --check-prefix=RV32XQCIA %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcics %s -o - | FileCheck --check-prefix=RV32XQCICS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicsr %s -o - | FileCheck --check-prefix=RV32XQCICSR %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcisls %s -o - | FileCheck --check-prefix=RV32XQCISLS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zaamo %s -o - | FileCheck --check-prefix=RV32ZAAMO %s
@@ -389,6 +390,7 @@
 ; RV32XTHEADSYNC: .attribute 5, "rv32i2p1_xtheadsync1p0"
 ; RV32XWCHC: .attribute 5, "rv32i2p1_xwchc2p2"
 ; RV32XQCIA: .attribute 5, "rv32i2p1_xqcia0p2"
+; RV32XQCICS: .attribute 5, "rv32i2p1_xqcics0p2"
 ; RV32XQCICSR: .attribute 5, "rv32i2p1_xqcicsr0p2"
 ; RV32XQCISLS: .attribute 5, "rv32i2p1_xqcisls0p2"
 ; RV32ZAAMO: .attribute 5, "rv32i2p1_zaamo1p0"
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
index 0db45ae71bc8a..41d8abb9b73eb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
@@ -395,3 +395,76 @@ define <4 x half> @vrgather_shuffle_vx_v4f16_load(ptr %p) {
   %s = shufflevector <4 x half> %v, <4 x half> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   ret <4 x half> %s
 }
+
+define <16 x float> @shuffle_disjoint_lanes(<16 x float> %v, <16 x float> %w) {
+; CHECK-LABEL: shuffle_disjoint_lanes:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI30_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI30_0)
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vle8.v v16, (a0)
+; CHECK-NEXT:    lui a0, 11
+; CHECK-NEXT:    addi a0, a0, -1366
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmerge.vvm v12, v12, v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsext.vf2 v18, v16
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v8, v12, v18
+; CHECK-NEXT:    ret
+  %out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
+  ret <16 x float> %out
+}
+
+define <16 x float> @shuffle_disjoint_lanes_one_identity(<16 x float> %v, <16 x float> %w) {
+; CHECK-LABEL: shuffle_disjoint_lanes_one_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI31_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI31_0)
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT:    vle16.v v16, (a0)
+; CHECK-NEXT:    li a0, -272
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    ret
+  %out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 26, i32 30, i32 22, i32 20, i32 8, i32 31, i32 29, i32 28, i32 27, i32 23, i32 25, i32 22>
+  ret <16 x float> %out
+}
+
+define <16 x float> @shuffle_disjoint_lanes_one_broadcast(<16 x float> %v, <16 x float> %w) {
+; CHECK-LABEL: shuffle_disjoint_lanes_one_broadcast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI32_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI32_0)
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT:    vle16.v v20, (a0)
+; CHECK-NEXT:    lui a0, 15
+; CHECK-NEXT:    addi a0, a0, 240
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vrgather.vi v16, v8, 7
+; CHECK-NEXT:    vrgatherei16.vv v16, v12, v20, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+  %out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 26, i32 30, i32 22, i32 18, i32 7, i32 7, i32 7, i32 7, i32 24, i32 28, i32 20, i32 16>
+  ret <16 x float> %out
+}
+
+define <16 x float> @shuffle_disjoint_lanes_one_splat(float %v, <16 x float> %w) {
+; CHECK-LABEL: shuffle_disjoint_lanes_one_splat:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI33_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI33_0)
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT:    vle16.v v16, (a0)
+; CHECK-NEXT:    lui a0, 15
+; CHECK-NEXT:    addi a0, a0, 240
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vfmv.v.f v12, fa0
+; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x float> poison, float %v, i32 0
+  %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer
+  %out = shufflevector <16 x float> %splat, <16 x float> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
+  ret <16 x float> %out
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
index 7f4483a8f77d9..ddcb3c3121bc3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
@@ -124,42 +124,40 @@ define <3 x i15> @fp2si_v3f32_v3i15(<3 x float> %x) {
 ; ZVFH32:       # %bb.0:
 ; ZVFH32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH32-NEXT:    vfncvt.rtz.x.f.w v9, v8
-; ZVFH32-NEXT:    lui a1, 8
 ; ZVFH32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFH32-NEXT:    vmv.x.s a2, v9
-; ZVFH32-NEXT:    addi a1, a1, -1
+; ZVFH32-NEXT:    vmv.x.s a1, v9
 ; ZVFH32-NEXT:    vslidedown.vi v9, v9, 1
-; ZVFH32-NEXT:    vmv.x.s a3, v8
-; ZVFH32-NEXT:    and a2, a2, a1
-; ZVFH32-NEXT:    vmv.x.s a4, v9
-; ZVFH32-NEXT:    and a1, a4, a1
-; ZVFH32-NEXT:    slli a4, a3, 17
-; ZVFH32-NEXT:    slli a3, a3, 30
-; ZVFH32-NEXT:    srli a4, a4, 19
-; ZVFH32-NEXT:    slli a1, a1, 15
-; ZVFH32-NEXT:    or a2, a2, a3
-; ZVFH32-NEXT:    or a1, a2, a1
+; ZVFH32-NEXT:    vmv.x.s a2, v8
+; ZVFH32-NEXT:    slli a1, a1, 17
+; ZVFH32-NEXT:    srli a1, a1, 17
+; ZVFH32-NEXT:    slli a3, a2, 30
+; ZVFH32-NEXT:    or a1, a1, a3
+; ZVFH32-NEXT:    vmv.x.s a3, v9
+; ZVFH32-NEXT:    slli a2, a2, 17
+; ZVFH32-NEXT:    slli a3, a3, 17
+; ZVFH32-NEXT:    srli a2, a2, 19
+; ZVFH32-NEXT:    srli a3, a3, 2
+; ZVFH32-NEXT:    or a1, a1, a3
 ; ZVFH32-NEXT:    sw a1, 0(a0)
-; ZVFH32-NEXT:    sh a4, 4(a0)
+; ZVFH32-NEXT:    sh a2, 4(a0)
 ; ZVFH32-NEXT:    ret
 ;
 ; ZVFH64-LABEL: fp2si_v3f32_v3i15:
 ; ZVFH64:       # %bb.0:
 ; ZVFH64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH64-NEXT:    vfncvt.rtz.x.f.w v9, v8
-; ZVFH64-NEXT:    lui a1, 8
-; ZVFH64-NEXT:    vmv.x.s a2, v9
-; ZVFH64-NEXT:    addiw a1, a1, -1
+; ZVFH64-NEXT:    vmv.x.s a1, v9
 ; ZVFH64-NEXT:    vslidedown.vi v8, v9, 1
 ; ZVFH64-NEXT:    vslidedown.vi v9, v9, 2
-; ZVFH64-NEXT:    and a2, a2, a1
-; ZVFH64-NEXT:    vmv.x.s a3, v8
-; ZVFH64-NEXT:    and a1, a3, a1
+; ZVFH64-NEXT:    slli a1, a1, 49
+; ZVFH64-NEXT:    vmv.x.s a2, v8
 ; ZVFH64-NEXT:    vmv.x.s a3, v9
+; ZVFH64-NEXT:    srli a1, a1, 49
+; ZVFH64-NEXT:    slli a2, a2, 49
 ; ZVFH64-NEXT:    slli a3, a3, 30
-; ZVFH64-NEXT:    slli a1, a1, 15
-; ZVFH64-NEXT:    or a2, a2, a3
-; ZVFH64-NEXT:    or a1, a2, a1
+; ZVFH64-NEXT:    srli a2, a2, 34
+; ZVFH64-NEXT:    or a1, a1, a3
+; ZVFH64-NEXT:    or a1, a1, a2
 ; ZVFH64-NEXT:    slli a2, a1, 19
 ; ZVFH64-NEXT:    srli a2, a2, 51
 ; ZVFH64-NEXT:    sw a1, 0(a0)
@@ -170,42 +168,40 @@ define <3 x i15> @fp2si_v3f32_v3i15(<3 x float> %x) {
 ; ZVFHMIN32:       # %bb.0:
 ; ZVFHMIN32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN32-NEXT:    vfncvt.rtz.x.f.w v9, v8
-; ZVFHMIN32-NEXT:    lui a1, 8
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v9
-; ZVFHMIN32-NEXT:    addi a1, a1, -1
+; ZVFHMIN32-NEXT:    vmv.x.s a1, v9
 ; ZVFHMIN32-NEXT:    vslidedown.vi v9, v9, 1
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN32-NEXT:    and a2, a2, a1
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v9
-; ZVFHMIN32-NEXT:    and a1, a4, a1
-; ZVFHMIN32-NEXT:    slli a4, a3, 17
-; ZVFHMIN32-NEXT:    slli a3, a3, 30
-; ZVFHMIN32-NEXT:    srli a4, a4, 19
-; ZVFHMIN32-NEXT:    slli a1, a1, 15
-; ZVFHMIN32-NEXT:    or a2, a2, a3
-; ZVFHMIN32-NEXT:    or a1, a2, a1
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN32-NEXT:    slli a1, a1, 17
+; ZVFHMIN32-NEXT:    srli a1, a1, 17
+; ZVFHMIN32-NEXT:    slli a3, a2, 30
+; ZVFHMIN32-NEXT:    or a1, a1, a3
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v9
+; ZVFHMIN32-NEXT:    slli a2, a2, 17
+; ZVFHMIN32-NEXT:    slli a3, a3, 17
+; ZVFHMIN32-NEXT:    srli a2, a2, 19
+; ZVFHMIN32-NEXT:    srli a3, a3, 2
+; ZVFHMIN32-NEXT:    or a1, a1, a3
 ; ZVFHMIN32-NEXT:    sw a1, 0(a0)
-; ZVFHMIN32-NEXT:    sh a4, 4(a0)
+; ZVFHMIN32-NEXT:    sh a2, 4(a0)
 ; ZVFHMIN32-NEXT:    ret
 ;
 ; ZVFHMIN64-LABEL: fp2si_v3f32_v3i15:
 ; ZVFHMIN64:       # %bb.0:
 ; ZVFHMIN64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN64-NEXT:    vfncvt.rtz.x.f.w v9, v8
-; ZVFHMIN64-NEXT:    lui a1, 8
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v9
-; ZVFHMIN64-NEXT:    addiw a1, a1, -1
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v9
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v9, 1
 ; ZVFHMIN64-NEXT:    vslidedown.vi v9, v9, 2
-; ZVFHMIN64-NEXT:    and a2, a2, a1
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN64-NEXT:    and a1, a3, a1
+; ZVFHMIN64-NEXT:    slli a1, a1, 49
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN64-NEXT:    vmv.x.s a3, v9
+; ZVFHMIN64-NEXT:    srli a1, a1, 49
+; ZVFHMIN64-NEXT:    slli a2, a2, 49
 ; ZVFHMIN64-NEXT:    slli a3, a3, 30
-; ZVFHMIN64-NEXT:    slli a1, a1, 15
-; ZVFHMIN64-NEXT:    or a2, a2, a3
-; ZVFHMIN64-NEXT:    or a1, a2, a1
+; ZVFHMIN64-NEXT:    srli a2, a2, 34
+; ZVFHMIN64-NEXT:    or a1, a1, a3
+; ZVFHMIN64-NEXT:    or a1, a1, a2
 ; ZVFHMIN64-NEXT:    slli a2, a1, 19
 ; ZVFHMIN64-NEXT:    srli a2, a2, 51
 ; ZVFHMIN64-NEXT:    sw a1, 0(a0)
@@ -221,42 +217,40 @@ define <3 x i15> @fp2ui_v3f32_v3i15(<3 x float> %x) {
 ; ZVFH32:       # %bb.0:
 ; ZVFH32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH32-NEXT:    vfncvt.rtz.x.f.w v9, v8
-; ZVFH32-NEXT:    lui a1, 16
 ; ZVFH32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFH32-NEXT:    vmv.x.s a2, v9
-; ZVFH32-NEXT:    addi a1, a1, -1
+; ZVFH32-NEXT:    vmv.x.s a1, v9
 ; ZVFH32-NEXT:    vslidedown.vi v9, v9, 1
-; ZVFH32-NEXT:    vmv.x.s a3, v8
-; ZVFH32-NEXT:    and a2, a2, a1
-; ZVFH32-NEXT:    vmv.x.s a4, v9
-; ZVFH32-NEXT:    and a1, a4, a1
-; ZVFH32-NEXT:    slli a4, a3, 17
-; ZVFH32-NEXT:    slli a3, a3, 30
-; ZVFH32-NEXT:    srli a4, a4, 19
-; ZVFH32-NEXT:    slli a1, a1, 15
-; ZVFH32-NEXT:    or a2, a2, a3
-; ZVFH32-NEXT:    or a1, a2, a1
+; ZVFH32-NEXT:    vmv.x.s a2, v8
+; ZVFH32-NEXT:    slli a1, a1, 16
+; ZVFH32-NEXT:    srli a1, a1, 16
+; ZVFH32-NEXT:    slli a3, a2, 30
+; ZVFH32-NEXT:    or a1, a1, a3
+; ZVFH32-NEXT:    vmv.x.s a3, v9
+; ZVFH32-NEXT:    slli a2, a2, 17
+; ZVFH32-NEXT:    slli a3, a3, 16
+; ZVFH32-NEXT:    srli a2, a2, 19
+; ZVFH32-NEXT:    srli a3, a3, 1
+; ZVFH32-NEXT:    or a1, a1, a3
 ; ZVFH32-NEXT:    sw a1, 0(a0)
-; ZVFH32-NEXT:    sh a4, 4(a0)
+; ZVFH32-NEXT:    sh a2, 4(a0)
 ; ZVFH32-NEXT:    ret
 ;
 ; ZVFH64-LABEL: fp2ui_v3f32_v3i15:
 ; ZVFH64:       # %bb.0:
 ; ZVFH64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH64-NEXT:    vfncvt.rtz.x.f.w v9, v8
-; ZVFH64-NEXT:    lui a1, 16
-; ZVFH64-NEXT:    vmv.x.s a2, v9
-; ZVFH64-NEXT:    addiw a1, a1, -1
+; ZVFH64-NEXT:    vmv.x.s a1, v9
 ; ZVFH64-NEXT:    vslidedown.vi v8, v9, 1
 ; ZVFH64-NEXT:    vslidedown.vi v9, v9, 2
-; ZVFH64-NEXT:    and a2, a2, a1
-; ZVFH64-NEXT:    vmv.x.s a3, v8
-; ZVFH64-NEXT:    and a1, a3, a1
+; ZVFH64-NEXT:    slli a1, a1, 48
+; ZVFH64-NEXT:    vmv.x.s a2, v8
 ; ZVFH64-NEXT:    vmv.x.s a3, v9
+; ZVFH64-NEXT:    srli a1, a1, 48
+; ZVFH64-NEXT:    slli a2, a2, 48
 ; ZVFH64-NEXT:    slli a3, a3, 30
-; ZVFH64-NEXT:    slli a1, a1, 15
-; ZVFH64-NEXT:    or a2, a2, a3
-; ZVFH64-NEXT:    or a1, a2, a1
+; ZVFH64-NEXT:    srli a2, a2, 33
+; ZVFH64-NEXT:    or a1, a1, a3
+; ZVFH64-NEXT:    or a1, a1, a2
 ; ZVFH64-NEXT:    slli a2, a1, 19
 ; ZVFH64-NEXT:    srli a2, a2, 51
 ; ZVFH64-NEXT:    sw a1, 0(a0)
@@ -267,42 +261,40 @@ define <3 x i15> @fp2ui_v3f32_v3i15(<3 x float> %x) {
 ; ZVFHMIN32:       # %bb.0:
 ; ZVFHMIN32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN32-NEXT:    vfncvt.rtz.x.f.w v9, v8
-; ZVFHMIN32-NEXT:    lui a1, 16
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v9
-; ZVFHMIN32-NEXT:    addi a1, a1, -1
+; ZVFHMIN32-NEXT:    vmv.x.s a1, v9
 ; ZVFHMIN32-NEXT:    vslidedown.vi v9, v9, 1
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN32-NEXT:    and a2, a2, a1
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v9
-; ZVFHMIN32-NEXT:    and a1, a4, a1
-; ZVFHMIN32-NEXT:    slli a4, a3, 17
-; ZVFHMIN32-NEXT:    slli a3, a3, 30
-; ZVFHMIN32-NEXT:    srli a4, a4, 19
-; ZVFHMIN32-NEXT:    slli a1, a1, 15
-; ZVFHMIN32-NEXT:    or a2, a2, a3
-; ZVFHMIN32-NEXT:    or a1, a2, a1
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN32-NEXT:    slli a1, a1, 16
+; ZVFHMIN32-NEXT:    srli a1, a1, 16
+; ZVFHMIN32-NEXT:    slli a3, a2, 30
+; ZVFHMIN32-NEXT:    or a1, a1, a3
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v9
+; ZVFHMIN32-NEXT:    slli a2, a2, 17
+; ZVFHMIN32-NEXT:    slli a3, a3, 16
+; ZVFHMIN32-NEXT:    srli a2, a2, 19
+; ZVFHMIN32-NEXT:    srli a3, a3, 1
+; ZVFHMIN32-NEXT:    or a1, a1, a3
 ; ZVFHMIN32-NEXT:    sw a1, 0(a0)
-; ZVFHMIN32-NEXT:    sh a4, 4(a0)
+; ZVFHMIN32-NEXT:    sh a2, 4(a0)
 ; ZVFHMIN32-NEXT:    ret
 ;
 ; ZVFHMIN64-LABEL: fp2ui_v3f32_v3i15:
 ; ZVFHMIN64:       # %bb.0:
 ; ZVFHMIN64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN64-NEXT:    vfncvt.rtz.x.f.w v9, v8
-; ZVFHMIN64-NEXT:    lui a1, 16
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v9
-; ZVFHMIN64-NEXT:    addiw a1, a1, -1
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v9
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v9, 1
 ; ZVFHMIN64-NEXT:    vslidedown.vi v9, v9, 2
-; ZVFHMIN64-NEXT:    and a2, a2, a1
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN64-NEXT:    and a1, a3, a1
+; ZVFHMIN64-NEXT:    slli a1, a1, 48
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN64-NEXT:    vmv.x.s a3, v9
+; ZVFHMIN64-NEXT:    srli a1, a1, 48
+; ZVFHMIN64-NEXT:    slli a2, a2, 48
 ; ZVFHMIN64-NEXT:    slli a3, a3, 30
-; ZVFHMIN64-NEXT:    slli a1, a1, 15
-; ZVFHMIN64-NEXT:    or a2, a2, a3
-; ZVFHMIN64-NEXT:    or a1, a2, a1
+; ZVFHMIN64-NEXT:    srli a2, a2, 33
+; ZVFHMIN64-NEXT:    or a1, a1, a3
+; ZVFHMIN64-NEXT:    or a1, a1, a2
 ; ZVFHMIN64-NEXT:    slli a2, a1, 19
 ; ZVFHMIN64-NEXT:    srli a2, a2, 51
 ; ZVFHMIN64-NEXT:    sw a1, 0(a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
index e9fd0a19e3eb6..139f7b4e6a0c8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
@@ -3296,11 +3296,11 @@ define <4 x i16> @buildvec_v4i16_pack(i16 %e1, i16 %e2, i16 %e3, i16 %e4) {
 ; RVA22U64-LABEL: buildvec_v4i16_pack:
 ; RVA22U64:       # %bb.0:
 ; RVA22U64-NEXT:    slli a3, a3, 48
-; RVA22U64-NEXT:    zext.h a2, a2
+; RVA22U64-NEXT:    slli a2, a2, 48
 ; RVA22U64-NEXT:    zext.h a0, a0
-; RVA22U64-NEXT:    zext.h a1, a1
-; RVA22U64-NEXT:    slli a2, a2, 32
-; RVA22U64-NEXT:    slli a1, a1, 16
+; RVA22U64-NEXT:    slli a1, a1, 48
+; RVA22U64-NEXT:    srli a2, a2, 16
+; RVA22U64-NEXT:    srli a1, a1, 32
 ; RVA22U64-NEXT:    or a2, a2, a3
 ; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    or a0, a0, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index ebcea741a2e8b..10156141119a7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -451,21 +451,14 @@ define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) {
 define <8 x i8> @splat_ve2_we0_ins_i2ve4_i5we6(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: splat_ve2_we0_ins_i2ve4_i5we6:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI26_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI26_0)
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 6
-; CHECK-NEXT:    vmv.v.i v11, 0
-; CHECK-NEXT:    lui a0, 8256
-; CHECK-NEXT:    addi a0, a0, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
-; CHECK-NEXT:    li a0, 98
-; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v11, v10, 5
+; CHECK-NEXT:    vle8.v v10, (a0)
+; CHECK-NEXT:    li a0, 20
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vv v10, v8, v12
-; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vmerge.vvm v9, v9, v8, v0
+; CHECK-NEXT:    vrgather.vv v8, v9, v10
 ; CHECK-NEXT:    ret
   %shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 2, i32 2, i32 14, i32 8, i32 2>
   ret <8 x i8> %shuff
@@ -693,12 +686,12 @@ define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    lui a0, %hi(.LCPI46_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI46_0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vle8.v v10, (a0)
-; CHECK-NEXT:    li a0, -22
+; CHECK-NEXT:    li a0, 84
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vmerge.vvm v9, v9, v8, v0
+; CHECK-NEXT:    vrgather.vv v8, v9, v10
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 9, i32 4, i32 11, i32 6, i32 13, i32 8, i32 15>
   ret <8 x i8> %res
@@ -1073,3 +1066,76 @@ define <16 x i64> @shuffle_zipodd_v16i64(<16 x i64> %v1, <16 x i64> %v2) {
   %out = shufflevector <16 x i64> %v1, <16 x i64> %v2, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
   ret <16 x i64> %out
 }
+
+define <16 x i32> @shuffle_disjoint_lanes(<16 x i32> %v, <16 x i32> %w) {
+; CHECK-LABEL: shuffle_disjoint_lanes:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI74_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI74_0)
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vle8.v v16, (a0)
+; CHECK-NEXT:    lui a0, 11
+; CHECK-NEXT:    addi a0, a0, -1366
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmerge.vvm v12, v12, v8, v0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsext.vf2 v18, v16
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v8, v12, v18
+; CHECK-NEXT:    ret
+  %out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
+  ret <16 x i32> %out
+}
+
+define <16 x i32> @shuffle_disjoint_lanes_one_identity(<16 x i32> %v, <16 x i32> %w) {
+; CHECK-LABEL: shuffle_disjoint_lanes_one_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI75_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI75_0)
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT:    vle16.v v16, (a0)
+; CHECK-NEXT:    li a0, -272
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    ret
+  %out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 26, i32 30, i32 22, i32 20, i32 8, i32 31, i32 29, i32 28, i32 27, i32 23, i32 25, i32 22>
+  ret <16 x i32> %out
+}
+
+define <16 x i32> @shuffle_disjoint_lanes_one_broadcast(<16 x i32> %v, <16 x i32> %w) {
+; CHECK-LABEL: shuffle_disjoint_lanes_one_broadcast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI76_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI76_0)
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT:    vle16.v v20, (a0)
+; CHECK-NEXT:    lui a0, 15
+; CHECK-NEXT:    addi a0, a0, 240
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vrgather.vi v16, v8, 7
+; CHECK-NEXT:    vrgatherei16.vv v16, v12, v20, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+  %out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 26, i32 30, i32 22, i32 18, i32 7, i32 7, i32 7, i32 7, i32 24, i32 28, i32 20, i32 16>
+  ret <16 x i32> %out
+}
+
+define <16 x i32> @shuffle_disjoint_lanes_one_splat(i32 %v, <16 x i32> %w) {
+; CHECK-LABEL: shuffle_disjoint_lanes_one_splat:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, %hi(.LCPI77_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI77_0)
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT:    vle16.v v16, (a1)
+; CHECK-NEXT:    lui a1, 15
+; CHECK-NEXT:    addi a1, a1, 240
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i32> poison, i32 %v, i32 0
+  %splat = shufflevector <16 x i32> %head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %out = shufflevector <16 x i32> %splat, <16 x i32> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
+  ret <16 x i32> %out
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 8833634be1a0e..67d649902b022 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -183,406 +183,499 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a3, a2, 6
-; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    li a3, 96
+; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc1, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 65 * vlenb
-; RV32-NEXT:    addi a3, a1, 256
-; RV32-NEXT:    addi a4, a1, 128
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xe0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 96 * vlenb
+; RV32-NEXT:    addi a3, a1, 128
+; RV32-NEXT:    addi a4, a1, 256
 ; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    lui a5, 12291
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle32.v v24, (a1)
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a6, 41
-; RV32-NEXT:    mul a1, a1, a6
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, %hi(.LCPI8_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_0)
-; RV32-NEXT:    vle16.v v4, (a1)
-; RV32-NEXT:    lui a1, 1
-; RV32-NEXT:    addi a5, a5, 3
+; RV32-NEXT:    li a5, 768
+; RV32-NEXT:    lui a6, 12291
+; RV32-NEXT:    lui a7, %hi(.LCPI8_1)
+; RV32-NEXT:    addi a7, a7, %lo(.LCPI8_1)
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a4)
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a6, 57
-; RV32-NEXT:    mul a4, a4, a6
+; RV32-NEXT:    li t0, 88
+; RV32-NEXT:    mul a4, a4, t0
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, a1, -64
+; RV32-NEXT:    vmv.s.x v0, a5
+; RV32-NEXT:    vle32.v v24, (a1)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a4, 72
+; RV32-NEXT:    mul a1, a1, a4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v16, (a3)
-; RV32-NEXT:    vmv.s.x v3, a5
-; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a6, a6, 3
+; RV32-NEXT:    vle16.v v4, (a7)
+; RV32-NEXT:    vmv.s.x v3, a6
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v16, v8, 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 80
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v16, v8, v0
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 52
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vcompress.vm v8, v24, v3
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v3
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vmerge.vvm v16, v8, v24, v0
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v4
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    li a1, 3
+; RV32-NEXT:    lui a3, 49164
+; RV32-NEXT:    lui a4, %hi(.LCPI8_3)
+; RV32-NEXT:    addi a4, a4, %lo(.LCPI8_3)
+; RV32-NEXT:    slli a1, a1, 10
+; RV32-NEXT:    addi a3, a3, 12
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vle16.v v16, (a4)
+; RV32-NEXT:    vmv.s.x v20, a3
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 88
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
-; RV32-NEXT:    lui a1, 12
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 49
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 80
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vslideup.vi v12, v16, 4
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a4, a3, 4
-; RV32-NEXT:    add a3, a4, a3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs4r.v v12, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v16, 16
+; RV32-NEXT:    vmerge.vvm v8, v8, v24, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vslideup.vi v12, v24, 10, v0.t
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v20
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v12, v8
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 21
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, 49164
-; RV32-NEXT:    lui a3, %hi(.LCPI8_1)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_1)
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v28, (a3)
-; RV32-NEXT:    addi a1, a1, 12
-; RV32-NEXT:    vmv.s.x v20, a1
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vmerge.vvm v24, v8, v24, v0
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vcompress.vm v8, v0, v20
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 3
+; RV32-NEXT:    lui a3, 196656
+; RV32-NEXT:    lui a4, 12
+; RV32-NEXT:    lui a5, 786624
+; RV32-NEXT:    li a6, 48
+; RV32-NEXT:    lui a7, 768
+; RV32-NEXT:    li t0, 192
+; RV32-NEXT:    addi a1, a1, 3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    addi a4, a4, 12
+; RV32-NEXT:    addi a5, a5, 192
+; RV32-NEXT:    addi a7, a7, 768
+; RV32-NEXT:    vmv.s.x v1, a6
+; RV32-NEXT:    vmv.s.x v8, t0
+; RV32-NEXT:    addi a6, sp, 16
+; RV32-NEXT:    vs1r.v v8, (a6) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vmv.s.x v14, a3
+; RV32-NEXT:    vmv.s.x v7, a4
+; RV32-NEXT:    vmv.s.x v3, a5
+; RV32-NEXT:    vmv.s.x v2, a7
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    li a3, 80
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv4r.v v8, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
+; RV32-NEXT:    li a3, 88
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v24, v0
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v14
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vslideup.vi v12, v16, 2
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vmerge.vvm v16, v8, v16, v0
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v7
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 88
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v12, v24, 8, v0.t
-; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v12, v8
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    li a3, 80
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, 196656
-; RV32-NEXT:    lui a3, %hi(.LCPI8_2)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_2)
-; RV32-NEXT:    li a4, 960
-; RV32-NEXT:    lui a5, %hi(.LCPI8_3)
-; RV32-NEXT:    addi a5, a5, %lo(.LCPI8_3)
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vmv.s.x v0, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v4, (a3)
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle16.v v8, (a5)
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a4, a3, 3
-; RV32-NEXT:    add a3, a4, a3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs2r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv.s.x v22, a1
+; RV32-NEXT:    vmerge.vvm v4, v24, v16, v0
+; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vcompress.vm v8, v24, v22
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vmerge.vvm v16, v8, v16, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
+; RV32-NEXT:    li a3, 28
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 88
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 80
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmerge.vvm v16, v24, v16, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 3
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 36
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl2r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v12, v0, v16
+; RV32-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v2
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vmerge.vvm v16, v8, v16, v0
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 20
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 88
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 80
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v12, v16, 6, v0.t
-; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v12, v8
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 3
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 88
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, 786624
-; RV32-NEXT:    lui a3, %hi(.LCPI8_4)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_4)
-; RV32-NEXT:    lui a4, %hi(.LCPI8_5)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI8_5)
-; RV32-NEXT:    addi a1, a1, 192
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, %hi(.LCPI8_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_0)
+; RV32-NEXT:    lui a3, %hi(.LCPI8_5)
+; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_5)
+; RV32-NEXT:    lui a4, 3073
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v8, (a3)
+; RV32-NEXT:    vle16.v v24, (a3)
+; RV32-NEXT:    addi a3, a4, -1024
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle16.v v12, (a4)
-; RV32-NEXT:    vmv.s.x v14, a1
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vcompress.vm v16, v24, v14
+; RV32-NEXT:    vle16.v v2, (a1)
+; RV32-NEXT:    vmv.s.x v0, a3
 ; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v16, v8, v24
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmerge.vvm v16, v8, v16, v0
 ; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 52
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v4, v0, v12
+; RV32-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v16, v12, v2
+; RV32-NEXT:    lui a1, %hi(.LCPI8_2)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_2)
+; RV32-NEXT:    vle16.v v12, (a1)
+; RV32-NEXT:    lui a1, %hi(.LCPI8_4)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_4)
+; RV32-NEXT:    vle16.v v14, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 56
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v16, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 80
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v4, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 768
-; RV32-NEXT:    lui a3, %hi(.LCPI8_6)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_6)
-; RV32-NEXT:    li a4, 1008
-; RV32-NEXT:    addi a1, a1, 768
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v8, (a3)
-; RV32-NEXT:    vmv.s.x v1, a4
-; RV32-NEXT:    vmv.s.x v12, a1
+; RV32-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vcompress.vm v24, v16, v12
-; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v24, v16, v12
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
+; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v8, v0.t
+; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v24, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, %hi(.LCPI8_7)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_7)
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    lui a1, 15
-; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v20, v16, v14
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v20, v8
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, %hi(.LCPI8_6)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_6)
+; RV32-NEXT:    lui a3, %hi(.LCPI8_7)
+; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_7)
+; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; RV32-NEXT:    vle16.v v20, (a3)
+; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV32-NEXT:    vle16.v v16, (a1)
+; RV32-NEXT:    lui a1, %hi(.LCPI8_9)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_9)
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vle16.v v0, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 28
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v20, v16, 6
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v20
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v20, v4, v16
+; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v20, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 20
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v20, v24, v8, v0.t
-; RV32-NEXT:    lui a1, 3073
-; RV32-NEXT:    lui a3, %hi(.LCPI8_8)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_8)
-; RV32-NEXT:    lui a4, %hi(.LCPI8_9)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI8_9)
-; RV32-NEXT:    addi a1, a1, -1024
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v0
+; RV32-NEXT:    lui a1, %hi(.LCPI8_8)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_8)
+; RV32-NEXT:    lui a3, %hi(.LCPI8_10)
+; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_10)
+; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV32-NEXT:    vle16.v v12, (a1)
+; RV32-NEXT:    lui a1, %hi(.LCPI8_11)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_11)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v16, (a3)
+; RV32-NEXT:    vle16.v v16, (a1)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle16.v v2, (a4)
-; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vle16.v v14, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 52
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vcompress.vm v8, v24, v0
-; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vs2r.v v14, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 57
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    li a3, 36
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v0, v4, v12
+; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v0, v8
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 4
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 5
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    li a2, 88
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v12, v24, v2, v0.t
+; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 52
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v4, v24
+; RV32-NEXT:    vl2r.v v12, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v12
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 25
+; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmv.v.v v20, v24
-; RV32-NEXT:    vmv.v.v v12, v8
+; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v8, v24
 ; RV32-NEXT:    addi a1, a0, 320
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vse32.v v12, (a1)
+; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
-; RV32-NEXT:    vse32.v v20, (a1)
+; RV32-NEXT:    vse32.v v0, (a1)
 ; RV32-NEXT:    addi a1, a0, 192
-; RV32-NEXT:    vse32.v v4, (a1)
+; RV32-NEXT:    vse32.v v20, (a1)
 ; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a3, a2, 3
-; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    slli a2, a2, 6
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    addi a1, a0, 64
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 21
+; RV32-NEXT:    li a2, 80
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a1, a0, 6
-; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    li a1, 96
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -594,479 +687,444 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 67
+; RV64-NEXT:    li a3, 88
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    sub sp, sp, a2
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc3, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 67 * vlenb
-; RV64-NEXT:    addi a2, a1, 128
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vle64.v v8, (a1)
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    li a4, 59
-; RV64-NEXT:    mul a3, a3, a4
-; RV64-NEXT:    add a3, sp, a3
-; RV64-NEXT:    addi a3, a3, 16
-; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a1, a1, 256
-; RV64-NEXT:    li a3, 128
-; RV64-NEXT:    vle64.v v24, (a1)
-; RV64-NEXT:    lui a1, 1
-; RV64-NEXT:    vid.v v8
-; RV64-NEXT:    vmv.s.x v0, a3
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    li a4, 30
-; RV64-NEXT:    mul a3, a3, a4
-; RV64-NEXT:    add a3, sp, a3
-; RV64-NEXT:    addi a3, a3, 16
-; RV64-NEXT:    vs1r.v v0, (a3) # Unknown-size Folded Spill
-; RV64-NEXT:    li a3, 6
-; RV64-NEXT:    vmul.vx v6, v8, a3
-; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 88 * vlenb
+; RV64-NEXT:    addi a3, a1, 128
+; RV64-NEXT:    addi a6, a1, 256
+; RV64-NEXT:    li a4, 128
+; RV64-NEXT:    lui a2, 1
+; RV64-NEXT:    lui a5, %hi(.LCPI8_0)
+; RV64-NEXT:    addi a5, a5, %lo(.LCPI8_0)
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vmv.v.i v16, 6
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vle64.v v8, (a6)
+; RV64-NEXT:    lui a6, 16
+; RV64-NEXT:    addi a6, a6, 7
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v17, a6
+; RV64-NEXT:    addi a6, a2, 65
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vrgather.vi v8, v24, 4
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    li a5, 22
-; RV64-NEXT:    mul a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 16
-; RV64-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vrgather.vi v24, v8, 4
+; RV64-NEXT:    vrgather.vi v20, v8, 5
+; RV64-NEXT:    csrr a7, vlenb
+; RV64-NEXT:    li t0, 68
+; RV64-NEXT:    mul a7, a7, t0
+; RV64-NEXT:    add a7, sp, a7
+; RV64-NEXT:    addi a7, a7, 16
+; RV64-NEXT:    vs4r.v v20, (a7) # Unknown-size Folded Spill
+; RV64-NEXT:    vrgatherei16.vv v20, v8, v16
+; RV64-NEXT:    csrr a7, vlenb
+; RV64-NEXT:    li t0, 84
+; RV64-NEXT:    mul a7, a7, t0
+; RV64-NEXT:    add a7, sp, a7
+; RV64-NEXT:    addi a7, a7, 16
+; RV64-NEXT:    vs4r.v v20, (a7) # Unknown-size Folded Spill
+; RV64-NEXT:    vrgatherei16.vv v20, v8, v17
+; RV64-NEXT:    csrr a7, vlenb
+; RV64-NEXT:    li t0, 72
+; RV64-NEXT:    mul a7, a7, t0
+; RV64-NEXT:    add a7, sp, a7
+; RV64-NEXT:    addi a7, a7, 16
+; RV64-NEXT:    vs4r.v v20, (a7) # Unknown-size Folded Spill
+; RV64-NEXT:    vrgather.vi v16, v8, 2
+; RV64-NEXT:    csrr a7, vlenb
+; RV64-NEXT:    slli a7, a7, 6
+; RV64-NEXT:    add a7, sp, a7
+; RV64-NEXT:    addi a7, a7, 16
+; RV64-NEXT:    vs4r.v v16, (a7) # Unknown-size Folded Spill
+; RV64-NEXT:    vrgather.vi v16, v8, 3
+; RV64-NEXT:    csrr a7, vlenb
+; RV64-NEXT:    li t0, 56
+; RV64-NEXT:    mul a7, a7, t0
+; RV64-NEXT:    add a7, sp, a7
+; RV64-NEXT:    addi a7, a7, 16
+; RV64-NEXT:    vs4r.v v16, (a7) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v16, v24, 8
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    li a5, 39
-; RV64-NEXT:    mul a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 16
-; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vslidedown.vi v16, v8, 8
+; RV64-NEXT:    csrr a7, vlenb
+; RV64-NEXT:    li t0, 48
+; RV64-NEXT:    mul a7, a7, t0
+; RV64-NEXT:    add a7, sp, a7
+; RV64-NEXT:    addi a7, a7, 16
+; RV64-NEXT:    vs8r.v v16, (a7) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv.s.x v21, a4
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vle64.v v8, (a1)
+; RV64-NEXT:    vle64.v v0, (a3)
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 40
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vle16.v v2, (a5)
+; RV64-NEXT:    vmv.s.x v20, a6
+; RV64-NEXT:    vmv1r.v v0, v21
+; RV64-NEXT:    vmv1r.v v7, v21
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v8, v16, 2, v0.t
-; RV64-NEXT:    vmv.v.v v20, v8
-; RV64-NEXT:    vmv.s.x v8, a3
+; RV64-NEXT:    vrgather.vi v24, v16, 2, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 60
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv1r.v v0, v20
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 40
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vmerge.vvm v24, v16, v8, v0
+; RV64-NEXT:    vmv8r.v v16, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 76
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vrgatherei16.vv v8, v24, v2
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 2
+; RV64-NEXT:    lui a3, %hi(.LCPI8_1)
+; RV64-NEXT:    addi a3, a3, %lo(.LCPI8_1)
+; RV64-NEXT:    addi a1, a1, 130
+; RV64-NEXT:    vle16.v v8, (a3)
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    li a4, 55
-; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    slli a3, a3, 4
 ; RV64-NEXT:    add a3, sp, a3
 ; RV64-NEXT:    addi a3, a3, 16
-; RV64-NEXT:    vs1r.v v8, (a3) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a3, a1, 65
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vle64.v v8, (a2)
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a4, 47
-; RV64-NEXT:    mul a2, a2, a4
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv.s.x v16, a3
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 35
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs1r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v16, v6, -16
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a3, a2, 5
-; RV64-NEXT:    sub a2, a3, a2
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv2r.v v18, v6
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 12
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 59
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 35
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl1r.v v16, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vcompress.vm v24, v0, v16
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 55
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a3, a2, 5
-; RV64-NEXT:    sub a2, a3, a2
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl2r.v v16, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v24, v8, v16, v0.t
-; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v20, v24
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 18
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs4r.v v20, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 22
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vs2r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv.s.x v2, a1
+; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs1r.v v7, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 68
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 48
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v8, v24, 5
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 30
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 39
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vi v8, v24, 3, v0.t
-; RV64-NEXT:    vmv.v.v v20, v8
-; RV64-NEXT:    lui a2, 2
-; RV64-NEXT:    addi a2, a2, 130
-; RV64-NEXT:    vmv.s.x v8, a2
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v16, v18, -15
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 59
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vcompress.vm v24, v0, v8
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 55
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 47
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v24, v8, v16, v0.t
-; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v20, v24
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 14
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs4r.v v20, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    lui a2, 16
-; RV64-NEXT:    addi a2, a2, 7
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.i v8, 6
-; RV64-NEXT:    vmv.v.x v9, a2
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 22
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v12, v16, v8
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 55
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs4r.v v12, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vrgatherei16.vv v12, v16, v9
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a3, a2, 5
-; RV64-NEXT:    sub a2, a3, a2
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs4r.v v12, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv4r.v v8, v16
-; RV64-NEXT:    vrgather.vi v12, v16, 2
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 35
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs4r.v v12, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vrgather.vi v12, v16, 3
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs4r.v v12, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    lui a2, 4
+; RV64-NEXT:    vrgather.vi v24, v8, 3, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 68
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv1r.v v0, v2
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 40
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vmerge.vvm v24, v8, v16, v0
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl2r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v0, v24, v16
+; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a3, 24
-; RV64-NEXT:    addi a2, a2, 260
-; RV64-NEXT:    vmv.s.x v0, a3
-; RV64-NEXT:    addi a3, sp, 16
-; RV64-NEXT:    vs1r.v v0, (a3) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv.s.x v24, a2
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 12
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl2r.v v2, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v6, v2, -14
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 59
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vcompress.vm v8, v16, v24
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 47
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v6, v0.t
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 22
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 30
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl1r.v v1, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv1r.v v0, v1
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 39
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 55
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl4r.v v28, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 4
+; RV64-NEXT:    lui a3, 8
+; RV64-NEXT:    addi a1, a1, 260
+; RV64-NEXT:    addi a3, a3, 520
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    vmv.s.x v2, a3
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 76
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmerge.vvm v24, v8, v16, v0
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vl1r.v v7, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 84
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 48
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v28, v24, 4, v0.t
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 55
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs4r.v v28, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    lui a2, 8
-; RV64-NEXT:    addi a2, a2, 520
-; RV64-NEXT:    vmv.s.x v7, a2
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v4, v2, -13
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 59
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vcompress.vm v8, v24, v7
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
-; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v1
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 39
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a3, a2, 5
-; RV64-NEXT:    sub a2, a3, a2
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vi v24, v16, 4, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 84
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv1r.v v0, v2
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 76
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vmerge.vvm v24, v8, v16, v0
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv8r.v v16, v8
+; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 72
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 48
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv4r.v v8, v24
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v8, v16, 5, v0.t
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a3, a2, 5
-; RV64-NEXT:    sub a2, a3, a2
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs4r.v v8, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    lui a2, 96
+; RV64-NEXT:    vrgather.vi v12, v24, 5, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 72
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 96
 ; RV64-NEXT:    li a3, 192
-; RV64-NEXT:    vmv.s.x v1, a3
+; RV64-NEXT:    vmv.s.x v3, a3
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a2
-; RV64-NEXT:    vmv1r.v v0, v1
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 35
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl4r.v v12, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv.v.x v12, a1
+; RV64-NEXT:    vmv1r.v v0, v3
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 6
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 35
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs4r.v v12, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 1040
-; RV64-NEXT:    li a3, 28
-; RV64-NEXT:    vmv.s.x v20, a2
+; RV64-NEXT:    vrgatherei16.vv v24, v8, v12, v0.t
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 6
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, %hi(.LCPI8_2)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_2)
+; RV64-NEXT:    li a3, 1040
+; RV64-NEXT:    lui a4, 112
+; RV64-NEXT:    addi a4, a4, 1
 ; RV64-NEXT:    vmv.s.x v0, a3
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 30
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v22, v2, -12
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 59
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vcompress.vm v8, v24, v20
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 47
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v22, v0.t
-; RV64-NEXT:    lui a2, 112
-; RV64-NEXT:    addi a2, a2, 1
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v12, a2
-; RV64-NEXT:    vmv1r.v v0, v1
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl4r.v v4, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv.v.x v12, a4
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vle16.v v6, (a1)
+; RV64-NEXT:    vmv8r.v v24, v16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 76
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmerge.vvm v16, v24, v16, v0
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv1r.v v0, v3
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v4, v16, v12, v0.t
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 55
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl4r.v v12, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 22
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v12, v16
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 55
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs4r.v v12, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a1, a1, -2016
-; RV64-NEXT:    vmv.s.x v12, a1
+; RV64-NEXT:    vrgatherei16.vv v16, v8, v12, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 59
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a1, a2, -2016
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vcompress.vm v16, v24, v12
-; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v12, v2, -11
+; RV64-NEXT:    vrgatherei16.vv v16, v8, v6
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 30
+; RV64-NEXT:    li a2, 76
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 47
+; RV64-NEXT:    li a2, 76
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v16, v24, v12, v0.t
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, %hi(.LCPI8_3)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_3)
+; RV64-NEXT:    vle16.v v24, (a1)
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 60
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
+; RV64-NEXT:    vmv.v.v v8, v0
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 60
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 68
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl4r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv.v.v v0, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 84
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
+; RV64-NEXT:    vmv.v.v v8, v16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 84
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vrgatherei16.vv v16, v8, v24
+; RV64-NEXT:    lui a1, %hi(.LCPI8_4)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_4)
+; RV64-NEXT:    vle16.v v8, (a1)
+; RV64-NEXT:    lui a1, %hi(.LCPI8_5)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_5)
+; RV64-NEXT:    vle16.v v6, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    sub a1, a2, a1
+; RV64-NEXT:    li a2, 72
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
+; RV64-NEXT:    vmv.v.v v12, v16
 ; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vrgatherei16.vv v24, v16, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 6
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v12, v24
+; RV64-NEXT:    vmv.v.v v8, v24
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 35
+; RV64-NEXT:    li a2, 76
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv.v.v v20, v8
-; RV64-NEXT:    vmv4r.v v8, v4
-; RV64-NEXT:    vmv.v.v v8, v16
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vrgatherei16.vv v24, v16, v6
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 56
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
+; RV64-NEXT:    vmv.v.v v16, v24
 ; RV64-NEXT:    addi a1, a0, 256
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v20, (a1)
-; RV64-NEXT:    addi a1, a0, 320
 ; RV64-NEXT:    vse64.v v8, (a1)
+; RV64-NEXT:    addi a1, a0, 320
+; RV64-NEXT:    vse64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 192
 ; RV64-NEXT:    vse64.v v12, (a1)
 ; RV64-NEXT:    addi a1, a0, 128
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 55
+; RV64-NEXT:    li a3, 84
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a1)
 ; RV64-NEXT:    addi a1, a0, 64
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 14
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vse64.v v8, (a1)
+; RV64-NEXT:    vse64.v v0, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 18
+; RV64-NEXT:    li a2, 60
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a1, 67
+; RV64-NEXT:    li a1, 88
 ; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    .cfi_def_cfa sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 141d54cf585f2..c6e12c52122d2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -3205,88 +3205,86 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
 ;
 ; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i16_v8i32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lui a1, 16
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV64ZVE32F-NEXT:    andi a3, a2, 1
-; RV64ZVE32F-NEXT:    addiw a1, a1, -1
-; RV64ZVE32F-NEXT:    beqz a3, .LBB40_2
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v0
+; RV64ZVE32F-NEXT:    andi a2, a1, 1
+; RV64ZVE32F-NEXT:    beqz a2, .LBB40_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    lw a3, 0(a3)
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    lw a2, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m4, tu, ma
-; RV64ZVE32F-NEXT:    vmv.s.x v10, a3
+; RV64ZVE32F-NEXT:    vmv.s.x v10, a2
 ; RV64ZVE32F-NEXT:  .LBB40_2: # %else
-; RV64ZVE32F-NEXT:    andi a3, a2, 2
-; RV64ZVE32F-NEXT:    beqz a3, .LBB40_4
+; RV64ZVE32F-NEXT:    andi a2, a1, 2
+; RV64ZVE32F-NEXT:    beqz a2, .LBB40_4
 ; RV64ZVE32F-NEXT:  # %bb.3: # %cond.load1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    lw a3, 0(a3)
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    lw a2, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.s.x v9, a3
+; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v9, 1
 ; RV64ZVE32F-NEXT:  .LBB40_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a3, a2, 4
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT:    bnez a3, .LBB40_14
+; RV64ZVE32F-NEXT:    bnez a2, .LBB40_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
-; RV64ZVE32F-NEXT:    andi a3, a2, 8
-; RV64ZVE32F-NEXT:    bnez a3, .LBB40_15
+; RV64ZVE32F-NEXT:    andi a2, a1, 8
+; RV64ZVE32F-NEXT:    bnez a2, .LBB40_15
 ; RV64ZVE32F-NEXT:  .LBB40_6: # %else8
-; RV64ZVE32F-NEXT:    andi a3, a2, 16
-; RV64ZVE32F-NEXT:    bnez a3, .LBB40_16
+; RV64ZVE32F-NEXT:    andi a2, a1, 16
+; RV64ZVE32F-NEXT:    bnez a2, .LBB40_16
 ; RV64ZVE32F-NEXT:  .LBB40_7: # %else11
-; RV64ZVE32F-NEXT:    andi a3, a2, 32
-; RV64ZVE32F-NEXT:    beqz a3, .LBB40_9
+; RV64ZVE32F-NEXT:    andi a2, a1, 32
+; RV64ZVE32F-NEXT:    beqz a2, .LBB40_9
 ; RV64ZVE32F-NEXT:  .LBB40_8: # %cond.load13
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    lw a3, 0(a3)
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    lw a2, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.s.x v12, a3
+; RV64ZVE32F-NEXT:    vmv.s.x v12, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 5
 ; RV64ZVE32F-NEXT:  .LBB40_9: # %else14
-; RV64ZVE32F-NEXT:    andi a3, a2, 64
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
-; RV64ZVE32F-NEXT:    beqz a3, .LBB40_11
+; RV64ZVE32F-NEXT:    beqz a2, .LBB40_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    lw a3, 0(a3)
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    lw a2, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.s.x v12, a3
+; RV64ZVE32F-NEXT:    vmv.s.x v12, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 6
 ; RV64ZVE32F-NEXT:  .LBB40_11: # %else17
-; RV64ZVE32F-NEXT:    andi a2, a2, -128
-; RV64ZVE32F-NEXT:    beqz a2, .LBB40_13
+; RV64ZVE32F-NEXT:    andi a1, a1, -128
+; RV64ZVE32F-NEXT:    beqz a1, .LBB40_13
 ; RV64ZVE32F-NEXT:  # %bb.12: # %cond.load19
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV64ZVE32F-NEXT:    and a1, a2, a1
-; RV64ZVE32F-NEXT:    slli a1, a1, 2
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV64ZVE32F-NEXT:    slli a1, a1, 48
+; RV64ZVE32F-NEXT:    srli a1, a1, 46
 ; RV64ZVE32F-NEXT:    add a0, a0, a1
 ; RV64ZVE32F-NEXT:    lw a0, 0(a0)
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -3298,44 +3296,44 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
 ; RV64ZVE32F-NEXT:    vmv2r.v v8, v10
 ; RV64ZVE32F-NEXT:    ret
 ; RV64ZVE32F-NEXT:  .LBB40_14: # %cond.load4
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    lw a3, 0(a3)
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    lw a2, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.s.x v12, a3
+; RV64ZVE32F-NEXT:    vmv.s.x v12, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 3, e32, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 2
-; RV64ZVE32F-NEXT:    andi a3, a2, 8
-; RV64ZVE32F-NEXT:    beqz a3, .LBB40_6
+; RV64ZVE32F-NEXT:    andi a2, a1, 8
+; RV64ZVE32F-NEXT:    beqz a2, .LBB40_6
 ; RV64ZVE32F-NEXT:  .LBB40_15: # %cond.load7
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    lw a3, 0(a3)
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    lw a2, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.s.x v8, a3
+; RV64ZVE32F-NEXT:    vmv.s.x v8, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v8, 3
-; RV64ZVE32F-NEXT:    andi a3, a2, 16
-; RV64ZVE32F-NEXT:    beqz a3, .LBB40_7
+; RV64ZVE32F-NEXT:    andi a2, a1, 16
+; RV64ZVE32F-NEXT:    beqz a2, .LBB40_7
 ; RV64ZVE32F-NEXT:  .LBB40_16: # %cond.load10
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    lw a3, 0(a3)
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    lw a2, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64ZVE32F-NEXT:    vmv.s.x v12, a3
+; RV64ZVE32F-NEXT:    vmv.s.x v12, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 4
-; RV64ZVE32F-NEXT:    andi a3, a2, 32
-; RV64ZVE32F-NEXT:    bnez a3, .LBB40_8
+; RV64ZVE32F-NEXT:    andi a2, a1, 32
+; RV64ZVE32F-NEXT:    bnez a2, .LBB40_8
 ; RV64ZVE32F-NEXT:    j .LBB40_9
   %eidxs = zext <8 x i16> %idxs to <8 x i32>
   %ptrs = getelementptr inbounds i32, ptr %base, <8 x i32> %eidxs
@@ -5643,124 +5641,122 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ;
 ; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i16_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lui a5, 16
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a6, v0
-; RV64ZVE32F-NEXT:    andi a3, a6, 1
-; RV64ZVE32F-NEXT:    addiw a5, a5, -1
+; RV64ZVE32F-NEXT:    vmv.x.s a5, v0
+; RV64ZVE32F-NEXT:    andi a3, a5, 1
 ; RV64ZVE32F-NEXT:    beqz a3, .LBB53_3
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV64ZVE32F-NEXT:    and a3, a3, a5
-; RV64ZVE32F-NEXT:    slli a3, a3, 3
+; RV64ZVE32F-NEXT:    slli a3, a3, 48
+; RV64ZVE32F-NEXT:    srli a3, a3, 45
 ; RV64ZVE32F-NEXT:    add a3, a1, a3
 ; RV64ZVE32F-NEXT:    ld a3, 0(a3)
-; RV64ZVE32F-NEXT:    andi a4, a6, 2
+; RV64ZVE32F-NEXT:    andi a4, a5, 2
 ; RV64ZVE32F-NEXT:    bnez a4, .LBB53_4
 ; RV64ZVE32F-NEXT:  .LBB53_2:
 ; RV64ZVE32F-NEXT:    ld a4, 8(a2)
 ; RV64ZVE32F-NEXT:    j .LBB53_5
 ; RV64ZVE32F-NEXT:  .LBB53_3:
 ; RV64ZVE32F-NEXT:    ld a3, 0(a2)
-; RV64ZVE32F-NEXT:    andi a4, a6, 2
+; RV64ZVE32F-NEXT:    andi a4, a5, 2
 ; RV64ZVE32F-NEXT:    beqz a4, .LBB53_2
 ; RV64ZVE32F-NEXT:  .LBB53_4: # %cond.load1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a4, v9
-; RV64ZVE32F-NEXT:    and a4, a4, a5
-; RV64ZVE32F-NEXT:    slli a4, a4, 3
+; RV64ZVE32F-NEXT:    slli a4, a4, 48
+; RV64ZVE32F-NEXT:    srli a4, a4, 45
 ; RV64ZVE32F-NEXT:    add a4, a1, a4
 ; RV64ZVE32F-NEXT:    ld a4, 0(a4)
 ; RV64ZVE32F-NEXT:  .LBB53_5: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a7, a6, 4
+; RV64ZVE32F-NEXT:    andi a6, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT:    beqz a7, .LBB53_10
+; RV64ZVE32F-NEXT:    beqz a6, .LBB53_10
 ; RV64ZVE32F-NEXT:  # %bb.6: # %cond.load4
-; RV64ZVE32F-NEXT:    vmv.x.s a7, v8
-; RV64ZVE32F-NEXT:    and a7, a7, a5
-; RV64ZVE32F-NEXT:    slli a7, a7, 3
-; RV64ZVE32F-NEXT:    add a7, a1, a7
-; RV64ZVE32F-NEXT:    ld a7, 0(a7)
-; RV64ZVE32F-NEXT:    andi t0, a6, 8
-; RV64ZVE32F-NEXT:    bnez t0, .LBB53_11
+; RV64ZVE32F-NEXT:    vmv.x.s a6, v8
+; RV64ZVE32F-NEXT:    slli a6, a6, 48
+; RV64ZVE32F-NEXT:    srli a6, a6, 45
+; RV64ZVE32F-NEXT:    add a6, a1, a6
+; RV64ZVE32F-NEXT:    ld a6, 0(a6)
+; RV64ZVE32F-NEXT:    andi a7, a5, 8
+; RV64ZVE32F-NEXT:    bnez a7, .LBB53_11
 ; RV64ZVE32F-NEXT:  .LBB53_7:
-; RV64ZVE32F-NEXT:    ld t0, 24(a2)
-; RV64ZVE32F-NEXT:    andi t1, a6, 16
-; RV64ZVE32F-NEXT:    bnez t1, .LBB53_12
+; RV64ZVE32F-NEXT:    ld a7, 24(a2)
+; RV64ZVE32F-NEXT:    andi t0, a5, 16
+; RV64ZVE32F-NEXT:    bnez t0, .LBB53_12
 ; RV64ZVE32F-NEXT:  .LBB53_8:
-; RV64ZVE32F-NEXT:    ld t1, 32(a2)
-; RV64ZVE32F-NEXT:    andi t2, a6, 32
-; RV64ZVE32F-NEXT:    bnez t2, .LBB53_13
+; RV64ZVE32F-NEXT:    ld t0, 32(a2)
+; RV64ZVE32F-NEXT:    andi t1, a5, 32
+; RV64ZVE32F-NEXT:    bnez t1, .LBB53_13
 ; RV64ZVE32F-NEXT:  .LBB53_9:
-; RV64ZVE32F-NEXT:    ld t2, 40(a2)
+; RV64ZVE32F-NEXT:    ld t1, 40(a2)
 ; RV64ZVE32F-NEXT:    j .LBB53_14
 ; RV64ZVE32F-NEXT:  .LBB53_10:
-; RV64ZVE32F-NEXT:    ld a7, 16(a2)
-; RV64ZVE32F-NEXT:    andi t0, a6, 8
-; RV64ZVE32F-NEXT:    beqz t0, .LBB53_7
+; RV64ZVE32F-NEXT:    ld a6, 16(a2)
+; RV64ZVE32F-NEXT:    andi a7, a5, 8
+; RV64ZVE32F-NEXT:    beqz a7, .LBB53_7
 ; RV64ZVE32F-NEXT:  .LBB53_11: # %cond.load7
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT:    vmv.x.s t0, v8
-; RV64ZVE32F-NEXT:    and t0, t0, a5
-; RV64ZVE32F-NEXT:    slli t0, t0, 3
+; RV64ZVE32F-NEXT:    vmv.x.s a7, v8
+; RV64ZVE32F-NEXT:    slli a7, a7, 48
+; RV64ZVE32F-NEXT:    srli a7, a7, 45
+; RV64ZVE32F-NEXT:    add a7, a1, a7
+; RV64ZVE32F-NEXT:    ld a7, 0(a7)
+; RV64ZVE32F-NEXT:    andi t0, a5, 16
+; RV64ZVE32F-NEXT:    beqz t0, .LBB53_8
+; RV64ZVE32F-NEXT:  .LBB53_12: # %cond.load10
+; RV64ZVE32F-NEXT:    vmv.x.s t0, v9
+; RV64ZVE32F-NEXT:    slli t0, t0, 48
+; RV64ZVE32F-NEXT:    srli t0, t0, 45
 ; RV64ZVE32F-NEXT:    add t0, a1, t0
 ; RV64ZVE32F-NEXT:    ld t0, 0(t0)
-; RV64ZVE32F-NEXT:    andi t1, a6, 16
-; RV64ZVE32F-NEXT:    beqz t1, .LBB53_8
-; RV64ZVE32F-NEXT:  .LBB53_12: # %cond.load10
-; RV64ZVE32F-NEXT:    vmv.x.s t1, v9
-; RV64ZVE32F-NEXT:    and t1, t1, a5
-; RV64ZVE32F-NEXT:    slli t1, t1, 3
-; RV64ZVE32F-NEXT:    add t1, a1, t1
-; RV64ZVE32F-NEXT:    ld t1, 0(t1)
-; RV64ZVE32F-NEXT:    andi t2, a6, 32
-; RV64ZVE32F-NEXT:    beqz t2, .LBB53_9
+; RV64ZVE32F-NEXT:    andi t1, a5, 32
+; RV64ZVE32F-NEXT:    beqz t1, .LBB53_9
 ; RV64ZVE32F-NEXT:  .LBB53_13: # %cond.load13
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT:    vmv.x.s t2, v8
-; RV64ZVE32F-NEXT:    and t2, t2, a5
-; RV64ZVE32F-NEXT:    slli t2, t2, 3
-; RV64ZVE32F-NEXT:    add t2, a1, t2
-; RV64ZVE32F-NEXT:    ld t2, 0(t2)
+; RV64ZVE32F-NEXT:    vmv.x.s t1, v8
+; RV64ZVE32F-NEXT:    slli t1, t1, 48
+; RV64ZVE32F-NEXT:    srli t1, t1, 45
+; RV64ZVE32F-NEXT:    add t1, a1, t1
+; RV64ZVE32F-NEXT:    ld t1, 0(t1)
 ; RV64ZVE32F-NEXT:  .LBB53_14: # %else14
-; RV64ZVE32F-NEXT:    andi t3, a6, 64
+; RV64ZVE32F-NEXT:    andi t2, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
-; RV64ZVE32F-NEXT:    beqz t3, .LBB53_17
+; RV64ZVE32F-NEXT:    beqz t2, .LBB53_17
 ; RV64ZVE32F-NEXT:  # %bb.15: # %cond.load16
-; RV64ZVE32F-NEXT:    vmv.x.s t3, v8
-; RV64ZVE32F-NEXT:    and t3, t3, a5
-; RV64ZVE32F-NEXT:    slli t3, t3, 3
-; RV64ZVE32F-NEXT:    add t3, a1, t3
-; RV64ZVE32F-NEXT:    ld t3, 0(t3)
-; RV64ZVE32F-NEXT:    andi a6, a6, -128
-; RV64ZVE32F-NEXT:    bnez a6, .LBB53_18
+; RV64ZVE32F-NEXT:    vmv.x.s t2, v8
+; RV64ZVE32F-NEXT:    slli t2, t2, 48
+; RV64ZVE32F-NEXT:    srli t2, t2, 45
+; RV64ZVE32F-NEXT:    add t2, a1, t2
+; RV64ZVE32F-NEXT:    ld t2, 0(t2)
+; RV64ZVE32F-NEXT:    andi a5, a5, -128
+; RV64ZVE32F-NEXT:    bnez a5, .LBB53_18
 ; RV64ZVE32F-NEXT:  .LBB53_16:
 ; RV64ZVE32F-NEXT:    ld a1, 56(a2)
 ; RV64ZVE32F-NEXT:    j .LBB53_19
 ; RV64ZVE32F-NEXT:  .LBB53_17:
-; RV64ZVE32F-NEXT:    ld t3, 48(a2)
-; RV64ZVE32F-NEXT:    andi a6, a6, -128
-; RV64ZVE32F-NEXT:    beqz a6, .LBB53_16
+; RV64ZVE32F-NEXT:    ld t2, 48(a2)
+; RV64ZVE32F-NEXT:    andi a5, a5, -128
+; RV64ZVE32F-NEXT:    beqz a5, .LBB53_16
 ; RV64ZVE32F-NEXT:  .LBB53_18: # %cond.load19
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV64ZVE32F-NEXT:    and a2, a2, a5
-; RV64ZVE32F-NEXT:    slli a2, a2, 3
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 45
 ; RV64ZVE32F-NEXT:    add a1, a1, a2
 ; RV64ZVE32F-NEXT:    ld a1, 0(a1)
 ; RV64ZVE32F-NEXT:  .LBB53_19: # %else20
 ; RV64ZVE32F-NEXT:    sd a3, 0(a0)
 ; RV64ZVE32F-NEXT:    sd a4, 8(a0)
-; RV64ZVE32F-NEXT:    sd a7, 16(a0)
-; RV64ZVE32F-NEXT:    sd t0, 24(a0)
-; RV64ZVE32F-NEXT:    sd t1, 32(a0)
-; RV64ZVE32F-NEXT:    sd t2, 40(a0)
-; RV64ZVE32F-NEXT:    sd t3, 48(a0)
+; RV64ZVE32F-NEXT:    sd a6, 16(a0)
+; RV64ZVE32F-NEXT:    sd a7, 24(a0)
+; RV64ZVE32F-NEXT:    sd t0, 32(a0)
+; RV64ZVE32F-NEXT:    sd t1, 40(a0)
+; RV64ZVE32F-NEXT:    sd t2, 48(a0)
 ; RV64ZVE32F-NEXT:    sd a1, 56(a0)
 ; RV64ZVE32F-NEXT:    ret
   %eidxs = zext <8 x i16> %idxs to <8 x i64>
@@ -10511,32 +10507,30 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
 ;
 ; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i16_v8f32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lui a1, 16
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV64ZVE32F-NEXT:    andi a3, a2, 1
-; RV64ZVE32F-NEXT:    addiw a1, a1, -1
-; RV64ZVE32F-NEXT:    beqz a3, .LBB89_2
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v0
+; RV64ZVE32F-NEXT:    andi a2, a1, 1
+; RV64ZVE32F-NEXT:    beqz a2, .LBB89_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    flw fa5, 0(a3)
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    flw fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m4, tu, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v10, fa5
 ; RV64ZVE32F-NEXT:  .LBB89_2: # %else
-; RV64ZVE32F-NEXT:    andi a3, a2, 2
-; RV64ZVE32F-NEXT:    beqz a3, .LBB89_4
+; RV64ZVE32F-NEXT:    andi a2, a1, 2
+; RV64ZVE32F-NEXT:    beqz a2, .LBB89_4
 ; RV64ZVE32F-NEXT:  # %bb.3: # %cond.load1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    flw fa5, 0(a3)
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    flw fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v9, fa5
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
@@ -10544,55 +10538,55 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-NEXT:  .LBB89_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a3, a2, 4
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT:    bnez a3, .LBB89_14
+; RV64ZVE32F-NEXT:    bnez a2, .LBB89_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
-; RV64ZVE32F-NEXT:    andi a3, a2, 8
-; RV64ZVE32F-NEXT:    bnez a3, .LBB89_15
+; RV64ZVE32F-NEXT:    andi a2, a1, 8
+; RV64ZVE32F-NEXT:    bnez a2, .LBB89_15
 ; RV64ZVE32F-NEXT:  .LBB89_6: # %else8
-; RV64ZVE32F-NEXT:    andi a3, a2, 16
-; RV64ZVE32F-NEXT:    bnez a3, .LBB89_16
+; RV64ZVE32F-NEXT:    andi a2, a1, 16
+; RV64ZVE32F-NEXT:    bnez a2, .LBB89_16
 ; RV64ZVE32F-NEXT:  .LBB89_7: # %else11
-; RV64ZVE32F-NEXT:    andi a3, a2, 32
-; RV64ZVE32F-NEXT:    beqz a3, .LBB89_9
+; RV64ZVE32F-NEXT:    andi a2, a1, 32
+; RV64ZVE32F-NEXT:    beqz a2, .LBB89_9
 ; RV64ZVE32F-NEXT:  .LBB89_8: # %cond.load13
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    flw fa5, 0(a3)
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    flw fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v12, fa5
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 5
 ; RV64ZVE32F-NEXT:  .LBB89_9: # %else14
-; RV64ZVE32F-NEXT:    andi a3, a2, 64
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
-; RV64ZVE32F-NEXT:    beqz a3, .LBB89_11
+; RV64ZVE32F-NEXT:    beqz a2, .LBB89_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    flw fa5, 0(a3)
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    flw fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v12, fa5
 ; RV64ZVE32F-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 6
 ; RV64ZVE32F-NEXT:  .LBB89_11: # %else17
-; RV64ZVE32F-NEXT:    andi a2, a2, -128
-; RV64ZVE32F-NEXT:    beqz a2, .LBB89_13
+; RV64ZVE32F-NEXT:    andi a1, a1, -128
+; RV64ZVE32F-NEXT:    beqz a1, .LBB89_13
 ; RV64ZVE32F-NEXT:  # %bb.12: # %cond.load19
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV64ZVE32F-NEXT:    and a1, a2, a1
-; RV64ZVE32F-NEXT:    slli a1, a1, 2
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV64ZVE32F-NEXT:    slli a1, a1, 48
+; RV64ZVE32F-NEXT:    srli a1, a1, 46
 ; RV64ZVE32F-NEXT:    add a0, a0, a1
 ; RV64ZVE32F-NEXT:    flw fa5, 0(a0)
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -10604,44 +10598,44 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-NEXT:    vmv2r.v v8, v10
 ; RV64ZVE32F-NEXT:    ret
 ; RV64ZVE32F-NEXT:  .LBB89_14: # %cond.load4
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    flw fa5, 0(a3)
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    flw fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v12, fa5
 ; RV64ZVE32F-NEXT:    vsetivli zero, 3, e32, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 2
-; RV64ZVE32F-NEXT:    andi a3, a2, 8
-; RV64ZVE32F-NEXT:    beqz a3, .LBB89_6
+; RV64ZVE32F-NEXT:    andi a2, a1, 8
+; RV64ZVE32F-NEXT:    beqz a2, .LBB89_6
 ; RV64ZVE32F-NEXT:  .LBB89_15: # %cond.load7
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    flw fa5, 0(a3)
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    flw fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v8, fa5
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v8, 3
-; RV64ZVE32F-NEXT:    andi a3, a2, 16
-; RV64ZVE32F-NEXT:    beqz a3, .LBB89_7
+; RV64ZVE32F-NEXT:    andi a2, a1, 16
+; RV64ZVE32F-NEXT:    beqz a2, .LBB89_7
 ; RV64ZVE32F-NEXT:  .LBB89_16: # %cond.load10
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    flw fa5, 0(a3)
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    flw fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v12, fa5
 ; RV64ZVE32F-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 4
-; RV64ZVE32F-NEXT:    andi a3, a2, 32
-; RV64ZVE32F-NEXT:    bnez a3, .LBB89_8
+; RV64ZVE32F-NEXT:    andi a2, a1, 32
+; RV64ZVE32F-NEXT:    bnez a2, .LBB89_8
 ; RV64ZVE32F-NEXT:    j .LBB89_9
   %eidxs = zext <8 x i16> %idxs to <8 x i32>
   %ptrs = getelementptr inbounds float, ptr %base, <8 x i32> %eidxs
@@ -12482,71 +12476,69 @@ define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
 ;
 ; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i16_v8f64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lui a2, 16
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
-; RV64ZVE32F-NEXT:    andi a4, a3, 1
-; RV64ZVE32F-NEXT:    addiw a2, a2, -1
-; RV64ZVE32F-NEXT:    beqz a4, .LBB102_2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v0
+; RV64ZVE32F-NEXT:    andi a3, a2, 1
+; RV64ZVE32F-NEXT:    beqz a3, .LBB102_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a4, v8
-; RV64ZVE32F-NEXT:    and a4, a4, a2
-; RV64ZVE32F-NEXT:    slli a4, a4, 3
-; RV64ZVE32F-NEXT:    add a4, a1, a4
-; RV64ZVE32F-NEXT:    fld fa0, 0(a4)
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
+; RV64ZVE32F-NEXT:    slli a3, a3, 48
+; RV64ZVE32F-NEXT:    srli a3, a3, 45
+; RV64ZVE32F-NEXT:    add a3, a1, a3
+; RV64ZVE32F-NEXT:    fld fa0, 0(a3)
 ; RV64ZVE32F-NEXT:  .LBB102_2: # %else
-; RV64ZVE32F-NEXT:    andi a4, a3, 2
-; RV64ZVE32F-NEXT:    beqz a4, .LBB102_4
+; RV64ZVE32F-NEXT:    andi a3, a2, 2
+; RV64ZVE32F-NEXT:    beqz a3, .LBB102_4
 ; RV64ZVE32F-NEXT:  # %bb.3: # %cond.load1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a4, v9
-; RV64ZVE32F-NEXT:    and a4, a4, a2
-; RV64ZVE32F-NEXT:    slli a4, a4, 3
-; RV64ZVE32F-NEXT:    add a4, a1, a4
-; RV64ZVE32F-NEXT:    fld fa1, 0(a4)
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    slli a3, a3, 48
+; RV64ZVE32F-NEXT:    srli a3, a3, 45
+; RV64ZVE32F-NEXT:    add a3, a1, a3
+; RV64ZVE32F-NEXT:    fld fa1, 0(a3)
 ; RV64ZVE32F-NEXT:  .LBB102_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a4, a3, 4
+; RV64ZVE32F-NEXT:    andi a3, a2, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT:    bnez a4, .LBB102_14
+; RV64ZVE32F-NEXT:    bnez a3, .LBB102_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
-; RV64ZVE32F-NEXT:    andi a4, a3, 8
-; RV64ZVE32F-NEXT:    bnez a4, .LBB102_15
+; RV64ZVE32F-NEXT:    andi a3, a2, 8
+; RV64ZVE32F-NEXT:    bnez a3, .LBB102_15
 ; RV64ZVE32F-NEXT:  .LBB102_6: # %else8
-; RV64ZVE32F-NEXT:    andi a4, a3, 16
-; RV64ZVE32F-NEXT:    bnez a4, .LBB102_16
+; RV64ZVE32F-NEXT:    andi a3, a2, 16
+; RV64ZVE32F-NEXT:    bnez a3, .LBB102_16
 ; RV64ZVE32F-NEXT:  .LBB102_7: # %else11
-; RV64ZVE32F-NEXT:    andi a4, a3, 32
-; RV64ZVE32F-NEXT:    beqz a4, .LBB102_9
+; RV64ZVE32F-NEXT:    andi a3, a2, 32
+; RV64ZVE32F-NEXT:    beqz a3, .LBB102_9
 ; RV64ZVE32F-NEXT:  .LBB102_8: # %cond.load13
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a4, v8
-; RV64ZVE32F-NEXT:    and a4, a4, a2
-; RV64ZVE32F-NEXT:    slli a4, a4, 3
-; RV64ZVE32F-NEXT:    add a4, a1, a4
-; RV64ZVE32F-NEXT:    fld fa5, 0(a4)
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
+; RV64ZVE32F-NEXT:    slli a3, a3, 48
+; RV64ZVE32F-NEXT:    srli a3, a3, 45
+; RV64ZVE32F-NEXT:    add a3, a1, a3
+; RV64ZVE32F-NEXT:    fld fa5, 0(a3)
 ; RV64ZVE32F-NEXT:  .LBB102_9: # %else14
-; RV64ZVE32F-NEXT:    andi a4, a3, 64
+; RV64ZVE32F-NEXT:    andi a3, a2, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
-; RV64ZVE32F-NEXT:    beqz a4, .LBB102_11
+; RV64ZVE32F-NEXT:    beqz a3, .LBB102_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
-; RV64ZVE32F-NEXT:    vmv.x.s a4, v8
-; RV64ZVE32F-NEXT:    and a4, a4, a2
-; RV64ZVE32F-NEXT:    slli a4, a4, 3
-; RV64ZVE32F-NEXT:    add a4, a1, a4
-; RV64ZVE32F-NEXT:    fld fa6, 0(a4)
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
+; RV64ZVE32F-NEXT:    slli a3, a3, 48
+; RV64ZVE32F-NEXT:    srli a3, a3, 45
+; RV64ZVE32F-NEXT:    add a3, a1, a3
+; RV64ZVE32F-NEXT:    fld fa6, 0(a3)
 ; RV64ZVE32F-NEXT:  .LBB102_11: # %else17
-; RV64ZVE32F-NEXT:    andi a3, a3, -128
-; RV64ZVE32F-NEXT:    beqz a3, .LBB102_13
+; RV64ZVE32F-NEXT:    andi a2, a2, -128
+; RV64ZVE32F-NEXT:    beqz a2, .LBB102_13
 ; RV64ZVE32F-NEXT:  # %bb.12: # %cond.load19
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV64ZVE32F-NEXT:    and a2, a3, a2
-; RV64ZVE32F-NEXT:    slli a2, a2, 3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 45
 ; RV64ZVE32F-NEXT:    add a1, a1, a2
 ; RV64ZVE32F-NEXT:    fld fa7, 0(a1)
 ; RV64ZVE32F-NEXT:  .LBB102_13: # %else20
@@ -12560,30 +12552,30 @@ define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
 ; RV64ZVE32F-NEXT:    fsd fa7, 56(a0)
 ; RV64ZVE32F-NEXT:    ret
 ; RV64ZVE32F-NEXT:  .LBB102_14: # %cond.load4
-; RV64ZVE32F-NEXT:    vmv.x.s a4, v8
-; RV64ZVE32F-NEXT:    and a4, a4, a2
-; RV64ZVE32F-NEXT:    slli a4, a4, 3
-; RV64ZVE32F-NEXT:    add a4, a1, a4
-; RV64ZVE32F-NEXT:    fld fa2, 0(a4)
-; RV64ZVE32F-NEXT:    andi a4, a3, 8
-; RV64ZVE32F-NEXT:    beqz a4, .LBB102_6
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
+; RV64ZVE32F-NEXT:    slli a3, a3, 48
+; RV64ZVE32F-NEXT:    srli a3, a3, 45
+; RV64ZVE32F-NEXT:    add a3, a1, a3
+; RV64ZVE32F-NEXT:    fld fa2, 0(a3)
+; RV64ZVE32F-NEXT:    andi a3, a2, 8
+; RV64ZVE32F-NEXT:    beqz a3, .LBB102_6
 ; RV64ZVE32F-NEXT:  .LBB102_15: # %cond.load7
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a4, v8
-; RV64ZVE32F-NEXT:    and a4, a4, a2
-; RV64ZVE32F-NEXT:    slli a4, a4, 3
-; RV64ZVE32F-NEXT:    add a4, a1, a4
-; RV64ZVE32F-NEXT:    fld fa3, 0(a4)
-; RV64ZVE32F-NEXT:    andi a4, a3, 16
-; RV64ZVE32F-NEXT:    beqz a4, .LBB102_7
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
+; RV64ZVE32F-NEXT:    slli a3, a3, 48
+; RV64ZVE32F-NEXT:    srli a3, a3, 45
+; RV64ZVE32F-NEXT:    add a3, a1, a3
+; RV64ZVE32F-NEXT:    fld fa3, 0(a3)
+; RV64ZVE32F-NEXT:    andi a3, a2, 16
+; RV64ZVE32F-NEXT:    beqz a3, .LBB102_7
 ; RV64ZVE32F-NEXT:  .LBB102_16: # %cond.load10
-; RV64ZVE32F-NEXT:    vmv.x.s a4, v9
-; RV64ZVE32F-NEXT:    and a4, a4, a2
-; RV64ZVE32F-NEXT:    slli a4, a4, 3
-; RV64ZVE32F-NEXT:    add a4, a1, a4
-; RV64ZVE32F-NEXT:    fld fa4, 0(a4)
-; RV64ZVE32F-NEXT:    andi a4, a3, 32
-; RV64ZVE32F-NEXT:    bnez a4, .LBB102_8
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    slli a3, a3, 48
+; RV64ZVE32F-NEXT:    srli a3, a3, 45
+; RV64ZVE32F-NEXT:    add a3, a1, a3
+; RV64ZVE32F-NEXT:    fld fa4, 0(a3)
+; RV64ZVE32F-NEXT:    andi a3, a2, 32
+; RV64ZVE32F-NEXT:    bnez a3, .LBB102_8
 ; RV64ZVE32F-NEXT:    j .LBB102_9
   %eidxs = zext <8 x i16> %idxs to <8 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index 575a757149ebb..7ec4726925704 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -2588,123 +2588,121 @@ define void @mscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8i32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lui a1, 16
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV64ZVE32F-NEXT:    andi a3, a2, 1
-; RV64ZVE32F-NEXT:    addiw a1, a1, -1
-; RV64ZVE32F-NEXT:    beqz a3, .LBB34_2
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v0
+; RV64ZVE32F-NEXT:    andi a2, a1, 1
+; RV64ZVE32F-NEXT:    beqz a2, .LBB34_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vse32.v v8, (a3)
+; RV64ZVE32F-NEXT:    vse32.v v8, (a2)
 ; RV64ZVE32F-NEXT:  .LBB34_2: # %else
-; RV64ZVE32F-NEXT:    andi a3, a2, 2
-; RV64ZVE32F-NEXT:    beqz a3, .LBB34_4
+; RV64ZVE32F-NEXT:    andi a2, a1, 2
+; RV64ZVE32F-NEXT:    beqz a2, .LBB34_4
 ; RV64ZVE32F-NEXT:  # %bb.3: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v10, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v11
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v8, 1
-; RV64ZVE32F-NEXT:    vse32.v v11, (a3)
+; RV64ZVE32F-NEXT:    vse32.v v11, (a2)
 ; RV64ZVE32F-NEXT:  .LBB34_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v10, 4
-; RV64ZVE32F-NEXT:    andi a3, a2, 4
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 2
-; RV64ZVE32F-NEXT:    bnez a3, .LBB34_12
+; RV64ZVE32F-NEXT:    bnez a2, .LBB34_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
-; RV64ZVE32F-NEXT:    andi a3, a2, 8
-; RV64ZVE32F-NEXT:    bnez a3, .LBB34_13
+; RV64ZVE32F-NEXT:    andi a2, a1, 8
+; RV64ZVE32F-NEXT:    bnez a2, .LBB34_13
 ; RV64ZVE32F-NEXT:  .LBB34_6: # %else6
-; RV64ZVE32F-NEXT:    andi a3, a2, 16
-; RV64ZVE32F-NEXT:    bnez a3, .LBB34_14
+; RV64ZVE32F-NEXT:    andi a2, a1, 16
+; RV64ZVE32F-NEXT:    bnez a2, .LBB34_14
 ; RV64ZVE32F-NEXT:  .LBB34_7: # %else8
-; RV64ZVE32F-NEXT:    andi a3, a2, 32
-; RV64ZVE32F-NEXT:    beqz a3, .LBB34_9
+; RV64ZVE32F-NEXT:    andi a2, a1, 32
+; RV64ZVE32F-NEXT:    beqz a2, .LBB34_9
 ; RV64ZVE32F-NEXT:  .LBB34_8: # %cond.store9
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v11, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v8, 5
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vse32.v v12, (a3)
+; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
 ; RV64ZVE32F-NEXT:  .LBB34_9: # %else10
-; RV64ZVE32F-NEXT:    andi a3, a2, 64
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v11, 2
-; RV64ZVE32F-NEXT:    bnez a3, .LBB34_15
+; RV64ZVE32F-NEXT:    bnez a2, .LBB34_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
-; RV64ZVE32F-NEXT:    andi a2, a2, -128
-; RV64ZVE32F-NEXT:    bnez a2, .LBB34_16
+; RV64ZVE32F-NEXT:    andi a1, a1, -128
+; RV64ZVE32F-NEXT:    bnez a1, .LBB34_16
 ; RV64ZVE32F-NEXT:  .LBB34_11: # %else14
 ; RV64ZVE32F-NEXT:    ret
 ; RV64ZVE32F-NEXT:  .LBB34_12: # %cond.store3
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v8, 2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vse32.v v12, (a3)
-; RV64ZVE32F-NEXT:    andi a3, a2, 8
-; RV64ZVE32F-NEXT:    beqz a3, .LBB34_6
+; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
+; RV64ZVE32F-NEXT:    andi a2, a1, 8
+; RV64ZVE32F-NEXT:    beqz a2, .LBB34_6
 ; RV64ZVE32F-NEXT:  .LBB34_13: # %cond.store5
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
-; RV64ZVE32F-NEXT:    vse32.v v10, (a3)
-; RV64ZVE32F-NEXT:    andi a3, a2, 16
-; RV64ZVE32F-NEXT:    beqz a3, .LBB34_7
+; RV64ZVE32F-NEXT:    vse32.v v10, (a2)
+; RV64ZVE32F-NEXT:    andi a2, a1, 16
+; RV64ZVE32F-NEXT:    beqz a2, .LBB34_7
 ; RV64ZVE32F-NEXT:  .LBB34_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v11
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v8, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vse32.v v12, (a3)
-; RV64ZVE32F-NEXT:    andi a3, a2, 32
-; RV64ZVE32F-NEXT:    bnez a3, .LBB34_8
+; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
+; RV64ZVE32F-NEXT:    andi a2, a1, 32
+; RV64ZVE32F-NEXT:    bnez a2, .LBB34_8
 ; RV64ZVE32F-NEXT:    j .LBB34_9
 ; RV64ZVE32F-NEXT:  .LBB34_15: # %cond.store11
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v8, 6
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vse32.v v12, (a3)
-; RV64ZVE32F-NEXT:    andi a2, a2, -128
-; RV64ZVE32F-NEXT:    beqz a2, .LBB34_11
+; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
+; RV64ZVE32F-NEXT:    andi a1, a1, -128
+; RV64ZVE32F-NEXT:    beqz a1, .LBB34_11
 ; RV64ZVE32F-NEXT:  .LBB34_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-NEXT:    and a1, a2, a1
-; RV64ZVE32F-NEXT:    slli a1, a1, 2
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v10
+; RV64ZVE32F-NEXT:    slli a1, a1, 48
+; RV64ZVE32F-NEXT:    srli a1, a1, 46
 ; RV64ZVE32F-NEXT:    add a0, a0, a1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
@@ -4794,109 +4792,107 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 48(a0)
 ; RV64ZVE32F-NEXT:    ld a2, 56(a0)
-; RV64ZVE32F-NEXT:    ld t2, 8(a0)
-; RV64ZVE32F-NEXT:    ld t1, 16(a0)
-; RV64ZVE32F-NEXT:    ld t0, 24(a0)
-; RV64ZVE32F-NEXT:    ld a7, 32(a0)
-; RV64ZVE32F-NEXT:    lui a4, 16
+; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a6, v0
-; RV64ZVE32F-NEXT:    andi t3, a6, 1
-; RV64ZVE32F-NEXT:    addiw a4, a4, -1
-; RV64ZVE32F-NEXT:    beqz t3, .LBB47_2
+; RV64ZVE32F-NEXT:    vmv.x.s a5, v0
+; RV64ZVE32F-NEXT:    andi t2, a5, 1
+; RV64ZVE32F-NEXT:    beqz t2, .LBB47_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s t3, v8
-; RV64ZVE32F-NEXT:    and t3, t3, a4
-; RV64ZVE32F-NEXT:    slli t3, t3, 3
-; RV64ZVE32F-NEXT:    add t3, a1, t3
-; RV64ZVE32F-NEXT:    sd a0, 0(t3)
+; RV64ZVE32F-NEXT:    vmv.x.s t2, v8
+; RV64ZVE32F-NEXT:    slli t2, t2, 48
+; RV64ZVE32F-NEXT:    srli t2, t2, 45
+; RV64ZVE32F-NEXT:    add t2, a1, t2
+; RV64ZVE32F-NEXT:    sd a0, 0(t2)
 ; RV64ZVE32F-NEXT:  .LBB47_2: # %else
-; RV64ZVE32F-NEXT:    andi a0, a6, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB47_4
 ; RV64ZVE32F-NEXT:  # %bb.3: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v9
-; RV64ZVE32F-NEXT:    and a0, a0, a4
-; RV64ZVE32F-NEXT:    slli a0, a0, 3
+; RV64ZVE32F-NEXT:    slli a0, a0, 48
+; RV64ZVE32F-NEXT:    srli a0, a0, 45
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
-; RV64ZVE32F-NEXT:    sd t2, 0(a0)
+; RV64ZVE32F-NEXT:    sd t1, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB47_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a0, a6, 4
+; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB47_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a6, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB47_13
 ; RV64ZVE32F-NEXT:  .LBB47_6: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a6, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB47_14
 ; RV64ZVE32F-NEXT:  .LBB47_7: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a6, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB47_9
 ; RV64ZVE32F-NEXT:  .LBB47_8: # %cond.store9
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV64ZVE32F-NEXT:    and a0, a0, a4
-; RV64ZVE32F-NEXT:    slli a0, a0, 3
+; RV64ZVE32F-NEXT:    slli a0, a0, 48
+; RV64ZVE32F-NEXT:    srli a0, a0, 45
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
-; RV64ZVE32F-NEXT:    sd a5, 0(a0)
+; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB47_9: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a6, 64
+; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB47_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a6, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB47_16
 ; RV64ZVE32F-NEXT:  .LBB47_11: # %else14
 ; RV64ZVE32F-NEXT:    ret
 ; RV64ZVE32F-NEXT:  .LBB47_12: # %cond.store3
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV64ZVE32F-NEXT:    and a0, a0, a4
-; RV64ZVE32F-NEXT:    slli a0, a0, 3
+; RV64ZVE32F-NEXT:    slli a0, a0, 48
+; RV64ZVE32F-NEXT:    srli a0, a0, 45
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
-; RV64ZVE32F-NEXT:    sd t1, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a6, 8
+; RV64ZVE32F-NEXT:    sd t0, 0(a0)
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB47_6
 ; RV64ZVE32F-NEXT:  .LBB47_13: # %cond.store5
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV64ZVE32F-NEXT:    and a0, a0, a4
-; RV64ZVE32F-NEXT:    slli a0, a0, 3
+; RV64ZVE32F-NEXT:    slli a0, a0, 48
+; RV64ZVE32F-NEXT:    srli a0, a0, 45
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
-; RV64ZVE32F-NEXT:    sd t0, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a6, 16
+; RV64ZVE32F-NEXT:    sd a7, 0(a0)
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB47_7
 ; RV64ZVE32F-NEXT:  .LBB47_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v9
-; RV64ZVE32F-NEXT:    and a0, a0, a4
-; RV64ZVE32F-NEXT:    slli a0, a0, 3
+; RV64ZVE32F-NEXT:    slli a0, a0, 48
+; RV64ZVE32F-NEXT:    srli a0, a0, 45
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
-; RV64ZVE32F-NEXT:    sd a7, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a6, 32
+; RV64ZVE32F-NEXT:    sd a6, 0(a0)
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB47_8
 ; RV64ZVE32F-NEXT:    j .LBB47_9
 ; RV64ZVE32F-NEXT:  .LBB47_15: # %cond.store11
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV64ZVE32F-NEXT:    and a0, a0, a4
-; RV64ZVE32F-NEXT:    slli a0, a0, 3
+; RV64ZVE32F-NEXT:    slli a0, a0, 48
+; RV64ZVE32F-NEXT:    srli a0, a0, 45
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a3, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a6, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB47_11
 ; RV64ZVE32F-NEXT:  .LBB47_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV64ZVE32F-NEXT:    and a0, a0, a4
-; RV64ZVE32F-NEXT:    slli a0, a0, 3
+; RV64ZVE32F-NEXT:    slli a0, a0, 48
+; RV64ZVE32F-NEXT:    srli a0, a0, 45
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a2, 0(a0)
 ; RV64ZVE32F-NEXT:    ret
@@ -9463,123 +9459,121 @@ define void @mscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8f32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lui a1, 16
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV64ZVE32F-NEXT:    andi a3, a2, 1
-; RV64ZVE32F-NEXT:    addiw a1, a1, -1
-; RV64ZVE32F-NEXT:    beqz a3, .LBB83_2
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v0
+; RV64ZVE32F-NEXT:    andi a2, a1, 1
+; RV64ZVE32F-NEXT:    beqz a2, .LBB83_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vse32.v v8, (a3)
+; RV64ZVE32F-NEXT:    vse32.v v8, (a2)
 ; RV64ZVE32F-NEXT:  .LBB83_2: # %else
-; RV64ZVE32F-NEXT:    andi a3, a2, 2
-; RV64ZVE32F-NEXT:    beqz a3, .LBB83_4
+; RV64ZVE32F-NEXT:    andi a2, a1, 2
+; RV64ZVE32F-NEXT:    beqz a2, .LBB83_4
 ; RV64ZVE32F-NEXT:  # %bb.3: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v10, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v11
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v8, 1
-; RV64ZVE32F-NEXT:    vse32.v v11, (a3)
+; RV64ZVE32F-NEXT:    vse32.v v11, (a2)
 ; RV64ZVE32F-NEXT:  .LBB83_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v10, 4
-; RV64ZVE32F-NEXT:    andi a3, a2, 4
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 2
-; RV64ZVE32F-NEXT:    bnez a3, .LBB83_12
+; RV64ZVE32F-NEXT:    bnez a2, .LBB83_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
-; RV64ZVE32F-NEXT:    andi a3, a2, 8
-; RV64ZVE32F-NEXT:    bnez a3, .LBB83_13
+; RV64ZVE32F-NEXT:    andi a2, a1, 8
+; RV64ZVE32F-NEXT:    bnez a2, .LBB83_13
 ; RV64ZVE32F-NEXT:  .LBB83_6: # %else6
-; RV64ZVE32F-NEXT:    andi a3, a2, 16
-; RV64ZVE32F-NEXT:    bnez a3, .LBB83_14
+; RV64ZVE32F-NEXT:    andi a2, a1, 16
+; RV64ZVE32F-NEXT:    bnez a2, .LBB83_14
 ; RV64ZVE32F-NEXT:  .LBB83_7: # %else8
-; RV64ZVE32F-NEXT:    andi a3, a2, 32
-; RV64ZVE32F-NEXT:    beqz a3, .LBB83_9
+; RV64ZVE32F-NEXT:    andi a2, a1, 32
+; RV64ZVE32F-NEXT:    beqz a2, .LBB83_9
 ; RV64ZVE32F-NEXT:  .LBB83_8: # %cond.store9
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v11, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v8, 5
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vse32.v v12, (a3)
+; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
 ; RV64ZVE32F-NEXT:  .LBB83_9: # %else10
-; RV64ZVE32F-NEXT:    andi a3, a2, 64
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v11, 2
-; RV64ZVE32F-NEXT:    bnez a3, .LBB83_15
+; RV64ZVE32F-NEXT:    bnez a2, .LBB83_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
-; RV64ZVE32F-NEXT:    andi a2, a2, -128
-; RV64ZVE32F-NEXT:    bnez a2, .LBB83_16
+; RV64ZVE32F-NEXT:    andi a1, a1, -128
+; RV64ZVE32F-NEXT:    bnez a1, .LBB83_16
 ; RV64ZVE32F-NEXT:  .LBB83_11: # %else14
 ; RV64ZVE32F-NEXT:    ret
 ; RV64ZVE32F-NEXT:  .LBB83_12: # %cond.store3
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v8, 2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vse32.v v12, (a3)
-; RV64ZVE32F-NEXT:    andi a3, a2, 8
-; RV64ZVE32F-NEXT:    beqz a3, .LBB83_6
+; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
+; RV64ZVE32F-NEXT:    andi a2, a1, 8
+; RV64ZVE32F-NEXT:    beqz a2, .LBB83_6
 ; RV64ZVE32F-NEXT:  .LBB83_13: # %cond.store5
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
-; RV64ZVE32F-NEXT:    vse32.v v10, (a3)
-; RV64ZVE32F-NEXT:    andi a3, a2, 16
-; RV64ZVE32F-NEXT:    beqz a3, .LBB83_7
+; RV64ZVE32F-NEXT:    vse32.v v10, (a2)
+; RV64ZVE32F-NEXT:    andi a2, a1, 16
+; RV64ZVE32F-NEXT:    beqz a2, .LBB83_7
 ; RV64ZVE32F-NEXT:  .LBB83_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v11
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v8, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vse32.v v12, (a3)
-; RV64ZVE32F-NEXT:    andi a3, a2, 32
-; RV64ZVE32F-NEXT:    bnez a3, .LBB83_8
+; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
+; RV64ZVE32F-NEXT:    andi a2, a1, 32
+; RV64ZVE32F-NEXT:    bnez a2, .LBB83_8
 ; RV64ZVE32F-NEXT:    j .LBB83_9
 ; RV64ZVE32F-NEXT:  .LBB83_15: # %cond.store11
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 2
-; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 46
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v8, 6
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vse32.v v12, (a3)
-; RV64ZVE32F-NEXT:    andi a2, a2, -128
-; RV64ZVE32F-NEXT:    beqz a2, .LBB83_11
+; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
+; RV64ZVE32F-NEXT:    andi a1, a1, -128
+; RV64ZVE32F-NEXT:    beqz a1, .LBB83_11
 ; RV64ZVE32F-NEXT:  .LBB83_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-NEXT:    and a1, a2, a1
-; RV64ZVE32F-NEXT:    slli a1, a1, 2
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v10
+; RV64ZVE32F-NEXT:    slli a1, a1, 48
+; RV64ZVE32F-NEXT:    srli a1, a1, 46
 ; RV64ZVE32F-NEXT:    add a0, a0, a1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
@@ -11270,101 +11264,99 @@ define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, ptr %base, <8
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8f64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lui a1, 16
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV64ZVE32F-NEXT:    andi a3, a2, 1
-; RV64ZVE32F-NEXT:    addiw a1, a1, -1
-; RV64ZVE32F-NEXT:    beqz a3, .LBB96_2
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v0
+; RV64ZVE32F-NEXT:    andi a2, a1, 1
+; RV64ZVE32F-NEXT:    beqz a2, .LBB96_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 3
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    fsd fa0, 0(a3)
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 45
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    fsd fa0, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB96_2: # %else
-; RV64ZVE32F-NEXT:    andi a3, a2, 2
-; RV64ZVE32F-NEXT:    beqz a3, .LBB96_4
+; RV64ZVE32F-NEXT:    andi a2, a1, 2
+; RV64ZVE32F-NEXT:    beqz a2, .LBB96_4
 ; RV64ZVE32F-NEXT:  # %bb.3: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 3
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    fsd fa1, 0(a3)
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 45
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    fsd fa1, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB96_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a3, a2, 4
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT:    bnez a3, .LBB96_12
+; RV64ZVE32F-NEXT:    bnez a2, .LBB96_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
-; RV64ZVE32F-NEXT:    andi a3, a2, 8
-; RV64ZVE32F-NEXT:    bnez a3, .LBB96_13
+; RV64ZVE32F-NEXT:    andi a2, a1, 8
+; RV64ZVE32F-NEXT:    bnez a2, .LBB96_13
 ; RV64ZVE32F-NEXT:  .LBB96_6: # %else6
-; RV64ZVE32F-NEXT:    andi a3, a2, 16
-; RV64ZVE32F-NEXT:    bnez a3, .LBB96_14
+; RV64ZVE32F-NEXT:    andi a2, a1, 16
+; RV64ZVE32F-NEXT:    bnez a2, .LBB96_14
 ; RV64ZVE32F-NEXT:  .LBB96_7: # %else8
-; RV64ZVE32F-NEXT:    andi a3, a2, 32
-; RV64ZVE32F-NEXT:    beqz a3, .LBB96_9
+; RV64ZVE32F-NEXT:    andi a2, a1, 32
+; RV64ZVE32F-NEXT:    beqz a2, .LBB96_9
 ; RV64ZVE32F-NEXT:  .LBB96_8: # %cond.store9
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 3
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    fsd fa5, 0(a3)
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 45
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    fsd fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB96_9: # %else10
-; RV64ZVE32F-NEXT:    andi a3, a2, 64
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
-; RV64ZVE32F-NEXT:    bnez a3, .LBB96_15
+; RV64ZVE32F-NEXT:    bnez a2, .LBB96_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
-; RV64ZVE32F-NEXT:    andi a2, a2, -128
-; RV64ZVE32F-NEXT:    bnez a2, .LBB96_16
+; RV64ZVE32F-NEXT:    andi a1, a1, -128
+; RV64ZVE32F-NEXT:    bnez a1, .LBB96_16
 ; RV64ZVE32F-NEXT:  .LBB96_11: # %else14
 ; RV64ZVE32F-NEXT:    ret
 ; RV64ZVE32F-NEXT:  .LBB96_12: # %cond.store3
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 3
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    fsd fa2, 0(a3)
-; RV64ZVE32F-NEXT:    andi a3, a2, 8
-; RV64ZVE32F-NEXT:    beqz a3, .LBB96_6
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 45
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    fsd fa2, 0(a2)
+; RV64ZVE32F-NEXT:    andi a2, a1, 8
+; RV64ZVE32F-NEXT:    beqz a2, .LBB96_6
 ; RV64ZVE32F-NEXT:  .LBB96_13: # %cond.store5
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 3
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    fsd fa3, 0(a3)
-; RV64ZVE32F-NEXT:    andi a3, a2, 16
-; RV64ZVE32F-NEXT:    beqz a3, .LBB96_7
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 45
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    fsd fa3, 0(a2)
+; RV64ZVE32F-NEXT:    andi a2, a1, 16
+; RV64ZVE32F-NEXT:    beqz a2, .LBB96_7
 ; RV64ZVE32F-NEXT:  .LBB96_14: # %cond.store7
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 3
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    fsd fa4, 0(a3)
-; RV64ZVE32F-NEXT:    andi a3, a2, 32
-; RV64ZVE32F-NEXT:    bnez a3, .LBB96_8
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 45
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    fsd fa4, 0(a2)
+; RV64ZVE32F-NEXT:    andi a2, a1, 32
+; RV64ZVE32F-NEXT:    bnez a2, .LBB96_8
 ; RV64ZVE32F-NEXT:    j .LBB96_9
 ; RV64ZVE32F-NEXT:  .LBB96_15: # %cond.store11
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV64ZVE32F-NEXT:    and a3, a3, a1
-; RV64ZVE32F-NEXT:    slli a3, a3, 3
-; RV64ZVE32F-NEXT:    add a3, a0, a3
-; RV64ZVE32F-NEXT:    fsd fa6, 0(a3)
-; RV64ZVE32F-NEXT:    andi a2, a2, -128
-; RV64ZVE32F-NEXT:    beqz a2, .LBB96_11
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 48
+; RV64ZVE32F-NEXT:    srli a2, a2, 45
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    fsd fa6, 0(a2)
+; RV64ZVE32F-NEXT:    andi a1, a1, -128
+; RV64ZVE32F-NEXT:    beqz a1, .LBB96_11
 ; RV64ZVE32F-NEXT:  .LBB96_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV64ZVE32F-NEXT:    and a1, a2, a1
-; RV64ZVE32F-NEXT:    slli a1, a1, 3
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV64ZVE32F-NEXT:    slli a1, a1, 48
+; RV64ZVE32F-NEXT:    srli a1, a1, 45
 ; RV64ZVE32F-NEXT:    add a0, a0, a1
 ; RV64ZVE32F-NEXT:    fsd fa7, 0(a0)
 ; RV64ZVE32F-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
index 188ef8fe35a4a..10dadbc022e02 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
@@ -11,21 +11,18 @@ define void @deinterleave3_0_i8(ptr %in, ptr %out) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI0_0)
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, -8
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    li a0, 3
-; CHECK-NEXT:    vmadd.vx v10, a0, v9
+; CHECK-NEXT:    vle8.v v9, (a0)
 ; CHECK-NEXT:    li a0, 73
-; CHECK-NEXT:    vmv.s.x v9, a0
-; CHECK-NEXT:    li a0, 56
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vcompress.vm v11, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 8
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vv v11, v8, v10, v0.t
-; CHECK-NEXT:    vse8.v v11, (a1)
+; CHECK-NEXT:    vslidedown.vi v10, v8, 8
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vrgather.vv v10, v8, v9
+; CHECK-NEXT:    vse8.v v10, (a1)
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i8>, ptr %in, align 1
@@ -39,21 +36,18 @@ define void @deinterleave3_8_i8(ptr %in, ptr %out) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    li a0, 146
-; CHECK-NEXT:    vmv.s.x v9, a0
-; CHECK-NEXT:    li a0, 24
+; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI1_0)
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vcompress.vm v10, v8, v9
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 8
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vsrl.vi v9, v8, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    li a0, 146
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v10, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    vrgather.vv v10, v8, v9
+; CHECK-NEXT:    vse8.v v10, (a1)
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i8>, ptr %in, align 1
@@ -105,20 +99,19 @@ define void @deinterleave5_0_i8(ptr %in, ptr %out) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, -8
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    li a0, 5
-; CHECK-NEXT:    vmadd.vx v10, a0, v9
 ; CHECK-NEXT:    li a0, 33
-; CHECK-NEXT:    vmv.v.i v0, 12
-; CHECK-NEXT:    vmv.s.x v9, a0
-; CHECK-NEXT:    vcompress.vm v11, v8, v9
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    lui a0, 28704
+; CHECK-NEXT:    addi a0, a0, 1280
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 8
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vv v11, v8, v10, v0.t
-; CHECK-NEXT:    vse8.v v11, (a1)
+; CHECK-NEXT:    vslidedown.vi v9, v8, 8
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vrgather.vv v10, v8, v9
+; CHECK-NEXT:    vse8.v v10, (a1)
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i8>, ptr %in, align 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
index 75a5eea1cb409..735621aa4390e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
@@ -5581,3 +5581,163 @@ vector.body:
 for.cond.cleanup:
   ret void
 }
+
+define void @sink_splat_fmuladd(ptr %a, ptr %b, float %x) {
+; CHECK-LABEL: sink_splat_fmuladd:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a2, a1, a2
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB121_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vle32.v v9, (a1)
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vfmacc.vf v9, fa0, v8
+; CHECK-NEXT:    vse32.v v9, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a1, a2, .LBB121_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
+  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds float, ptr %a, i64 %index
+  %wide.load = load <4 x float>, ptr %0, align 4
+  %1 = getelementptr inbounds float, ptr %b, i64 %index
+  %wide.load12 = load <4 x float>, ptr %1, align 4
+  %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x float> %wide.load12)
+  store <4 x float> %2, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @sink_splat_fmuladd_commute(ptr %a, ptr %b, float %x) {
+; CHECK-LABEL: sink_splat_fmuladd_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a2, a1, a2
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB122_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vle32.v v9, (a1)
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vfmacc.vf v9, fa0, v8
+; CHECK-NEXT:    vse32.v v9, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a1, a2, .LBB122_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
+  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds float, ptr %a, i64 %index
+  %wide.load = load <4 x float>, ptr %0, align 4
+  %1 = getelementptr inbounds float, ptr %b, i64 %index
+  %wide.load12 = load <4 x float>, ptr %1, align 4
+  %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
+  store <4 x float> %2, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @sink_splat_vp_fmuladd(ptr %a, ptr %b, float %x, <4 x i1> %m, i32 %vl) {
+; CHECK-LABEL: sink_splat_vp_fmuladd:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    slli a4, a2, 32
+; CHECK-NEXT:    add a2, a1, a3
+; CHECK-NEXT:    srli a3, a4, 32
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB123_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vle32.v v9, (a1)
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vsetvli zero, a3, e32, m1, ta, ma
+; CHECK-NEXT:    vfmadd.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a1, a2, .LBB123_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
+  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds float, ptr %a, i64 %index
+  %wide.load = load <4 x float>, ptr %0, align 4
+  %1 = getelementptr inbounds float, ptr %b, i64 %index
+  %wide.load12 = load <4 x float>, ptr %1, align 4
+  %2 = call <4 x float> @llvm.vp.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x float> %wide.load12, <4 x i1> %m, i32 %vl)
+  store <4 x float> %2, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @sink_splat_vp_fmuladd_commute(ptr %a, ptr %b, float %x, <4 x i1> %m, i32 %vl) {
+; CHECK-LABEL: sink_splat_vp_fmuladd_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    slli a4, a2, 32
+; CHECK-NEXT:    add a2, a1, a3
+; CHECK-NEXT:    srli a3, a4, 32
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB124_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vle32.v v9, (a1)
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vsetvli zero, a3, e32, m1, ta, ma
+; CHECK-NEXT:    vfmadd.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a1, a2, .LBB124_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
+  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds float, ptr %a, i64 %index
+  %wide.load = load <4 x float>, ptr %0, align 4
+  %1 = getelementptr inbounds float, ptr %b, i64 %index
+  %wide.load12 = load <4 x float>, ptr %1, align 4
+  %2 = call <4 x float> @llvm.vp.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12, <4 x i1> %m, i32 %vl)
+  store <4 x float> %2, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
index e13482d23a26f..a21e3df85193f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
@@ -2159,3 +2159,296 @@ define <vscale x 1 x i8> @vmerge_vvm(<vscale x 1 x i8> %a, i8 %b, <vscale x 1 x
   %3 = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> %2, <vscale x 1 x i8> %c, <vscale x 1 x i1> %m, iXLen %vl)
   ret <vscale x 1 x i8> %3
 }
+
+define <vscale x 1 x i32> @vmand_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, <vscale x 1 x i32> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vmand_mm:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmand.mm v8, v0, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmand.mm v0, v0, v8
+; NOVLOPT-NEXT:    vmv1r.v v8, v9
+; NOVLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; NOVLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmand_mm:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; VLOPT-NEXT:    vmand.mm v8, v0, v8
+; VLOPT-NEXT:    vmand.mm v0, v0, v8
+; VLOPT-NEXT:    vmv1r.v v8, v9
+; VLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; VLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
+  %2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
+  %3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
+  ret <vscale x 1 x i32> %3
+}
+
+define <vscale x 1 x i32> @vmnand_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, <vscale x 1 x i32> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vmnand_mm:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmnand.mm v8, v0, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmand.mm v0, v0, v8
+; NOVLOPT-NEXT:    vmv1r.v v8, v9
+; NOVLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; NOVLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmnand_mm:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; VLOPT-NEXT:    vmnand.mm v8, v0, v8
+; VLOPT-NEXT:    vmand.mm v0, v0, v8
+; VLOPT-NEXT:    vmv1r.v v8, v9
+; VLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; VLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 1 x i1> @llvm.riscv.vmnand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
+  %2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
+  %3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
+  ret <vscale x 1 x i32> %3
+}
+
+define <vscale x 1 x i32> @vmandn_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, <vscale x 1 x i32> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vmandn_mm:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmandn.mm v8, v0, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmand.mm v0, v0, v8
+; NOVLOPT-NEXT:    vmv1r.v v8, v9
+; NOVLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; NOVLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmandn_mm:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; VLOPT-NEXT:    vmandn.mm v8, v0, v8
+; VLOPT-NEXT:    vmand.mm v0, v0, v8
+; VLOPT-NEXT:    vmv1r.v v8, v9
+; VLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; VLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 1 x i1> @llvm.riscv.vmandn.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
+  %2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
+  %3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
+  ret <vscale x 1 x i32> %3
+}
+
+define <vscale x 1 x i32> @vmxor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, <vscale x 1 x i32> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vmxor_mm:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmxor.mm v8, v0, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmand.mm v0, v0, v8
+; NOVLOPT-NEXT:    vmv1r.v v8, v9
+; NOVLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; NOVLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmxor_mm:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; VLOPT-NEXT:    vmxor.mm v8, v0, v8
+; VLOPT-NEXT:    vmand.mm v0, v0, v8
+; VLOPT-NEXT:    vmv1r.v v8, v9
+; VLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; VLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 1 x i1> @llvm.riscv.vmxor.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
+  %2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
+  %3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
+  ret <vscale x 1 x i32> %3
+}
+
+define <vscale x 1 x i32> @vmor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, <vscale x 1 x i32> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vmor_mm:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmor.mm v8, v0, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmand.mm v0, v0, v8
+; NOVLOPT-NEXT:    vmv1r.v v8, v9
+; NOVLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; NOVLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmor_mm:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; VLOPT-NEXT:    vmor.mm v8, v0, v8
+; VLOPT-NEXT:    vmand.mm v0, v0, v8
+; VLOPT-NEXT:    vmv1r.v v8, v9
+; VLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; VLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 1 x i1> @llvm.riscv.vmor.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
+  %2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
+  %3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
+  ret <vscale x 1 x i32> %3
+}
+
+
+define <vscale x 1 x i32> @vmnor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, <vscale x 1 x i32> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vmnor_mm:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmnor.mm v8, v0, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmand.mm v0, v0, v8
+; NOVLOPT-NEXT:    vmv1r.v v8, v9
+; NOVLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; NOVLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmnor_mm:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; VLOPT-NEXT:    vmnor.mm v8, v0, v8
+; VLOPT-NEXT:    vmand.mm v0, v0, v8
+; VLOPT-NEXT:    vmv1r.v v8, v9
+; VLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; VLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 1 x i1> @llvm.riscv.vmnor.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
+  %2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
+  %3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
+  ret <vscale x 1 x i32> %3
+}
+
+define <vscale x 1 x i32> @vmorn_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, <vscale x 1 x i32> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vmorn_mm:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmorn.mm v8, v0, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmand.mm v0, v0, v8
+; NOVLOPT-NEXT:    vmv1r.v v8, v9
+; NOVLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; NOVLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmorn_mm:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; VLOPT-NEXT:    vmorn.mm v8, v0, v8
+; VLOPT-NEXT:    vmand.mm v0, v0, v8
+; VLOPT-NEXT:    vmv1r.v v8, v9
+; VLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; VLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 1 x i1> @llvm.riscv.vmorn.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
+  %2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
+  %3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
+  ret <vscale x 1 x i32> %3
+}
+
+define <vscale x 1 x i32> @vmxnor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, <vscale x 1 x i32> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vmxnor_mm:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmxnor.mm v8, v0, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmand.mm v0, v0, v8
+; NOVLOPT-NEXT:    vmv1r.v v8, v9
+; NOVLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; NOVLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmxnor_mm:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; VLOPT-NEXT:    vmxnor.mm v8, v0, v8
+; VLOPT-NEXT:    vmand.mm v0, v0, v8
+; VLOPT-NEXT:    vmv1r.v v8, v9
+; VLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; VLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 1 x i1> @llvm.riscv.vmxnor.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
+  %2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
+  %3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
+  ret <vscale x 1 x i32> %3
+}
+
+define <vscale x 1 x i32> @vmsbf_m(<vscale x 1 x i1> %a, <vscale x 1 x i32> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vmsbf_m:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmsbf.m v9, v0
+; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmand.mm v0, v0, v9
+; NOVLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v8, v0.t
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmsbf_m:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; VLOPT-NEXT:    vmsbf.m v9, v0
+; VLOPT-NEXT:    vmand.mm v0, v0, v9
+; VLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; VLOPT-NEXT:    vadd.vv v8, v8, v8, v0.t
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 1 x i1> @llvm.riscv.vmsbf.nxv1i1(<vscale x 1 x i1> %a, iXLen -1)
+  %2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
+  %3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
+  ret <vscale x 1 x i32> %3
+}
+
+define <vscale x 1 x i32> @vmsif_m(<vscale x 1 x i1> %a, <vscale x 1 x i32> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vmsif_m:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmsif.m v9, v0
+; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmand.mm v0, v0, v9
+; NOVLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v8, v0.t
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmsif_m:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; VLOPT-NEXT:    vmsif.m v9, v0
+; VLOPT-NEXT:    vmand.mm v0, v0, v9
+; VLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; VLOPT-NEXT:    vadd.vv v8, v8, v8, v0.t
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 1 x i1> @llvm.riscv.vmsif.nxv1i1(<vscale x 1 x i1> %a, iXLen -1)
+  %2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
+  %3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
+  ret <vscale x 1 x i32> %3
+}
+
+define <vscale x 1 x i32> @vmsof_m(<vscale x 1 x i1> %a, <vscale x 1 x i32> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vmsof_m:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmsof.m v9, v0
+; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmand.mm v0, v0, v9
+; NOVLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v8, v0.t
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmsof_m:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; VLOPT-NEXT:    vmsof.m v9, v0
+; VLOPT-NEXT:    vmand.mm v0, v0, v9
+; VLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; VLOPT-NEXT:    vadd.vv v8, v8, v8, v0.t
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 1 x i1> @llvm.riscv.vmsof.nxv1i1(<vscale x 1 x i1> %a, iXLen -1)
+  %2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
+  %3 = call <vscale x 1 x i32> @llvm.riscv.vadd.mask.nxv1i32.nxv1i32(<vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i32> %c, <vscale x 1 x i1> %2, iXLen %vl, iXLen 0)
+  ret <vscale x 1 x i32> %3
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
index 814894f4acea3..f1e7bb446482e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
@@ -542,4 +542,64 @@ body: |
     %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
     PseudoVSSE8_V_MF2 %x, $noreg, $noreg, 1, 3 /* e8 */
 ...
+---
+name: vmop_mm
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vmop_mm
+    ; CHECK: %x:vr = PseudoVMAND_MM_M1 $noreg, $noreg, 1, 0 /* e8 */
+    ; CHECK-NEXT: %y:vr = PseudoVMAND_MM_M1 $noreg, %x, 1, 0 /* e8 */
+    %x:vr = PseudoVMAND_MM_M1 $noreg, $noreg, -1, 0
+    %y:vr = PseudoVMAND_MM_M1 $noreg, %x, 1, 0
+...
+---
+name: vmop_mm_incompatible_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vmop_mm_incompatible_eew
+    ; CHECK: %x:vr = PseudoVMAND_MM_M1 $noreg, $noreg, -1, 0 /* e8 */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVMAND_MM_M1 $noreg, $noreg, -1, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, $noreg, %x, 1, 3 /* e8 */, 0
+...
+---
+name: vmop_mm_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vmop_mm_incompatible_emul
+    ; CHECK: %x:vr = PseudoVMAND_MM_M1 $noreg, $noreg, -1, 0 /* e8 */
+    ; CHECK-NEXT: %y:vr = PseudoVMAND_MM_MF2 $noreg, %x, 1, 0 /* e8 */
+    %x:vr = PseudoVMAND_MM_M1 $noreg, $noreg, -1, 0
+    %y:vr = PseudoVMAND_MM_MF2 $noreg, %x, 1, 0
+...
+---
+name: vmop_mm_mask
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vmop_mm_mask
+    ; CHECK: %x:vmv0 = PseudoVMAND_MM_M1 $noreg, $noreg, 1, 0 /* e8 */
+    ; CHECK-NEXT: %y:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vmv0 = PseudoVMAND_MM_M1 $noreg, $noreg, -1, 0
+    %y:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %x, 1, 3 /* e8 */, 0
+...
+---
+name: vmop_mm_mask_larger_emul_user
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vmop_mm_mask_larger_emul_user
+    ; CHECK: %x:vmv0 = PseudoVMAND_MM_M1 $noreg, $noreg, 1, 0 /* e8 */
+    ; CHECK-NEXT: %y:vrm2nov0 = PseudoVADD_VV_M2_MASK $noreg, $noreg, $noreg, %x, 1, 4 /* e16 */, 0 /* tu, mu */
+    %x:vmv0 = PseudoVMAND_MM_M1 $noreg, $noreg, -1, 0
+    %y:vrm2nov0 = PseudoVADD_VV_M2_MASK $noreg, $noreg, $noreg, %x, 1, 4 /* e16 */, 0
+...
+---
+name: vmop_mm_mask_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vmop_mm_mask_incompatible_emul
+    ; CHECK: %x:vmv0 = PseudoVMAND_MM_M1 $noreg, $noreg, -1, 0 /* e8 */
+    ; CHECK-NEXT: %y:vrnov0 = PseudoVADD_VV_MF2_MASK $noreg, $noreg, $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vmv0 = PseudoVMAND_MM_M1 $noreg, $noreg, -1, 0
+    %y:vrnov0 = PseudoVADD_VV_MF2_MASK $noreg, $noreg, $noreg, %x, 1, 3 /* e8 */, 0
+...
 
diff --git a/llvm/test/CodeGen/RISCV/sifive7-enable-intervals.mir b/llvm/test/CodeGen/RISCV/sifive7-enable-intervals.mir
index e179e7f08752a..ec8a57d14e8f2 100644
--- a/llvm/test/CodeGen/RISCV/sifive7-enable-intervals.mir
+++ b/llvm/test/CodeGen/RISCV/sifive7-enable-intervals.mir
@@ -1,6 +1,6 @@
 # RUN: llc -mtriple=riscv64 -mcpu=sifive-x280 -run-pass=machine-scheduler      \
 # RUN:  -debug-only=machine-scheduler -misched-dump-schedule-trace             \
-# RUN:  -misched-topdown -o - %s 2>&1 | FileCheck %s
+# RUN:  -misched-prera-direction=topdown -o - %s 2>&1 | FileCheck %s
 # REQUIRES: asserts
 
 # The purpose of this test is to show that the VADD instructions are issued so
diff --git a/llvm/test/CodeGen/X86/handle-move.ll b/llvm/test/CodeGen/X86/handle-move.ll
index 0a43ef3fc22d4..c6da9589ff465 100644
--- a/llvm/test/CodeGen/X86/handle-move.ll
+++ b/llvm/test/CodeGen/X86/handle-move.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-- -mcpu=core2 -fast-isel -enable-misched -misched=shuffle -misched-bottomup -verify-machineinstrs < %s
-; RUN: llc -mtriple=x86_64-- -mcpu=core2 -fast-isel -enable-misched -misched=shuffle -misched-topdown -verify-machineinstrs < %s
+; RUN: llc -mtriple=x86_64-- -mcpu=core2 -fast-isel -enable-misched -misched=shuffle -misched-prera-direction=bottomup -verify-machineinstrs < %s
+; RUN: llc -mtriple=x86_64-- -mcpu=core2 -fast-isel -enable-misched -misched=shuffle -misched-prera-direction=topdown -verify-machineinstrs < %s
 ; REQUIRES: asserts
 ;
 ; Test the LiveIntervals::handleMove() function.
diff --git a/llvm/test/CodeGen/X86/isel-select-cmov.ll b/llvm/test/CodeGen/X86/isel-select-cmov.ll
index 39a20bf6637bb..fbd9bd073155e 100644
--- a/llvm/test/CodeGen/X86/isel-select-cmov.ll
+++ b/llvm/test/CodeGen/X86/isel-select-cmov.ll
@@ -741,18 +741,18 @@ define i64 @select_cmp_cmov_i64(i64 %a, i64 %b) nounwind {
 ; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; GISEL-X86-NEXT:    cmpl %eax, %esi
+; GISEL-X86-NEXT:    setb %bl
 ; GISEL-X86-NEXT:    xorl %ecx, %ecx
 ; GISEL-X86-NEXT:    cmpl %edx, %ebp
-; GISEL-X86-NEXT:    setb %bl
-; GISEL-X86-NEXT:    sete %cl
-; GISEL-X86-NEXT:    cmpl %eax, %esi
 ; GISEL-X86-NEXT:    setb %bh
+; GISEL-X86-NEXT:    sete %cl
 ; GISEL-X86-NEXT:    testl %ecx, %ecx
 ; GISEL-X86-NEXT:    je LBB6_2
 ; GISEL-X86-NEXT:  ## %bb.1:
-; GISEL-X86-NEXT:    movb %bh, %bl
+; GISEL-X86-NEXT:    movb %bl, %bh
 ; GISEL-X86-NEXT:  LBB6_2:
-; GISEL-X86-NEXT:    movzbl %bl, %edi
+; GISEL-X86-NEXT:    movzbl %bh, %edi
 ; GISEL-X86-NEXT:    andl $1, %edi
 ; GISEL-X86-NEXT:    je LBB6_4
 ; GISEL-X86-NEXT:  ## %bb.3:
@@ -779,16 +779,16 @@ define i64 @select_cmp_cmov_i64(i64 %a, i64 %b) nounwind {
 ; GISEL-X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; GISEL-X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; GISEL-X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; GISEL-X86-CMOV-NEXT:    xorl %ebx, %ebx
 ; GISEL-X86-CMOV-NEXT:    xorl %ecx, %ecx
+; GISEL-X86-CMOV-NEXT:    cmpl %esi, %ebp
+; GISEL-X86-CMOV-NEXT:    setb %cl
+; GISEL-X86-CMOV-NEXT:    xorl %ebx, %ebx
+; GISEL-X86-CMOV-NEXT:    xorl %eax, %eax
 ; GISEL-X86-CMOV-NEXT:    cmpl %edi, %edx
 ; GISEL-X86-CMOV-NEXT:    setb %bl
-; GISEL-X86-CMOV-NEXT:    sete %cl
-; GISEL-X86-CMOV-NEXT:    xorl %eax, %eax
-; GISEL-X86-CMOV-NEXT:    cmpl %esi, %ebp
-; GISEL-X86-CMOV-NEXT:    setb %al
-; GISEL-X86-CMOV-NEXT:    testl %ecx, %ecx
-; GISEL-X86-CMOV-NEXT:    cmovnew %ax, %bx
+; GISEL-X86-CMOV-NEXT:    sete %al
+; GISEL-X86-CMOV-NEXT:    testl %eax, %eax
+; GISEL-X86-CMOV-NEXT:    cmovnew %cx, %bx
 ; GISEL-X86-CMOV-NEXT:    andl $1, %ebx
 ; GISEL-X86-CMOV-NEXT:    cmovel %esi, %ebp
 ; GISEL-X86-CMOV-NEXT:    cmovel %edi, %edx
diff --git a/llvm/test/CodeGen/X86/misched-aa-colored.ll b/llvm/test/CodeGen/X86/misched-aa-colored.ll
index 73626de163d00..3504e555cd9ca 100644
--- a/llvm/test/CodeGen/X86/misched-aa-colored.ll
+++ b/llvm/test/CodeGen/X86/misched-aa-colored.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=x86-64 -enable-misched -misched-bottomup=0 -misched-topdown=0 -misched=shuffle -enable-aa-sched-mi | FileCheck %s
+; RUN: llc < %s -mcpu=x86-64 -enable-misched -misched-prera-direction=bidirectional -misched=shuffle -enable-aa-sched-mi | FileCheck %s
 ; REQUIRES: asserts
 ; -misched=shuffle is NDEBUG only!
 
diff --git a/llvm/test/CodeGen/X86/misched-matrix.ll b/llvm/test/CodeGen/X86/misched-matrix.ll
index e909348eaa388..f44bf39e76f6f 100644
--- a/llvm/test/CodeGen/X86/misched-matrix.ll
+++ b/llvm/test/CodeGen/X86/misched-matrix.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=generic -pre-RA-sched=source -enable-misched \
-; RUN:          -misched-topdown -verify-machineinstrs \
+; RUN:          -misched-prera-direction=topdown -verify-machineinstrs \
 ; RUN:     | FileCheck %s -check-prefix=TOPDOWN
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=generic -pre-RA-sched=source -enable-misched \
 ; RUN:          -misched=ilpmin -verify-machineinstrs \
diff --git a/llvm/test/CodeGen/X86/misched-new.ll b/llvm/test/CodeGen/X86/misched-new.ll
index 06ae8ff43d5af..d7b3604ceefc4 100644
--- a/llvm/test/CodeGen/X86/misched-new.ll
+++ b/llvm/test/CodeGen/X86/misched-new.ll
@@ -1,8 +1,8 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=core2 -x86-early-ifcvt -enable-misched \
-; RUN:          -misched=shuffle -misched-bottomup -verify-machineinstrs \
+; RUN:          -misched=shuffle -misched-prera-direction=bottomup -verify-machineinstrs \
 ; RUN:     | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=core2 -x86-early-ifcvt -enable-misched \
-; RUN:          -misched=shuffle -misched-topdown -verify-machineinstrs \
+; RUN:          -misched=shuffle -misched-prera-direction=topdown -verify-machineinstrs \
 ; RUN:     | FileCheck %s --check-prefix TOPDOWN
 ; REQUIRES: asserts
 ;
diff --git a/llvm/test/MC/Disassembler/M68k/control.txt b/llvm/test/MC/Disassembler/M68k/control.txt
index d722dfd791fa9..58d5629855838 100644
--- a/llvm/test/MC/Disassembler/M68k/control.txt
+++ b/llvm/test/MC/Disassembler/M68k/control.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -disassemble -triple m68k %s | FileCheck %s
+# RUN: llvm-mc -disassemble -triple m68k -mcpu=M68020 %s | FileCheck %s
 
 # CHECK: bra $0
 0x60 0x00 0x00 0x00
diff --git a/llvm/test/MC/ELF/debug-hash-file-empty-dwarf.s b/llvm/test/MC/ELF/debug-hash-file-empty-dwarf.s
new file mode 100644
index 0000000000000..cc1c3d1796b6e
--- /dev/null
+++ b/llvm/test/MC/ELF/debug-hash-file-empty-dwarf.s
@@ -0,0 +1,26 @@
+// RUN: llvm-mc -triple x86_64-unknown-linux-gnu -filetype obj -g -dwarf-version 5 -o %t %s
+// RUN: llvm-dwarfdump -debug-info -debug-line %t | FileCheck %s
+
+// CHECK-NOT: DW_TAG_
+
+// CHECK:      include_directories[ 0] =
+// CHECK-NOT:  include_directories[ 1] =
+// CHECK:      file_names[ 0]:
+// CHECK-NEXT:           name: "/MyTest/Inputs/other.S"
+// CHECK-NEXT:      dir_index: 0
+// CHECK-NOT:  file_names[ 1]:
+
+// RUN: llvm-mc -triple=x86_64 -filetype=obj -g -dwarf-version=5 -fdebug-prefix-map=/MyTest=/src_root %s -o %t.5.o
+// RUN: llvm-dwarfdump -debug-info -debug-line %t.5.o | FileCheck %s --check-prefixes=MAP
+
+// MAP-NOT: DW_TAG_
+
+// MAP:      include_directories[  0] = "{{.*}}"
+// MAP-NEXT: file_names[  0]:
+// MAP-NEXT:            name: "/src_root/Inputs/other.S"
+// MAP-NEXT:       dir_index: 0
+
+# 1 "/MyTest/Inputs/other.S"
+
+.section .data
+.asciz "data"
diff --git a/llvm/test/MC/ELF/debug-hash-file.s b/llvm/test/MC/ELF/debug-hash-file.s
index 99e3d6dca9bba..508813abcacd5 100644
--- a/llvm/test/MC/ELF/debug-hash-file.s
+++ b/llvm/test/MC/ELF/debug-hash-file.s
@@ -43,6 +43,12 @@
 // MAP_V5-NEXT:            name: "/src_root/Inputs/other.S"
 // MAP_V5-NEXT:       dir_index: 0
 
+// RUN: llvm-mc -triple x86_64-unknown-linux-gnu -filetype obj -o %t %s
+// RUN: llvm-readelf --sections %t | FileCheck %s --check-prefix=CHECK-NO-DEBUG
+
+// CHECK-NO-DEBUG: Section Headers:
+// CHECK-NO-DEBUG-NOT: .debug_
+
 # 1 "/MyTest/Inputs/other.S"
 
 foo:
diff --git a/llvm/test/MC/M68k/Atomics/cas.s b/llvm/test/MC/M68k/Atomics/cas.s
index cb149e4c2b869..c6cae72c03f09 100644
--- a/llvm/test/MC/M68k/Atomics/cas.s
+++ b/llvm/test/MC/M68k/Atomics/cas.s
@@ -1,5 +1,6 @@
-; RUN: llvm-mc -show-encoding -triple=m68k -mcpu=M68020 %s | FileCheck %s
+; RUN: llvm-mc -show-encoding -triple=m68k -mcpu=M68020 -motorola-integers %s | FileCheck %s
 
+; Address Register Indirect
 ; CHECK: cas.b %d3, %d2, (%a2)
 ; CHECK-SAME: ; encoding: [0x0a,0xd2,0x00,0x83]
 cas.b %d3, %d2, (%a2)
@@ -11,3 +12,42 @@ cas.w %d4, %d5, (%a3)
 ; CHECK: cas.l %d6, %d7, (%a4)
 ; CHECK-SAME: ; encoding: [0x0e,0xd4,0x01,0xc6]
 cas.l %d6, %d7, (%a4)
+
+; Address Register Indirect with Displacement
+; CHECK: cas.b %d3, %d2, (5,%a2)
+; CHECK-SAME: ; encoding: [0x0a,0xea,0x00,0x83]
+cas.b %d3, %d2, (5, %a2)
+
+; CHECK: cas.w %d4, %d5, (6,%a3)
+; CHECK-SAME: ; encoding: [0x0c,0xeb,0x01,0x44]
+cas.w %d4, %d5, (6, %a3)
+
+; CHECK: cas.l %d6, %d7, (7,%a4)
+; CHECK-SAME: ; encoding: [0x0e,0xec,0x01,0xc6]
+cas.l %d6, %d7, (7, %a4)
+
+; Address Register Indirect with Index (Scale = 1)
+; CHECK: cas.b %d3, %d2, (5,%a2,%d1)
+; CHECK-SAME: ; encoding: [0x0a,0xf2,0x00,0x83]
+cas.b %d3, %d2, (5, %a2, %d1)
+
+; CHECK: cas.w %d4, %d5, (6,%a3,%d1)
+; CHECK-SAME: ; encoding: [0x0c,0xf3,0x01,0x44]
+cas.w %d4, %d5, (6, %a3, %d1)
+
+; CHECK: cas.l %d6, %d7, (7,%a4,%d1)
+; CHECK-SAME: ; encoding: [0x0e,0xf4,0x01,0xc6]
+cas.l %d6, %d7, (7, %a4, %d1)
+
+; Absolute Long Address
+; CHECK: cas.b %d3, %d2, $ffffffffffffffff
+; CHECK-SAME: ; encoding: [0x0a,0xf8,0x00,0x83]
+cas.b %d3, %d2, $ffffffffffffffff
+
+; CHECK: cas.w %d4, %d5, $ffffffffffffffff
+; CHECK-SAME: ; encoding: [0x0c,0xf8,0x01,0x44]
+cas.w %d4, %d5, $ffffffffffffffff
+
+; CHECK: cas.l %d6, %d7, $ffffffffffffffff
+; CHECK-SAME: ; encoding: [0x0e,0xf8,0x01,0xc6]
+cas.l %d6, %d7, $ffffffffffffffff
diff --git a/llvm/test/MC/RISCV/xqcics-invalid.s b/llvm/test/MC/RISCV/xqcics-invalid.s
new file mode 100644
index 0000000000000..e7effdfd263b6
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqcics-invalid.s
@@ -0,0 +1,121 @@
+# Xqcics - Qualcomm uC Conditional Select Extension
+# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-xqcics < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-IMM %s
+# RUN: not llvm-mc -triple riscv32 -mattr=-experimental-xqcics < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-EXT %s
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.selecteqi 9, 15, x4, x3
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.selecteqi x9, 15, x4
+
+# CHECK-IMM: :[[@LINE+1]]:18: error: immediate must be an integer in the range [-16, 15]
+qc.selecteqi x9, 16, x4, x3
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcics' (Qualcomm uC Conditional Select Extension)
+qc.selecteqi x9, 15, x4, x3
+
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.selectieq 8, x4, x3, 12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.selectieq x8, x4, x3
+
+# CHECK-IMM: :[[@LINE+1]]:26: error: immediate must be an integer in the range [-16, 15]
+qc.selectieq x8, x4, x3, 17
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcics' (Qualcomm uC Conditional Select Extension)
+qc.selectieq x8, x4, x3, 12
+
+
+# CHECK: :[[@LINE+1]]:15: error: invalid operand for instruction
+qc.selectieqi 9, 11, x3, 12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.selectieqi x9, 11, x3
+
+# CHECK-IMM: :[[@LINE+1]]:19: error: immediate must be an integer in the range [-16, 15]
+qc.selectieqi x9, 16, x3, 12
+
+# CHECK-IMM: :[[@LINE+1]]:27: error: immediate must be an integer in the range [-16, 15]
+qc.selectieqi x9, 11, x3, 18
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcics' (Qualcomm uC Conditional Select Extension)
+qc.selectieqi x9, 11, x3, 12
+
+
+# CHECK: :[[@LINE+1]]:15: error: invalid operand for instruction
+qc.selectiieq 9, x3, 11, 12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.selectiieq x9, x3, 11
+
+# CHECK-IMM: :[[@LINE+1]]:23: error: immediate must be an integer in the range [-16, 15]
+qc.selectiieq x9, x3, 16, 12
+
+# CHECK-IMM: :[[@LINE+1]]:27: error: immediate must be an integer in the range [-16, 15]
+qc.selectiieq x9, x3, 11, 17
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcics' (Qualcomm uC Conditional Select Extension)
+qc.selectiieq x9, x3, 11, 12
+
+
+# CHECK: :[[@LINE+1]]:15: error: invalid operand for instruction
+qc.selectiine 8, x3, 10, 11
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.selectiine x8, x3, 10
+
+# CHECK-IMM: :[[@LINE+1]]:23: error: immediate must be an integer in the range [-16, 15]
+qc.selectiine x8, x3, 16, 11
+
+# CHECK-IMM: :[[@LINE+1]]:27: error: immediate must be an integer in the range [-16, 15]
+qc.selectiine x8, x3, 12, 18
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcics' (Qualcomm uC Conditional Select Extension)
+qc.selectiine x8, x3, 10, 11
+
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.selectine 8, x3, x4, 11
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.selectine x8, x3, x4
+
+# CHECK-IMM: :[[@LINE+1]]:26: error: immediate must be an integer in the range [-16, 15]
+qc.selectine x8, x3, x4, 16
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcics' (Qualcomm uC Conditional Select Extension)
+qc.selectine x8, x3, x4, 11
+
+
+# CHECK: :[[@LINE+1]]:15: error: invalid operand for instruction
+qc.selectinei 8, 11, x3, 12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.selectinei x8, 11, x3
+
+# CHECK-IMM: :[[@LINE+1]]:19: error: immediate must be an integer in the range [-16, 15]
+qc.selectinei x8, 16, x3, 12
+
+# CHECK-IMM: :[[@LINE+1]]:27: error: immediate must be an integer in the range [-16, 15]
+qc.selectinei x8, 11, x3, 18
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcics' (Qualcomm uC Conditional Select Extension)
+qc.selectinei x8, 11, x3, 12
+
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.selectnei 8, 11, x3, x5
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.selectnei x8, 11, x3
+
+# CHECK-IMM: :[[@LINE+1]]:18: error: immediate must be an integer in the range [-16, 15]
+qc.selectnei x8, 16, x3, x5
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcics' (Qualcomm uC Conditional Select Extension)
+qc.selectnei x8, 11, x3, x5
+
diff --git a/llvm/test/MC/RISCV/xqcics-valid.s b/llvm/test/MC/RISCV/xqcics-valid.s
new file mode 100644
index 0000000000000..eb888a6222693
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqcics-valid.s
@@ -0,0 +1,147 @@
+# Xqcics - Qualcomm uC Conditional Select Extension
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcics -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcics < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqcics -M no-aliases --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcics -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcics < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqcics --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+
+# CHECK-INST: qc.selecteqi    s1, 5, tp, gp
+# CHECK-ENC: encoding: [0xdb,0xa4,0x42,0x1c]
+qc.selecteqi x9, 5, x4, x3
+
+# CHECK-INST: qc.selecteqi    s1, -16, tp, gp
+# CHECK-ENC: encoding: [0xdb,0x24,0x48,0x1c]
+qc.selecteqi x9, -16, x4, x3
+
+# CHECK-INST: qc.selecteqi    s1, 15, tp, gp
+# CHECK-ENC: encoding: [0xdb,0xa4,0x47,0x1c]
+qc.selecteqi x9, 15, x4, x3
+
+
+# CHECK-INST: qc.selectieq    s0, tp, gp, 12
+# CHECK-ENC: encoding: [0x5b,0x24,0x32,0x62]
+qc.selectieq x8, x4, x3, 12
+
+# CHECK-INST: qc.selectieq    s0, tp, gp, -16
+# CHECK-ENC: encoding: [0x5b,0x24,0x32,0x82]
+qc.selectieq x8, x4, x3, -16
+
+# CHECK-INST: qc.selectieq    s0, tp, gp, 15
+# CHECK-ENC: encoding: [0x5b,0x24,0x32,0x7a]
+qc.selectieq x8, x4, x3, 15
+
+
+# CHECK-INST: qc.selectieqi   s1, 11, gp, 12
+# CHECK-ENC: encoding: [0xdb,0xa4,0x35,0x66]
+qc.selectieqi x9, 11, x3, 12
+
+# CHECK-INST: qc.selectieqi   s1, -16, gp, 12
+# CHECK-ENC: encoding: [0xdb,0x24,0x38,0x66]
+qc.selectieqi x9, -16, x3, 12
+
+# CHECK-INST: qc.selectieqi   s1, 15, gp, 12
+# CHECK-ENC: encoding: [0xdb,0xa4,0x37,0x66]
+qc.selectieqi x9, 15, x3, 12
+
+# CHECK-INST: qc.selectieqi   s1, 11, gp, -16
+# CHECK-ENC: encoding: [0xdb,0xa4,0x35,0x86]
+qc.selectieqi x9, 11, x3, -16
+
+# CHECK-INST: qc.selectieqi   s1, 11, gp, 15
+# CHECK-ENC: encoding: [0xdb,0xa4,0x35,0x7e]
+qc.selectieqi x9, 11, x3, 15
+
+
+# CHECK-INST: qc.selectiieq   s1, gp, 11, 12
+# CHECK-ENC: encoding: [0xdb,0xa4,0xb1,0x60]
+qc.selectiieq x9, x3, 11, 12
+
+# CHECK-INST: qc.selectiieq   s1, gp, -16, 12
+# CHECK-ENC: encoding: [0xdb,0xa4,0x01,0x61]
+qc.selectiieq x9, x3, -16, 12
+
+# CHECK-INST: qc.selectiieq   s1, gp, 15, 12
+# CHECK-ENC: encoding: [0xdb,0xa4,0xf1,0x60]
+qc.selectiieq x9, x3, 15, 12
+
+# CHECK-INST: qc.selectiieq   s1, gp, 11, -16
+# CHECK-ENC: encoding: [0xdb,0xa4,0xb1,0x80]
+qc.selectiieq x9, x3, 11, -16
+
+# CHECK-INST: qc.selectiieq   s1, gp, 11, 15
+# CHECK-ENC: encoding: [0xdb,0xa4,0xb1,0x78]
+qc.selectiieq x9, x3, 11, 15
+
+
+# CHECK-INST: qc.selectiine   s0, gp, 10, 11
+# CHECK-ENC: encoding: [0x5b,0xb4,0xa1,0x58]
+qc.selectiine x8, x3, 10, 11
+
+# CHECK-INST: qc.selectiine   s0, gp, -16, 11
+# CHECK-ENC: encoding: [0x5b,0xb4,0x01,0x59]
+qc.selectiine x8, x3, -16, 11
+
+# CHECK-INST: qc.selectiine   s0, gp, 15, 11
+# CHECK-ENC: encoding: [0x5b,0xb4,0xf1,0x58]
+qc.selectiine x8, x3, 15, 11
+
+# CHECK-INST: qc.selectiine   s0, gp, 10, -16
+# CHECK-ENC: encoding: [0x5b,0xb4,0xa1,0x80]
+qc.selectiine x8, x3, 10, -16
+
+# CHECK-INST: qc.selectiine   s0, gp, 10, 15
+# CHECK-ENC: encoding: [0x5b,0xb4,0xa1,0x78]
+qc.selectiine x8, x3, 10, 15
+
+
+# CHECK-INST: qc.selectine    s0, gp, tp, 11
+# CHECK-ENC: encoding: [0x5b,0xb4,0x41,0x5a]
+qc.selectine x8, x3, x4, 11
+
+# CHECK-INST: qc.selectine    s0, gp, tp, -16
+# CHECK-ENC: encoding: [0x5b,0xb4,0x41,0x82]
+qc.selectine x8, x3, x4, -16
+
+# CHECK-INST: qc.selectine    s0, gp, tp, 15
+# CHECK-ENC: encoding: [0x5b,0xb4,0x41,0x7a]
+qc.selectine x8, x3, x4, 15
+
+
+# CHECK-INST: qc.selectinei   s0, 11, gp, 12
+# CHECK-ENC: encoding: [0x5b,0xb4,0x35,0x66]
+qc.selectinei x8, 11, x3, 12
+
+# CHECK-INST: qc.selectinei   s0, -16, gp, 12
+# CHECK-ENC: encoding: [0x5b,0x34,0x38,0x66]
+qc.selectinei x8, -16, x3, 12
+
+# CHECK-INST: qc.selectinei   s0, 15, gp, 12
+# CHECK-ENC: encoding: [0x5b,0xb4,0x37,0x66]
+qc.selectinei x8, 15, x3, 12
+
+# CHECK-INST: qc.selectinei   s0, 11, gp, -16
+# CHECK-ENC: encoding: [0x5b,0xb4,0x35,0x86]
+qc.selectinei x8, 11, x3, -16
+
+# CHECK-INST: qc.selectinei   s0, 11, gp, 15
+# CHECK-ENC: encoding: [0x5b,0xb4,0x35,0x7e]
+qc.selectinei x8, 11, x3, 15
+
+
+# CHECK-INST: qc.selectnei    s0, 11, gp, t0
+# CHECK-ENC: encoding: [0x5b,0xb4,0x35,0x2c]
+qc.selectnei x8, 11, x3, x5
+
+# CHECK-INST: qc.selectnei    s0, -16, gp, t0
+# CHECK-ENC: encoding: [0x5b,0x34,0x38,0x2c]
+qc.selectnei x8, -16, x3, x5
+
+# CHECK-INST: qc.selectnei    s0, 15, gp, t0
+# CHECK-ENC: encoding: [0x5b,0xb4,0x37,0x2c]
+qc.selectnei x8, 15, x3, x5
+
diff --git a/llvm/test/Transforms/GlobalOpt/evaluate-call-errors.ll b/llvm/test/Transforms/GlobalOpt/evaluate-call-errors.ll
index 510edae69d7c4..853bbb562644c 100644
--- a/llvm/test/Transforms/GlobalOpt/evaluate-call-errors.ll
+++ b/llvm/test/Transforms/GlobalOpt/evaluate-call-errors.ll
@@ -3,8 +3,7 @@
 ; REQUIRES: asserts
 ; RUN: opt -passes=globalopt,instcombine -S -debug-only=evaluator %s -o %t 2>&1 | FileCheck %s
 
-; CHECK: Failed to fold bitcast call expr
-; CHECK: Can not convert function argument
+; CHECK: Signature mismatch.
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.12.0"
diff --git a/llvm/test/Transforms/GlobalOpt/evaluate-constfold-call.ll b/llvm/test/Transforms/GlobalOpt/evaluate-constfold-call.ll
index 83d8bc6e7c24f..6400988e7cc09 100644
--- a/llvm/test/Transforms/GlobalOpt/evaluate-constfold-call.ll
+++ b/llvm/test/Transforms/GlobalOpt/evaluate-constfold-call.ll
@@ -1,12 +1,9 @@
-; Check if we can evaluate a bitcasted call to a function which is constant folded.
-; Evaluator folds call to fmodf, replacing it with constant value in case both operands
-; are known at compile time.
+; Check that we do not try to evaluate function calls with signature
+; mismatches.
 ; RUN: opt -passes=globalopt,instcombine %s -S -o - | FileCheck %s
 
-; CHECK:        @_q = dso_local local_unnamed_addr global %struct.Q { i32 1066527622 }
-; CHECK:        define dso_local i32 @main
-; CHECK-NEXT:     %[[V:.+]] = load i32, ptr @_q
-; CHECK-NEXT:     ret i32 %[[V]]
+; CHECK: @_q = dso_local global %struct.Q zeroinitializer
+; CHECK: @llvm.global_ctors
 
 source_filename = "main.cpp"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Transforms/GlobalOpt/evaluate-ret-void-mismatch.ll b/llvm/test/Transforms/GlobalOpt/evaluate-ret-void-mismatch.ll
new file mode 100644
index 0000000000000..4cc4bf9cedbaf
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/evaluate-ret-void-mismatch.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=globalopt < %s | FileCheck %s
+
+; Don't evaluate call with return value type mismatch.
+
+@llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @__cxx_global_var_init, ptr null }]
+
+define void @__cxa_guard_acquire() {
+; CHECK-LABEL: define void @__cxa_guard_acquire() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  ret void
+}
+
+define void @__cxx_global_var_init() {
+; CHECK-LABEL: define void @__cxx_global_var_init() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @__cxa_guard_acquire()
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[RES]], 0
+; CHECK-NEXT:    ret void
+;
+  %res = call i32 @__cxa_guard_acquire()
+  %tobool.not = icmp eq i32 %res, 0
+  ret void
+}
diff --git a/llvm/test/Transforms/IRCE/low-iterations.ll b/llvm/test/Transforms/IRCE/low-iterations.ll
index 071ab4d015685..e044c455fe6e2 100644
--- a/llvm/test/Transforms/IRCE/low-iterations.ll
+++ b/llvm/test/Transforms/IRCE/low-iterations.ll
@@ -1,5 +1,5 @@
-; RUN: opt -verify-loop-info -irce-print-changed-loops -passes=irce -irce-min-runtime-iterations=3 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-NO
-; RUN: opt -verify-loop-info -irce-print-changed-loops -passes=irce -irce-min-runtime-iterations=0 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-YES
+; RUN: opt -verify-loop-info -irce-print-changed-loops -passes=irce -irce-min-eliminated-checks=3 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-NO
+; RUN: opt -verify-loop-info -irce-print-changed-loops -passes=irce -irce-min-eliminated-checks=0 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-YES
 
 ; CHECK-YES: constrained Loop
 ; CHECK-NO-NOT: constrained Loop
diff --git a/llvm/test/Transforms/IRCE/profitability.ll b/llvm/test/Transforms/IRCE/profitability.ll
new file mode 100644
index 0000000000000..04cea2cfce2fd
--- /dev/null
+++ b/llvm/test/Transforms/IRCE/profitability.ll
@@ -0,0 +1,38 @@
+; RUN: opt -S -verify-loop-info -irce-print-changed-loops -passes=irce -irce-min-eliminated-checks=51 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-NO
+; RUN: opt -S -verify-loop-info -irce-print-changed-loops -passes=irce -irce-min-eliminated-checks=50 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-YES
+
+; CHECK-YES: constrained Loop
+; CHECK-NO-NOT: constrained Loop
+
+declare void @bar(i32)
+
+define i32 @foo(ptr %arr_a, ptr %a_len_ptr, i32 %n) {
+entry:
+  %len.a = load i32, ptr %a_len_ptr, !range !0
+  %first.itr.check = icmp sgt i32 %n, 0
+  br i1 %first.itr.check, label %loop, label %exit, !prof !1
+
+loop:
+  %idx = phi i32 [ 0, %entry ] , [ %idx.next, %backedge ]
+  %abc.a = icmp slt i32 %idx, %len.a
+  br i1 %abc.a, label %in.bounds.a, label %backedge, !prof !2
+
+in.bounds.a:
+  %addr.a = getelementptr i32, ptr %arr_a, i32 %idx
+  %val = load i32, ptr %addr.a
+  call void @bar(i32 %val)
+  br label %backedge
+
+backedge:
+  %idx.next = add i32 %idx, 1
+  %next = icmp slt i32 %idx.next, %n
+  br i1 %next, label %loop, label %exit, !prof !3
+
+exit:
+  ret i32 0
+}
+
+!0 = !{i32 0, i32 2147483647}
+!1 = !{!"branch_weights", i32 1024, i32 1}
+!2 = !{!"branch_weights", i32 1, i32 1}
+!3 = !{!"branch_weights", i32 99, i32 1}
diff --git a/llvm/test/Transforms/Inline/always-inline-bfi.ll b/llvm/test/Transforms/Inline/always-inline-bfi.ll
new file mode 100644
index 0000000000000..0971bbeec842b
--- /dev/null
+++ b/llvm/test/Transforms/Inline/always-inline-bfi.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes="function(require<block-freq>,loop(loop-unroll-full)),always-inline" < %s | FileCheck %s
+
+; Make sure this does not crash.
+
+define void @f_116_0(ptr %p) alwaysinline {
+; CHECK-LABEL: define void @f_116_0(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load i16, ptr [[P]], align 1
+; CHECK-NEXT:    br label %[[FOR_COND:.*]]
+; CHECK:       [[FOR_COND]]:
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i16 [[DOTPRE]], 1
+; CHECK-NEXT:    br i1 [[CMP3]], label %[[FOR_BODY:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    br label %[[FOR_COND]]
+;
+entry:
+  %.pre = load i16, ptr %p, align 1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %cmp3 = icmp ult i16 %.pre, 1
+  br i1 %cmp3, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  ret void
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond
+}
+
+define void @f_321_0(ptr %p) alwaysinline {
+; CHECK-LABEL: define void @f_321_0(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[FOR_COND:.*]]
+; CHECK:       [[FOR_COND]]:
+; CHECK-NEXT:    br i1 false, label %[[CRIT_EDGE:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[CRIT_EDGE]]:
+; CHECK-NEXT:    unreachable
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    [[DOTPRE_I:%.*]] = load i16, ptr [[P]], align 1
+; CHECK-NEXT:    br label %[[FOR_COND_I:.*]]
+; CHECK:       [[FOR_COND_I]]:
+; CHECK-NEXT:    [[CMP3_I:%.*]] = icmp ult i16 [[DOTPRE_I]], 1
+; CHECK-NEXT:    br i1 [[CMP3_I]], label %[[FOR_BODY_I:.*]], label %[[F_116_0_EXIT:.*]]
+; CHECK:       [[FOR_BODY_I]]:
+; CHECK-NEXT:    br label %[[FOR_COND_I]]
+; CHECK:       [[F_116_0_EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %crit_edge, %entry
+  br i1 false, label %crit_edge, label %for.cond.cleanup
+
+crit_edge:                                        ; preds = %for.cond
+  br label %for.cond
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  call void @f_116_0(ptr %p)
+  ret void
+}
+
+define i16 @main(ptr %p) {
+; CHECK-LABEL: define i16 @main(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[FOR_COND:.*]]
+; CHECK:       [[FOR_COND]]:
+; CHECK-NEXT:    br label %[[FOR_COND]]
+; CHECK:       [[IF_ELSE:.*:]]
+; CHECK-NEXT:    [[DOTPRE_I_I:%.*]] = load i16, ptr [[P]], align 1
+; CHECK-NEXT:    br label %[[FOR_COND_I_I:.*]]
+; CHECK:       [[FOR_COND_I_I]]:
+; CHECK-NEXT:    [[CMP3_I_I:%.*]] = icmp ult i16 [[DOTPRE_I_I]], 1
+; CHECK-NEXT:    br i1 [[CMP3_I_I]], label %[[FOR_BODY_I_I:.*]], label %[[F_321_0_EXIT:.*]]
+; CHECK:       [[FOR_BODY_I_I]]:
+; CHECK-NEXT:    br label %[[FOR_COND_I_I]]
+; CHECK:       [[F_321_0_EXIT]]:
+; CHECK-NEXT:    br label %[[FOR_COND115:.*]]
+; CHECK:       [[FOR_COND115]]:
+; CHECK-NEXT:    br label %[[FOR_COND115]]
+;
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  br label %for.cond
+
+if.else:                                          ; No predecessors!
+  call void @f_321_0(ptr %p)
+  br label %for.cond115
+
+for.cond115:                                      ; preds = %for.cond115, %if.else
+  br label %for.cond115
+}
diff --git a/llvm/test/Transforms/InstCombine/fpclass-from-dom-cond.ll b/llvm/test/Transforms/InstCombine/fpclass-from-dom-cond.ll
index 141b44cbbb7a1..78329faf34172 100644
--- a/llvm/test/Transforms/InstCombine/fpclass-from-dom-cond.ll
+++ b/llvm/test/Transforms/InstCombine/fpclass-from-dom-cond.ll
@@ -456,3 +456,65 @@ if.end:
   %ret = call <2 x float> @llvm.fabs.v2f32(<2 x float> %value)
   ret <2 x float> %ret
 }
+
+define i1 @pr118257(half %v0, half %v1) {
+; CHECK-LABEL: define i1 @pr118257(
+; CHECK-SAME: half [[V0:%.*]], half [[V1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp une half [[V1]], 0xH0000
+; CHECK-NEXT:    [[CAST0:%.*]] = bitcast half [[V0]] to i16
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i16 [[CAST0]], 0
+; CHECK-NEXT:    [[OR_COND:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[IF_END:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[CAST1:%.*]] = bitcast half [[V1]] to i16
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp slt i16 [[CAST1]], 0
+; CHECK-NEXT:    ret i1 [[CMP3]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cmp1 = fcmp une half %v1, 0.000000e+00
+  %cast0 = bitcast half %v0 to i16
+  %cmp2 = icmp slt i16 %cast0, 0
+  %or.cond = or i1 %cmp1, %cmp2
+  br i1 %or.cond, label %if.end, label %if.else
+
+if.else:
+  %cast1 = bitcast half %v1 to i16
+  %cmp3 = icmp slt i16 %cast1, 0
+  ret i1 %cmp3
+
+if.end:
+  ret i1 false
+}
+
+define i1 @pr118257_is_fpclass(half %v0, half %v1) {
+; CHECK-LABEL: define i1 @pr118257_is_fpclass(
+; CHECK-SAME: half [[V0:%.*]], half [[V1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp une half [[V1]], 0xH0000
+; CHECK-NEXT:    [[CMP2:%.*]] = call i1 @llvm.is.fpclass.f16(half [[V0]], i32 35)
+; CHECK-NEXT:    [[OR_COND:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[IF_END:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[CAST1:%.*]] = bitcast half [[V1]] to i16
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp slt i16 [[CAST1]], 0
+; CHECK-NEXT:    ret i1 [[CMP3]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cmp1 = fcmp une half %v1, 0.000000e+00
+  %cmp2 = call i1 @llvm.is.fpclass.half(half %v0, i32 35)
+  %or.cond = or i1 %cmp1, %cmp2
+  br i1 %or.cond, label %if.end, label %if.else
+
+if.else:
+  %cast1 = bitcast half %v1 to i16
+  %cmp3 = icmp slt i16 %cast1, 0
+  ret i1 %cmp3
+
+if.end:
+  ret i1 false
+}
diff --git a/llvm/test/Transforms/InstCombine/stdio-custom-dl.ll b/llvm/test/Transforms/InstCombine/stdio-custom-dl.ll
index cc06be7e759d0..f96b1f7a13810 100644
--- a/llvm/test/Transforms/InstCombine/stdio-custom-dl.ll
+++ b/llvm/test/Transforms/InstCombine/stdio-custom-dl.ll
@@ -8,11 +8,12 @@ target datalayout = "e-m:o-p:40:64:64:32-i64:64-f80:128-n8:16:32:64-S128"
 @.str.1 = private unnamed_addr constant [2 x i8] c"w\00", align 1
 @.str.2 = private unnamed_addr constant [4 x i8] c"str\00", align 1
 
-; Check fwrite is generated with arguments of ptr size, not index size
+;; Check fwrite is generated with arguments of index size, not ptr size
+
 define internal void @fputs_test_custom_dl() {
 ; CHECK-LABEL: @fputs_test_custom_dl(
 ; CHECK-NEXT:    [[CALL:%.*]] = call ptr @fopen(ptr nonnull @.str, ptr nonnull @.str.1)
-; CHECK-NEXT:    [[TMP1:%.*]] = call i40 @fwrite(ptr nonnull @.str.2, i40 3, i40 1, ptr [[CALL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @fwrite(ptr nonnull @.str.2, i32 3, i32 1, ptr %call)
 ; CHECK-NEXT:    ret void
 ;
   %call = call ptr @fopen(ptr @.str, ptr @.str.1)
diff --git a/llvm/test/Transforms/InstCombine/strcpy-nonzero-as.ll b/llvm/test/Transforms/InstCombine/strcpy-nonzero-as.ll
index 86b49ffdf04b2..9bde0a3ac3fde 100644
--- a/llvm/test/Transforms/InstCombine/strcpy-nonzero-as.ll
+++ b/llvm/test/Transforms/InstCombine/strcpy-nonzero-as.ll
@@ -52,7 +52,7 @@ define void @test_strncpy_to_memcpy(ptr addrspace(200) %dst) addrspace(200) noun
 ; CHECK-LABEL: define {{[^@]+}}@test_strncpy_to_memcpy
 ; CHECK-SAME: (ptr addrspace(200) [[DST:%.*]]) addrspace(200) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call addrspace(200) void @llvm.memcpy.p200.p200.i128(ptr addrspace(200) noundef align 1 dereferenceable(17) [[DST]], ptr addrspace(200) noundef align 1 dereferenceable(17) @str, i128 17, i1 false)
+; CHECK-NEXT:    call addrspace(200) void @llvm.memcpy.p200.p200.i64(ptr addrspace(200) noundef align 1 dereferenceable(17) [[DST]], ptr addrspace(200) noundef align 1 dereferenceable(17) @str, i64 17, i1 false)
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -64,7 +64,7 @@ define void @test_stpncpy_to_memcpy(ptr addrspace(200) %dst) addrspace(200) noun
 ; CHECK-LABEL: define {{[^@]+}}@test_stpncpy_to_memcpy
 ; CHECK-SAME: (ptr addrspace(200) [[DST:%.*]]) addrspace(200) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call addrspace(200) void @llvm.memcpy.p200.p200.i128(ptr addrspace(200) noundef align 1 dereferenceable(17) [[DST]], ptr addrspace(200) noundef align 1 dereferenceable(17) @str, i128 17, i1 false)
+; CHECK-NEXT:    call addrspace(200) void @llvm.memcpy.p200.p200.i64(ptr addrspace(200) noundef align 1 dereferenceable(17) [[DST]], ptr addrspace(200) noundef align 1 dereferenceable(17) @str, i64 17, i1 false)
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
index 0a7948d89a6b4..d81cfbf08ec93 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -15,7 +15,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; VPLANS-EMPTY:
 ; VPLANS-NEXT: ir-bb<entry>:
 ; VPLANS-NEXT:  EMIT vp<[[TC]]> = EXPAND SCEV (1 umax %n)
-; VPLANS-NEXT: No successors
+; VPLANS-NEXT: Successor(s): vector.ph
 ; VPLANS-EMPTY:
 ; VPLANS-NEXT: vector.ph:
 ; VPLANS-NEXT:   EMIT vp<[[NEWTC:%[0-9]+]]> = TC > VF ? TC - VF : 0 vp<[[TC]]>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
index c3f753af6fb53..1ee6083eb59a5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
@@ -13,6 +13,9 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT:   Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT:   Live-in ir<%N> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
index 9739611a8b6e4..3ef99ff496a68 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
@@ -16,6 +16,9 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Live-in [[VTC:.*]] = vector-trip-count
 ; CHECK-NEXT: Live-in [[OTC:.*]] = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -59,6 +62,9 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Live-in [[VTC:.*]] = vector-trip-count
 ; CHECK-NEXT: Live-in [[OTC:.*]] = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
index 050bc7c346bf2..8ac46fe7687d2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
@@ -14,6 +14,9 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<1024> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -56,6 +59,9 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<1024> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -103,6 +109,9 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<1024> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -145,6 +154,9 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<1024> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -191,6 +203,9 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<1024> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -233,6 +248,9 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<1024> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
index 7a67f37c72662..648f6e874abbe 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
@@ -12,6 +12,9 @@ target triple = "arm64-apple-ios"
 
 ; CHECK-NEXT: Live-in ir<1024> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -54,6 +57,9 @@ target triple = "arm64-apple-ios"
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<1024> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
index a0696b3204dbd..cd1d18aad8361 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
@@ -14,6 +14,9 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-NEXT: Live-in ir<%N> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -77,6 +80,9 @@ define void @safe_dep(ptr %p) {
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<512> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index fcc01e0ba2e2d..eb60c24393df9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -61,7 +61,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  ir-bb<for.body.preheader>:
 ; CHECK-NEXT:    IR %0 = zext i32 %n to i64
 ; CHECK-NEXT:    EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64)
-; CHECK-NEXT:  No successors
+; CHECK-NEXT:  Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
 ; CHECK-NEXT:  Successor(s): vector loop
@@ -152,9 +152,40 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  ir-bb<for.body.preheader>:
 ; CHECK-NEXT:    IR %0 = zext i32 %n to i64
 ; CHECK-NEXT:    EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64)
-; CHECK-NEXT:  No successors
+; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.scevcheck>
 ; CHECK-EMPTY:
-; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  ir-bb<vector.scevcheck>:
+; CHECK-NEXT:    IR   %3 = add nsw i64 %0, -1
+; CHECK-NEXT:    IR   %4 = add i32 %n, -1
+; CHECK-NEXT:    IR   %5 = trunc i64 %3 to i32
+; CHECK-NEXT:    IR   %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
+; CHECK-NEXT:    IR   %mul.result = extractvalue { i32, i1 } %mul, 0
+; CHECK-NEXT:    IR   %mul.overflow = extractvalue { i32, i1 } %mul, 1
+; CHECK-NEXT:    IR   %6 = sub i32 %4, %mul.result
+; CHECK-NEXT:    IR   %7 = icmp ugt i32 %6, %4
+; CHECK-NEXT:    IR   %8 = or i1 %7, %mul.overflow
+; CHECK-NEXT:    IR   %9 = icmp ugt i64 %3, 4294967295
+; CHECK-NEXT:    IR   %10 = or i1 %8, %9
+; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<vector.memcheck>:
+; CHECK-NEXT:    IR   %11 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    IR   %12 = mul i64 %11, 4
+; CHECK-NEXT:    IR   %13 = mul i64 %12, 4
+; CHECK-NEXT:    IR   %14 = sub i64 %B1, %A2
+; CHECK-NEXT:    IR   %diff.check = icmp ult i64 %14, %13
+; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<vector.ph>:
+; CHECK-NEXT:    IR   %15 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    IR   %16 = mul i64 %15, 4
+; CHECK-NEXT:    IR   %n.mod.vf = urem i64 %0, %16
+; CHECK-NEXT:    IR   %n.vec = sub i64 %0, %n.mod.vf
+; CHECK-NEXT:    IR   %ind.end = sub i64 %0, %n.vec
+; CHECK-NEXT:    IR   %.cast = trunc i64 %n.vec to i32
+; CHECK-NEXT:    IR   %ind.end3 = sub i32 %n, %.cast
+; CHECK-NEXT:    IR   %17 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    IR   %18 = mul i64 %17, 4
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
@@ -182,19 +213,19 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph>
 ; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<scalar.ph>:
-; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi ir<%ind.end>, ir<%0>
-; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi ir<%ind.end3>, ir<%n>
+; CHECK-NEXT:    EMIT vp<[[RESUME_1:%.+]]> = resume-phi ir<%ind.end>, ir<%0>
+; CHECK-NEXT:    EMIT vp<[[RESUME_2:%.+]]>.1 = resume-phi ir<%ind.end3>, ir<%n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from ir-bb<scalar.ph>
-; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from ir-bb<scalar.ph>
+; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME_1]]> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME_2]]>.1 from ir-bb<scalar.ph>)
 ; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
-; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
 ; CHECK:  LV: Loop does not require scalar epilogue
 ;
@@ -277,7 +308,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  ir-bb<for.body.preheader>:
 ; CHECK-NEXT:    IR %0 = zext i32 %n to i64
 ; CHECK-NEXT:    EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64)
-; CHECK-NEXT:  No successors
+; CHECK-NEXT:  Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
 ; CHECK-NEXT:  Successor(s): vector loop
@@ -368,9 +399,40 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  ir-bb<for.body.preheader>:
 ; CHECK-NEXT:    IR %0 = zext i32 %n to i64
 ; CHECK-NEXT:    EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64)
-; CHECK-NEXT:  No successors
+; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.scevcheck>
 ; CHECK-EMPTY:
-; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  ir-bb<vector.scevcheck>:
+; CHECK-NEXT:    IR   %3 = add nsw i64 %0, -1
+; CHECK-NEXT:    IR   %4 = add i32 %n, -1
+; CHECK-NEXT:    IR   %5 = trunc i64 %3 to i32
+; CHECK-NEXT:    IR   %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
+; CHECK-NEXT:    IR   %mul.result = extractvalue { i32, i1 } %mul, 0
+; CHECK-NEXT:    IR   %mul.overflow = extractvalue { i32, i1 } %mul, 1
+; CHECK-NEXT:    IR   %6 = sub i32 %4, %mul.result
+; CHECK-NEXT:    IR   %7 = icmp ugt i32 %6, %4
+; CHECK-NEXT:    IR   %8 = or i1 %7, %mul.overflow
+; CHECK-NEXT:    IR   %9 = icmp ugt i64 %3, 4294967295
+; CHECK-NEXT:    IR   %10 = or i1 %8, %9
+; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<vector.memcheck>:
+; CHECK-NEXT:    IR   %11 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    IR   %12 = mul i64 %11, 4
+; CHECK-NEXT:    IR   %13 = mul i64 %12, 4
+; CHECK-NEXT:    IR   %14 = sub i64 %B1, %A2
+; CHECK-NEXT:    IR   %diff.check = icmp ult i64 %14, %13
+; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<vector.ph>:
+; CHECK-NEXT:    IR   %15 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    IR   %16 = mul i64 %15, 4
+; CHECK-NEXT:    IR   %n.mod.vf = urem i64 %0, %16
+; CHECK-NEXT:    IR   %n.vec = sub i64 %0, %n.mod.vf
+; CHECK-NEXT:    IR   %ind.end = sub i64 %0, %n.vec
+; CHECK-NEXT:    IR   %.cast = trunc i64 %n.vec to i32
+; CHECK-NEXT:    IR   %ind.end3 = sub i32 %n, %.cast
+; CHECK-NEXT:    IR   %17 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    IR   %18 = mul i64 %17, 4
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
@@ -398,19 +460,19 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph>
 ; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<scalar.ph>:
 ; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi ir<%ind.end>, ir<%0>
 ; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi ir<%ind.end3>, ir<%n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from ir-bb<scalar.ph>
-; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from ir-bb<scalar.ph>
+; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from ir-bb<scalar.ph>)
 ; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
-; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
 ; CHECK:  LV: Loop does not require scalar epilogue
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index a474d926a6303..e7eb5778ffb93 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -29,7 +29,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
 ; IF-EVL-OUTLOOP-NEXT: Live-in ir<%n> = original trip-count
 ; IF-EVL-OUTLOOP-EMPTY:
-; IF-EVL-OUTLOOP:      vector.ph:
+; IF-EVL-OUTLOOP-NEXT: ir-bb<entry>:
+; IF-EVL-OUTLOOP-NEXT: Successor(s): vector.ph
+; IF-EVL-OUTLOOP-EMPTY:
+; IF-EVL-OUTLOOP-NEXT: vector.ph:
 ; IF-EVL-OUTLOOP-NEXT: Successor(s): vector loop
 ; IF-EVL-OUTLOOP-EMPTY:
 ; IF-EVL-OUTLOOP-NEXT: <x1> vector loop: {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
index 4983ea0f7a738..53c9fb0c604da 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
@@ -11,7 +11,18 @@
  ; IF-EVL-NEXT: Live-in ir<[[VTC:%.+]]> = vector-trip-count
  ; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
 
- ; IF-EVL: vector.ph:
+ ; IF-EVL:      ir-bb<entry>:
+ ; IF-EVL-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
+
+ ; IF-EVL:      ir-bb<vector.ph>:
+ ; IF-EVL-NEXT:   IR   %4 = call i64 @llvm.vscale.i64()
+ ; IF-EVL-NEXT:   IR   %5 = mul i64 %4, 4
+ ; IF-EVL-NEXT:   IR   %6 = sub i64 %5, 1
+ ; IF-EVL-NEXT:   IR   %n.rnd.up = add i64 %N, %6
+ ; IF-EVL-NEXT:   IR   %n.mod.vf = urem i64 %n.rnd.up, %5
+ ; IF-EVL-NEXT:   IR   %n.vec = sub i64 %n.rnd.up, %n.mod.vf
+ ; IF-EVL-NEXT:   IR   %7 = call i64 @llvm.vscale.i64()
+ ; IF-EVL-NEXT:   IR   %8 = mul i64 %7, 4
  ; IF-EVL-NEXT: Successor(s): vector loop
 
  ; IF-EVL: <x1> vector loop: {
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
index d2d2063fd9058..0eab97b0cc735 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
@@ -9,6 +9,9 @@ define void @test_chained_first_order_recurrences_1(ptr %ptr) {
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<1000> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -81,6 +84,9 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) {
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<1000> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -165,6 +171,9 @@ define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) {
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<4098> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT:   WIDEN ir<%for.x.next> = mul ir<%x>, ir<2>
 ; CHECK-NEXT: Successor(s): vector loop
@@ -237,6 +246,9 @@ define i32 @test_chained_first_order_recurrences_5_hoist_to_load(ptr %base) {
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<4098> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 3d8403b1eb9c0..d59573e7f4678 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -15,6 +15,9 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-NEXT: Live-in ir<20001> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -121,6 +124,9 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-NEXT: Live-in ir<20001> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT:   WIDEN-CAST ir<%recur.next> = sext ir<%y> to i32
 ; CHECK-NEXT: Successor(s): vector loop
@@ -205,6 +211,9 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-NEXT: Live-in ir<20001> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT:   WIDEN-CAST ir<%recur.next> = sext ir<%y> to i32
 ; CHECK-NEXT: Successor(s): vector loop
@@ -282,6 +291,9 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-NEXT: Live-in ir<20001> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -397,7 +409,7 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
 ; CHECK-NEXT:  EMIT vp<[[TC]]> = EXPAND SCEV (1 smax (1 + (sext i8 %y to i32))<nsw>)
-; CHECK-NEXT: No successors
+; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT:   WIDEN-CAST ir<%recur.next> = sext ir<%y> to i32
@@ -487,6 +499,9 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-NEXT: Live-in ir<3> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll
index 7170147438327..ee87636eb0316 100644
--- a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll
@@ -43,6 +43,9 @@ for.end:
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-NEXT: Live-in ir<14> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index 53a0a308c79a9..bb17580ac4d11 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -11,7 +11,7 @@
 ; DBG-EMPTY:
 ; DBG-NEXT: ir-bb<entry>:
 ; DBG-NEXT:  EMIT vp<[[TC]]> = EXPAND SCEV (1000 + (-1 * %start))
-; DBG-NEXT: No successors
+; DBG-NEXT: Successor(s): vector.ph
 ; DBG-EMPTY:
 ; DBG-NEXT: vector.ph:
 ; DBG-NEXT: Successor(s): vector loop
@@ -72,6 +72,9 @@ declare i32 @llvm.smin.i32(i32, i32)
 ; DBG-NEXT:  Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; DBG-NEXT:  Live-in ir<1000> = original trip-count
 ; DBG-EMPTY:
+; DBG-NEXT: ir-bb<entry>:
+; DBG-NEXT: Successor(s): vector.ph
+; DBG-EMPTY:
 ; DBG-NEXT: vector.ph:
 ; DBG-NEXT: Successor(s): vector loop
 ; DBG-EMPTY:
@@ -192,7 +195,7 @@ exit:
 ; DBG-EMPTY:
 ; DBG-NEXT: ir-bb<entry>:
 ; DBG-NEXT:  EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 (1 smax %n) to i64)
-; DBG-NEXT: No successors
+; DBG-NEXT: Successor(s): vector.ph
 ; DBG-EMPTY:
 ; DBG-NEXT: vector.ph:
 ; DBG-NEXT:   SCALAR-CAST vp<[[CAST:%.+]]> = trunc ir<1> to i32
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-blend.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-blend.ll
new file mode 100644
index 0000000000000..89d7821cac9d3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-blend.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK
+
+define i32 @select_icmp_switch(i32 %n, i32 %case, ptr %a, ptr %b) {
+; CHECK-LABEL: define i32 @select_icmp_switch(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[CASE:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_SGT:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    [[RDX_PHI:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[RDX_PHI_NEXT:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:    switch i32 [[CASE]], label %[[SW_BB0:.*]] [
+; CHECK-NEXT:      i32 0, label %[[SW_BB0]]
+; CHECK-NEXT:      i32 1, label %[[SW_BB1:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[SW_BB0]]:
+; CHECK-NEXT:    [[A_ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS]]
+; CHECK-NEXT:    [[A_VALUE:%.*]] = load i8, ptr [[A_ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP_A:%.*]] = icmp eq i8 [[A_VALUE]], -1
+; CHECK-NEXT:    [[TRUNC_BB0:%.*]] = trunc i64 [[INDVARS]] to i32
+; CHECK-NEXT:    [[SELECT_BB0:%.*]] = select i1 [[CMP_A]], i32 [[RDX_PHI]], i32 [[TRUNC_BB0]]
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[SW_BB1]]:
+; CHECK-NEXT:    [[B_ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDVARS]]
+; CHECK-NEXT:    [[B_VALUE:%.*]] = load i8, ptr [[B_ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP_B:%.*]] = icmp eq i8 [[B_VALUE]], -1
+; CHECK-NEXT:    [[TRUNC_BB1:%.*]] = trunc i64 [[INDVARS]] to i32
+; CHECK-NEXT:    [[SELECT_BB1:%.*]] = select i1 [[CMP_B]], i32 [[RDX_PHI]], i32 [[TRUNC_BB1]]
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[RDX_PHI_NEXT]] = phi i32 [ [[SELECT_BB0]], %[[SW_BB0]] ], [ [[SELECT_BB1]], %[[SW_BB1]] ]
+; CHECK-NEXT:    [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[RDX_PHI_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_PHI_NEXT]], %[[FOR_INC]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[SELECT_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_PHI_NEXT_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[SELECT_LCSSA]]
+;
+entry:
+  %cmp.sgt = icmp sgt i32 %n, 0
+  br i1 %cmp.sgt, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %indvars = phi i64 [ 0, %for.body.preheader ], [ %indvars.next, %for.inc ]
+  %rdx.phi = phi i32 [ 0, %for.body.preheader ], [ %rdx.phi.next, %for.inc ]
+  switch i32 %case, label %sw.bb0 [
+  i32 0, label %sw.bb0
+  i32 1, label %sw.bb1
+  ]
+
+sw.bb0:
+  %a.arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars
+  %a.value = load i8, ptr %a.arrayidx, align 1
+  %cmp.a = icmp eq i8 %a.value, -1
+  %trunc.bb0 = trunc i64 %indvars to i32
+  %select.bb0 = select i1 %cmp.a, i32 %rdx.phi, i32 %trunc.bb0
+  br label %for.inc
+
+sw.bb1:
+  %b.arrayidx = getelementptr inbounds i8, ptr %b, i64 %indvars
+  %b.value = load i8, ptr %b.arrayidx, align 1
+  %cmp.b = icmp eq i8 %b.value, -1
+  %trunc.bb1 = trunc i64 %indvars to i32
+  %select.bb1 = select i1 %cmp.b, i32 %rdx.phi, i32 %trunc.bb1
+  br label %for.inc
+
+for.inc:
+  %rdx.phi.next = phi i32 [ %select.bb0, %sw.bb0 ], [ %select.bb1, %sw.bb1 ]
+  %indvars.next = add nuw nsw i64 %indvars, 1
+  %exitcond.not = icmp eq i64 %indvars.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  %select.lcssa = phi i32 [ %rdx.phi.next, %for.inc ], [ 0, %entry ]
+  ret i32 %select.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
index d3582ae16d1c1..98dc0558489ad 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
@@ -5,10 +5,42 @@ define i64 @select_icmp_nuw_nsw(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; CHECK-LABEL: define i64 @select_icmp_nuw_nsw(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[II:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP6]])
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 [[II]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[II]], %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[II]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
@@ -17,9 +49,9 @@ define i64 @select_icmp_nuw_nsw(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; CHECK-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[COND_LCSSA]]
 ;
 entry:
@@ -46,10 +78,42 @@ define i64 @select_icmp_nsw(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; CHECK-LABEL: define i64 @select_icmp_nsw(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[II:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP6]])
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 [[II]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[II]], %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[II]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
@@ -58,9 +122,9 @@ define i64 @select_icmp_nsw(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; CHECK-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
 ; CHECK-NEXT:    [[INC]] = add nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[COND_LCSSA]]
 ;
 entry:
@@ -164,3 +228,11 @@ for.body:                                         ; preds = %entry, %for.body
 exit:                                             ; preds = %for.body
   ret i64 %cond
 }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
index 2eb63db2b0247..5352be9379783 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
@@ -1,42 +1,220 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC4
 
-; This test can theoretically be vectorized without a runtime-check, by
-; pattern-matching on the constructs that are introduced by IndVarSimplify.
-; We can check two things:
-;   %1 = trunc i64 %iv to i32
-; This indicates that the %iv is truncated to i32. We can then check the loop
-; guard is a signed i32:
-;   %cmp.sgt = icmp sgt i32 %n, 0
-; and successfully vectorize the case without a runtime-check.
+; About the truncated test cases, the range analysis of induction variable is
+; used to ensure the induction variable is always greater than the sentinal
+; value. The case is vectorizable if the truncated induction variable is
+; monotonic increasing, and not equals to the sentinal.
 define i32 @select_icmp_const_truncated_iv_widened_exit(ptr %a, i32 %n) {
-; CHECK-LABEL: define i32 @select_icmp_const_truncated_iv_widened_exit(
-; CHECK-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[CMP_SGT:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-NEXT:    br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
-; CHECK:       [[FOR_BODY_PREHEADER]]:
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT_LOOPEXIT]]:
-; CHECK-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    br label %[[EXIT]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[RDX_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i32 @select_icmp_const_truncated_iv_widened_exit(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[CMP_SGT:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC1-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3)
+; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP6]], -2147483648
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP6]], i32 331
+; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[TMP7]], 3
+; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP8]], i32 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT_LOOPEXIT]]:
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[EXIT]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF4IC1-NEXT:    ret i32 [[RDX_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i32 @select_icmp_const_truncated_iv_widened_exit(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[CMP_SGT:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC4-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD4]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD5]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD6]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP10]] = select <4 x i1> [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT:    [[TMP11]] = select <4 x i1> [[TMP7]], <4 x i32> [[STEP_ADD]], <4 x i32> [[VEC_PHI1]]
+; CHECK-VF4IC4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT:    [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX7:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX]], <4 x i32> [[TMP12]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX8:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX7]], <4 x i32> [[TMP13]])
+; CHECK-VF4IC4-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[RDX_MINMAX8]])
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP15]], -2147483648
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP15]], i32 331
+; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC4-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[TMP16]], 3
+; CHECK-VF4IC4-NEXT:    [[TMP17:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP17]], i32 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT_LOOPEXIT]]:
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[EXIT]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF4IC4-NEXT:    ret i32 [[RDX_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i32 @select_icmp_const_truncated_iv_widened_exit(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    [[CMP_SGT:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY_PREHEADER]]:
+; CHECK-VF1IC4-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF1IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF1IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4:       [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4:       [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ -2147483648, %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ -2147483648, %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ -2147483648, %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i32 [ -2147483648, %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-VF1IC4-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP6:%.*]] = add i32 [[TMP4]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP7:%.*]] = add i32 [[TMP4]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP8:%.*]] = add i32 [[TMP4]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = icmp sgt i64 [[TMP13]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = icmp sgt i64 [[TMP14]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = icmp sgt i64 [[TMP15]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP20:%.*]] = icmp sgt i64 [[TMP16]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP21]] = select i1 [[TMP17]], i32 [[TMP5]], i32 [[VEC_PHI]]
+; CHECK-VF1IC4-NEXT:    [[TMP22]] = select i1 [[TMP18]], i32 [[TMP6]], i32 [[VEC_PHI1]]
+; CHECK-VF1IC4-NEXT:    [[TMP23]] = select i1 [[TMP19]], i32 [[TMP7]], i32 [[VEC_PHI2]]
+; CHECK-VF1IC4-NEXT:    [[TMP24]] = select i1 [[TMP20]], i32 [[TMP8]], i32 [[VEC_PHI3]]
+; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP21]], i32 [[TMP22]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX4:%.*]] = call i32 @llvm.smax.i32(i32 [[RDX_MINMAX]], i32 [[TMP23]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX5:%.*]] = call i32 @llvm.smax.i32(i32 [[RDX_MINMAX4]], i32 [[TMP24]])
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[RDX_MINMAX5]], -2147483648
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[RDX_MINMAX5]], i32 331
+; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4:       [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP26:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF1IC4-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[TMP26]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP27:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP27]], i32 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF1IC4:       [[EXIT_LOOPEXIT]]:
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[EXIT]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF1IC4-NEXT:    ret i32 [[RDX_LCSSA]]
 ;
 entry:
   %cmp.sgt = icmp sgt i32 %n, 0
@@ -63,33 +241,183 @@ exit:                                            ; preds = %for.body, %entry
   ret i32 %rdx.lcssa
 }
 
-; This test can theoretically be vectorized without a runtime-check, by
-; pattern-matching on the constructs that are introduced by IndVarSimplify.
-; We can check two things:
-;   %1 = trunc i64 %iv to i32
-; This indicates that the %iv is truncated to i32. We can then check the loop
-; exit condition, which compares to a constant that fits within i32:
-;   %exitcond.not = icmp eq i64 %inc, 20000
-; and successfully vectorize the case without a runtime-check.
+; Without loop guard, the range analysis is also able to base on the constant
+; trip count.
 define i32 @select_icmp_const_truncated_iv_const_exit(ptr %a) {
-; CHECK-LABEL: define i32 @select_icmp_const_truncated_iv_const_exit(
-; CHECK-SAME: ptr [[A:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 20000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i32 @select_icmp_const_truncated_iv_const_exit(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3)
+; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20000
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP6]], -2147483648
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP6]], i32 331
+; CHECK-VF4IC1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 20000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[TMP7]], 3
+; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP8]], i32 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 20000
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i32 @select_icmp_const_truncated_iv_const_exit(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD4]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD5]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD6]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP10]] = select <4 x i1> [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT:    [[TMP11]] = select <4 x i1> [[TMP7]], <4 x i32> [[STEP_ADD]], <4 x i32> [[VEC_PHI1]]
+; CHECK-VF4IC4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT:    [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20000
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX7:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX]], <4 x i32> [[TMP12]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX8:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX7]], <4 x i32> [[TMP13]])
+; CHECK-VF4IC4-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[RDX_MINMAX8]])
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP15]], -2147483648
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP15]], i32 331
+; CHECK-VF4IC4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 20000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC4-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[TMP16]], 3
+; CHECK-VF4IC4-NEXT:    [[TMP17:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP17]], i32 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 20000
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i32 @select_icmp_const_truncated_iv_const_exit(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4:       [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4:       [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ -2147483648, %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ -2147483648, %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ -2147483648, %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i32 [ -2147483648, %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-VF1IC4-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP6:%.*]] = add i32 [[TMP4]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP7:%.*]] = add i32 [[TMP4]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP8:%.*]] = add i32 [[TMP4]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = icmp sgt i64 [[TMP13]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = icmp sgt i64 [[TMP14]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = icmp sgt i64 [[TMP15]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP20:%.*]] = icmp sgt i64 [[TMP16]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP21]] = select i1 [[TMP17]], i32 [[TMP5]], i32 [[VEC_PHI]]
+; CHECK-VF1IC4-NEXT:    [[TMP22]] = select i1 [[TMP18]], i32 [[TMP6]], i32 [[VEC_PHI1]]
+; CHECK-VF1IC4-NEXT:    [[TMP23]] = select i1 [[TMP19]], i32 [[TMP7]], i32 [[VEC_PHI2]]
+; CHECK-VF1IC4-NEXT:    [[TMP24]] = select i1 [[TMP20]], i32 [[TMP8]], i32 [[VEC_PHI3]]
+; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20000
+; CHECK-VF1IC4-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP21]], i32 [[TMP22]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX4:%.*]] = call i32 @llvm.smax.i32(i32 [[RDX_MINMAX]], i32 [[TMP23]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX5:%.*]] = call i32 @llvm.smax.i32(i32 [[RDX_MINMAX4]], i32 [[TMP24]])
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[RDX_MINMAX5]], -2147483648
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[RDX_MINMAX5]], i32 331
+; CHECK-VF1IC4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4:       [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 20000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP26:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF1IC4-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[TMP26]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP27:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP27]], i32 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 20000
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -113,24 +441,180 @@ exit:                                           ; preds = %for.body
 ; Without loop guard, the maximum constant trip count that can be vectorized is
 ; the signed maximum value of reduction type.
 define i32 @select_fcmp_max_valid_const_ub(ptr %a) {
-; CHECK-LABEL: define i32 @select_fcmp_max_valid_const_ub(
-; CHECK-SAME: ptr [[A:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483648
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i32 @select_fcmp_max_valid_const_ub(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP6]], -2147483648
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP6]], i32 -1
+; CHECK-VF4IC1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2147483648, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = fcmp fast olt float [[TMP7]], 0.000000e+00
+; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP8]], i32 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483648
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i32 @select_fcmp_max_valid_const_ub(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD4]], zeroinitializer
+; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD5]], zeroinitializer
+; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD6]], zeroinitializer
+; CHECK-VF4IC4-NEXT:    [[TMP10]] = select <4 x i1> [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT:    [[TMP11]] = select <4 x i1> [[TMP7]], <4 x i32> [[STEP_ADD]], <4 x i32> [[VEC_PHI1]]
+; CHECK-VF4IC4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT:    [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX7:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX]], <4 x i32> [[TMP12]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX8:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX7]], <4 x i32> [[TMP13]])
+; CHECK-VF4IC4-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[RDX_MINMAX8]])
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP15]], -2147483648
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP15]], i32 -1
+; CHECK-VF4IC4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2147483648, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT:    [[CMP:%.*]] = fcmp fast olt float [[TMP16]], 0.000000e+00
+; CHECK-VF4IC4-NEXT:    [[TMP17:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP17]], i32 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483648
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i32 @select_fcmp_max_valid_const_ub(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4:       [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4:       [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ -2147483648, %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ -2147483648, %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ -2147483648, %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i32 [ -2147483648, %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-VF1IC4-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP6:%.*]] = add i32 [[TMP4]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP7:%.*]] = add i32 [[TMP4]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP8:%.*]] = add i32 [[TMP4]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP9]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP10]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP11]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP12]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = fcmp fast olt float [[TMP13]], 0.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = fcmp fast olt float [[TMP14]], 0.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = fcmp fast olt float [[TMP15]], 0.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP20:%.*]] = fcmp fast olt float [[TMP16]], 0.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP21]] = select i1 [[TMP17]], i32 [[TMP5]], i32 [[VEC_PHI]]
+; CHECK-VF1IC4-NEXT:    [[TMP22]] = select i1 [[TMP18]], i32 [[TMP6]], i32 [[VEC_PHI1]]
+; CHECK-VF1IC4-NEXT:    [[TMP23]] = select i1 [[TMP19]], i32 [[TMP7]], i32 [[VEC_PHI2]]
+; CHECK-VF1IC4-NEXT:    [[TMP24]] = select i1 [[TMP20]], i32 [[TMP8]], i32 [[VEC_PHI3]]
+; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
+; CHECK-VF1IC4-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP21]], i32 [[TMP22]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX4:%.*]] = call i32 @llvm.smax.i32(i32 [[RDX_MINMAX]], i32 [[TMP23]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX5:%.*]] = call i32 @llvm.smax.i32(i32 [[RDX_MINMAX4]], i32 [[TMP24]])
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[RDX_MINMAX5]], -2147483648
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[RDX_MINMAX5]], i32 -1
+; CHECK-VF1IC4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4:       [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2147483648, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP26:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF1IC4-NEXT:    [[CMP:%.*]] = fcmp fast olt float [[TMP26]], 0.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP27:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP27]], i32 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483648
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -161,30 +645,80 @@ exit:                                            ; preds = %for.body
 ; We cannot guarantee that %iv won't overflow an i32 value (and hence hit the
 ; sentinel value), and need a runtime-check to vectorize this case.
 define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit(ptr %a, i64 %n) {
-; CHECK-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit(
-; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0
-; CHECK-NEXT:    br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
-; CHECK:       [[FOR_BODY_PREHEADER]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT_LOOPEXIT]]:
-; CHECK-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    br label %[[EXIT]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[RDX_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1:       [[EXIT_LOOPEXIT]]:
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[EXIT]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF4IC1-NEXT:    ret i32 [[RDX_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4:       [[EXIT_LOOPEXIT]]:
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[EXIT]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF4IC4-NEXT:    ret i32 [[RDX_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY_PREHEADER]]:
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF1IC4-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4:       [[EXIT_LOOPEXIT]]:
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[EXIT]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF1IC4-NEXT:    ret i32 [[RDX_LCSSA]]
 ;
 entry:
   %cmp.sgt = icmp sgt i64 %n, 0
@@ -215,31 +749,83 @@ exit:                                             ; preds = %for.body, %entry
 ; We cannot guarantee that %iv won't overflow an i32 value (and hence hit the
 ; sentinel value), and need a runtime-check to vectorize this case.
 define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(ptr %a, i32 %n) {
-; CHECK-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(
-; CHECK-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[N]], 0
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER:.*]]
-; CHECK:       [[FOR_BODY_PREHEADER]]:
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[RDX]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT_LOOPEXIT]]:
-; CHECK-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    br label %[[EXIT]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[RDX_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC1-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 3
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1:       [[EXIT_LOOPEXIT]]:
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[EXIT]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF4IC1-NEXT:    ret i32 [[RDX_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC4-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 3
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4:       [[EXIT_LOOPEXIT]]:
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[EXIT]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF4IC4-NEXT:    ret i32 [[RDX_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY_PREHEADER]]:
+; CHECK-VF1IC4-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF1IC4-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4:       [[EXIT_LOOPEXIT]]:
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[EXIT]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF1IC4-NEXT:    ret i32 [[RDX_LCSSA]]
 ;
 entry:
   %cmp.not = icmp eq i32 %n, 0
@@ -274,24 +860,62 @@ exit:                                             ; preds = %for.body, %entry
 ; Hence, the i32 will most certainly wrap and hit the sentinel value, and we
 ; cannot vectorize this case.
 define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) {
-; CHECK-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(
-; CHECK-SAME: ptr [[A:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 2147483646, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3
-; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 4294967294
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 2147483646, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3
+; CHECK-VF4IC1-NEXT:    [[CONV:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 4294967294
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ 2147483646, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3
+; CHECK-VF4IC4-NEXT:    [[CONV:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 4294967294
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ 2147483646, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF1IC4-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3
+; CHECK-VF1IC4-NEXT:    [[CONV:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 4294967294
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -315,27 +939,71 @@ exit:                                             ; preds = %for.body
 ; Forbidding vectorization of the FindLastIV pattern involving a truncated
 ; induction variable in the absence of any loop guard.
 define i32 @not_vectorized_select_iv_icmp_no_guard(ptr %a, ptr %b, i32 %start, i32 %n) {
-; CHECK-LABEL: define i32 @not_vectorized_select_iv_icmp_no_guard(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[START:%.*]], i32 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[COND]] = select i1 [[CMP]], i32 [[TMP2]], i32 [[RDX]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret i32 [[COND_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_iv_icmp_no_guard(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[START:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC1-NEXT:    [[COND]] = select i1 [[CMP]], i32 [[TMP2]], i32 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    ret i32 [[COND_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_iv_icmp_no_guard(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[START:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-VF4IC4-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]]
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC4-NEXT:    [[COND]] = select i1 [[CMP]], i32 [[TMP2]], i32 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    ret i32 [[COND_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_iv_icmp_no_guard(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[START:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-VF1IC4-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF1IC4-NEXT:    [[COND]] = select i1 [[CMP]], i32 [[TMP2]], i32 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    ret i32 [[COND_LCSSA]]
 ;
 entry:
   %wide.trip.count = zext i32 %n to i64
@@ -364,24 +1032,62 @@ exit:                                             ; preds = %for.body
 ; vectorizer is unable to guarantee that the induction variable is monotonic
 ; increasing.
 define i32 @not_vectorized_select_fcmp_invalid_const_ub(ptr %a) {
-; CHECK-LABEL: define i32 @not_vectorized_select_fcmp_invalid_const_ub(
-; CHECK-SAME: ptr [[A:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483649
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_fcmp_invalid_const_ub(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483649
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_fcmp_invalid_const_ub(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT:    [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483649
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_fcmp_invalid_const_ub(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF1IC4-NEXT:    [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483649
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -406,33 +1112,89 @@ exit:                                            ; preds = %for.body
 ; instruction is smaller than the trip count type before extension, overflow
 ; could still occur.
 define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, ptr %b, i16 %start, i32 %n) {
-; CHECK-LABEL: define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i16 [[START:%.*]], i32 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-NEXT:    br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
-; CHECK:       [[FOR_BODY_PREHEADER]]:
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i16 [ [[START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[IV]] to i16
-; CHECK-NEXT:    [[COND]] = select i1 [[CMP3]], i16 [[TMP2]], i16 [[RDX]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT_LOOPEXIT]]:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    br label %[[EXIT]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RDX_0_LCSSA:%.*]] = phi i16 [ [[START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i16 [[RDX_0_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i16 [[START:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC1-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i16 [ [[START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = trunc i64 [[IV]] to i16
+; CHECK-VF4IC1-NEXT:    [[COND]] = select i1 [[CMP3]], i16 [[TMP2]], i16 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1:       [[EXIT_LOOPEXIT]]:
+; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[EXIT]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[RDX_0_LCSSA:%.*]] = phi i16 [ [[START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF4IC1-NEXT:    ret i16 [[RDX_0_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i16 [[START:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC4-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i16 [ [[START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-VF4IC4-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]]
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = trunc i64 [[IV]] to i16
+; CHECK-VF4IC4-NEXT:    [[COND]] = select i1 [[CMP3]], i16 [[TMP2]], i16 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4:       [[EXIT_LOOPEXIT]]:
+; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[EXIT]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[RDX_0_LCSSA:%.*]] = phi i16 [ [[START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF4IC4-NEXT:    ret i16 [[RDX_0_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i16 [[START:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY_PREHEADER]]:
+; CHECK-VF1IC4-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i16 [ [[START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-VF1IC4-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = trunc i64 [[IV]] to i16
+; CHECK-VF1IC4-NEXT:    [[COND]] = select i1 [[CMP3]], i16 [[TMP2]], i16 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4:       [[EXIT_LOOPEXIT]]:
+; CHECK-VF1IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[EXIT]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[RDX_0_LCSSA:%.*]] = phi i16 [ [[START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF1IC4-NEXT:    ret i16 [[RDX_0_LCSSA]]
 ;
 entry:
   %cmp9 = icmp sgt i32 %n, 0
@@ -460,3 +1222,31 @@ exit:                                             ; preds = %for.body, %entry
   %rdx.0.lcssa = phi i16 [ %start, %entry ], [ %cond, %for.body ]
   ret i16 %rdx.0.lcssa
 }
+;.
+; CHECK-VF4IC1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VF4IC1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VF4IC1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VF4IC1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK-VF4IC1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VF4IC1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK-VF4IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-VF4IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+;.
+; CHECK-VF4IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VF4IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VF4IC4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VF4IC4: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK-VF4IC4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VF4IC4: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK-VF4IC4: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-VF4IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+;.
+; CHECK-VF1IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VF1IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VF1IC4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VF1IC4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; CHECK-VF1IC4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VF1IC4: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; CHECK-VF1IC4: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-VF1IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
index b989b8bbe5229..dc3f2a1773cf6 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
@@ -1,26 +1,187 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC4
 
 define i64 @select_icmp_const_1(ptr %a, i64 %n) {
-; CHECK-LABEL: define i64 @select_icmp_const_1(
-; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ 3, %[[ENTRY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i64 [[TMP0]], 3
-; CHECK-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret i64 [[COND_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i64 @select_icmp_const_1(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 3)
+; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP4]])
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP6]], -9223372036854775808
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP6]], i64 3
+; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC1-NEXT:    [[CMP2:%.*]] = icmp eq i64 [[TMP7]], 3
+; CHECK-VF4IC1-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i64 @select_icmp_const_1(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD5]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD6]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP10]] = select <4 x i1> [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT:    [[TMP11]] = select <4 x i1> [[TMP7]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; CHECK-VF4IC4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT:    [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP10]], <4 x i64> [[TMP11]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX7:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP12]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX8:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX7]], <4 x i64> [[TMP13]])
+; CHECK-VF4IC4-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX8]])
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP15]], -9223372036854775808
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP15]], i64 3
+; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC4-NEXT:    [[CMP2:%.*]] = icmp eq i64 [[TMP16]], 3
+; CHECK-VF4IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i64 @select_icmp_const_1(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4:       [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4:       [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[TMP8]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[TMP9]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP10]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[TMP11]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP16]] = select i1 [[TMP12]], i64 [[TMP0]], i64 [[VEC_PHI]]
+; CHECK-VF1IC4-NEXT:    [[TMP17]] = select i1 [[TMP13]], i64 [[TMP1]], i64 [[VEC_PHI1]]
+; CHECK-VF1IC4-NEXT:    [[TMP18]] = select i1 [[TMP14]], i64 [[TMP2]], i64 [[VEC_PHI2]]
+; CHECK-VF1IC4-NEXT:    [[TMP19]] = select i1 [[TMP15]], i64 [[TMP3]], i64 [[VEC_PHI3]]
+; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP16]], i64 [[TMP17]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX4:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP18]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX5:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX4]], i64 [[TMP19]])
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX5]], -9223372036854775808
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX5]], i64 3
+; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4:       [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP21:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF1IC4-NEXT:    [[CMP2:%.*]] = icmp eq i64 [[TMP21]], 3
+; CHECK-VF1IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    ret i64 [[COND_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -41,23 +202,184 @@ exit:                                             ; preds = %for.body
 }
 
 define i64 @select_icmp_const_2(ptr %a, i64 %n) {
-; CHECK-LABEL: define i64 @select_icmp_const_2(
-; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ 3, %[[ENTRY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i64 [[TMP0]], 3
-; CHECK-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[RDX]], i64 [[IV]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret i64 [[COND_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i64 @select_icmp_const_2(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 3)
+; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_PHI]], <4 x i64> [[VEC_IND]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP4]])
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP6]], -9223372036854775808
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP6]], i64 3
+; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC1-NEXT:    [[CMP2:%.*]] = icmp eq i64 [[TMP7]], 3
+; CHECK-VF4IC1-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[RDX]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i64 @select_icmp_const_2(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD5]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD6]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP10]] = select <4 x i1> [[TMP6]], <4 x i64> [[VEC_PHI]], <4 x i64> [[VEC_IND]]
+; CHECK-VF4IC4-NEXT:    [[TMP11]] = select <4 x i1> [[TMP7]], <4 x i64> [[VEC_PHI1]], <4 x i64> [[STEP_ADD]]
+; CHECK-VF4IC4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i64> [[VEC_PHI2]], <4 x i64> [[STEP_ADD_2]]
+; CHECK-VF4IC4-NEXT:    [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i64> [[VEC_PHI3]], <4 x i64> [[STEP_ADD_3]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP10]], <4 x i64> [[TMP11]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX7:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP12]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX8:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX7]], <4 x i64> [[TMP13]])
+; CHECK-VF4IC4-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX8]])
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP15]], -9223372036854775808
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP15]], i64 3
+; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC4-NEXT:    [[CMP2:%.*]] = icmp eq i64 [[TMP16]], 3
+; CHECK-VF4IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[RDX]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i64 @select_icmp_const_2(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4:       [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4:       [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[TMP8]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[TMP9]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP10]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[TMP11]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP16]] = select i1 [[TMP12]], i64 [[VEC_PHI]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP17]] = select i1 [[TMP13]], i64 [[VEC_PHI1]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP18]] = select i1 [[TMP14]], i64 [[VEC_PHI2]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP19]] = select i1 [[TMP15]], i64 [[VEC_PHI3]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP16]], i64 [[TMP17]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX4:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP18]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX5:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX4]], i64 [[TMP19]])
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX5]], -9223372036854775808
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX5]], i64 3
+; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4:       [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP21:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF1IC4-NEXT:    [[CMP2:%.*]] = icmp eq i64 [[TMP21]], 3
+; CHECK-VF1IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[RDX]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    ret i64 [[COND_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -78,23 +400,184 @@ exit:                                             ; preds = %for.body
 }
 
 define i64 @select_icmp_const_3_variable_rdx_start(ptr %a, i64 %rdx.start, i64 %n) {
-; CHECK-LABEL: define i64 @select_icmp_const_3_variable_rdx_start(
-; CHECK-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i64 [[TMP0]], 3
-; CHECK-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret i64 [[COND_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i64 @select_icmp_const_3_variable_rdx_start(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 3)
+; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP4]])
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP6]], -9223372036854775808
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP6]], i64 [[RDX_START]]
+; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC1-NEXT:    [[CMP2:%.*]] = icmp eq i64 [[TMP7]], 3
+; CHECK-VF4IC1-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i64 @select_icmp_const_3_variable_rdx_start(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD5]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD6]], splat (i64 3)
+; CHECK-VF4IC4-NEXT:    [[TMP10]] = select <4 x i1> [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT:    [[TMP11]] = select <4 x i1> [[TMP7]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; CHECK-VF4IC4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT:    [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP10]], <4 x i64> [[TMP11]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX7:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP12]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX8:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX7]], <4 x i64> [[TMP13]])
+; CHECK-VF4IC4-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX8]])
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP15]], -9223372036854775808
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP15]], i64 [[RDX_START]]
+; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC4-NEXT:    [[CMP2:%.*]] = icmp eq i64 [[TMP16]], 3
+; CHECK-VF4IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i64 @select_icmp_const_3_variable_rdx_start(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4:       [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4:       [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[TMP8]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[TMP9]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP10]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[TMP11]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP16]] = select i1 [[TMP12]], i64 [[TMP0]], i64 [[VEC_PHI]]
+; CHECK-VF1IC4-NEXT:    [[TMP17]] = select i1 [[TMP13]], i64 [[TMP1]], i64 [[VEC_PHI1]]
+; CHECK-VF1IC4-NEXT:    [[TMP18]] = select i1 [[TMP14]], i64 [[TMP2]], i64 [[VEC_PHI2]]
+; CHECK-VF1IC4-NEXT:    [[TMP19]] = select i1 [[TMP15]], i64 [[TMP3]], i64 [[VEC_PHI3]]
+; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP16]], i64 [[TMP17]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX4:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP18]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX5:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX4]], i64 [[TMP19]])
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX5]], -9223372036854775808
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX5]], i64 [[RDX_START]]
+; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4:       [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP21:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF1IC4-NEXT:    [[CMP2:%.*]] = icmp eq i64 [[TMP21]], 3
+; CHECK-VF1IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    ret i64 [[COND_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -115,23 +598,184 @@ exit:                                             ; preds = %for.body
 }
 
 define i64 @select_fcmp_const_fast(ptr %a, i64 %n) {
-; CHECK-LABEL: define i64 @select_fcmp_const_fast(
-; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ 2, %[[ENTRY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp fast ueq float [[TMP0]], 3.000000e+00
-; CHECK-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret i64 [[COND_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i64 @select_fcmp_const_fast(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
+; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP4]])
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP6]], -9223372036854775808
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP6]], i64 2
+; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP2:%.*]] = fcmp fast ueq float [[TMP7]], 3.000000e+00
+; CHECK-VF4IC1-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i64 @select_fcmp_const_fast(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD4]], splat (float 3.000000e+00)
+; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD5]], splat (float 3.000000e+00)
+; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD6]], splat (float 3.000000e+00)
+; CHECK-VF4IC4-NEXT:    [[TMP10]] = select <4 x i1> [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT:    [[TMP11]] = select <4 x i1> [[TMP7]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; CHECK-VF4IC4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT:    [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP10]], <4 x i64> [[TMP11]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX7:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP12]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX8:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX7]], <4 x i64> [[TMP13]])
+; CHECK-VF4IC4-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX8]])
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP15]], -9223372036854775808
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP15]], i64 2
+; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT:    [[CMP2:%.*]] = fcmp fast ueq float [[TMP16]], 3.000000e+00
+; CHECK-VF4IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i64 @select_fcmp_const_fast(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4:       [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4:       [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP4]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = fcmp fast ueq float [[TMP8]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = fcmp fast ueq float [[TMP9]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = fcmp fast ueq float [[TMP10]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = fcmp fast ueq float [[TMP11]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP16]] = select i1 [[TMP12]], i64 [[TMP0]], i64 [[VEC_PHI]]
+; CHECK-VF1IC4-NEXT:    [[TMP17]] = select i1 [[TMP13]], i64 [[TMP1]], i64 [[VEC_PHI1]]
+; CHECK-VF1IC4-NEXT:    [[TMP18]] = select i1 [[TMP14]], i64 [[TMP2]], i64 [[VEC_PHI2]]
+; CHECK-VF1IC4-NEXT:    [[TMP19]] = select i1 [[TMP15]], i64 [[TMP3]], i64 [[VEC_PHI3]]
+; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP16]], i64 [[TMP17]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX4:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP18]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX5:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX4]], i64 [[TMP19]])
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX5]], -9223372036854775808
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX5]], i64 2
+; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4:       [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF1IC4-NEXT:    [[CMP2:%.*]] = fcmp fast ueq float [[TMP21]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    ret i64 [[COND_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -152,23 +796,184 @@ exit:                                             ; preds = %for.body
 }
 
 define i64 @select_fcmp_const(ptr %a, i64 %n) {
-; CHECK-LABEL: define i64 @select_fcmp_const(
-; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ 2, %[[ENTRY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ueq float [[TMP0]], 3.000000e+00
-; CHECK-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret i64 [[COND_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i64 @select_fcmp_const(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = fcmp ueq <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
+; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP4]])
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP6]], -9223372036854775808
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP6]], i64 2
+; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP2:%.*]] = fcmp ueq float [[TMP7]], 3.000000e+00
+; CHECK-VF4IC1-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i64 @select_fcmp_const(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = fcmp ueq <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = fcmp ueq <4 x float> [[WIDE_LOAD4]], splat (float 3.000000e+00)
+; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = fcmp ueq <4 x float> [[WIDE_LOAD5]], splat (float 3.000000e+00)
+; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = fcmp ueq <4 x float> [[WIDE_LOAD6]], splat (float 3.000000e+00)
+; CHECK-VF4IC4-NEXT:    [[TMP10]] = select <4 x i1> [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT:    [[TMP11]] = select <4 x i1> [[TMP7]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; CHECK-VF4IC4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT:    [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP10]], <4 x i64> [[TMP11]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX7:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP12]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX8:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX7]], <4 x i64> [[TMP13]])
+; CHECK-VF4IC4-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX8]])
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP15]], -9223372036854775808
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP15]], i64 2
+; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT:    [[CMP2:%.*]] = fcmp ueq float [[TMP16]], 3.000000e+00
+; CHECK-VF4IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i64 @select_fcmp_const(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4:       [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4:       [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP4]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = fcmp ueq float [[TMP8]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = fcmp ueq float [[TMP9]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = fcmp ueq float [[TMP10]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = fcmp ueq float [[TMP11]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP16]] = select i1 [[TMP12]], i64 [[TMP0]], i64 [[VEC_PHI]]
+; CHECK-VF1IC4-NEXT:    [[TMP17]] = select i1 [[TMP13]], i64 [[TMP1]], i64 [[VEC_PHI1]]
+; CHECK-VF1IC4-NEXT:    [[TMP18]] = select i1 [[TMP14]], i64 [[TMP2]], i64 [[VEC_PHI2]]
+; CHECK-VF1IC4-NEXT:    [[TMP19]] = select i1 [[TMP15]], i64 [[TMP3]], i64 [[VEC_PHI3]]
+; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP16]], i64 [[TMP17]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX4:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP18]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX5:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX4]], i64 [[TMP19]])
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX5]], -9223372036854775808
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX5]], i64 2
+; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4:       [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF1IC4-NEXT:    [[CMP2:%.*]] = fcmp ueq float [[TMP21]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    ret i64 [[COND_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -189,25 +994,210 @@ exit:                                             ; preds = %for.body
 }
 
 define i64 @select_icmp(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
-; CHECK-LABEL: define i64 @select_icmp(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret i64 [[COND_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i64 @select_icmp(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-VF4IC1-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP6]])
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 [[RDX_START]]
+; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF4IC1-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP9]], [[TMP10]]
+; CHECK-VF4IC1-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i64 @select_icmp(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
+; CHECK-VF4IC4-NEXT:    [[TMP11:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD7]]
+; CHECK-VF4IC4-NEXT:    [[TMP12:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD4]], [[WIDE_LOAD8]]
+; CHECK-VF4IC4-NEXT:    [[TMP13:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD5]], [[WIDE_LOAD9]]
+; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD6]], [[WIDE_LOAD10]]
+; CHECK-VF4IC4-NEXT:    [[TMP15]] = select <4 x i1> [[TMP11]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT:    [[TMP16]] = select <4 x i1> [[TMP12]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; CHECK-VF4IC4-NEXT:    [[TMP17]] = select <4 x i1> [[TMP13]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT:    [[TMP18]] = select <4 x i1> [[TMP14]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP15]], <4 x i64> [[TMP16]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX11:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP17]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX12:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX11]], <4 x i64> [[TMP18]])
+; CHECK-VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX12]])
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP20]], -9223372036854775808
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP20]], i64 [[RDX_START]]
+; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP21:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP22:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF4IC4-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP21]], [[TMP22]]
+; CHECK-VF4IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i64 @select_icmp(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4:       [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4:       [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP27:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP20:%.*]] = icmp sgt i64 [[TMP8]], [[TMP16]]
+; CHECK-VF1IC4-NEXT:    [[TMP21:%.*]] = icmp sgt i64 [[TMP9]], [[TMP17]]
+; CHECK-VF1IC4-NEXT:    [[TMP22:%.*]] = icmp sgt i64 [[TMP10]], [[TMP18]]
+; CHECK-VF1IC4-NEXT:    [[TMP23:%.*]] = icmp sgt i64 [[TMP11]], [[TMP19]]
+; CHECK-VF1IC4-NEXT:    [[TMP24]] = select i1 [[TMP20]], i64 [[TMP0]], i64 [[VEC_PHI]]
+; CHECK-VF1IC4-NEXT:    [[TMP25]] = select i1 [[TMP21]], i64 [[TMP1]], i64 [[VEC_PHI1]]
+; CHECK-VF1IC4-NEXT:    [[TMP26]] = select i1 [[TMP22]], i64 [[TMP2]], i64 [[VEC_PHI2]]
+; CHECK-VF1IC4-NEXT:    [[TMP27]] = select i1 [[TMP23]], i64 [[TMP3]], i64 [[VEC_PHI3]]
+; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP24]], i64 [[TMP25]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX4:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP26]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX5:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX4]], i64 [[TMP27]])
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX5]], -9223372036854775808
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX5]], i64 [[RDX_START]]
+; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4:       [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP30:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF1IC4-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP29]], [[TMP30]]
+; CHECK-VF1IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    ret i64 [[COND_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -230,25 +1220,210 @@ exit:                                             ; preds = %for.body
 }
 
 define i64 @select_fcmp(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
-; CHECK-LABEL: define i64 @select_fcmp(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ogt float [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret i64 [[COND_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i64 @select_fcmp(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-VF4IC1-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP6]])
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 [[RDX_START]]
+; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP2:%.*]] = fcmp ogt float [[TMP9]], [[TMP10]]
+; CHECK-VF4IC1-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i64 @select_fcmp(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP8]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP9]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP10]], align 4
+; CHECK-VF4IC4-NEXT:    [[TMP11:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD7]]
+; CHECK-VF4IC4-NEXT:    [[TMP12:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD4]], [[WIDE_LOAD8]]
+; CHECK-VF4IC4-NEXT:    [[TMP13:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD9]]
+; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD10]]
+; CHECK-VF4IC4-NEXT:    [[TMP15]] = select <4 x i1> [[TMP11]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT:    [[TMP16]] = select <4 x i1> [[TMP12]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; CHECK-VF4IC4-NEXT:    [[TMP17]] = select <4 x i1> [[TMP13]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT:    [[TMP18]] = select <4 x i1> [[TMP14]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP15]], <4 x i64> [[TMP16]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX11:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP17]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX12:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX11]], <4 x i64> [[TMP18]])
+; CHECK-VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX12]])
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP20]], -9223372036854775808
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP20]], i64 [[RDX_START]]
+; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-VF4IC4-NEXT:    [[CMP2:%.*]] = fcmp ogt float [[TMP21]], [[TMP22]]
+; CHECK-VF4IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i64 @select_fcmp(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4:       [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4:       [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP27:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP4]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP12]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP13]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP14]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = load float, ptr [[TMP15]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP20:%.*]] = fcmp ogt float [[TMP8]], [[TMP16]]
+; CHECK-VF1IC4-NEXT:    [[TMP21:%.*]] = fcmp ogt float [[TMP9]], [[TMP17]]
+; CHECK-VF1IC4-NEXT:    [[TMP22:%.*]] = fcmp ogt float [[TMP10]], [[TMP18]]
+; CHECK-VF1IC4-NEXT:    [[TMP23:%.*]] = fcmp ogt float [[TMP11]], [[TMP19]]
+; CHECK-VF1IC4-NEXT:    [[TMP24]] = select i1 [[TMP20]], i64 [[TMP0]], i64 [[VEC_PHI]]
+; CHECK-VF1IC4-NEXT:    [[TMP25]] = select i1 [[TMP21]], i64 [[TMP1]], i64 [[VEC_PHI1]]
+; CHECK-VF1IC4-NEXT:    [[TMP26]] = select i1 [[TMP22]], i64 [[TMP2]], i64 [[VEC_PHI2]]
+; CHECK-VF1IC4-NEXT:    [[TMP27]] = select i1 [[TMP23]], i64 [[TMP3]], i64 [[VEC_PHI3]]
+; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP24]], i64 [[TMP25]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX4:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP26]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX5:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX4]], i64 [[TMP27]])
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX5]], -9223372036854775808
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX5]], i64 [[RDX_START]]
+; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4:       [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP29:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP30:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-VF1IC4-NEXT:    [[CMP2:%.*]] = fcmp ogt float [[TMP29]], [[TMP30]]
+; CHECK-VF1IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    ret i64 [[COND_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -271,27 +1446,227 @@ exit:                                             ; preds = %for.body
 }
 
 define i64 @select_icmp_min_valid_iv_start(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
-; CHECK-LABEL: define i64 @select_icmp_min_valid_iv_start(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775807, %[[ENTRY]] ]
-; CHECK-NEXT:    [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV_J]], i64 [[RDX]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV_I]], 1
-; CHECK-NEXT:    [[INC3]] = add nsw i64 [[IV_J]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret i64 [[COND_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i64 @select_icmp_min_valid_iv_start(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT:    [[IND_END:%.*]] = add i64 -9223372036854775807, [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -9223372036854775807, i64 -9223372036854775806, i64 -9223372036854775805, i64 -9223372036854775804>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-VF4IC1-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP6]])
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 [[RDX_START]]
+; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775807, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
+; CHECK-VF4IC1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]]
+; CHECK-VF4IC1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF4IC1-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP9]], [[TMP10]]
+; CHECK-VF4IC1-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV_J]], i64 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV_I]], 1
+; CHECK-VF4IC1-NEXT:    [[INC3]] = add nsw i64 [[IV_J]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i64 @select_icmp_min_valid_iv_start(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT:    [[IND_END:%.*]] = add i64 -9223372036854775807, [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -9223372036854775807, i64 -9223372036854775806, i64 -9223372036854775805, i64 -9223372036854775804>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
+; CHECK-VF4IC4-NEXT:    [[TMP11:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD8]]
+; CHECK-VF4IC4-NEXT:    [[TMP12:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD5]], [[WIDE_LOAD9]]
+; CHECK-VF4IC4-NEXT:    [[TMP13:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD6]], [[WIDE_LOAD10]]
+; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD7]], [[WIDE_LOAD11]]
+; CHECK-VF4IC4-NEXT:    [[TMP15]] = select <4 x i1> [[TMP11]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT:    [[TMP16]] = select <4 x i1> [[TMP12]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT:    [[TMP17]] = select <4 x i1> [[TMP13]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT:    [[TMP18]] = select <4 x i1> [[TMP14]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI4]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP15]], <4 x i64> [[TMP16]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX12:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP17]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX13:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX12]], <4 x i64> [[TMP18]])
+; CHECK-VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX13]])
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP20]], -9223372036854775808
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP20]], i64 [[RDX_START]]
+; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775807, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
+; CHECK-VF4IC4-NEXT:    [[TMP21:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]]
+; CHECK-VF4IC4-NEXT:    [[TMP22:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF4IC4-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP21]], [[TMP22]]
+; CHECK-VF4IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV_J]], i64 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV_I]], 1
+; CHECK-VF4IC4-NEXT:    [[INC3]] = add nsw i64 [[IV_J]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i64 @select_icmp_min_valid_iv_start(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4:       [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT:    [[IND_END:%.*]] = add i64 -9223372036854775807, [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4:       [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP28:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP30:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI4:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP31:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 -9223372036854775807, [[INDEX]]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP5]]
+; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP6]]
+; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP7]]
+; CHECK-VF1IC4-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP16]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP17]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP18]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP19]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP24:%.*]] = icmp sgt i64 [[TMP12]], [[TMP20]]
+; CHECK-VF1IC4-NEXT:    [[TMP25:%.*]] = icmp sgt i64 [[TMP13]], [[TMP21]]
+; CHECK-VF1IC4-NEXT:    [[TMP26:%.*]] = icmp sgt i64 [[TMP14]], [[TMP22]]
+; CHECK-VF1IC4-NEXT:    [[TMP27:%.*]] = icmp sgt i64 [[TMP15]], [[TMP23]]
+; CHECK-VF1IC4-NEXT:    [[TMP28]] = select i1 [[TMP24]], i64 [[TMP0]], i64 [[VEC_PHI]]
+; CHECK-VF1IC4-NEXT:    [[TMP29]] = select i1 [[TMP25]], i64 [[TMP1]], i64 [[VEC_PHI2]]
+; CHECK-VF1IC4-NEXT:    [[TMP30]] = select i1 [[TMP26]], i64 [[TMP2]], i64 [[VEC_PHI3]]
+; CHECK-VF1IC4-NEXT:    [[TMP31]] = select i1 [[TMP27]], i64 [[TMP3]], i64 [[VEC_PHI4]]
+; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP28]], i64 [[TMP29]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX5:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP30]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX5]], i64 [[TMP31]])
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX6]], -9223372036854775808
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX6]], i64 [[RDX_START]]
+; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4:       [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775807, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT:    [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
+; CHECK-VF1IC4-NEXT:    [[TMP33:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]]
+; CHECK-VF1IC4-NEXT:    [[TMP34:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF1IC4-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP33]], [[TMP34]]
+; CHECK-VF1IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV_J]], i64 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV_I]], 1
+; CHECK-VF1IC4-NEXT:    [[INC3]] = add nsw i64 [[IV_J]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    ret i64 [[COND_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -318,27 +1693,71 @@ exit:                                             ; preds = %for.body
 ; Negative tests
 
 define float @not_vectorized_select_float_induction_icmp(ptr %a, ptr %b, float %rdx.start, i64 %n) {
-; CHECK-LABEL: define float @not_vectorized_select_float_induction_icmp(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], float [[RDX_START:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[FIV:%.*]] = phi float [ [[CONV3:%.*]], %[[FOR_BODY]] ], [ 0.000000e+00, %[[ENTRY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi float [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[COND]] = select i1 [[CMP2]], float [[FIV]], float [[RDX]]
-; CHECK-NEXT:    [[CONV3]] = fadd float [[FIV]], 1.000000e+00
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi float [ [[COND]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret float [[COND_LCSSA]]
+; CHECK-VF4IC1-LABEL: define float @not_vectorized_select_float_induction_icmp(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], float [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[FIV:%.*]] = phi float [ [[CONV3:%.*]], %[[FOR_BODY]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi float [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF4IC1-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; CHECK-VF4IC1-NEXT:    [[COND]] = select i1 [[CMP2]], float [[FIV]], float [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[CONV3]] = fadd float [[FIV]], 1.000000e+00
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi float [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    ret float [[COND_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define float @not_vectorized_select_float_induction_icmp(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], float [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[FIV:%.*]] = phi float [ [[CONV3:%.*]], %[[FOR_BODY]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi float [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF4IC4-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; CHECK-VF4IC4-NEXT:    [[COND]] = select i1 [[CMP2]], float [[FIV]], float [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[CONV3]] = fadd float [[FIV]], 1.000000e+00
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi float [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    ret float [[COND_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define float @not_vectorized_select_float_induction_icmp(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], float [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[FIV:%.*]] = phi float [ [[CONV3:%.*]], %[[FOR_BODY]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi float [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF1IC4-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[COND]] = select i1 [[CMP2]], float [[FIV]], float [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[CONV3]] = fadd float [[FIV]], 1.000000e+00
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[COND_LCSSA:%.*]] = phi float [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    ret float [[COND_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -363,23 +1782,59 @@ exit:                                             ; preds = %for.body
 }
 
 define i64 @not_vectorized_select_decreasing_induction_icmp_const_start(ptr %a) {
-; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_const_start(
-; CHECK-SAME: ptr [[A:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 19999, %[[ENTRY]] ], [ [[DEC:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[TMP0]], 3
-; CHECK-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i64 [[IV]], i64 [[RDX]]
-; CHECK-NEXT:    [[DEC]] = add nsw i64 [[IV]], -1
-; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret i64 [[SPEC_SELECT_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_const_start(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 19999, %[[ENTRY]] ], [ [[DEC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[TMP0]], 3
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[DEC]] = add nsw i64 [[IV]], -1
+; CHECK-VF4IC1-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[IV]], 0
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[SPEC_SELECT_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_const_start(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ 19999, %[[ENTRY]] ], [ [[DEC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC4-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[TMP0]], 3
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[DEC]] = add nsw i64 [[IV]], -1
+; CHECK-VF4IC4-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[IV]], 0
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    ret i64 [[SPEC_SELECT_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_const_start(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ 19999, %[[ENTRY]] ], [ [[DEC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF1IC4-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[TMP0]], 3
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[DEC]] = add nsw i64 [[IV]], -1
+; CHECK-VF1IC4-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[IV]], 0
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    ret i64 [[SPEC_SELECT_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -400,25 +1855,65 @@ exit:                                             ; preds = %for.body
 }
 
 define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
-; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[I_0_IN10:%.*]] = phi i64 [ [[IV:%.*]], %[[FOR_BODY]] ], [ [[N]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[IV]] = add nsw i64 [[I_0_IN10]], -1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[I_0_IN10]], 1
-; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT:.*]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret i64 [[COND_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[I_0_IN10:%.*]] = phi i64 [ [[IV:%.*]], %[[FOR_BODY]] ], [ [[N]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[IV]] = add nsw i64 [[I_0_IN10]], -1
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF4IC1-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; CHECK-VF4IC1-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[I_0_IN10]], 1
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT:.*]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[I_0_IN10:%.*]] = phi i64 [ [[IV:%.*]], %[[FOR_BODY]] ], [ [[N]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[IV]] = add nsw i64 [[I_0_IN10]], -1
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF4IC4-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; CHECK-VF4IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[I_0_IN10]], 1
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT:.*]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[I_0_IN10:%.*]] = phi i64 [ [[IV:%.*]], %[[FOR_BODY]] ], [ [[N]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[IV]] = add nsw i64 [[I_0_IN10]], -1
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF1IC4-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[I_0_IN10]], 1
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT:.*]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    ret i64 [[COND_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -443,27 +1938,71 @@ exit:                                             ; preds = %for.body
 ; The sentinel value for increasing-IV vectorization is -LONG_MAX, and since
 ; the IV hits this value, it is impossible to vectorize this case.
 define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
-; CHECK-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ]
-; CHECK-NEXT:    [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV_J]], i64 [[RDX]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV_I]], 1
-; CHECK-NEXT:    [[INC3]] = add nsw i64 [[IV_J]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret i64 [[COND_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]]
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF4IC1-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; CHECK-VF4IC1-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV_J]], i64 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV_I]], 1
+; CHECK-VF4IC1-NEXT:    [[INC3]] = add nsw i64 [[IV_J]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]]
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF4IC4-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; CHECK-VF4IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV_J]], i64 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV_I]], 1
+; CHECK-VF4IC4-NEXT:    [[INC3]] = add nsw i64 [[IV_J]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]]
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF1IC4-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV_J]], i64 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV_I]], 1
+; CHECK-VF1IC4-NEXT:    [[INC3]] = add nsw i64 [[IV_J]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    ret i64 [[COND_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -490,23 +2029,59 @@ exit:                                             ; preds = %for.body
 ; The sentinel value for decreasing-IV vectorization is LONG_MAX, and since
 ; the IV hits this value, it is impossible to vectorize this case.
 define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(ptr %a) {
-; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(
-; CHECK-SAME: ptr [[A:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 9223372036854775807, %[[ENTRY]] ], [ [[DEC:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i64 [[TMP0]], 3
-; CHECK-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX]]
-; CHECK-NEXT:    [[DEC]] = add nsw i64 [[IV]], -1
-; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret i64 [[SPEC_SELECT_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 9223372036854775807, %[[ENTRY]] ], [ [[DEC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC1-NEXT:    [[CMP1:%.*]] = icmp sgt i64 [[TMP0]], 3
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[DEC]] = add nsw i64 [[IV]], -1
+; CHECK-VF4IC1-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[IV]], 0
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[SPEC_SELECT_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ 9223372036854775807, %[[ENTRY]] ], [ [[DEC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC4-NEXT:    [[CMP1:%.*]] = icmp sgt i64 [[TMP0]], 3
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[DEC]] = add nsw i64 [[IV]], -1
+; CHECK-VF4IC4-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[IV]], 0
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    ret i64 [[SPEC_SELECT_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ 9223372036854775807, %[[ENTRY]] ], [ [[DEC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF1IC4-NEXT:    [[CMP1:%.*]] = icmp sgt i64 [[TMP0]], 3
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[DEC]] = add nsw i64 [[IV]], -1
+; CHECK-VF1IC4-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[IV]], 0
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    ret i64 [[SPEC_SELECT_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -527,25 +2102,65 @@ exit:                                             ; preds = %for.body
 }
 
 define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b, i64 %ivstart, i64 %rdx.start, i64 %n) {
-; CHECK-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret i64 [[COND_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF4IC1-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; CHECK-VF4IC1-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF4IC4-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; CHECK-VF4IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF1IC4-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    ret i64 [[COND_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -566,3 +2181,61 @@ for.body:                                         ; preds = %entry, %for.body
 exit:                                             ; preds = %for.body
   ret i64 %cond
 }
+;.
+; CHECK-VF4IC1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VF4IC1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VF4IC1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VF4IC1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK-VF4IC1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VF4IC1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK-VF4IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-VF4IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK-VF4IC1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-VF4IC1: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK-VF4IC1: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK-VF4IC1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK-VF4IC1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK-VF4IC1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK-VF4IC1: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK-VF4IC1: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; CHECK-VF4IC1: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; CHECK-VF4IC1: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+;.
+; CHECK-VF4IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VF4IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VF4IC4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VF4IC4: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK-VF4IC4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VF4IC4: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK-VF4IC4: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-VF4IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK-VF4IC4: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-VF4IC4: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK-VF4IC4: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK-VF4IC4: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK-VF4IC4: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK-VF4IC4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK-VF4IC4: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK-VF4IC4: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; CHECK-VF4IC4: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; CHECK-VF4IC4: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+;.
+; CHECK-VF1IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VF1IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VF1IC4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VF1IC4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; CHECK-VF1IC4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VF1IC4: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; CHECK-VF1IC4: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-VF1IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+; CHECK-VF1IC4: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-VF1IC4: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]}
+; CHECK-VF1IC4: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK-VF1IC4: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]}
+; CHECK-VF1IC4: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK-VF1IC4: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]}
+; CHECK-VF1IC4: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK-VF1IC4: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]]}
+; CHECK-VF1IC4: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; CHECK-VF1IC4: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/select-min-index.ll b/llvm/test/Transforms/LoopVectorize/select-min-index.ll
index 1ce88f7221451..335a7b4569b58 100644
--- a/llvm/test/Transforms/LoopVectorize/select-min-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-min-index.ll
@@ -1,30 +1,70 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
-; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s
-; RUN: opt -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S %s | FileCheck %s
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s --check-prefix=CHECK-VF4IC1
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s --check-prefix=CHECK-VF4IC2
+; RUN: opt -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S %s | FileCheck %s --check-prefix=CHECK-VF1IC2
 
 ; Test cases for selecting the index with the minimum value.
 
 define i64 @test_vectorize_select_umin_idx(ptr %src, i64 %n) {
-; CHECK-LABEL: define i64 @test_vectorize_select_umin_idx(
-; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
-; CHECK-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
-; CHECK-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-VF4IC1-LABEL: define i64 @test_vectorize_select_umin_idx(
+; CHECK-VF4IC1-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC1:       [[LOOP]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF4IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[RES]]
+;
+; CHECK-VF4IC2-LABEL: define i64 @test_vectorize_select_umin_idx(
+; CHECK-VF4IC2-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC2:       [[LOOP]]:
+; CHECK-VF4IC2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF4IC2-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF4IC2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF4IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC2:       [[EXIT]]:
+; CHECK-VF4IC2-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    ret i64 [[RES]]
+;
+; CHECK-VF1IC2-LABEL: define i64 @test_vectorize_select_umin_idx(
+; CHECK-VF1IC2-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF1IC2:       [[LOOP]]:
+; CHECK-VF1IC2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF1IC2-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF1IC2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF1IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF1IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF1IC2:       [[EXIT]]:
+; CHECK-VF1IC2-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    ret i64 [[RES]]
 ;
 entry:
   br label %loop
@@ -48,27 +88,71 @@ exit:
 }
 
 define i64 @test_vectorize_select_umin_idx_all_exit_inst(ptr %src, ptr %umin, i64 %n) {
-; CHECK-LABEL: define i64 @test_vectorize_select_umin_idx_all_exit_inst(
-; CHECK-SAME: ptr [[SRC:%.*]], ptr [[UMIN:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
-; CHECK-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
-; CHECK-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RES_UMIN:%.*]] = phi i64 [ [[MIN_VAL_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT:    store i64 [[RES_UMIN]], ptr [[UMIN]], align 4
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-VF4IC1-LABEL: define i64 @test_vectorize_select_umin_idx_all_exit_inst(
+; CHECK-VF4IC1-SAME: ptr [[SRC:%.*]], ptr [[UMIN:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC1:       [[LOOP]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF4IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[RES_UMIN:%.*]] = phi i64 [ [[MIN_VAL_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    store i64 [[RES_UMIN]], ptr [[UMIN]], align 4
+; CHECK-VF4IC1-NEXT:    ret i64 [[RES]]
+;
+; CHECK-VF4IC2-LABEL: define i64 @test_vectorize_select_umin_idx_all_exit_inst(
+; CHECK-VF4IC2-SAME: ptr [[SRC:%.*]], ptr [[UMIN:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC2:       [[LOOP]]:
+; CHECK-VF4IC2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF4IC2-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF4IC2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF4IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC2:       [[EXIT]]:
+; CHECK-VF4IC2-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[RES_UMIN:%.*]] = phi i64 [ [[MIN_VAL_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    store i64 [[RES_UMIN]], ptr [[UMIN]], align 4
+; CHECK-VF4IC2-NEXT:    ret i64 [[RES]]
+;
+; CHECK-VF1IC2-LABEL: define i64 @test_vectorize_select_umin_idx_all_exit_inst(
+; CHECK-VF1IC2-SAME: ptr [[SRC:%.*]], ptr [[UMIN:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF1IC2:       [[LOOP]]:
+; CHECK-VF1IC2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF1IC2-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF1IC2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF1IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF1IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF1IC2:       [[EXIT]]:
+; CHECK-VF1IC2-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[RES_UMIN:%.*]] = phi i64 [ [[MIN_VAL_NEXT]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    store i64 [[RES_UMIN]], ptr [[UMIN]], align 4
+; CHECK-VF1IC2-NEXT:    ret i64 [[RES]]
 ;
 entry:
   br label %loop
@@ -94,25 +178,65 @@ exit:
 }
 
 define i64 @test_vectorize_select_umin_idx_min_ops_switched(ptr %src, i64 %n) {
-; CHECK-LABEL: define i64 @test_vectorize_select_umin_idx_min_ops_switched(
-; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
-; CHECK-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[L]], i64 [[MIN_VAL]])
-; CHECK-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-VF4IC1-LABEL: define i64 @test_vectorize_select_umin_idx_min_ops_switched(
+; CHECK-VF4IC1-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC1:       [[LOOP]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[L]], i64 [[MIN_VAL]])
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF4IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[RES]]
+;
+; CHECK-VF4IC2-LABEL: define i64 @test_vectorize_select_umin_idx_min_ops_switched(
+; CHECK-VF4IC2-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC2:       [[LOOP]]:
+; CHECK-VF4IC2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF4IC2-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF4IC2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[L]], i64 [[MIN_VAL]])
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF4IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC2:       [[EXIT]]:
+; CHECK-VF4IC2-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    ret i64 [[RES]]
+;
+; CHECK-VF1IC2-LABEL: define i64 @test_vectorize_select_umin_idx_min_ops_switched(
+; CHECK-VF1IC2-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF1IC2:       [[LOOP]]:
+; CHECK-VF1IC2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF1IC2-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF1IC2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[L]], i64 [[MIN_VAL]])
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF1IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF1IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF1IC2:       [[EXIT]]:
+; CHECK-VF1IC2-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    ret i64 [[RES]]
 ;
 entry:
   br label %loop
@@ -136,26 +260,181 @@ exit:
 }
 
 define i64 @test_not_vectorize_select_no_min_reduction(ptr %src, i64 %n) {
-; CHECK-LABEL: define i64 @test_not_vectorize_select_no_min_reduction(
-; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
-; CHECK-NEXT:    [[MIN_VAL_NEXT]] = add i64 [[L]], 1
-; CHECK-NEXT:    [[FOO:%.*]] = call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
-; CHECK-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-VF4IC1-LABEL: define i64 @test_not_vectorize_select_no_min_reduction(
+; CHECK-VF4IC1-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT:    [[TMP3]] = add <4 x i64> [[WIDE_LOAD]], splat (i64 1)
+; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp ugt <4 x i64> [[TMP4]], [[WIDE_LOAD]]
+; CHECK-VF4IC1-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP6]])
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 0
+; CHECK-VF4IC1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC1:       [[LOOP]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL_NEXT]] = add i64 [[L]], 1
+; CHECK-VF4IC1-NEXT:    [[FOO:%.*]] = call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF4IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[RES]]
+;
+; CHECK-VF4IC2-LABEL: define i64 @test_not_vectorize_select_no_min_reduction(
+; CHECK-VF4IC2-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-VF4IC2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC2:       [[VECTOR_PH]]:
+; CHECK-VF4IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-VF4IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC2:       [[VECTOR_BODY]]:
+; CHECK-VF4IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC2-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC2-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC2-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC2-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC2-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[TMP0]]
+; CHECK-VF4IC2-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
+; CHECK-VF4IC2-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP1]], i32 4
+; CHECK-VF4IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-VF4IC2-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP3]], align 4
+; CHECK-VF4IC2-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[WIDE_LOAD]], splat (i64 1)
+; CHECK-VF4IC2-NEXT:    [[TMP5]] = add <4 x i64> [[WIDE_LOAD2]], splat (i64 1)
+; CHECK-VF4IC2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-VF4IC2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-VF4IC2-NEXT:    [[TMP8:%.*]] = icmp ugt <4 x i64> [[TMP6]], [[WIDE_LOAD]]
+; CHECK-VF4IC2-NEXT:    [[TMP9:%.*]] = icmp ugt <4 x i64> [[TMP7]], [[WIDE_LOAD2]]
+; CHECK-VF4IC2-NEXT:    [[TMP10]] = select <4 x i1> [[TMP8]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC2-NEXT:    [[TMP11]] = select <4 x i1> [[TMP9]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; CHECK-VF4IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-VF4IC2-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-VF4IC2-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC2-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF4IC2:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC2-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP10]], <4 x i64> [[TMP11]])
+; CHECK-VF4IC2-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX]])
+; CHECK-VF4IC2-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP13]], -9223372036854775808
+; CHECK-VF4IC2-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP13]], i64 0
+; CHECK-VF4IC2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
+; CHECK-VF4IC2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC2:       [[SCALAR_PH]]:
+; CHECK-VF4IC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC2:       [[LOOP]]:
+; CHECK-VF4IC2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF4IC2-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF4IC2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL_NEXT]] = add i64 [[L]], 1
+; CHECK-VF4IC2-NEXT:    [[FOO:%.*]] = call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF4IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF4IC2:       [[EXIT]]:
+; CHECK-VF4IC2-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC2-NEXT:    ret i64 [[RES]]
+;
+; CHECK-VF1IC2-LABEL: define i64 @test_not_vectorize_select_no_min_reduction(
+; CHECK-VF1IC2-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; CHECK-VF1IC2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC2:       [[VECTOR_PH]]:
+; CHECK-VF1IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; CHECK-VF1IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF1IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC2:       [[VECTOR_BODY]]:
+; CHECK-VF1IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC2-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC2-NEXT:    [[VEC_PHI1:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC2-NEXT:    [[VECTOR_RECUR:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF1IC2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-VF1IC2-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[TMP0]]
+; CHECK-VF1IC2-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[TMP1]]
+; CHECK-VF1IC2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP2]], align 4
+; CHECK-VF1IC2-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 4
+; CHECK-VF1IC2-NEXT:    [[TMP6:%.*]] = add i64 [[TMP4]], 1
+; CHECK-VF1IC2-NEXT:    [[TMP7]] = add i64 [[TMP5]], 1
+; CHECK-VF1IC2-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[VECTOR_RECUR]], [[TMP4]]
+; CHECK-VF1IC2-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[TMP6]], [[TMP5]]
+; CHECK-VF1IC2-NEXT:    [[TMP10]] = select i1 [[TMP8]], i64 [[TMP0]], i64 [[VEC_PHI]]
+; CHECK-VF1IC2-NEXT:    [[TMP11]] = select i1 [[TMP9]], i64 [[TMP1]], i64 [[VEC_PHI1]]
+; CHECK-VF1IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-VF1IC2-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC2-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF1IC2:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC2-NEXT:    [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP10]], i64 [[TMP11]])
+; CHECK-VF1IC2-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX]], -9223372036854775808
+; CHECK-VF1IC2-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX]], i64 0
+; CHECK-VF1IC2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF1IC2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC2:       [[SCALAR_PH]]:
+; CHECK-VF1IC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF1IC2:       [[LOOP]]:
+; CHECK-VF1IC2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF1IC2-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF1IC2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL_NEXT]] = add i64 [[L]], 1
+; CHECK-VF1IC2-NEXT:    [[FOO:%.*]] = call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF1IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF1IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF1IC2:       [[EXIT]]:
+; CHECK-VF1IC2-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC2-NEXT:    ret i64 [[RES]]
 ;
 entry:
   br label %loop
@@ -181,23 +460,59 @@ exit:
 
 
 define i64 @test_not_vectorize_cmp_value(i64 %x, i64 %n) {
-; CHECK-LABEL: define i64 @test_not_vectorize_cmp_value(
-; CHECK-SAME: i64 [[X:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[X]]
-; CHECK-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 0)
-; CHECK-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-VF4IC1-LABEL: define i64 @test_not_vectorize_cmp_value(
+; CHECK-VF4IC1-SAME: i64 [[X:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC1:       [[LOOP]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[X]]
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 0)
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF4IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[RES]]
+;
+; CHECK-VF4IC2-LABEL: define i64 @test_not_vectorize_cmp_value(
+; CHECK-VF4IC2-SAME: i64 [[X:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC2:       [[LOOP]]:
+; CHECK-VF4IC2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[X]]
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 0)
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF4IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC2:       [[EXIT]]:
+; CHECK-VF4IC2-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    ret i64 [[RES]]
+;
+; CHECK-VF1IC2-LABEL: define i64 @test_not_vectorize_cmp_value(
+; CHECK-VF1IC2-SAME: i64 [[X:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF1IC2:       [[LOOP]]:
+; CHECK-VF1IC2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[X]]
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 0)
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF1IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF1IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF1IC2:       [[EXIT]]:
+; CHECK-VF1IC2-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    ret i64 [[RES]]
 ;
 entry:
   br label %loop
@@ -219,24 +534,62 @@ exit:
 }
 
 define i32 @test_vectorize_select_umin_idx_with_trunc(i64 %n) {
-; CHECK-LABEL: define i32 @test_vectorize_select_umin_idx_with_trunc(
-; CHECK-SAME: i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_IDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], 0
-; CHECK-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 0)
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i32 [[TRUNC]], i32 [[MIN_IDX]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i32 [[RES]]
+; CHECK-VF4IC1-LABEL: define i32 @test_vectorize_select_umin_idx_with_trunc(
+; CHECK-VF4IC1-SAME: i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC1:       [[LOOP]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], 0
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 0)
+; CHECK-VF4IC1-NEXT:    [[TRUNC:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i32 [[TRUNC]], i32 [[MIN_IDX]]
+; CHECK-VF4IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[RES:%.*]] = phi i32 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    ret i32 [[RES]]
+;
+; CHECK-VF4IC2-LABEL: define i32 @test_vectorize_select_umin_idx_with_trunc(
+; CHECK-VF4IC2-SAME: i64 [[N:%.*]]) {
+; CHECK-VF4IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC2:       [[LOOP]]:
+; CHECK-VF4IC2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], 0
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 0)
+; CHECK-VF4IC2-NEXT:    [[TRUNC:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i32 [[TRUNC]], i32 [[MIN_IDX]]
+; CHECK-VF4IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC2:       [[EXIT]]:
+; CHECK-VF4IC2-NEXT:    [[RES:%.*]] = phi i32 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    ret i32 [[RES]]
+;
+; CHECK-VF1IC2-LABEL: define i32 @test_vectorize_select_umin_idx_with_trunc(
+; CHECK-VF1IC2-SAME: i64 [[N:%.*]]) {
+; CHECK-VF1IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF1IC2:       [[LOOP]]:
+; CHECK-VF1IC2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], 0
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 0)
+; CHECK-VF1IC2-NEXT:    [[TRUNC:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i32 [[TRUNC]], i32 [[MIN_IDX]]
+; CHECK-VF1IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF1IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF1IC2:       [[EXIT]]:
+; CHECK-VF1IC2-NEXT:    [[RES:%.*]] = phi i32 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    ret i32 [[RES]]
 ;
 entry:
   br label %loop
@@ -259,23 +612,59 @@ exit:
 }
 
 define ptr @test_with_ptr_index(ptr %start, ptr %end) {
-; CHECK-LABEL: define ptr @test_with_ptr_index(
-; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_IDX:%.*]] = phi ptr [ null, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[CMP7_US:%.*]] = icmp ult i64 0, 0
-; CHECK-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 0)
-; CHECK-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP7_US]], ptr [[IV]], ptr [[MIN_IDX]]
-; CHECK-NEXT:    [[IV_NEXT]] = getelementptr i32, ptr [[IV]], i64 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq ptr [[IV_NEXT]], [[END]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi ptr [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT:    ret ptr [[RES]]
+; CHECK-VF4IC1-LABEL: define ptr @test_with_ptr_index(
+; CHECK-VF4IC1-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC1:       [[LOOP]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX:%.*]] = phi ptr [ null, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[CMP7_US:%.*]] = icmp ult i64 0, 0
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 0)
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP7_US]], ptr [[IV]], ptr [[MIN_IDX]]
+; CHECK-VF4IC1-NEXT:    [[IV_NEXT]] = getelementptr i32, ptr [[IV]], i64 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq ptr [[IV_NEXT]], [[END]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[RES:%.*]] = phi ptr [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    ret ptr [[RES]]
+;
+; CHECK-VF4IC2-LABEL: define ptr @test_with_ptr_index(
+; CHECK-VF4IC2-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) {
+; CHECK-VF4IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC2:       [[LOOP]]:
+; CHECK-VF4IC2-NEXT:    [[IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX:%.*]] = phi ptr [ null, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[CMP7_US:%.*]] = icmp ult i64 0, 0
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 0)
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP7_US]], ptr [[IV]], ptr [[MIN_IDX]]
+; CHECK-VF4IC2-NEXT:    [[IV_NEXT]] = getelementptr i32, ptr [[IV]], i64 1
+; CHECK-VF4IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq ptr [[IV_NEXT]], [[END]]
+; CHECK-VF4IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC2:       [[EXIT]]:
+; CHECK-VF4IC2-NEXT:    [[RES:%.*]] = phi ptr [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    ret ptr [[RES]]
+;
+; CHECK-VF1IC2-LABEL: define ptr @test_with_ptr_index(
+; CHECK-VF1IC2-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) {
+; CHECK-VF1IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF1IC2:       [[LOOP]]:
+; CHECK-VF1IC2-NEXT:    [[IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX:%.*]] = phi ptr [ null, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[CMP7_US:%.*]] = icmp ult i64 0, 0
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 0)
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP7_US]], ptr [[IV]], ptr [[MIN_IDX]]
+; CHECK-VF1IC2-NEXT:    [[IV_NEXT]] = getelementptr i32, ptr [[IV]], i64 1
+; CHECK-VF1IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq ptr [[IV_NEXT]], [[END]]
+; CHECK-VF1IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF1IC2:       [[EXIT]]:
+; CHECK-VF1IC2-NEXT:    [[RES:%.*]] = phi ptr [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    ret ptr [[RES]]
 ;
 entry:
   br label %loop
@@ -297,20 +686,50 @@ exit:
 }
 
 define void @pointer_index(ptr %start) {
-; CHECK-LABEL: define void @pointer_index(
-; CHECK-SAME: ptr [[START:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[PTR_IDX:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_SELECT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[CMP_I_I_I_I2531:%.*]] = icmp ult i16 0, 0
-; CHECK-NEXT:    [[PTR_SELECT]] = select i1 [[CMP_I_I_I_I2531]], ptr [[PTR_IV]], ptr [[PTR_IDX]]
-; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i16, ptr [[PTR_IV]], i64 1
-; CHECK-NEXT:    [[CMP_I_I10_NOT_I_I_I:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], null
-; CHECK-NEXT:    br i1 [[CMP_I_I10_NOT_I_I_I]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-VF4IC1-LABEL: define void @pointer_index(
+; CHECK-VF4IC1-SAME: ptr [[START:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC1:       [[LOOP]]:
+; CHECK-VF4IC1-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[PTR_IDX:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_SELECT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[CMP_I_I_I_I2531:%.*]] = icmp ult i16 0, 0
+; CHECK-VF4IC1-NEXT:    [[PTR_SELECT]] = select i1 [[CMP_I_I_I_I2531]], ptr [[PTR_IV]], ptr [[PTR_IDX]]
+; CHECK-VF4IC1-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i16, ptr [[PTR_IV]], i64 1
+; CHECK-VF4IC1-NEXT:    [[CMP_I_I10_NOT_I_I_I:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], null
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_I_I10_NOT_I_I_I]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    ret void
+;
+; CHECK-VF4IC2-LABEL: define void @pointer_index(
+; CHECK-VF4IC2-SAME: ptr [[START:%.*]]) {
+; CHECK-VF4IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC2:       [[LOOP]]:
+; CHECK-VF4IC2-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[PTR_IDX:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_SELECT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[CMP_I_I_I_I2531:%.*]] = icmp ult i16 0, 0
+; CHECK-VF4IC2-NEXT:    [[PTR_SELECT]] = select i1 [[CMP_I_I_I_I2531]], ptr [[PTR_IV]], ptr [[PTR_IDX]]
+; CHECK-VF4IC2-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i16, ptr [[PTR_IV]], i64 1
+; CHECK-VF4IC2-NEXT:    [[CMP_I_I10_NOT_I_I_I:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], null
+; CHECK-VF4IC2-NEXT:    br i1 [[CMP_I_I10_NOT_I_I_I]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC2:       [[EXIT]]:
+; CHECK-VF4IC2-NEXT:    ret void
+;
+; CHECK-VF1IC2-LABEL: define void @pointer_index(
+; CHECK-VF1IC2-SAME: ptr [[START:%.*]]) {
+; CHECK-VF1IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF1IC2:       [[LOOP]]:
+; CHECK-VF1IC2-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[PTR_IDX:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_SELECT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[CMP_I_I_I_I2531:%.*]] = icmp ult i16 0, 0
+; CHECK-VF1IC2-NEXT:    [[PTR_SELECT]] = select i1 [[CMP_I_I_I_I2531]], ptr [[PTR_IV]], ptr [[PTR_IDX]]
+; CHECK-VF1IC2-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i16, ptr [[PTR_IV]], i64 1
+; CHECK-VF1IC2-NEXT:    [[CMP_I_I10_NOT_I_I_I:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], null
+; CHECK-VF1IC2-NEXT:    br i1 [[CMP_I_I10_NOT_I_I_I]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF1IC2:       [[EXIT]]:
+; CHECK-VF1IC2-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -329,23 +748,59 @@ exit:
 }
 
 define ptr @pointer_index_2(ptr %start, ptr %end) {
-; CHECK-LABEL: define ptr @pointer_index_2(
-; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[MIN_VAL:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_IDX:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[CMP_I_I_I_I:%.*]] = icmp ult i16 0, [[MIN_VAL]]
-; CHECK-NEXT:    [[MIN_VAL_NEXT]] = call i16 @llvm.umin.i16(i16 0, i16 [[MIN_VAL]])
-; CHECK-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP_I_I_I_I]], ptr [[PTR_IV]], ptr [[MIN_IDX]]
-; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i16, ptr [[PTR_IV]], i64 1
-; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
-; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi ptr [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT:    ret ptr [[RES]]
+; CHECK-VF4IC1-LABEL: define ptr @pointer_index_2(
+; CHECK-VF4IC1-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC1:       [[LOOP]]:
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[CMP_I_I_I_I:%.*]] = icmp ult i16 0, [[MIN_VAL]]
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL_NEXT]] = call i16 @llvm.umin.i16(i16 0, i16 [[MIN_VAL]])
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP_I_I_I_I]], ptr [[PTR_IV]], ptr [[MIN_IDX]]
+; CHECK-VF4IC1-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i16, ptr [[PTR_IV]], i64 1
+; CHECK-VF4IC1-NEXT:    [[EXIT_COND:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[RES:%.*]] = phi ptr [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    ret ptr [[RES]]
+;
+; CHECK-VF4IC2-LABEL: define ptr @pointer_index_2(
+; CHECK-VF4IC2-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) {
+; CHECK-VF4IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC2:       [[LOOP]]:
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[CMP_I_I_I_I:%.*]] = icmp ult i16 0, [[MIN_VAL]]
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL_NEXT]] = call i16 @llvm.umin.i16(i16 0, i16 [[MIN_VAL]])
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP_I_I_I_I]], ptr [[PTR_IV]], ptr [[MIN_IDX]]
+; CHECK-VF4IC2-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i16, ptr [[PTR_IV]], i64 1
+; CHECK-VF4IC2-NEXT:    [[EXIT_COND:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; CHECK-VF4IC2-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC2:       [[EXIT]]:
+; CHECK-VF4IC2-NEXT:    [[RES:%.*]] = phi ptr [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    ret ptr [[RES]]
+;
+; CHECK-VF1IC2-LABEL: define ptr @pointer_index_2(
+; CHECK-VF1IC2-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) {
+; CHECK-VF1IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF1IC2:       [[LOOP]]:
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[CMP_I_I_I_I:%.*]] = icmp ult i16 0, [[MIN_VAL]]
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL_NEXT]] = call i16 @llvm.umin.i16(i16 0, i16 [[MIN_VAL]])
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP_I_I_I_I]], ptr [[PTR_IV]], ptr [[MIN_IDX]]
+; CHECK-VF1IC2-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i16, ptr [[PTR_IV]], i64 1
+; CHECK-VF1IC2-NEXT:    [[EXIT_COND:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; CHECK-VF1IC2-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF1IC2:       [[EXIT]]:
+; CHECK-VF1IC2-NEXT:    [[RES:%.*]] = phi ptr [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    ret ptr [[RES]]
 ;
 entry:
   br label %loop
@@ -367,25 +822,65 @@ exit:
 }
 
 define i64 @test_no_vectorize_select_iv_decrement(ptr %src) {
-; CHECK-LABEL: define i64 @test_no_vectorize_select_iv_decrement(
-; CHECK-SAME: ptr [[SRC:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1000, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
-; CHECK-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
-; CHECK-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], -1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-VF4IC1-LABEL: define i64 @test_no_vectorize_select_iv_decrement(
+; CHECK-VF4IC1-SAME: ptr [[SRC:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC1:       [[LOOP]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 1000, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF4IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], -1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[RES]]
+;
+; CHECK-VF4IC2-LABEL: define i64 @test_no_vectorize_select_iv_decrement(
+; CHECK-VF4IC2-SAME: ptr [[SRC:%.*]]) {
+; CHECK-VF4IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC2:       [[LOOP]]:
+; CHECK-VF4IC2-NEXT:    [[IV:%.*]] = phi i64 [ 1000, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF4IC2-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF4IC2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF4IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], -1
+; CHECK-VF4IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-VF4IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC2:       [[EXIT]]:
+; CHECK-VF4IC2-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    ret i64 [[RES]]
+;
+; CHECK-VF1IC2-LABEL: define i64 @test_no_vectorize_select_iv_decrement(
+; CHECK-VF1IC2-SAME: ptr [[SRC:%.*]]) {
+; CHECK-VF1IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF1IC2:       [[LOOP]]:
+; CHECK-VF1IC2-NEXT:    [[IV:%.*]] = phi i64 [ 1000, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF1IC2-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF1IC2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF1IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], -1
+; CHECK-VF1IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-VF1IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF1IC2:       [[EXIT]]:
+; CHECK-VF1IC2-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    ret i64 [[RES]]
 ;
 entry:
   br label %loop
@@ -409,25 +904,65 @@ exit:
 }
 
 define i64 @test_no_vectorize_select_iv_sub(ptr %src) {
-; CHECK-LABEL: define i64 @test_no_vectorize_select_iv_sub(
-; CHECK-SAME: ptr [[SRC:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1000, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
-; CHECK-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
-; CHECK-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
-; CHECK-NEXT:    [[IV_NEXT]] = sub i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-VF4IC1-LABEL: define i64 @test_no_vectorize_select_iv_sub(
+; CHECK-VF4IC1-SAME: ptr [[SRC:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC1:       [[LOOP]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 1000, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF4IC1-NEXT:    [[IV_NEXT]] = sub i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[RES]]
+;
+; CHECK-VF4IC2-LABEL: define i64 @test_no_vectorize_select_iv_sub(
+; CHECK-VF4IC2-SAME: ptr [[SRC:%.*]]) {
+; CHECK-VF4IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC2:       [[LOOP]]:
+; CHECK-VF4IC2-NEXT:    [[IV:%.*]] = phi i64 [ 1000, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF4IC2-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF4IC2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF4IC2-NEXT:    [[IV_NEXT]] = sub i64 [[IV]], 1
+; CHECK-VF4IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-VF4IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC2:       [[EXIT]]:
+; CHECK-VF4IC2-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    ret i64 [[RES]]
+;
+; CHECK-VF1IC2-LABEL: define i64 @test_no_vectorize_select_iv_sub(
+; CHECK-VF1IC2-SAME: ptr [[SRC:%.*]]) {
+; CHECK-VF1IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF1IC2:       [[LOOP]]:
+; CHECK-VF1IC2-NEXT:    [[IV:%.*]] = phi i64 [ 1000, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF1IC2-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF1IC2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF1IC2-NEXT:    [[IV_NEXT]] = sub i64 [[IV]], 1
+; CHECK-VF1IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-VF1IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF1IC2:       [[EXIT]]:
+; CHECK-VF1IC2-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    ret i64 [[RES]]
 ;
 entry:
   br label %loop
@@ -451,25 +986,65 @@ exit:
 }
 
 define i64 @test_no_vectorize_select_iv_mul(ptr %src) {
-; CHECK-LABEL: define i64 @test_no_vectorize_select_iv_mul(
-; CHECK-SAME: ptr [[SRC:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
-; CHECK-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
-; CHECK-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
-; CHECK-NEXT:    [[IV_NEXT]] = mul i64 [[IV]], 2
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 128
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-VF4IC1-LABEL: define i64 @test_no_vectorize_select_iv_mul(
+; CHECK-VF4IC1-SAME: ptr [[SRC:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC1:       [[LOOP]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF4IC1-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
+; CHECK-VF4IC1-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF4IC1-NEXT:    [[IV_NEXT]] = mul i64 [[IV]], 2
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 128
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[RES]]
+;
+; CHECK-VF4IC2-LABEL: define i64 @test_no_vectorize_select_iv_mul(
+; CHECK-VF4IC2-SAME: ptr [[SRC:%.*]]) {
+; CHECK-VF4IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC2:       [[LOOP]]:
+; CHECK-VF4IC2-NEXT:    [[IV:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF4IC2-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF4IC2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF4IC2-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
+; CHECK-VF4IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF4IC2-NEXT:    [[IV_NEXT]] = mul i64 [[IV]], 2
+; CHECK-VF4IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 128
+; CHECK-VF4IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF4IC2:       [[EXIT]]:
+; CHECK-VF4IC2-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF4IC2-NEXT:    ret i64 [[RES]]
+;
+; CHECK-VF1IC2-LABEL: define i64 @test_no_vectorize_select_iv_mul(
+; CHECK-VF1IC2-SAME: ptr [[SRC:%.*]]) {
+; CHECK-VF1IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC2-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF1IC2:       [[LOOP]]:
+; CHECK-VF1IC2-NEXT:    [[IV:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-VF1IC2-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-VF1IC2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[MIN_VAL]], [[L]]
+; CHECK-VF1IC2-NEXT:    [[MIN_VAL_NEXT]] = tail call i64 @llvm.umin.i64(i64 [[MIN_VAL]], i64 [[L]])
+; CHECK-VF1IC2-NEXT:    [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-VF1IC2-NEXT:    [[IV_NEXT]] = mul i64 [[IV]], 2
+; CHECK-VF1IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 128
+; CHECK-VF1IC2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF1IC2:       [[EXIT]]:
+; CHECK-VF1IC2-NEXT:    [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-VF1IC2-NEXT:    ret i64 [[RES]]
 ;
 entry:
   br label %loop
@@ -494,3 +1069,19 @@ exit:
 
 declare i64 @llvm.umin.i64(i64, i64)
 declare i16 @llvm.umin.i16(i16, i16)
+;.
+; CHECK-VF4IC1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VF4IC1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VF4IC1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VF4IC1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
+; CHECK-VF4IC2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VF4IC2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VF4IC2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VF4IC2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
+; CHECK-VF1IC2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VF1IC2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VF1IC2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VF1IC2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
index 368c766e6b3c7..730dbfe84070a 100644
--- a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
@@ -15,7 +15,7 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; CHECK-NEXT: ir-bb<entry>:
 ; CHECK-NEXT:   IR %src = alloca [128 x i32], align 4
 ; CHECK-NEXT:   IR call void @init(ptr %src)
-; CHECK-NEXT: No successors
+; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
@@ -99,7 +99,7 @@ define i64 @multi_exiting_to_same_exit_live_in_exit_values() {
 ; CHECK-NEXT: ir-bb<entry>:
 ; CHECK-NEXT:   IR %src = alloca [128 x i32], align 4
 ; CHECK-NEXT:   IR call void @init(ptr %src)
-; CHECK-NEXT: No successors
+; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
@@ -176,7 +176,7 @@ define i64 @multi_exiting_to_same_exit_live_in_exit_values_2() {
 ; CHECK-NEXT: ir-bb<entry>:
 ; CHECK-NEXT:   IR %src = alloca [128 x i32], align 4
 ; CHECK-NEXT:   IR call void @init(ptr %src)
-; CHECK-NEXT: No successors
+; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll
index 68dd47537fdfd..50755f8f05e35 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll
@@ -14,8 +14,9 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw
 ; CHECK-NEXT:  compound=true
 ; CHECK-NEXT:  N0 [label =
 ; CHECK-NEXT:    "ir-bb\<for.body.preheader\>:\l" +
-; CHECK-NEXT:    "No successors\l"
+; CHECK-NEXT:    "Successor(s): vector.ph\l"
 ; CHECK-NEXT:  ]
+; CHECK-NEXT:  N0 -> N1 [ label=""]
 ; CHECK-NEXT:  N1 [label =
 ; CHECK-NEXT:    "vector.ph:\l" +
 ; CHECK-NEXT:    "Successor(s): vector loop\l"
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
index 323a638fd6354..c342d2f81e979 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
@@ -9,6 +9,9 @@ define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<1000> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
index f0ac88c75e9ec..484e1ea8de0d2 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
@@ -9,9 +9,12 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
 ; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV ((-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64))
-; CHECK-NEXT: No successors
+; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
 ; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: ir-bb<vector.ph>:
+; CHECK-NEXT:   IR %n.mod.vf = urem i64 %0, 2
+; CHECK-NEXT:   IR %n.vec = sub i64 %0, %n.mod.vf
+; CHECK-NEXT:   IR %ind.end = getelementptr i8, ptr %start, i64 %n.vec
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -87,6 +90,9 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>:
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<scalar.ph>:
 ; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi ir<%ind.end>, ir<%start>
 ; CHECK-NEXT: Successor(s): ir-bb<loop.header>
@@ -95,9 +101,6 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; CHECK-NEXT:   IR   %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %ptr.iv.next, %loop.latch ] (extra operand: vp<[[RESUME]]> from ir-bb<scalar.ph>)
 ; CHECK-NEXT:   IR   %l = load i8, ptr %ptr.iv, align 1
 ; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<exit>:
-; CHECK-NEXT: No successors
 ; CHECK-NEXT: }
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
index 6bd31565e9fb9..f07d1af47af02 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -16,7 +16,7 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: ir-bb<entry>:
 ; CHECK-NEXT:   IR %and = and i64 %N, 15
 ; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (zext i4 (trunc i64 %N to i4) to i64)
-; CHECK-NEXT: No successors
+; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
@@ -64,9 +64,13 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: ir-bb<entry>:
 ; CHECK-NEXT:   IR %and = and i64 %N, 15
 ; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (zext i4 (trunc i64 %N to i4) to i64)
-; CHECK-NEXT: No successors
+; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
 ; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: ir-bb<vector.ph>:
+; CHECK-NEXT:  IR   %n.mod.vf = urem i64 %and, 16
+; CHECK-NEXT:  IR   %n.vec = sub i64 %and, %n.mod.vf
+; CHECK-NEXT:  IR   %ind.end = sub i64 %and, %n.vec
+; CHECK-NEXT:  IR   %ind.end1 = getelementptr i8, ptr %A, i64 %n.vec
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -95,6 +99,9 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[C]]>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>:
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<scalar.ph>:
 ; CHECK-NEXT:   EMIT vp<[[RESUME1:%.+]]> = resume-phi ir<%ind.end>, ir<%and>
 ; CHECK-NEXT:   EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi ir<%ind.end1>, ir<%A>
@@ -105,9 +112,6 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT:   IR   %p.src = phi ptr [ %A, %scalar.ph ], [ %p.src.next, %loop ] (extra operand: vp<[[RESUME2]]>.1 from ir-bb<scalar.ph>)
 ; CHECK:        IR   %cmp = icmp eq i64 %iv.next, 0
 ; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<exit>:
-; CHECK-NEXT: No successors
 ; CHECK-NEXT: }
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll
index 5e3116eae8548..2adeb5920cb5b 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll
@@ -11,6 +11,9 @@ define void @foo(i64 %n) {
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<8> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 32e55bff94b3e..c526c53dbea06 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -13,6 +13,9 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<%n> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.body.preheader>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -77,6 +80,9 @@ define void @print_widen_gep_and_select(i64 %n, ptr noalias %y, ptr noalias %x,
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<%n> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.body.preheader>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -145,6 +151,9 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<%n> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -208,6 +217,9 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<%n> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -275,7 +287,7 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
 ; CHECK-NEXT:  EMIT vp<[[TC]]> = EXPAND SCEV (1 smax %n)
-; CHECK-NEXT: No successors
+; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
@@ -365,6 +377,9 @@ define void @print_interleave_groups(i32 %C, i32 %D) {
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<256> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -446,6 +461,9 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<%n> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -516,6 +534,9 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<128> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -624,7 +645,7 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
 ; CHECK-NEXT:   IR %inc = add i64 %div, 1
 ; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (1 + ((15 + (%y /u 492802768830814060))<nuw><nsw> /u (1 + (%y /u 492802768830814060))<nuw><nsw>))<nuw><nsw>
 ; CHECK-NEXT:   EMIT vp<[[EXP_SCEV:%.+]]> = EXPAND SCEV (1 + (%y /u 492802768830814060))<nuw><nsw>
-; CHECK-NEXT: No successors
+; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
@@ -691,6 +712,9 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<1000> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -752,6 +776,9 @@ define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr %
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<%n> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -818,6 +845,9 @@ define void @print_exact_flags(i64 %n, ptr noalias %x) {
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<%n> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -882,6 +912,9 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) {
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<%n> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -977,6 +1010,9 @@ define void @print_disjoint_flags(i64 %n, ptr noalias %x) {
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<%n> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -1041,6 +1077,9 @@ define void @zext_nneg(ptr noalias %p, ptr noalias %p1) {
 ; CHECK-NEXT:  Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT:  Live-in ir<1000> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -1082,6 +1121,9 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) {
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<1000> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll
index 823b1cef93ce7..a939b1e923a91 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll
@@ -12,6 +12,9 @@ define void @sink_with_sideeffects(i1 %c, ptr %ptr) {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: ir<0> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
index 330a6b1715c78..94aefdee82b14 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -19,7 +19,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
 ; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (1 + (8 umin %k))<nuw><nsw>
-; CHECK-NEXT: No successors
+; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
@@ -88,7 +88,7 @@ exit:
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
 ; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (1 + (8 umin %k))<nuw><nsw>
-; CHECK-NEXT: No successors
+; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
@@ -173,7 +173,7 @@ exit:
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
 ; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (1 + (8 umin %k))<nuw><nsw>
-; CHECK-NEXT: No successors
+; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
@@ -258,6 +258,9 @@ define void @uniform_gep(i64 %k, ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-NEXT: Live-in ir<11> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT:   CLONE ir<%gep.A.uniform> = getelementptr inbounds ir<%A>, ir<0>
 ; CHECK-NEXT: Successor(s): vector loop
@@ -332,7 +335,7 @@ define void @pred_cfg1(i32 %k, i32 %j) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
 ; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (1 + (8 umin %k))<nuw><nsw>
-; CHECK-NEXT: No successors
+; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
@@ -430,7 +433,7 @@ define void @pred_cfg2(i32 %k, i32 %j) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
 ; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (1 + (8 umin %k))<nuw><nsw>
-; CHECK-NEXT: No successors
+; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
@@ -537,7 +540,7 @@ define void @pred_cfg3(i32 %k, i32 %j) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
 ; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (1 + (8 umin %k))<nuw><nsw>
-; CHECK-NEXT: No successors
+; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
@@ -644,7 +647,7 @@ define void @merge_3_replicate_region(i32 %k, i32 %j) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
 ; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (1 + (8 umin %k))<nuw><nsw>
-; CHECK-NEXT: No successors
+; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
@@ -750,7 +753,7 @@ define void @update_2_uses_in_same_recipe_in_merged_block(i32 %k) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
 ; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (1 + (8 umin %k))<nuw><nsw>
-; CHECK-NEXT: No successors
+; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
@@ -816,7 +819,7 @@ define void @recipe_in_merge_candidate_used_by_first_order_recurrence(i32 %k) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
 ; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (1 + (8 umin %k))<nuw><nsw>
-; CHECK-NEXT: No successors
+; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
@@ -898,6 +901,9 @@ define void @update_multiple_users(ptr noalias %src, ptr noalias %dst, i1 %c) {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<999> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -962,6 +968,9 @@ define void @sinking_requires_duplication(ptr %addr) {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<201> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -1033,6 +1042,9 @@ define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr n
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-NEXT: Live-in ir<%n> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -1119,7 +1131,7 @@ define void @ptr_induction_remove_dead_recipe(ptr %start, ptr %end) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
 ; CHECK-NEXT:  EMIT vp<[[TC]]> = EXPAND SCEV ((-1 * (ptrtoint ptr %end to i64)) + (ptrtoint ptr %start to i64))
-; CHECK-NEXT: No successors
+; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-unused-interleave-group.ll b/llvm/test/Transforms/LoopVectorize/vplan-unused-interleave-group.ll
index 27d81de260d3b..9778ef6853a70 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-unused-interleave-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-unused-interleave-group.ll
@@ -13,6 +13,9 @@ define void @test_unused_interleave(ptr %src, i32 %length) {
 ; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<%length> = original trip-count
 ; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/MergeICmps/X86/distinct-index-width-crash.ll b/llvm/test/Transforms/MergeICmps/X86/distinct-index-width-crash.ll
index 7dce968ee9de0..8ff7e95674f96 100644
--- a/llvm/test/Transforms/MergeICmps/X86/distinct-index-width-crash.ll
+++ b/llvm/test/Transforms/MergeICmps/X86/distinct-index-width-crash.ll
@@ -8,7 +8,7 @@ target triple = "x86_64"
 target datalayout = "e-p:64:64:64:32"
 
 ; Define a cunstom data layout that has index width < pointer width
-; and make sure that doesn't mreak anything
+; and make sure that doesn't break anything
 define void @fat_ptrs(ptr dereferenceable(16) %a, ptr dereferenceable(16) %b) {
 ; CHECK-LABEL: @fat_ptrs(
 ; CHECK-NEXT:  bb0:
@@ -16,7 +16,7 @@ define void @fat_ptrs(ptr dereferenceable(16) %a, ptr dereferenceable(16) %b) {
 ; CHECK-NEXT:    [[PTR_B1:%.*]] = getelementptr inbounds [2 x i64], ptr [[B:%.*]], i32 0, i32 1
 ; CHECK-NEXT:    br label %"bb1+bb2"
 ; CHECK:       "bb1+bb2":
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(ptr [[A]], ptr [[B]], i64 16)
+; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(ptr [[A]], ptr [[B]], i32 16)
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[MEMCMP]], 0
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb3:
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
index 4423b89d81565..7097171ab78c5 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
@@ -18,12 +18,8 @@ define i32 @read_only_loop_with_runtime_check(ptr noundef %array, i32 noundef %c
 ; CHECK:       for.body.preheader10:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER13:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       for.body.preheader13:
-; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER10]] ], [ [[N_VEC:%.*]], [[MIDDLE_BLOCK:%.*]] ]
-; CHECK-NEXT:    [[SUM_07_PH:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER10]] ], [ [[TMP7:%.*]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC]] = and i64 [[TMP0]], 4294967288
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967288
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -37,12 +33,16 @@ define i32 @read_only_loop_with_runtime_check(ptr noundef %array, i32 noundef %c
 ; CHECK-NEXT:    [[TMP5]] = add <4 x i32> [[WIDE_LOAD12]], [[VEC_PHI11]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7]] = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER13]]
+; CHECK:       for.body.preheader13:
+; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER10]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUM_07_PH:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER10]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
@@ -133,12 +133,8 @@ define dso_local noundef i32 @sum_prefix_with_sum(ptr %s.coerce0, i64 %s.coerce1
 ; CHECK:       for.body.preheader8:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER11:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       for.body.preheader11:
-; CHECK-NEXT:    [[I_07_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER8]] ], [ [[N_VEC:%.*]], [[SPAN_CHECKED_ACCESS_EXIT:%.*]] ]
-; CHECK-NEXT:    [[RET_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER8]] ], [ [[ADD:%.*]], [[SPAN_CHECKED_ACCESS_EXIT]] ]
-; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC]] = and i64 [[N]], -8
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -8
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
@@ -152,12 +148,16 @@ define dso_local noundef i32 @sum_prefix_with_sum(ptr %s.coerce0, i64 %s.coerce1
 ; CHECK-NEXT:    [[TMP4]] = add <4 x i32> [[WIDE_LOAD10]], [[VEC_PHI9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[SPAN_CHECKED_ACCESS_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[SPAN_CHECKED_ACCESS_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    [[ADD]] = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    [[ADD:%.*]] = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER11]]
+; CHECK:       for.body.preheader11:
+; CHECK-NEXT:    [[I_07_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER8]] ], [ [[N_VEC]], [[SPAN_CHECKED_ACCESS_EXIT]] ]
+; CHECK-NEXT:    [[RET_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER8]] ], [ [[ADD]], [[SPAN_CHECKED_ACCESS_EXIT]] ]
+; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[RET_0_LCSSA1:%.*]] = phi i32 [ 0, [[ENTRY1:%.*]] ], [ [[ADD]], [[SPAN_CHECKED_ACCESS_EXIT]] ], [ [[ADD1:%.*]], [[FOR_BODY1]] ]
 ; CHECK-NEXT:    ret i32 [[RET_0_LCSSA1]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/concat-boolmasks.ll b/llvm/test/Transforms/PhaseOrdering/X86/concat-boolmasks.ll
deleted file mode 100644
index 07bfbffa9518f..0000000000000
--- a/llvm/test/Transforms/PhaseOrdering/X86/concat-boolmasks.ll
+++ /dev/null
@@ -1,252 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s
-
-define i32 @movmsk_i32_v32i8_v16i8(<16 x i8> %v0, <16 x i8> %v1) {
-; CHECK-LABEL: @movmsk_i32_v32i8_v16i8(
-; CHECK-NEXT:    [[C0:%.*]] = icmp slt <16 x i8> [[V0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[C1:%.*]] = icmp slt <16 x i8> [[V1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[B0:%.*]] = bitcast <16 x i1> [[C0]] to i16
-; CHECK-NEXT:    [[B1:%.*]] = bitcast <16 x i1> [[C1]] to i16
-; CHECK-NEXT:    [[Z0:%.*]] = zext i16 [[B0]] to i32
-; CHECK-NEXT:    [[Z1:%.*]] = zext i16 [[B1]] to i32
-; CHECK-NEXT:    [[S0:%.*]] = shl nuw i32 [[Z0]], 16
-; CHECK-NEXT:    [[OR:%.*]] = or disjoint i32 [[S0]], [[Z1]]
-; CHECK-NEXT:    ret i32 [[OR]]
-;
-  %c0 = icmp slt <16 x i8> %v0, zeroinitializer
-  %c1 = icmp slt <16 x i8> %v1, zeroinitializer
-  %b0 = bitcast <16 x i1> %c0 to i16
-  %b1 = bitcast <16 x i1> %c1 to i16
-  %z0 = zext i16 %b0 to i32
-  %z1 = zext i16 %b1 to i32
-  %s0 = shl nuw i32 %z0, 16
-  %or = or disjoint i32 %s0, %z1
-  ret i32 %or
-}
-
-define i32 @movmsk_i32_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
-; CHECK-LABEL: @movmsk_i32_v8i32_v4i32(
-; CHECK-NEXT:    [[C0:%.*]] = icmp slt <4 x i32> [[V0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[C1:%.*]] = icmp slt <4 x i32> [[V1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[B0:%.*]] = bitcast <4 x i1> [[C0]] to i4
-; CHECK-NEXT:    [[B1:%.*]] = bitcast <4 x i1> [[C1]] to i4
-; CHECK-NEXT:    [[Z0:%.*]] = zext i4 [[B0]] to i32
-; CHECK-NEXT:    [[Z1:%.*]] = zext i4 [[B1]] to i32
-; CHECK-NEXT:    [[S0:%.*]] = shl nuw nsw i32 [[Z0]], 4
-; CHECK-NEXT:    [[OR:%.*]] = or disjoint i32 [[S0]], [[Z1]]
-; CHECK-NEXT:    ret i32 [[OR]]
-;
-  %c0 = icmp slt <4 x i32> %v0, zeroinitializer
-  %c1 = icmp slt <4 x i32> %v1, zeroinitializer
-  %b0 = bitcast <4 x i1> %c0 to i4
-  %b1 = bitcast <4 x i1> %c1 to i4
-  %z0 = zext i4 %b0 to i32
-  %z1 = zext i4 %b1 to i32
-  %s0 = shl nuw i32 %z0, 4
-  %or = or disjoint i32 %s0, %z1
-  ret i32 %or
-}
-
-define i64 @movmsk_i64_v32i8_v16i8(<16 x i8> %v0, <16 x i8> %v1) {
-; CHECK-LABEL: @movmsk_i64_v32i8_v16i8(
-; CHECK-NEXT:    [[C0:%.*]] = icmp slt <16 x i8> [[V0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[C1:%.*]] = icmp slt <16 x i8> [[V1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[B0:%.*]] = bitcast <16 x i1> [[C0]] to i16
-; CHECK-NEXT:    [[B1:%.*]] = bitcast <16 x i1> [[C1]] to i16
-; CHECK-NEXT:    [[Z0:%.*]] = zext i16 [[B0]] to i64
-; CHECK-NEXT:    [[Z1:%.*]] = zext i16 [[B1]] to i64
-; CHECK-NEXT:    [[S0:%.*]] = shl nuw nsw i64 [[Z0]], 16
-; CHECK-NEXT:    [[OR:%.*]] = or disjoint i64 [[S0]], [[Z1]]
-; CHECK-NEXT:    ret i64 [[OR]]
-;
-  %c0 = icmp slt <16 x i8> %v0, zeroinitializer
-  %c1 = icmp slt <16 x i8> %v1, zeroinitializer
-  %b0 = bitcast <16 x i1> %c0 to i16
-  %b1 = bitcast <16 x i1> %c1 to i16
-  %z0 = zext i16 %b0 to i64
-  %z1 = zext i16 %b1 to i64
-  %s0 = shl nuw i64 %z0, 16
-  %or = or disjoint i64 %s0, %z1
-  ret i64 %or
-}
-
-define i64 @movmsk_i64_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
-; CHECK-LABEL: @movmsk_i64_v8i32_v4i32(
-; CHECK-NEXT:    [[C0:%.*]] = icmp slt <4 x i32> [[V0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[C1:%.*]] = icmp slt <4 x i32> [[V1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[B0:%.*]] = bitcast <4 x i1> [[C0]] to i4
-; CHECK-NEXT:    [[B1:%.*]] = bitcast <4 x i1> [[C1]] to i4
-; CHECK-NEXT:    [[Z0:%.*]] = zext i4 [[B0]] to i64
-; CHECK-NEXT:    [[Z1:%.*]] = zext i4 [[B1]] to i64
-; CHECK-NEXT:    [[S0:%.*]] = shl nuw nsw i64 [[Z0]], 4
-; CHECK-NEXT:    [[OR:%.*]] = or disjoint i64 [[S0]], [[Z1]]
-; CHECK-NEXT:    ret i64 [[OR]]
-;
-  %c0 = icmp slt <4 x i32> %v0, zeroinitializer
-  %c1 = icmp slt <4 x i32> %v1, zeroinitializer
-  %b0 = bitcast <4 x i1> %c0 to i4
-  %b1 = bitcast <4 x i1> %c1 to i4
-  %z0 = zext i4 %b0 to i64
-  %z1 = zext i4 %b1 to i64
-  %s0 = shl nuw i64 %z0, 4
-  %or = or disjoint i64 %s0, %z1
-  ret i64 %or
-}
-
-define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-; CHECK-LABEL: @movmsk_i64_v64i8_v16i8(
-; CHECK-NEXT:    [[C0:%.*]] = icmp slt <16 x i8> [[V0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[C1:%.*]] = icmp slt <16 x i8> [[V1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[C2:%.*]] = icmp slt <16 x i8> [[V2:%.*]], zeroinitializer
-; CHECK-NEXT:    [[C3:%.*]] = icmp slt <16 x i8> [[V3:%.*]], zeroinitializer
-; CHECK-NEXT:    [[B0:%.*]] = bitcast <16 x i1> [[C0]] to i16
-; CHECK-NEXT:    [[B1:%.*]] = bitcast <16 x i1> [[C1]] to i16
-; CHECK-NEXT:    [[B2:%.*]] = bitcast <16 x i1> [[C2]] to i16
-; CHECK-NEXT:    [[B3:%.*]] = bitcast <16 x i1> [[C3]] to i16
-; CHECK-NEXT:    [[Z0:%.*]] = zext i16 [[B0]] to i64
-; CHECK-NEXT:    [[Z1:%.*]] = zext i16 [[B1]] to i64
-; CHECK-NEXT:    [[Z2:%.*]] = zext i16 [[B2]] to i64
-; CHECK-NEXT:    [[Z3:%.*]] = zext i16 [[B3]] to i64
-; CHECK-NEXT:    [[S0:%.*]] = shl nuw i64 [[Z0]], 48
-; CHECK-NEXT:    [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 32
-; CHECK-NEXT:    [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16
-; CHECK-NEXT:    [[OR0:%.*]] = or disjoint i64 [[S1]], [[S0]]
-; CHECK-NEXT:    [[OR1:%.*]] = or disjoint i64 [[S2]], [[Z3]]
-; CHECK-NEXT:    [[OR:%.*]] = or disjoint i64 [[OR1]], [[OR0]]
-; CHECK-NEXT:    ret i64 [[OR]]
-;
-  %c0 = icmp slt <16 x i8> %v0, zeroinitializer
-  %c1 = icmp slt <16 x i8> %v1, zeroinitializer
-  %c2 = icmp slt <16 x i8> %v2, zeroinitializer
-  %c3 = icmp slt <16 x i8> %v3, zeroinitializer
-  %b0 = bitcast <16 x i1> %c0 to i16
-  %b1 = bitcast <16 x i1> %c1 to i16
-  %b2 = bitcast <16 x i1> %c2 to i16
-  %b3 = bitcast <16 x i1> %c3 to i16
-  %z0 = zext i16 %b0 to i64
-  %z1 = zext i16 %b1 to i64
-  %z2 = zext i16 %b2 to i64
-  %z3 = zext i16 %b3 to i64
-  %s0 = shl nuw i64 %z0, 48
-  %s1 = shl nuw i64 %z1, 32
-  %s2 = shl nuw i64 %z2, 16
-  %or0 = or disjoint i64 %s0, %s1
-  %or1 = or disjoint i64 %s2, %z3
-  %or = or disjoint i64 %or0, %or1
-  ret i64 %or
-}
-
-define i64 @movmsk_i64_v32i32_v4i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-; CHECK-LABEL: @movmsk_i64_v32i32_v4i32(
-; CHECK-NEXT:    [[C0:%.*]] = icmp slt <4 x i32> [[V0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[C1:%.*]] = icmp slt <4 x i32> [[V1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[C2:%.*]] = icmp slt <4 x i32> [[V2:%.*]], zeroinitializer
-; CHECK-NEXT:    [[C3:%.*]] = icmp slt <4 x i32> [[V3:%.*]], zeroinitializer
-; CHECK-NEXT:    [[B0:%.*]] = bitcast <4 x i1> [[C0]] to i4
-; CHECK-NEXT:    [[B1:%.*]] = bitcast <4 x i1> [[C1]] to i4
-; CHECK-NEXT:    [[B2:%.*]] = bitcast <4 x i1> [[C2]] to i4
-; CHECK-NEXT:    [[B3:%.*]] = bitcast <4 x i1> [[C3]] to i4
-; CHECK-NEXT:    [[Z0:%.*]] = zext i4 [[B0]] to i64
-; CHECK-NEXT:    [[Z1:%.*]] = zext i4 [[B1]] to i64
-; CHECK-NEXT:    [[Z2:%.*]] = zext i4 [[B2]] to i64
-; CHECK-NEXT:    [[Z3:%.*]] = zext i4 [[B3]] to i64
-; CHECK-NEXT:    [[S0:%.*]] = shl nuw nsw i64 [[Z0]], 12
-; CHECK-NEXT:    [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8
-; CHECK-NEXT:    [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 4
-; CHECK-NEXT:    [[OR0:%.*]] = or disjoint i64 [[S1]], [[S0]]
-; CHECK-NEXT:    [[OR1:%.*]] = or disjoint i64 [[S2]], [[Z3]]
-; CHECK-NEXT:    [[OR:%.*]] = or disjoint i64 [[OR1]], [[OR0]]
-; CHECK-NEXT:    ret i64 [[OR]]
-;
-  %c0 = icmp slt <4 x i32> %v0, zeroinitializer
-  %c1 = icmp slt <4 x i32> %v1, zeroinitializer
-  %c2 = icmp slt <4 x i32> %v2, zeroinitializer
-  %c3 = icmp slt <4 x i32> %v3, zeroinitializer
-  %b0 = bitcast <4 x i1> %c0 to i4
-  %b1 = bitcast <4 x i1> %c1 to i4
-  %b2 = bitcast <4 x i1> %c2 to i4
-  %b3 = bitcast <4 x i1> %c3 to i4
-  %z0 = zext i4 %b0 to i64
-  %z1 = zext i4 %b1 to i64
-  %z2 = zext i4 %b2 to i64
-  %z3 = zext i4 %b3 to i64
-  %s0 = shl nuw i64 %z0, 12
-  %s1 = shl nuw i64 %z1, 8
-  %s2 = shl nuw i64 %z2, 4
-  %or0 = or disjoint i64 %s0, %s1
-  %or1 = or disjoint i64 %s2, %z3
-  %or = or disjoint i64 %or0, %or1
-  ret i64 %or
-}
-
-define i64 @movmsk_i64_v64i8_v32i8(<32 x i8> %v0, <32 x i8> %v1) {
-; CHECK-LABEL: @movmsk_i64_v64i8_v32i8(
-; CHECK-NEXT:    [[C0:%.*]] = icmp slt <32 x i8> [[V0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[C1:%.*]] = icmp slt <32 x i8> [[V1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[B0:%.*]] = bitcast <32 x i1> [[C0]] to i32
-; CHECK-NEXT:    [[B1:%.*]] = bitcast <32 x i1> [[C1]] to i32
-; CHECK-NEXT:    [[Z0:%.*]] = zext i32 [[B0]] to i64
-; CHECK-NEXT:    [[Z1:%.*]] = zext i32 [[B1]] to i64
-; CHECK-NEXT:    [[S0:%.*]] = shl nuw i64 [[Z0]], 32
-; CHECK-NEXT:    [[OR:%.*]] = or disjoint i64 [[S0]], [[Z1]]
-; CHECK-NEXT:    ret i64 [[OR]]
-;
-  %c0 = icmp slt <32 x i8> %v0, zeroinitializer
-  %c1 = icmp slt <32 x i8> %v1, zeroinitializer
-  %b0 = bitcast <32 x i1> %c0 to i32
-  %b1 = bitcast <32 x i1> %c1 to i32
-  %z0 = zext i32 %b0 to i64
-  %z1 = zext i32 %b1 to i64
-  %s0 = shl nuw i64 %z0, 32
-  %or = or disjoint i64 %s0, %z1
-  ret i64 %or
-}
-
-define i32 @movmsk_i32_v16i32_v8i32(<8 x i32> %v0, <8 x i32> %v1) {
-; CHECK-LABEL: @movmsk_i32_v16i32_v8i32(
-; CHECK-NEXT:    [[C0:%.*]] = icmp slt <8 x i32> [[V0:%.*]], zeroinitializer
-; CHECK-NEXT:    [[C1:%.*]] = icmp slt <8 x i32> [[V1:%.*]], zeroinitializer
-; CHECK-NEXT:    [[B0:%.*]] = bitcast <8 x i1> [[C0]] to i8
-; CHECK-NEXT:    [[B1:%.*]] = bitcast <8 x i1> [[C1]] to i8
-; CHECK-NEXT:    [[Z0:%.*]] = zext i8 [[B0]] to i32
-; CHECK-NEXT:    [[Z1:%.*]] = zext i8 [[B1]] to i32
-; CHECK-NEXT:    [[S0:%.*]] = shl nuw nsw i32 [[Z0]], 8
-; CHECK-NEXT:    [[OR:%.*]] = or disjoint i32 [[S0]], [[Z1]]
-; CHECK-NEXT:    ret i32 [[OR]]
-;
-  %c0 = icmp slt <8 x i32> %v0, zeroinitializer
-  %c1 = icmp slt <8 x i32> %v1, zeroinitializer
-  %b0 = bitcast <8 x i1> %c0 to i8
-  %b1 = bitcast <8 x i1> %c1 to i8
-  %z0 = zext i8 %b0 to i32
-  %z1 = zext i8 %b1 to i32
-  %s0 = shl nuw i32 %z0, 8
-  %or = or disjoint i32 %s0, %z1
-  ret i32 %or
-}
-
-define i64 @PR111431(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) {
-; CHECK-LABEL: @PR111431(
-; CHECK-NEXT:    [[C01:%.*]] = icmp eq <32 x i8> [[A0:%.*]], [[A1:%.*]]
-; CHECK-NEXT:    [[C02:%.*]] = icmp eq <32 x i8> [[A0]], [[A2:%.*]]
-; CHECK-NEXT:    [[B01:%.*]] = bitcast <32 x i1> [[C01]] to i32
-; CHECK-NEXT:    [[B02:%.*]] = bitcast <32 x i1> [[C02]] to i32
-; CHECK-NEXT:    [[Z01:%.*]] = zext i32 [[B01]] to i64
-; CHECK-NEXT:    [[Z02:%.*]] = zext i32 [[B02]] to i64
-; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i64 [[Z01]], 32
-; CHECK-NEXT:    [[OR:%.*]] = or disjoint i64 [[SHL]], [[Z02]]
-; CHECK-NEXT:    ret i64 [[OR]]
-;
-  %c01 = icmp eq <32 x i8> %a0, %a1
-  %c02 = icmp eq <32 x i8> %a0, %a2
-  %b01 = bitcast <32 x i1> %c01 to i32
-  %b02 = bitcast <32 x i1> %c02 to i32
-  %z01 = zext i32 %b01 to i64
-  %z02 = zext i32 %b02 to i64
-  %shl = shl nuw i64 %z01, 32
-  %or = or disjoint i64 %shl, %z02
-  ret i64 %or
-}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll b/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll
index 63e85731e8c70..8bc40cfc5cc8b 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll
@@ -53,9 +53,6 @@ define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(ptr noundef nonnull align 8
 ; O2-NEXT:    br i1 [[CMP24_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4_PREHEADER:%.*]]
 ; O2:       for.body4.preheader:
 ; O2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_PREHEADER9:%.*]], label [[VECTOR_BODY:%.*]]
-; O2:       for.body4.preheader9:
-; O2-NEXT:    [[J_05_PH:%.*]] = phi i64 [ 0, [[FOR_BODY4_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK:%.*]] ]
-; O2-NEXT:    br label [[FOR_BODY4:%.*]]
 ; O2:       vector.body:
 ; O2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[FOR_BODY4_PREHEADER]] ]
 ; O2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[INDEX]]
@@ -68,9 +65,12 @@ define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(ptr noundef nonnull align 8
 ; O2-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP2]], align 4, !tbaa [[TBAA0]]
 ; O2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; O2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; O2-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; O2-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; O2:       middle.block:
 ; O2-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4_PREHEADER9]]
+; O2:       for.body4.preheader9:
+; O2-NEXT:    [[J_05_PH:%.*]] = phi i64 [ 0, [[FOR_BODY4_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; O2-NEXT:    br label [[FOR_BODY4:%.*]]
 ; O2:       for.cond.cleanup:
 ; O2-NEXT:    ret void
 ; O2:       for.cond.cleanup3:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
index d79ffb0149ff8..1e062041b1286 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
@@ -17,11 +17,8 @@ define void @test(i32 noundef %nface, i32 noundef %ncell, ptr noalias noundef %f
 ; CHECK-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr inbounds nuw i32, ptr [[FACE_CELL]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[NFACE]], 4
 ; CHECK-NEXT:    br i1 [[TMP1]], label %[[FOR_BODY_PREHEADER14:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[FOR_BODY_PREHEADER14]]:
-; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[UNROLL_ITER:%.*]], %[[MIDDLE_BLOCK:.*]] ]
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[UNROLL_ITER]] = and i64 [[TMP0]], 2147483644
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[TMP0]], 2147483644
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV_EPIL:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -40,10 +37,13 @@ define void @test(i32 noundef %nface, i32 noundef %ncell, ptr noalias noundef %f
 ; CHECK-NEXT:    tail call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP8]], <4 x ptr> [[TMP4]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[TBAA5]], !llvm.access.group [[ACC_GRP4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV_EPIL]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[UNROLL_ITER]]
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UNROLL_ITER]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY_PREHEADER14]]
+; CHECK:       [[FOR_BODY_PREHEADER14]]:
+; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[UNROLL_ITER]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[FOR_BODY]]:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index b312688b7932d..61ff4f5766d30 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s
+; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck --check-prefixes=CHECK,POWEROF2 %s
+; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 -slp-vectorize-non-power-of-2 %s | FileCheck --check-prefixes=CHECK,NONPOWEROF2 %s
 
 define i32 @test() {
 ; CHECK-LABEL: @test(
@@ -134,3 +135,99 @@ for.body:
   %6 = select <2 x i1> %4, <2 x float> %3, <2 x float> zeroinitializer
   br label %for.cond.cleanup
 }
+
+define ptr @test4() {
+; POWEROF2-LABEL: @test4(
+; POWEROF2-NEXT:    [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
+; POWEROF2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; POWEROF2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
+; POWEROF2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 4, i32 0>
+; POWEROF2-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
+; POWEROF2-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
+; POWEROF2-NEXT:    br label [[TMP8:%.*]]
+; POWEROF2:       7:
+; POWEROF2-NEXT:    br label [[TMP8]]
+; POWEROF2:       8:
+; POWEROF2-NEXT:    [[TMP9:%.*]] = phi <2 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP4]], [[TMP0:%.*]] ]
+; POWEROF2-NEXT:    [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7]] ], [ [[TMP6]], [[TMP0]] ]
+; POWEROF2-NEXT:    br label [[TMP11:%.*]]
+; POWEROF2:       11:
+; POWEROF2-NEXT:    [[TMP12:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 0)
+; POWEROF2-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
+; POWEROF2-NEXT:    [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2)
+; POWEROF2-NEXT:    [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
+; POWEROF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
+; POWEROF2-NEXT:    [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP16]]
+; POWEROF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
+; POWEROF2-NEXT:    [[TMP19:%.*]] = fmul float [[TMP18]], 0.000000e+00
+; POWEROF2-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
+; POWEROF2-NEXT:    [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]]
+; POWEROF2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; POWEROF2-NEXT:    [[TMP23:%.*]] = fadd reassoc nsz float [[TMP22]], [[TMP19]]
+; POWEROF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
+; POWEROF2-NEXT:    [[TMP25:%.*]] = fadd reassoc nsz float [[TMP21]], [[TMP24]]
+; POWEROF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; POWEROF2-NEXT:    [[TMP27:%.*]] = fadd reassoc nsz float [[TMP23]], [[TMP26]]
+; POWEROF2-NEXT:    [[TMP28:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP25]])
+; POWEROF2-NEXT:    [[TMP29:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP27]])
+; POWEROF2-NEXT:    ret ptr null
+;
+; NONPOWEROF2-LABEL: @test4(
+; NONPOWEROF2-NEXT:    [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
+; NONPOWEROF2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NONPOWEROF2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> <i32 4, i32 5, i32 6>
+; NONPOWEROF2-NEXT:    [[TMP4:%.*]] = call <6 x float> @llvm.vector.insert.v6f32.v3f32(<6 x float> poison, <3 x float> [[TMP2]], i64 0)
+; NONPOWEROF2-NEXT:    [[TMP5:%.*]] = call <6 x float> @llvm.vector.insert.v6f32.v3f32(<6 x float> [[TMP4]], <3 x float> [[TMP3]], i64 3)
+; NONPOWEROF2-NEXT:    br label [[TMP7:%.*]]
+; NONPOWEROF2:       6:
+; NONPOWEROF2-NEXT:    br label [[TMP7]]
+; NONPOWEROF2:       7:
+; NONPOWEROF2-NEXT:    [[TMP8:%.*]] = phi <6 x float> [ poison, [[TMP6:%.*]] ], [ [[TMP5]], [[TMP0:%.*]] ]
+; NONPOWEROF2-NEXT:    br label [[TMP9:%.*]]
+; NONPOWEROF2:       9:
+; NONPOWEROF2-NEXT:    [[TMP10:%.*]] = call <3 x float> @llvm.vector.extract.v3f32.v6f32(<6 x float> [[TMP8]], i64 0)
+; NONPOWEROF2-NEXT:    [[TMP11:%.*]] = fmul <3 x float> zeroinitializer, [[TMP10]]
+; NONPOWEROF2-NEXT:    [[TMP12:%.*]] = call <3 x float> @llvm.vector.extract.v3f32.v6f32(<6 x float> [[TMP8]], i64 3)
+; NONPOWEROF2-NEXT:    [[TMP13:%.*]] = fmul <3 x float> zeroinitializer, [[TMP12]]
+; NONPOWEROF2-NEXT:    [[TMP14:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP11]])
+; NONPOWEROF2-NEXT:    [[TMP15:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP13]])
+; NONPOWEROF2-NEXT:    [[TMP16:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP14]])
+; NONPOWEROF2-NEXT:    [[TMP17:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP15]])
+; NONPOWEROF2-NEXT:    ret ptr null
+;
+  %1 = fadd <8 x float> zeroinitializer, zeroinitializer
+  %2 = extractelement <8 x float> %1, i64 0
+  %3 = extractelement <8 x float> %1, i64 1
+  %4 = extractelement <8 x float> %1, i64 2
+  %5 = extractelement <8 x float> %1, i64 4
+  %6 = extractelement <8 x float> %1, i64 5
+  %7 = extractelement <8 x float> %1, i64 6
+  br label %9
+
+8:
+  br label %9
+
+9:
+  %10 = phi float [ 0.000000e+00, %8 ], [ %7, %0 ]
+  %11 = phi float [ 0.000000e+00, %8 ], [ %6, %0 ]
+  %12 = phi float [ 0.000000e+00, %8 ], [ %5, %0 ]
+  %13 = phi float [ 0.000000e+00, %8 ], [ %4, %0 ]
+  %14 = phi float [ 0.000000e+00, %8 ], [ %3, %0 ]
+  %15 = phi float [ 0.000000e+00, %8 ], [ %2, %0 ]
+  br label %16
+
+16:
+  %17 = fmul float %14, 0.000000e+00
+  %18 = fmul float 0.000000e+00, %11
+  %19 = fmul float 0.000000e+00, %15
+  %20 = fmul float %12, 0.000000e+00
+  %21 = fadd reassoc nsz float %17, %19
+  %22 = fadd reassoc nsz float %18, %20
+  %23 = fmul float %13, 0.000000e+00
+  %24 = fmul float %10, 0.000000e+00
+  %25 = fadd reassoc nsz float %21, %23
+  %26 = fadd reassoc nsz float %22, %24
+  %27 = tail call float @llvm.sqrt.f32(float %25)
+  %28 = tail call float @llvm.sqrt.f32(float %26)
+  ret ptr null
+}
diff --git a/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll b/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll
index 87862b929a751..cc57abe391aa8 100644
--- a/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll
+++ b/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll
@@ -9,19 +9,18 @@ define i32 @alloca_used_in_call(ptr %data, i64 %n) {
 ; CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RDX_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[RDX:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    [[RDX_INC:%.*]] = add nsw i32 [[RDX]], [[LD]]
+; CHECK-NEXT:    [[RDX_INC]] = add nsw i32 [[RDX]], [[LD]]
 ; CHECK-NEXT:    store i32 [[RDX_INC]], ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[I0:%.*]] = call i32 @user_of_alloca(ptr [[RETVAL]])
-; CHECK-NEXT:    [[I1:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    ret i32 [[I1]]
+; CHECK-NEXT:    ret i32 [[RDX_INC]]
 ;
 entry:
   %retval = alloca i32, align 4
@@ -138,19 +137,18 @@ define i32 @alloca_not_captured_and_readonly_as_per_operand_attr(ptr %data, i64
 ; CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RDX_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[RDX:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    [[RDX_INC:%.*]] = add nsw i32 [[RDX]], [[LD]]
+; CHECK-NEXT:    [[RDX_INC]] = add nsw i32 [[RDX]], [[LD]]
 ; CHECK-NEXT:    store i32 [[RDX_INC]], ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[I0:%.*]] = call i32 @capture_of_alloca(ptr nocapture readonly [[RETVAL]])
-; CHECK-NEXT:    [[I1:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    ret i32 [[I1]]
+; CHECK-NEXT:    ret i32 [[RDX_INC]]
 ;
 entry:
   %retval = alloca i32, align 4
@@ -267,19 +265,18 @@ define i32 @alloca_with_gep_used_in_call(ptr %data, i64 %n) {
 ; CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RDX_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[RDX:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    [[RDX_INC:%.*]] = add nsw i32 [[RDX]], [[LD]]
+; CHECK-NEXT:    [[RDX_INC]] = add nsw i32 [[RDX]], [[LD]]
 ; CHECK-NEXT:    store i32 [[RDX_INC]], ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[I0:%.*]] = call i32 @user_of_alloca(ptr [[RETVAL]])
-; CHECK-NEXT:    [[I1:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    ret i32 [[I1]]
+; CHECK-NEXT:    ret i32 [[RDX_INC]]
 ;
 entry:
   %retval = alloca i32, align 4
@@ -353,11 +350,11 @@ define i32 @alloca_used_in_maybe_throwing_call(ptr %data, i64 %n) personality pt
 ; CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RDX_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[RDX:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    [[RDX_INC:%.*]] = add nsw i32 [[RDX]], [[LD]]
+; CHECK-NEXT:    [[RDX_INC]] = add nsw i32 [[RDX]], [[LD]]
 ; CHECK-NEXT:    store i32 [[RDX_INC]], ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
@@ -372,8 +369,7 @@ define i32 @alloca_used_in_maybe_throwing_call(ptr %data, i64 %n) personality pt
 ; CHECK-NEXT:            catch ptr null
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    ret i32 [[I2]]
+; CHECK-NEXT:    ret i32 [[RDX_INC]]
 ;
 entry:
   %retval = alloca i32, align 4
@@ -413,11 +409,11 @@ define i32 @alloca_used_in_maybe_throwing_call_with_same_dests(ptr %data, i64 %n
 ; CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RDX_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[RDX:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    [[RDX_INC:%.*]] = add nsw i32 [[RDX]], [[LD]]
+; CHECK-NEXT:    [[RDX_INC]] = add nsw i32 [[RDX]], [[LD]]
 ; CHECK-NEXT:    store i32 [[RDX_INC]], ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
@@ -430,8 +426,7 @@ define i32 @alloca_used_in_maybe_throwing_call_with_same_dests(ptr %data, i64 %n
 ; CHECK-NEXT:            catch ptr null
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    ret i32 [[I2]]
+; CHECK-NEXT:    ret i32 [[RDX_INC]]
 ;
 entry:
   %retval = alloca i32, align 4
@@ -472,11 +467,11 @@ define [2 x i32] @part_of_alloca_used_in_call(ptr %data, i64 %n) {
 ; CHECK-NEXT:    [[RETVAL:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i64 0, i64 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RDX_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[RDX:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    [[RDX_INC:%.*]] = add nsw i32 [[RDX]], [[LD]]
+; CHECK-NEXT:    [[RDX_INC]] = add nsw i32 [[RDX]], [[LD]]
 ; CHECK-NEXT:    store i32 [[RDX_INC]], ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
@@ -484,11 +479,9 @@ define [2 x i32] @part_of_alloca_used_in_call(ptr %data, i64 %n) {
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[I0:%.*]] = call i32 @user_of_alloca(ptr [[RETVAL]])
 ; CHECK-NEXT:    [[I1_FCA_0_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 0
-; CHECK-NEXT:    [[I1_FCA_0_LOAD:%.*]] = load i32, ptr [[I1_FCA_0_GEP]], align 4
-; CHECK-NEXT:    [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 [[I1_FCA_0_LOAD]], 0
+; CHECK-NEXT:    [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 0, 0
 ; CHECK-NEXT:    [[I1_FCA_1_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 1
-; CHECK-NEXT:    [[I1_FCA_1_LOAD:%.*]] = load i32, ptr [[I1_FCA_1_GEP]], align 4
-; CHECK-NEXT:    [[I1_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I1_FCA_0_INSERT]], i32 [[I1_FCA_1_LOAD]], 1
+; CHECK-NEXT:    [[I1_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I1_FCA_0_INSERT]], i32 [[RDX_INC]], 1
 ; CHECK-NEXT:    ret [2 x i32] [[I1_FCA_1_INSERT]]
 ;
 entry:
@@ -525,11 +518,11 @@ define [2 x i32] @all_parts_of_alloca_used_in_call_with_multiple_args(ptr %data,
 ; CHECK-NEXT:    [[RETVAL:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i64 0, i64 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RDX_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[RDX:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    [[RDX_INC:%.*]] = add nsw i32 [[RDX]], [[LD]]
+; CHECK-NEXT:    [[RDX_INC]] = add nsw i32 [[RDX]], [[LD]]
 ; CHECK-NEXT:    store i32 [[RDX_INC]], ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
@@ -537,11 +530,9 @@ define [2 x i32] @all_parts_of_alloca_used_in_call_with_multiple_args(ptr %data,
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[I0:%.*]] = call i32 @user_of_alloca_with_multiple_args(ptr [[RETVAL]], ptr [[RETVAL_FULL]])
 ; CHECK-NEXT:    [[I1_FCA_0_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 0
-; CHECK-NEXT:    [[I1_FCA_0_LOAD:%.*]] = load i32, ptr [[I1_FCA_0_GEP]], align 4
-; CHECK-NEXT:    [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 [[I1_FCA_0_LOAD]], 0
+; CHECK-NEXT:    [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 0, 0
 ; CHECK-NEXT:    [[I1_FCA_1_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 1
-; CHECK-NEXT:    [[I1_FCA_1_LOAD:%.*]] = load i32, ptr [[I1_FCA_1_GEP]], align 4
-; CHECK-NEXT:    [[I1_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I1_FCA_0_INSERT]], i32 [[I1_FCA_1_LOAD]], 1
+; CHECK-NEXT:    [[I1_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I1_FCA_0_INSERT]], i32 [[RDX_INC]], 1
 ; CHECK-NEXT:    ret [2 x i32] [[I1_FCA_1_INSERT]]
 ;
 entry:
@@ -688,11 +679,11 @@ define [2 x i32] @part_of_alloca_used_in_call_with_multiple_args(ptr %data, i64
 ; CHECK-NEXT:    [[RETVAL:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i64 0, i64 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RDX_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[RDX:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    [[RDX_INC:%.*]] = add nsw i32 [[RDX]], [[LD]]
+; CHECK-NEXT:    [[RDX_INC]] = add nsw i32 [[RDX]], [[LD]]
 ; CHECK-NEXT:    store i32 [[RDX_INC]], ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
@@ -700,11 +691,9 @@ define [2 x i32] @part_of_alloca_used_in_call_with_multiple_args(ptr %data, i64
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[I0:%.*]] = call i32 @user_of_alloca_with_multiple_args(ptr [[RETVAL]], ptr [[RETVAL]])
 ; CHECK-NEXT:    [[I1_FCA_0_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 0
-; CHECK-NEXT:    [[I1_FCA_0_LOAD:%.*]] = load i32, ptr [[I1_FCA_0_GEP]], align 4
-; CHECK-NEXT:    [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 [[I1_FCA_0_LOAD]], 0
+; CHECK-NEXT:    [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 0, 0
 ; CHECK-NEXT:    [[I1_FCA_1_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 1
-; CHECK-NEXT:    [[I1_FCA_1_LOAD:%.*]] = load i32, ptr [[I1_FCA_1_GEP]], align 4
-; CHECK-NEXT:    [[I1_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I1_FCA_0_INSERT]], i32 [[I1_FCA_1_LOAD]], 1
+; CHECK-NEXT:    [[I1_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I1_FCA_0_INSERT]], i32 [[RDX_INC]], 1
 ; CHECK-NEXT:    ret [2 x i32] [[I1_FCA_1_INSERT]]
 ;
 entry:
@@ -742,11 +731,11 @@ define [2 x i32] @all_parts_of_alloca_used_in_calls_with_multiple_args(ptr %data
 ; CHECK-NEXT:    [[RETVAL:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i64 0, i64 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RDX_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[RDX:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    [[RDX_INC:%.*]] = add nsw i32 [[RDX]], [[LD]]
+; CHECK-NEXT:    [[RDX_INC]] = add nsw i32 [[RDX]], [[LD]]
 ; CHECK-NEXT:    store i32 [[RDX_INC]], ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
@@ -756,11 +745,9 @@ define [2 x i32] @all_parts_of_alloca_used_in_calls_with_multiple_args(ptr %data
 ; CHECK-NEXT:    [[I1:%.*]] = call i32 @user_of_alloca_with_multiple_args(ptr [[RETVAL_FULL]], ptr [[RETVAL]])
 ; CHECK-NEXT:    [[I2:%.*]] = call i32 @capture_of_alloca(ptr [[SOME_ANOTHER_ALLOCA_FULL]])
 ; CHECK-NEXT:    [[I3_FCA_0_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 0
-; CHECK-NEXT:    [[I3_FCA_0_LOAD:%.*]] = load i32, ptr [[I3_FCA_0_GEP]], align 4
-; CHECK-NEXT:    [[I3_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 [[I3_FCA_0_LOAD]], 0
+; CHECK-NEXT:    [[I3_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 0, 0
 ; CHECK-NEXT:    [[I3_FCA_1_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 1
-; CHECK-NEXT:    [[I3_FCA_1_LOAD:%.*]] = load i32, ptr [[I3_FCA_1_GEP]], align 4
-; CHECK-NEXT:    [[I3_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I3_FCA_0_INSERT]], i32 [[I3_FCA_1_LOAD]], 1
+; CHECK-NEXT:    [[I3_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I3_FCA_0_INSERT]], i32 [[RDX_INC]], 1
 ; CHECK-NEXT:    ret [2 x i32] [[I3_FCA_1_INSERT]]
 ;
 entry:
@@ -851,8 +838,7 @@ define i8 @dont_transform_load_only() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
 ; CHECK-NEXT:    call void @byte_user_of_alloca(ptr [[A]])
-; CHECK-NEXT:    [[R:%.*]] = load i8, ptr [[A]], align 1
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    ret i8 undef
 ;
 entry:
   %a = alloca i8
@@ -866,8 +852,7 @@ define i8 @transform_load_and_store() {
 ; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
 ; CHECK-NEXT:    store i8 0, ptr [[A]], align 1
 ; CHECK-NEXT:    call void @byte_user_of_alloca(ptr [[A]])
-; CHECK-NEXT:    [[R:%.*]] = load i8, ptr [[A]], align 1
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    ret i8 0
 ;
 entry:
   %a = alloca i8
diff --git a/llvm/test/Transforms/SROA/readonlynocapture.ll b/llvm/test/Transforms/SROA/readonlynocapture.ll
index 2d02996d806ed..2284a00126678 100644
--- a/llvm/test/Transforms/SROA/readonlynocapture.ll
+++ b/llvm/test/Transforms/SROA/readonlynocapture.ll
@@ -8,8 +8,7 @@ define i32 @simple() {
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    ret i32 [[L1]]
+; CHECK-NEXT:    ret i32 0
 ;
   %a = alloca i32
   store i32 0, ptr %a
@@ -40,9 +39,7 @@ define i32 @twoalloc() {
 ; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[A]], i32 1
 ; CHECK-NEXT:    store i32 1, ptr [[B]], align 4
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[B]], align 4
-; CHECK-NEXT:    [[R:%.*]] = add i32 [[L1]], [[L2]]
+; CHECK-NEXT:    [[R:%.*]] = add i32 0, 1
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %a = alloca {i32, i32}
@@ -62,8 +59,7 @@ define i32 @twostore() {
 ; CHECK-NEXT:    store i32 1, ptr [[A]], align 4
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
 ; CHECK-NEXT:    store i32 2, ptr [[A]], align 4
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    ret i32 [[L]]
+; CHECK-NEXT:    ret i32 2
 ;
   %a = alloca i32
   store i32 1, ptr %a
@@ -116,10 +112,8 @@ define i32 @twocalls() {
 ; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[A]], i32 1
 ; CHECK-NEXT:    store i32 1, ptr [[B]], align 4
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
-; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[B]], align 4
-; CHECK-NEXT:    [[R:%.*]] = add i32 [[L1]], [[L2]]
+; CHECK-NEXT:    [[R:%.*]] = add i32 0, 1
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %a = alloca {i32, i32}
@@ -165,8 +159,7 @@ define i32 @atomic() {
 ; CHECK-NEXT:    store i32 1, ptr [[B]], align 4
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
 ; CHECK-NEXT:    [[L1:%.*]] = load atomic i32, ptr [[A]] seq_cst, align 4
-; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[B]], align 4
-; CHECK-NEXT:    [[R:%.*]] = add i32 [[L1]], [[L2]]
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[L1]], 1
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %a = alloca {i32, i32}
@@ -184,12 +177,10 @@ define i32 @notdominating() {
 ; CHECK-LABEL: @notdominating(
 ; CHECK-NEXT:    [[A:%.*]] = alloca { i32, i32 }, align 8
 ; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[A]], i32 1
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[B]], align 4
 ; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
 ; CHECK-NEXT:    store i32 1, ptr [[B]], align 4
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
-; CHECK-NEXT:    [[R:%.*]] = add i32 [[L1]], [[L2]]
+; CHECK-NEXT:    [[R:%.*]] = add i32 undef, undef
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %a = alloca {i32, i32}
@@ -235,9 +226,7 @@ define i32 @multiuse() {
 ; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[A]], i32 1
 ; CHECK-NEXT:    store i32 1, ptr [[B]], align 4
 ; CHECK-NEXT:    call void @callee_multiuse(ptr [[A]], ptr [[A]])
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[B]], align 4
-; CHECK-NEXT:    [[R:%.*]] = add i32 [[L1]], [[L2]]
+; CHECK-NEXT:    [[R:%.*]] = add i32 0, 1
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %a = alloca {i32, i32}
@@ -296,8 +285,7 @@ define void @incompletestruct(i1 %b, i1 %c) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[LII:%.*]] = alloca [[STRUCT_LOADIMMEDIATEINFO:%.*]], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[LII]])
-; CHECK-NEXT:    [[BF_LOAD:%.*]] = load i32, ptr [[LII]], align 4
-; CHECK-NEXT:    [[BF_CLEAR4:%.*]] = and i32 [[BF_LOAD]], -262144
+; CHECK-NEXT:    [[BF_CLEAR4:%.*]] = and i32 undef, -262144
 ; CHECK-NEXT:    [[BF_SET5:%.*]] = select i1 [[B:%.*]], i32 196608, i32 131072
 ; CHECK-NEXT:    [[BF_SET12:%.*]] = or disjoint i32 [[BF_SET5]], [[BF_CLEAR4]]
 ; CHECK-NEXT:    store i32 [[BF_SET12]], ptr [[LII]], align 4
@@ -325,8 +313,7 @@ define void @incompletestruct_bb(i1 %b, i1 %c) {
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[LII]])
-; CHECK-NEXT:    [[BF_LOAD:%.*]] = load i32, ptr [[LII]], align 4
-; CHECK-NEXT:    [[BF_CLEAR4:%.*]] = and i32 [[BF_LOAD]], -262144
+; CHECK-NEXT:    [[BF_CLEAR4:%.*]] = and i32 undef, -262144
 ; CHECK-NEXT:    [[BF_SET5:%.*]] = select i1 [[B:%.*]], i32 196608, i32 131072
 ; CHECK-NEXT:    [[BF_SET12:%.*]] = or disjoint i32 [[BF_SET5]], [[BF_CLEAR4]]
 ; CHECK-NEXT:    store i32 [[BF_SET12]], ptr [[LII]], align 4
diff --git a/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll
new file mode 100644
index 0000000000000..e46fc730fb5b8
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll
@@ -0,0 +1,293 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define i32 @movmsk_i32_v32i8_v16i8(<16 x i8> %v0, <16 x i8> %v1) {
+; SSE-LABEL: @movmsk_i32_v32i8_v16i8(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:    [[TMP2:%.*]] = icmp slt <32 x i8> [[TMP1]], zeroinitializer
+; SSE-NEXT:    [[OR:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+; SSE-NEXT:    ret i32 [[OR]]
+;
+; AVX-LABEL: @movmsk_i32_v32i8_v16i8(
+; AVX-NEXT:    [[C0:%.*]] = icmp slt <16 x i8> [[V0:%.*]], zeroinitializer
+; AVX-NEXT:    [[C1:%.*]] = icmp slt <16 x i8> [[V1:%.*]], zeroinitializer
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i1> [[C1]], <16 x i1> [[C0]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:    [[OR:%.*]] = bitcast <32 x i1> [[TMP1]] to i32
+; AVX-NEXT:    ret i32 [[OR]]
+;
+  %c0 = icmp slt <16 x i8> %v0, zeroinitializer
+  %c1 = icmp slt <16 x i8> %v1, zeroinitializer
+  %b0 = bitcast <16 x i1> %c0 to i16
+  %b1 = bitcast <16 x i1> %c1 to i16
+  %z0 = zext i16 %b0 to i32
+  %z1 = zext i16 %b1 to i32
+  %s0 = shl nuw i32 %z0, 16
+  %or = or disjoint i32 %s0, %z1
+  ret i32 %or
+}
+
+define i32 @movmsk_i32_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
+; SSE-LABEL: @movmsk_i32_v8i32_v4i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP2:%.*]] = icmp slt <8 x i32> [[TMP1]], zeroinitializer
+; SSE-NEXT:    [[TMP3:%.*]] = bitcast <8 x i1> [[TMP2]] to i8
+; SSE-NEXT:    [[OR:%.*]] = zext i8 [[TMP3]] to i32
+; SSE-NEXT:    ret i32 [[OR]]
+;
+; AVX-LABEL: @movmsk_i32_v8i32_v4i32(
+; AVX-NEXT:    [[C0:%.*]] = icmp slt <4 x i32> [[V0:%.*]], zeroinitializer
+; AVX-NEXT:    [[C1:%.*]] = icmp slt <4 x i32> [[V1:%.*]], zeroinitializer
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[C1]], <4 x i1> [[C0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8
+; AVX-NEXT:    [[OR:%.*]] = zext i8 [[TMP2]] to i32
+; AVX-NEXT:    ret i32 [[OR]]
+;
+  %c0 = icmp slt <4 x i32> %v0, zeroinitializer
+  %c1 = icmp slt <4 x i32> %v1, zeroinitializer
+  %b0 = bitcast <4 x i1> %c0 to i4
+  %b1 = bitcast <4 x i1> %c1 to i4
+  %z0 = zext i4 %b0 to i32
+  %z1 = zext i4 %b1 to i32
+  %s0 = shl nuw i32 %z0, 4
+  %or = or disjoint i32 %s0, %z1
+  ret i32 %or
+}
+
+define i64 @movmsk_i64_v32i8_v16i8(<16 x i8> %v0, <16 x i8> %v1) {
+; SSE-LABEL: @movmsk_i64_v32i8_v16i8(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:    [[TMP2:%.*]] = icmp slt <32 x i8> [[TMP1]], zeroinitializer
+; SSE-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+; SSE-NEXT:    [[OR:%.*]] = zext i32 [[TMP3]] to i64
+; SSE-NEXT:    ret i64 [[OR]]
+;
+; AVX-LABEL: @movmsk_i64_v32i8_v16i8(
+; AVX-NEXT:    [[C0:%.*]] = icmp slt <16 x i8> [[V0:%.*]], zeroinitializer
+; AVX-NEXT:    [[C1:%.*]] = icmp slt <16 x i8> [[V1:%.*]], zeroinitializer
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i1> [[C1]], <16 x i1> [[C0]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:    [[TMP2:%.*]] = bitcast <32 x i1> [[TMP1]] to i32
+; AVX-NEXT:    [[OR:%.*]] = zext i32 [[TMP2]] to i64
+; AVX-NEXT:    ret i64 [[OR]]
+;
+  %c0 = icmp slt <16 x i8> %v0, zeroinitializer
+  %c1 = icmp slt <16 x i8> %v1, zeroinitializer
+  %b0 = bitcast <16 x i1> %c0 to i16
+  %b1 = bitcast <16 x i1> %c1 to i16
+  %z0 = zext i16 %b0 to i64
+  %z1 = zext i16 %b1 to i64
+  %s0 = shl nuw i64 %z0, 16
+  %or = or disjoint i64 %s0, %z1
+  ret i64 %or
+}
+
+define i64 @movmsk_i64_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
+; SSE-LABEL: @movmsk_i64_v8i32_v4i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP2:%.*]] = icmp slt <8 x i32> [[TMP1]], zeroinitializer
+; SSE-NEXT:    [[TMP3:%.*]] = bitcast <8 x i1> [[TMP2]] to i8
+; SSE-NEXT:    [[OR:%.*]] = zext i8 [[TMP3]] to i64
+; SSE-NEXT:    ret i64 [[OR]]
+;
+; AVX-LABEL: @movmsk_i64_v8i32_v4i32(
+; AVX-NEXT:    [[C0:%.*]] = icmp slt <4 x i32> [[V0:%.*]], zeroinitializer
+; AVX-NEXT:    [[C1:%.*]] = icmp slt <4 x i32> [[V1:%.*]], zeroinitializer
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[C1]], <4 x i1> [[C0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8
+; AVX-NEXT:    [[OR:%.*]] = zext i8 [[TMP2]] to i64
+; AVX-NEXT:    ret i64 [[OR]]
+;
+  %c0 = icmp slt <4 x i32> %v0, zeroinitializer
+  %c1 = icmp slt <4 x i32> %v1, zeroinitializer
+  %b0 = bitcast <4 x i1> %c0 to i4
+  %b1 = bitcast <4 x i1> %c1 to i4
+  %z0 = zext i4 %b0 to i64
+  %z1 = zext i4 %b1 to i64
+  %s0 = shl nuw i64 %z0, 4
+  %or = or disjoint i64 %s0, %z1
+  ret i64 %or
+}
+
+define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
+; SSE-LABEL: @movmsk_i64_v64i8_v16i8(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
+; SSE-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
+; SSE-NEXT:    ret i64 [[OR]]
+;
+; AVX-LABEL: @movmsk_i64_v64i8_v16i8(
+; AVX-NEXT:    [[C0:%.*]] = icmp slt <16 x i8> [[V0:%.*]], zeroinitializer
+; AVX-NEXT:    [[C1:%.*]] = icmp slt <16 x i8> [[V1:%.*]], zeroinitializer
+; AVX-NEXT:    [[C2:%.*]] = icmp slt <16 x i8> [[V2:%.*]], zeroinitializer
+; AVX-NEXT:    [[C3:%.*]] = icmp slt <16 x i8> [[V3:%.*]], zeroinitializer
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i1> [[C1]], <16 x i1> [[C0]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i1> [[C3]], <16 x i1> [[C2]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i1> [[TMP1]], <32 x i1> [[TMP2]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP3]] to i64
+; AVX-NEXT:    ret i64 [[OR]]
+;
+  %c0 = icmp slt <16 x i8> %v0, zeroinitializer
+  %c1 = icmp slt <16 x i8> %v1, zeroinitializer
+  %c2 = icmp slt <16 x i8> %v2, zeroinitializer
+  %c3 = icmp slt <16 x i8> %v3, zeroinitializer
+  %b0 = bitcast <16 x i1> %c0 to i16
+  %b1 = bitcast <16 x i1> %c1 to i16
+  %b2 = bitcast <16 x i1> %c2 to i16
+  %b3 = bitcast <16 x i1> %c3 to i16
+  %z0 = zext i16 %b0 to i64
+  %z1 = zext i16 %b1 to i64
+  %z2 = zext i16 %b2 to i64
+  %z3 = zext i16 %b3 to i64
+  %s0 = shl nuw i64 %z0, 48
+  %s1 = shl nuw i64 %z1, 32
+  %s2 = shl nuw i64 %z2, 16
+  %or0 = or disjoint i64 %s0, %s1
+  %or1 = or disjoint i64 %s2, %z3
+  %or = or disjoint i64 %or0, %or1
+  ret i64 %or
+}
+
+define i64 @movmsk_i64_v32i32_v4i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+; SSE-LABEL: @movmsk_i64_v32i32_v4i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
+; SSE-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; SSE-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
+; SSE-NEXT:    ret i64 [[OR]]
+;
+; AVX-LABEL: @movmsk_i64_v32i32_v4i32(
+; AVX-NEXT:    [[C0:%.*]] = icmp slt <4 x i32> [[V0:%.*]], zeroinitializer
+; AVX-NEXT:    [[C1:%.*]] = icmp slt <4 x i32> [[V1:%.*]], zeroinitializer
+; AVX-NEXT:    [[C2:%.*]] = icmp slt <4 x i32> [[V2:%.*]], zeroinitializer
+; AVX-NEXT:    [[C3:%.*]] = icmp slt <4 x i32> [[V3:%.*]], zeroinitializer
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i1> [[C1]], <4 x i1> [[C0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[C3]], <4 x i1> [[C2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    [[TMP4:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
+; AVX-NEXT:    [[OR:%.*]] = zext i16 [[TMP4]] to i64
+; AVX-NEXT:    ret i64 [[OR]]
+;
+  %c0 = icmp slt <4 x i32> %v0, zeroinitializer
+  %c1 = icmp slt <4 x i32> %v1, zeroinitializer
+  %c2 = icmp slt <4 x i32> %v2, zeroinitializer
+  %c3 = icmp slt <4 x i32> %v3, zeroinitializer
+  %b0 = bitcast <4 x i1> %c0 to i4
+  %b1 = bitcast <4 x i1> %c1 to i4
+  %b2 = bitcast <4 x i1> %c2 to i4
+  %b3 = bitcast <4 x i1> %c3 to i4
+  %z0 = zext i4 %b0 to i64
+  %z1 = zext i4 %b1 to i64
+  %z2 = zext i4 %b2 to i64
+  %z3 = zext i4 %b3 to i64
+  %s0 = shl nuw i64 %z0, 12
+  %s1 = shl nuw i64 %z1, 8
+  %s2 = shl nuw i64 %z2, 4
+  %or0 = or disjoint i64 %s0, %s1
+  %or1 = or disjoint i64 %s2, %z3
+  %or = or disjoint i64 %or0, %or1
+  ret i64 %or
+}
+
+define i64 @movmsk_i64_v64i8_v32i8(<32 x i8> %v0, <32 x i8> %v1) {
+; SSE-LABEL: @movmsk_i64_v64i8_v32i8(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[V1:%.*]], <32 x i8> [[V0:%.*]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:    [[TMP2:%.*]] = icmp slt <64 x i8> [[TMP1]], zeroinitializer
+; SSE-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP2]] to i64
+; SSE-NEXT:    ret i64 [[OR]]
+;
+; AVX2-LABEL: @movmsk_i64_v64i8_v32i8(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[V1:%.*]], <32 x i8> [[V0:%.*]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:    [[TMP2:%.*]] = icmp slt <64 x i8> [[TMP1]], zeroinitializer
+; AVX2-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP2]] to i64
+; AVX2-NEXT:    ret i64 [[OR]]
+;
+; AVX512-LABEL: @movmsk_i64_v64i8_v32i8(
+; AVX512-NEXT:    [[C0:%.*]] = icmp slt <32 x i8> [[V0:%.*]], zeroinitializer
+; AVX512-NEXT:    [[C1:%.*]] = icmp slt <32 x i8> [[V1:%.*]], zeroinitializer
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i1> [[C1]], <32 x i1> [[C0]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP1]] to i64
+; AVX512-NEXT:    ret i64 [[OR]]
+;
+  %c0 = icmp slt <32 x i8> %v0, zeroinitializer
+  %c1 = icmp slt <32 x i8> %v1, zeroinitializer
+  %b0 = bitcast <32 x i1> %c0 to i32
+  %b1 = bitcast <32 x i1> %c1 to i32
+  %z0 = zext i32 %b0 to i64
+  %z1 = zext i32 %b1 to i64
+  %s0 = shl nuw i64 %z0, 32
+  %or = or disjoint i64 %s0, %z1
+  ret i64 %or
+}
+
+define i32 @movmsk_i32_v16i32_v8i32(<8 x i32> %v0, <8 x i32> %v1) {
+; SSE-LABEL: @movmsk_i32_v16i32_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[V1:%.*]], <8 x i32> [[V0:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP2:%.*]] = icmp slt <16 x i32> [[TMP1]], zeroinitializer
+; SSE-NEXT:    [[TMP3:%.*]] = bitcast <16 x i1> [[TMP2]] to i16
+; SSE-NEXT:    [[OR:%.*]] = zext i16 [[TMP3]] to i32
+; SSE-NEXT:    ret i32 [[OR]]
+;
+; AVX2-LABEL: @movmsk_i32_v16i32_v8i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[V1:%.*]], <8 x i32> [[V0:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    [[TMP2:%.*]] = icmp slt <16 x i32> [[TMP1]], zeroinitializer
+; AVX2-NEXT:    [[TMP3:%.*]] = bitcast <16 x i1> [[TMP2]] to i16
+; AVX2-NEXT:    [[OR:%.*]] = zext i16 [[TMP3]] to i32
+; AVX2-NEXT:    ret i32 [[OR]]
+;
+; AVX512-LABEL: @movmsk_i32_v16i32_v8i32(
+; AVX512-NEXT:    [[C0:%.*]] = icmp slt <8 x i32> [[V0:%.*]], zeroinitializer
+; AVX512-NEXT:    [[C1:%.*]] = icmp slt <8 x i32> [[V1:%.*]], zeroinitializer
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i1> [[C1]], <8 x i1> [[C0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    [[TMP2:%.*]] = bitcast <16 x i1> [[TMP1]] to i16
+; AVX512-NEXT:    [[OR:%.*]] = zext i16 [[TMP2]] to i32
+; AVX512-NEXT:    ret i32 [[OR]]
+;
+  %c0 = icmp slt <8 x i32> %v0, zeroinitializer
+  %c1 = icmp slt <8 x i32> %v1, zeroinitializer
+  %b0 = bitcast <8 x i1> %c0 to i8
+  %b1 = bitcast <8 x i1> %c1 to i8
+  %z0 = zext i8 %b0 to i32
+  %z1 = zext i8 %b1 to i32
+  %s0 = shl nuw i32 %z0, 8
+  %or = or disjoint i32 %s0, %z1
+  ret i32 %or
+}
+
+define i64 @PR111431(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) {
+; SSE-LABEL: @PR111431(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> [[A0]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i8> [[A2:%.*]], <32 x i8> [[A1:%.*]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:    [[TMP3:%.*]] = icmp eq <64 x i8> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP3]] to i64
+; SSE-NEXT:    ret i64 [[OR]]
+;
+; AVX2-LABEL: @PR111431(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> [[A0]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i8> [[A2:%.*]], <32 x i8> [[A1:%.*]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:    [[TMP3:%.*]] = icmp eq <64 x i8> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP3]] to i64
+; AVX2-NEXT:    ret i64 [[OR]]
+;
+; AVX512-LABEL: @PR111431(
+; AVX512-NEXT:    [[C01:%.*]] = icmp eq <32 x i8> [[A0:%.*]], [[A1:%.*]]
+; AVX512-NEXT:    [[C02:%.*]] = icmp eq <32 x i8> [[A0]], [[A2:%.*]]
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i1> [[C02]], <32 x i1> [[C01]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP1]] to i64
+; AVX512-NEXT:    ret i64 [[OR]]
+;
+  %c01 = icmp eq <32 x i8> %a0, %a1
+  %c02 = icmp eq <32 x i8> %a0, %a2
+  %b01 = bitcast <32 x i1> %c01 to i32
+  %b02 = bitcast <32 x i1> %c02 to i32
+  %z01 = zext i32 %b01 to i64
+  %z02 = zext i32 %b02 to i64
+  %shl = shl nuw i64 %z01, 32
+  %or = or disjoint i64 %shl, %z02
+  ret i64 %or
+}
diff --git a/llvm/test/tools/llc/new-pm/regalloc-amdgpu.mir b/llvm/test/tools/llc/new-pm/regalloc-amdgpu.mir
index f75db964e2d6e..07f2d350ffd9c 100644
--- a/llvm/test/tools/llc/new-pm/regalloc-amdgpu.mir
+++ b/llvm/test/tools/llc/new-pm/regalloc-amdgpu.mir
@@ -1,8 +1,10 @@
 # REQUIRES: amdgpu-registered-target
-# RUN: llc -mtriple=amdgcn --passes='regallocfast<filter=sgpr>' --print-pipeline-passes --filetype=null %s | FileCheck %s --check-prefix=PASS
+# RUN: llc -mtriple=amdgcn --passes='regallocfast<filter=sgpr>,regallocfast<filter=wwm>,regallocfast<filter=vgpr>' --print-pipeline-passes --filetype=null %s | FileCheck %s --check-prefix=PASS
 # RUN: not llc -mtriple=amdgcn --passes='regallocfast<filter=bad-filter>' --print-pipeline-passes --filetype=null %s 2>&1 | FileCheck %s --check-prefix=BAD-FILTER
 
 # PASS: regallocfast<filter=sgpr>
+# PASS: regallocfast<filter=wwm>
+# PASS: regallocfast<filter=vgpr>
 # BAD-FILTER: invalid regallocfast register filter 'bad-filter'
 
 ---
diff --git a/llvm/unittests/ADT/CMakeLists.txt b/llvm/unittests/ADT/CMakeLists.txt
index c9bc58f45f08c..07568ad0c64e3 100644
--- a/llvm/unittests/ADT/CMakeLists.txt
+++ b/llvm/unittests/ADT/CMakeLists.txt
@@ -86,6 +86,7 @@ add_llvm_unittest(ADTTests
   StringRefTest.cpp
   StringSetTest.cpp
   StringSwitchTest.cpp
+  StringTableTest.cpp
   TinyPtrVectorTest.cpp
   TrieRawHashMapTest.cpp
   TwineTest.cpp
diff --git a/llvm/unittests/ADT/StringTableTest.cpp b/llvm/unittests/ADT/StringTableTest.cpp
new file mode 100644
index 0000000000000..0fc4ba7b50b80
--- /dev/null
+++ b/llvm/unittests/ADT/StringTableTest.cpp
@@ -0,0 +1,41 @@
+//===- llvm/unittest/ADT/StringTableTest.cpp - StringTable tests ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringTable.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <cstdlib>
+
+using namespace llvm;
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::StrEq;
+
+TEST(StringTableTest, Basic) {
+  static constexpr char InputTable[] = "\0test\0";
+  constexpr StringTable T = InputTable;
+
+  // We support some limited constexpr operations, check those first.
+  static_assert(T.size() == sizeof(InputTable));
+  static_assert(T[0].empty());
+  static_assert(T[StringTable::Offset()].empty());
+  static_assert(T[1].size() == 4);
+
+  // And use normal Google Test runtime assertions to check the contents and
+  // give more complete error messages.
+  EXPECT_THAT(T[0], Eq(""));
+  EXPECT_THAT(T[StringTable::Offset()], Eq(""));
+  EXPECT_THAT(T[1], Eq("test"));
+
+  // Also check that this is a valid C-string.
+  EXPECT_THAT(T[1].data(), StrEq("test"));
+}
+
+} // anonymous namespace
diff --git a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
index 8f195c4f60326..d218de225c362 100644
--- a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
@@ -748,8 +748,7 @@ TEST_F(OpenMPDecompositionTest, Allocate3) {
   // Allocate + linear
   omp::List<omp::Clause> Clauses{
       {OMPC_allocate, omp::clause::Allocate{{std::nullopt, std::nullopt, {x}}}},
-      {OMPC_linear,
-       omp::clause::Linear{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
+      {OMPC_linear, omp::clause::Linear{{std::nullopt, std::nullopt, {x}}}},
   };
 
   omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_for,
@@ -761,7 +760,7 @@ TEST_F(OpenMPDecompositionTest, Allocate3) {
   // The "shared" clause is duplicated---this isn't harmful, but it
   // should be fixed eventually.
   ASSERT_EQ(Dir0, "parallel shared(x) shared(x)"); // (33)
-  ASSERT_EQ(Dir1, "for linear(, , , (x)) firstprivate(x) lastprivate(, (x)) "
+  ASSERT_EQ(Dir1, "for linear(, , (x)) firstprivate(x) lastprivate(, (x)) "
                   "allocate(, , (x))"); // (33)
 }
 
@@ -1059,8 +1058,7 @@ TEST_F(OpenMPDecompositionTest, Linear1) {
   omp::Object x{"x"};
 
   omp::List<omp::Clause> Clauses{
-      {OMPC_linear,
-       omp::clause::Linear{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
+      {OMPC_linear, omp::clause::Linear{{std::nullopt, std::nullopt, {x}}}},
   };
 
   omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_for_simd, Clauses);
@@ -1068,7 +1066,7 @@ TEST_F(OpenMPDecompositionTest, Linear1) {
   std::string Dir0 = stringify(Dec.output[0]);
   std::string Dir1 = stringify(Dec.output[1]);
   ASSERT_EQ(Dir0, "for firstprivate(x) lastprivate(, (x))"); // (15.1), (15.2)
-  ASSERT_EQ(Dir1, "simd linear(, , , (x)) lastprivate(, (x))"); // (15.1)
+  ASSERT_EQ(Dir1, "simd linear(, , (x)) lastprivate(, (x))"); // (15.1)
 }
 
 // NOWAIT
@@ -1102,13 +1100,12 @@ TEST_F(OpenMPDecompositionTest, Nowait1) {
 TEST_F(OpenMPDecompositionTest, Misc1) {
   omp::Object x{"x"};
   omp::List<omp::Clause> Clauses{
-      {OMPC_linear,
-       omp::clause::Linear{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
+      {OMPC_linear, omp::clause::Linear{{std::nullopt, std::nullopt, {x}}}},
   };
 
   omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_simd, Clauses);
   ASSERT_EQ(Dec.output.size(), 1u);
   std::string Dir0 = stringify(Dec.output[0]);
-  ASSERT_EQ(Dir0, "simd linear(, , , (x)) lastprivate(, (x))");
+  ASSERT_EQ(Dir0, "simd linear(, , (x)) lastprivate(, (x))");
 }
 } // namespace
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 630cd03c68801..d7ac108249118 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -6358,7 +6358,13 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
   auto *Load2 = Load1->getNextNode();
   EXPECT_TRUE(isa<LoadInst>(Load2));
 
-  auto *Value1 = Load2->getNextNode();
+  auto *OutlinedBlockBr = Load2->getNextNode();
+  EXPECT_TRUE(isa<BranchInst>(OutlinedBlockBr));
+
+  auto *OutlinedBlock = OutlinedBlockBr->getSuccessor(0);
+  EXPECT_EQ(OutlinedBlock->getName(), "outlined.body");
+
+  auto *Value1 = OutlinedBlock->getFirstNonPHI();
   EXPECT_EQ(Value1, Value);
   EXPECT_EQ(Value1->getNextNode(), TargetStore);
   auto *Deinit = TargetStore->getNextNode();
@@ -6510,7 +6516,14 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
   EXPECT_EQ(UserCodeBlock->getName(), "user_code.entry");
   auto *Load1 = UserCodeBlock->getFirstNonPHI();
   EXPECT_TRUE(isa<LoadInst>(Load1));
-  auto *Load2 = Load1->getNextNode();
+
+  auto *OutlinedBlockBr = Load1->getNextNode();
+  EXPECT_TRUE(isa<BranchInst>(OutlinedBlockBr));
+
+  auto *OutlinedBlock = OutlinedBlockBr->getSuccessor(0);
+  EXPECT_EQ(OutlinedBlock->getName(), "outlined.body");
+
+  auto *Load2 = OutlinedBlock->getFirstNonPHI();
   EXPECT_TRUE(isa<LoadInst>(Load2));
   EXPECT_EQ(Load2, Value);
   EXPECT_EQ(Load2->getNextNode(), TargetStore);
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index 690af62d18020..2fd52860e71b9 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -881,12 +881,12 @@ TEST_F(IRBuilderTest, DIBuilder) {
 
   auto ExpectOrder = [&](DbgInstPtr First, BasicBlock::iterator Second) {
     if (M->IsNewDbgInfoFormat) {
-      EXPECT_TRUE(First.is<DbgRecord *>());
+      EXPECT_TRUE(isa<DbgRecord *>(First));
       EXPECT_FALSE(Second->getDbgRecordRange().empty());
-      EXPECT_EQ(GetLastDbgRecord(&*Second), First.get<DbgRecord *>());
+      EXPECT_EQ(GetLastDbgRecord(&*Second), cast<DbgRecord *>(First));
     } else {
-      EXPECT_TRUE(First.is<Instruction *>());
-      EXPECT_EQ(&*std::prev(Second), First.get<Instruction *>());
+      EXPECT_TRUE(isa<Instruction *>(First));
+      EXPECT_EQ(&*std::prev(Second), cast<Instruction *>(First));
     }
   };
 
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index 8b700673814c3..af801d0ff5e1e 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -411,10 +411,10 @@ TEST(MemProf, BaseMemProfReader) {
            /*Column=*/5, /*IsInlineFrame=*/true);
   Frame F2(/*Hash=*/IndexedMemProfRecord::getGUID("bar"), /*LineOffset=*/10,
            /*Column=*/2, /*IsInlineFrame=*/false);
-  MemProfData.addFrame(F1);
-  MemProfData.addFrame(F2);
+  auto F1Id = MemProfData.addFrame(F1);
+  auto F2Id = MemProfData.addFrame(F2);
 
-  llvm::SmallVector<FrameId> CallStack{F1.hash(), F2.hash()};
+  llvm::SmallVector<FrameId> CallStack{F1Id, F2Id};
   CallStackId CSId = MemProfData.addCallStack(std::move(CallStack));
 
   IndexedMemProfRecord FakeRecord;
@@ -443,19 +443,17 @@ TEST(MemProf, BaseMemProfReaderWithCSIdMap) {
            /*Column=*/5, /*IsInlineFrame=*/true);
   Frame F2(/*Hash=*/IndexedMemProfRecord::getGUID("bar"), /*LineOffset=*/10,
            /*Column=*/2, /*IsInlineFrame=*/false);
-  MemProfData.addFrame(F1);
-  MemProfData.addFrame(F2);
+  auto F1Id = MemProfData.addFrame(F1);
+  auto F2Id = MemProfData.addFrame(F2);
 
-  llvm::SmallVector<FrameId> CallStack = {F1.hash(), F2.hash()};
-  MemProfData.addCallStack(CallStack);
+  llvm::SmallVector<FrameId> CallStack = {F1Id, F2Id};
+  auto CSId = MemProfData.addCallStack(std::move(CallStack));
 
   IndexedMemProfRecord FakeRecord;
   MemInfoBlock Block;
   Block.AllocCount = 1U, Block.TotalAccessDensity = 4,
   Block.TotalLifetime = 200001;
-  FakeRecord.AllocSites.emplace_back(
-      /*CSId=*/hashCallStack(CallStack),
-      /*MB=*/Block);
+  FakeRecord.AllocSites.emplace_back(/*CSId=*/CSId, /*MB=*/Block);
   MemProfData.Records.insert({F1.hash(), FakeRecord});
 
   MemProfReader Reader(std::move(MemProfData));
@@ -480,28 +478,28 @@ TEST(MemProf, IndexedMemProfRecordToMemProfRecord) {
   Frame F2(2, 0, 0, false);
   Frame F3(3, 0, 0, false);
   Frame F4(4, 0, 0, false);
-  MemProfData.addFrame(F1);
-  MemProfData.addFrame(F2);
-  MemProfData.addFrame(F3);
-  MemProfData.addFrame(F4);
-
-  llvm::SmallVector<FrameId> CS1 = {F1.hash(), F2.hash()};
-  llvm::SmallVector<FrameId> CS2 = {F1.hash(), F3.hash()};
-  llvm::SmallVector<FrameId> CS3 = {F2.hash(), F3.hash()};
-  llvm::SmallVector<FrameId> CS4 = {F2.hash(), F4.hash()};
-  MemProfData.addCallStack(CS1);
-  MemProfData.addCallStack(CS2);
-  MemProfData.addCallStack(CS3);
-  MemProfData.addCallStack(CS4);
+  auto F1Id = MemProfData.addFrame(F1);
+  auto F2Id = MemProfData.addFrame(F2);
+  auto F3Id = MemProfData.addFrame(F3);
+  auto F4Id = MemProfData.addFrame(F4);
+
+  llvm::SmallVector<FrameId> CS1 = {F1Id, F2Id};
+  llvm::SmallVector<FrameId> CS2 = {F1Id, F3Id};
+  llvm::SmallVector<FrameId> CS3 = {F2Id, F3Id};
+  llvm::SmallVector<FrameId> CS4 = {F2Id, F4Id};
+  auto CS1Id = MemProfData.addCallStack(std::move(CS1));
+  auto CS2Id = MemProfData.addCallStack(std::move(CS2));
+  auto CS3Id = MemProfData.addCallStack(std::move(CS3));
+  auto CS4Id = MemProfData.addCallStack(std::move(CS4));
 
   IndexedMemProfRecord IndexedRecord;
   IndexedAllocationInfo AI;
-  AI.CSId = hashCallStack(CS1);
+  AI.CSId = CS1Id;
   IndexedRecord.AllocSites.push_back(AI);
-  AI.CSId = hashCallStack(CS2);
+  AI.CSId = CS2Id;
   IndexedRecord.AllocSites.push_back(AI);
-  IndexedRecord.CallSiteIds.push_back(hashCallStack(CS3));
-  IndexedRecord.CallSiteIds.push_back(hashCallStack(CS4));
+  IndexedRecord.CallSiteIds.push_back(CS3Id);
+  IndexedRecord.CallSiteIds.push_back(CS4Id);
 
   FrameIdConverter<decltype(MemProfData.Frames)> FrameIdConv(
       MemProfData.Frames);
@@ -556,14 +554,12 @@ TEST(MemProf, MissingCallStackId) {
 }
 
 TEST(MemProf, MissingFrameId) {
-  IndexedAllocationInfo AI(0x222, makePartialMIB(), getHotColdSchema());
-
-  IndexedMemProfRecord IndexedMR;
-  IndexedMR.AllocSites.push_back(AI);
-
   // An empty Frame map to trigger a mapping error.
   IndexedMemProfData MemProfData;
-  MemProfData.CallStacks.insert({0x222, {2, 3}});
+  auto CSId = MemProfData.addCallStack(SmallVector<FrameId>{2, 3});
+
+  IndexedMemProfRecord IndexedMR;
+  IndexedMR.AllocSites.emplace_back(CSId, makePartialMIB(), getHotColdSchema());
 
   FrameIdConverter<decltype(MemProfData.Frames)> FrameIdConv(
       MemProfData.Frames);
@@ -581,11 +577,11 @@ TEST(MemProf, MissingFrameId) {
 // Verify CallStackRadixTreeBuilder can handle empty inputs.
 TEST(MemProf, RadixTreeBuilderEmpty) {
   llvm::DenseMap<FrameId, LinearFrameId> MemProfFrameIndexes;
-  llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> MemProfCallStackData;
+  IndexedMemProfData MemProfData;
   llvm::DenseMap<FrameId, FrameStat> FrameHistogram =
-      computeFrameHistogram<FrameId>(MemProfCallStackData);
+      computeFrameHistogram<FrameId>(MemProfData.CallStacks);
   CallStackRadixTreeBuilder<FrameId> Builder;
-  Builder.build(std::move(MemProfCallStackData), &MemProfFrameIndexes,
+  Builder.build(std::move(MemProfData.CallStacks), &MemProfFrameIndexes,
                 FrameHistogram);
   ASSERT_THAT(Builder.getRadixArray(), IsEmpty());
   const auto Mappings = Builder.takeCallStackPos();
@@ -597,12 +593,12 @@ TEST(MemProf, RadixTreeBuilderOne) {
   llvm::DenseMap<FrameId, LinearFrameId> MemProfFrameIndexes = {
       {11, 1}, {12, 2}, {13, 3}};
   llvm::SmallVector<FrameId> CS1 = {13, 12, 11};
-  llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> MemProfCallStackData;
-  MemProfCallStackData.insert({hashCallStack(CS1), CS1});
+  IndexedMemProfData MemProfData;
+  auto CS1Id = MemProfData.addCallStack(std::move(CS1));
   llvm::DenseMap<FrameId, FrameStat> FrameHistogram =
-      computeFrameHistogram<FrameId>(MemProfCallStackData);
+      computeFrameHistogram<FrameId>(MemProfData.CallStacks);
   CallStackRadixTreeBuilder<FrameId> Builder;
-  Builder.build(std::move(MemProfCallStackData), &MemProfFrameIndexes,
+  Builder.build(std::move(MemProfData.CallStacks), &MemProfFrameIndexes,
                 FrameHistogram);
   EXPECT_THAT(Builder.getRadixArray(),
               ElementsAre(3U, // Size of CS1,
@@ -611,7 +607,7 @@ TEST(MemProf, RadixTreeBuilderOne) {
                           1U  // MemProfFrameIndexes[11]
                           ));
   const auto Mappings = Builder.takeCallStackPos();
-  EXPECT_THAT(Mappings, UnorderedElementsAre(Pair(hashCallStack(CS1), 0U)));
+  EXPECT_THAT(Mappings, UnorderedElementsAre(Pair(CS1Id, 0U)));
 }
 
 // Verify CallStackRadixTreeBuilder can form a link between two call stacks.
@@ -620,13 +616,13 @@ TEST(MemProf, RadixTreeBuilderTwo) {
       {11, 1}, {12, 2}, {13, 3}};
   llvm::SmallVector<FrameId> CS1 = {12, 11};
   llvm::SmallVector<FrameId> CS2 = {13, 12, 11};
-  llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> MemProfCallStackData;
-  MemProfCallStackData.insert({hashCallStack(CS1), CS1});
-  MemProfCallStackData.insert({hashCallStack(CS2), CS2});
+  IndexedMemProfData MemProfData;
+  auto CS1Id = MemProfData.addCallStack(std::move(CS1));
+  auto CS2Id = MemProfData.addCallStack(std::move(CS2));
   llvm::DenseMap<FrameId, FrameStat> FrameHistogram =
-      computeFrameHistogram<FrameId>(MemProfCallStackData);
+      computeFrameHistogram<FrameId>(MemProfData.CallStacks);
   CallStackRadixTreeBuilder<FrameId> Builder;
-  Builder.build(std::move(MemProfCallStackData), &MemProfFrameIndexes,
+  Builder.build(std::move(MemProfData.CallStacks), &MemProfFrameIndexes,
                 FrameHistogram);
   EXPECT_THAT(Builder.getRadixArray(),
               ElementsAre(2U,                        // Size of CS1
@@ -637,8 +633,7 @@ TEST(MemProf, RadixTreeBuilderTwo) {
                           1U                         // MemProfFrameIndexes[11]
                           ));
   const auto Mappings = Builder.takeCallStackPos();
-  EXPECT_THAT(Mappings, UnorderedElementsAre(Pair(hashCallStack(CS1), 0U),
-                                             Pair(hashCallStack(CS2), 2U)));
+  EXPECT_THAT(Mappings, UnorderedElementsAre(Pair(CS1Id, 0U), Pair(CS2Id, 2U)));
 }
 
 // Verify CallStackRadixTreeBuilder can form a jump to a prefix that itself has
@@ -651,15 +646,15 @@ TEST(MemProf, RadixTreeBuilderSuccessiveJumps) {
   llvm::SmallVector<FrameId> CS2 = {15, 13, 12, 11};
   llvm::SmallVector<FrameId> CS3 = {17, 16, 12, 11};
   llvm::SmallVector<FrameId> CS4 = {18, 16, 12, 11};
-  llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> MemProfCallStackData;
-  MemProfCallStackData.insert({hashCallStack(CS1), CS1});
-  MemProfCallStackData.insert({hashCallStack(CS2), CS2});
-  MemProfCallStackData.insert({hashCallStack(CS3), CS3});
-  MemProfCallStackData.insert({hashCallStack(CS4), CS4});
+  IndexedMemProfData MemProfData;
+  auto CS1Id = MemProfData.addCallStack(std::move(CS1));
+  auto CS2Id = MemProfData.addCallStack(std::move(CS2));
+  auto CS3Id = MemProfData.addCallStack(std::move(CS3));
+  auto CS4Id = MemProfData.addCallStack(std::move(CS4));
   llvm::DenseMap<FrameId, FrameStat> FrameHistogram =
-      computeFrameHistogram<FrameId>(MemProfCallStackData);
+      computeFrameHistogram<FrameId>(MemProfData.CallStacks);
   CallStackRadixTreeBuilder<FrameId> Builder;
-  Builder.build(std::move(MemProfCallStackData), &MemProfFrameIndexes,
+  Builder.build(std::move(MemProfData.CallStacks), &MemProfFrameIndexes,
                 FrameHistogram);
   EXPECT_THAT(Builder.getRadixArray(),
               ElementsAre(4U,                        // Size of CS1
@@ -679,10 +674,9 @@ TEST(MemProf, RadixTreeBuilderSuccessiveJumps) {
                           1U                         // MemProfFrameIndexes[11]
                           ));
   const auto Mappings = Builder.takeCallStackPos();
-  EXPECT_THAT(Mappings, UnorderedElementsAre(Pair(hashCallStack(CS1), 0U),
-                                             Pair(hashCallStack(CS2), 3U),
-                                             Pair(hashCallStack(CS3), 7U),
-                                             Pair(hashCallStack(CS4), 10U)));
+  EXPECT_THAT(Mappings,
+              UnorderedElementsAre(Pair(CS1Id, 0U), Pair(CS2Id, 3U),
+                                   Pair(CS3Id, 7U), Pair(CS4Id, 10U)));
 }
 
 // Verify that we can parse YAML and retrieve IndexedMemProfData as expected.
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index f9ff45f116603..cf06ec5670346 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -1115,6 +1115,7 @@ Experimental extensions
     ssctr                1.0
     svukte               0.3
     xqcia                0.2
+    xqcics               0.2
     xqcicsr              0.2
     xqcisls              0.2
 
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
index 93277eed8be12..1b362d1d26bdd 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
@@ -107,8 +107,9 @@ edge [fontname=Courier, fontsize=30]
 compound=true
   N0 [label =
     "ir-bb\<entry\>:\l" +
-    "No successors\l"
+    "Successor(s): vector.ph\l"
   ]
+  N0 -> N1 [ label=""]
   N1 [label =
     "vector.ph:\l" +
     "Successor(s): vector loop\l"
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 3179cfc676ab6..bc8bcc3447ea0 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -747,8 +747,9 @@ compound=true
   N0 [label =
     "preheader:\l" +
     "  EMIT vp\<%1\> = add\l" +
-    "No successors\l"
+    "Successor(s): bb1\l"
   ]
+  N0 -> N1 [ label=""]
   N1 [label =
     "bb1:\l" +
     "  EMIT vp\<%2\> = add\l" +
@@ -840,7 +841,7 @@ vp<%1> = original trip-count
 
 preheader:
   EMIT vp<%1> = sub
-No successors
+Successor(s): bb1
 
 bb1:
   EMIT vp<%2> = add
@@ -864,7 +865,7 @@ vp<%1> = original trip-count
 
 preheader:
   EMIT vp<%1> = sub
-No successors
+Successor(s): bb1
 
 bb1:
   EMIT vp<%2> = add
@@ -888,7 +889,7 @@ vp<%1> = original trip-count
 
 preheader:
   EMIT vp<%1> = sub
-No successors
+Successor(s): bb1
 
 bb1:
   EMIT vp<%2> = add
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
index edb3f8a295294..bc4f3943447fd 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
@@ -227,7 +227,6 @@ TEST(VPVerifierTest, BlockOutsideRegionWithParent) {
 
   VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1");
   VPBlockUtils::connectBlocks(VPBB1, R1);
-  VPBB1->setParent(R1);
 
   auto TC = std::make_unique<VPValue>();
   LLVMContext C;
@@ -235,6 +234,7 @@ TEST(VPVerifierTest, BlockOutsideRegionWithParent) {
   VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
   VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB);
   VPlan Plan(VPPH, &*TC, VPBB1, ScalarHeaderVPBB);
+  VPBB1->setParent(R1);
 
 #if GTEST_HAS_STREAM_REDIRECTION
   ::testing::internal::CaptureStderr();
diff --git a/llvm/utils/gn/secondary/llvm/unittests/ADT/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ADT/BUILD.gn
index b1462e3ab8e1c..3541a7ae45291 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/ADT/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/ADT/BUILD.gn
@@ -94,6 +94,7 @@ unittest("ADTTests") {
     "StringRefTest.cpp",
     "StringSetTest.cpp",
     "StringSwitchTest.cpp",
+    "StringTableTest.cpp",
     "TinyPtrVectorTest.cpp",
     "TrieRawHashMapTest.cpp",
     "TwineTest.cpp",
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index 82bfbe56f0839..0608eef15c5a4 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -153,6 +153,9 @@ set(MLIR_INSTALL_AGGREGATE_OBJECTS 1 CACHE BOOL
 
 set(MLIR_BUILD_MLIR_C_DYLIB 0 CACHE BOOL "Builds libMLIR-C shared library.")
 
+set(MLIR_LINK_MLIR_DYLIB ${LLVM_LINK_LLVM_DYLIB} CACHE BOOL
+    "Link tools against libMLIR.so")
+
 configure_file(
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Config/mlir-config.h.cmake
   ${MLIR_INCLUDE_DIR}/mlir/Config/mlir-config.h)
diff --git a/mlir/cmake/modules/AddMLIR.cmake b/mlir/cmake/modules/AddMLIR.cmake
index a3324705c525c..e1e79593ec2cb 100644
--- a/mlir/cmake/modules/AddMLIR.cmake
+++ b/mlir/cmake/modules/AddMLIR.cmake
@@ -717,3 +717,23 @@ function(mlir_check_all_link_libraries name)
     endforeach()
   endif()
 endfunction(mlir_check_all_link_libraries)
+
+# Link target against a list of MLIR libraries. If MLIR_LINK_MLIR_DYLIB is
+# enabled, this will link against the MLIR dylib instead of the static
+# libraries.
+#
+# This function should be used instead of target_link_libraries() when linking
+# MLIR libraries that are part of the MLIR dylib. For libraries that are not
+# part of the dylib (like test libraries), target_link_libraries() should be
+# used.
+function(mlir_target_link_libraries target type)
+  if (TARGET obj.${target})
+    target_link_libraries(obj.${target} ${ARGN})
+  endif()
+
+  if (MLIR_LINK_MLIR_DYLIB)
+    target_link_libraries(${target} ${type} MLIR)
+  else()
+    target_link_libraries(${target} ${type} ${ARGN})
+  endif()
+endfunction()
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index d722bd1f3e296..8835e0a9099fd 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -553,9 +553,9 @@ def ConvertGpuOpsToLLVMSPVOps : Pass<"convert-gpu-to-llvm-spv", "gpu::GPUModuleO
     "Generate LLVM operations to be ingested by a SPIR-V backend for gpu operations";
   let dependentDialects = ["LLVM::LLVMDialect"];
   let options = [
-    Option<"indexBitwidth", "index-bitwidth", "unsigned",
-           /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
-           "Bitwidth of the index type, 0 to use size of machine word">,
+    Option<"use64bitIndex", "use-64bit-index",
+           "bool", /*default=*/"false",
+           "Use 64-bit integers to convert index types">,
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 85b6a6638036f..89c7ed46ff500 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -135,6 +135,14 @@ def PrivateClauseOp : OpenMP_Op<"private", [IsolatedFromAbove, RecipeInterface]>
       auto &region = getDeallocRegion();
       return region.empty() ? nullptr : region.getArgument(0);
     }
+
+    /// needsMap returns true if the value being privatized should additionally
+    /// be mapped to the target region using a MapInfoOp. This is most common
+    /// when an allocatable is privatized. In such cases, the descriptor is used
+    /// in privatization and needs to be mapped on to the device.
+    bool needsMap() {
+      return !getAllocMoldArg().use_empty();
+    }
   }];
 
   let hasRegionVerifier = 1;
diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
index 807beebe4fb22..473b1da4f701c 100644
--- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
+++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
@@ -696,177 +696,22 @@ struct MulOpConversion : public OpConversionPattern<complex::MulOp> {
     auto elementType = cast<FloatType>(type.getElementType());
     arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
     auto fmfValue = fmf.getValue();
-
     Value lhsReal = b.create<complex::ReOp>(elementType, adaptor.getLhs());
-    Value lhsRealAbs = b.create<math::AbsFOp>(lhsReal, fmfValue);
     Value lhsImag = b.create<complex::ImOp>(elementType, adaptor.getLhs());
-    Value lhsImagAbs = b.create<math::AbsFOp>(lhsImag, fmfValue);
     Value rhsReal = b.create<complex::ReOp>(elementType, adaptor.getRhs());
-    Value rhsRealAbs = b.create<math::AbsFOp>(rhsReal, fmfValue);
     Value rhsImag = b.create<complex::ImOp>(elementType, adaptor.getRhs());
-    Value rhsImagAbs = b.create<math::AbsFOp>(rhsImag, fmfValue);
-
     Value lhsRealTimesRhsReal =
         b.create<arith::MulFOp>(lhsReal, rhsReal, fmfValue);
-    Value lhsRealTimesRhsRealAbs =
-        b.create<math::AbsFOp>(lhsRealTimesRhsReal, fmfValue);
     Value lhsImagTimesRhsImag =
         b.create<arith::MulFOp>(lhsImag, rhsImag, fmfValue);
-    Value lhsImagTimesRhsImagAbs =
-        b.create<math::AbsFOp>(lhsImagTimesRhsImag, fmfValue);
     Value real = b.create<arith::SubFOp>(lhsRealTimesRhsReal,
                                          lhsImagTimesRhsImag, fmfValue);
-
     Value lhsImagTimesRhsReal =
         b.create<arith::MulFOp>(lhsImag, rhsReal, fmfValue);
-    Value lhsImagTimesRhsRealAbs =
-        b.create<math::AbsFOp>(lhsImagTimesRhsReal, fmfValue);
     Value lhsRealTimesRhsImag =
         b.create<arith::MulFOp>(lhsReal, rhsImag, fmfValue);
-    Value lhsRealTimesRhsImagAbs =
-        b.create<math::AbsFOp>(lhsRealTimesRhsImag, fmfValue);
     Value imag = b.create<arith::AddFOp>(lhsImagTimesRhsReal,
                                          lhsRealTimesRhsImag, fmfValue);
-
-    // Handle cases where the "naive" calculation results in NaN values.
-    Value realIsNan =
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::UNO, real, real);
-    Value imagIsNan =
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::UNO, imag, imag);
-    Value isNan = b.create<arith::AndIOp>(realIsNan, imagIsNan);
-
-    Value inf = b.create<arith::ConstantOp>(
-        elementType,
-        b.getFloatAttr(elementType,
-                       APFloat::getInf(elementType.getFloatSemantics())));
-
-    // Case 1. `lhsReal` or `lhsImag` are infinite.
-    Value lhsRealIsInf =
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, lhsRealAbs, inf);
-    Value lhsImagIsInf =
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, lhsImagAbs, inf);
-    Value lhsIsInf = b.create<arith::OrIOp>(lhsRealIsInf, lhsImagIsInf);
-    Value rhsRealIsNan =
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::UNO, rhsReal, rhsReal);
-    Value rhsImagIsNan =
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::UNO, rhsImag, rhsImag);
-    Value zero =
-        b.create<arith::ConstantOp>(elementType, b.getZeroAttr(elementType));
-    Value one = b.create<arith::ConstantOp>(elementType,
-                                            b.getFloatAttr(elementType, 1));
-    Value lhsRealIsInfFloat =
-        b.create<arith::SelectOp>(lhsRealIsInf, one, zero);
-    lhsReal = b.create<arith::SelectOp>(
-        lhsIsInf, b.create<math::CopySignOp>(lhsRealIsInfFloat, lhsReal),
-        lhsReal);
-    Value lhsImagIsInfFloat =
-        b.create<arith::SelectOp>(lhsImagIsInf, one, zero);
-    lhsImag = b.create<arith::SelectOp>(
-        lhsIsInf, b.create<math::CopySignOp>(lhsImagIsInfFloat, lhsImag),
-        lhsImag);
-    Value lhsIsInfAndRhsRealIsNan =
-        b.create<arith::AndIOp>(lhsIsInf, rhsRealIsNan);
-    rhsReal = b.create<arith::SelectOp>(
-        lhsIsInfAndRhsRealIsNan, b.create<math::CopySignOp>(zero, rhsReal),
-        rhsReal);
-    Value lhsIsInfAndRhsImagIsNan =
-        b.create<arith::AndIOp>(lhsIsInf, rhsImagIsNan);
-    rhsImag = b.create<arith::SelectOp>(
-        lhsIsInfAndRhsImagIsNan, b.create<math::CopySignOp>(zero, rhsImag),
-        rhsImag);
-
-    // Case 2. `rhsReal` or `rhsImag` are infinite.
-    Value rhsRealIsInf =
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, rhsRealAbs, inf);
-    Value rhsImagIsInf =
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, rhsImagAbs, inf);
-    Value rhsIsInf = b.create<arith::OrIOp>(rhsRealIsInf, rhsImagIsInf);
-    Value lhsRealIsNan =
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::UNO, lhsReal, lhsReal);
-    Value lhsImagIsNan =
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::UNO, lhsImag, lhsImag);
-    Value rhsRealIsInfFloat =
-        b.create<arith::SelectOp>(rhsRealIsInf, one, zero);
-    rhsReal = b.create<arith::SelectOp>(
-        rhsIsInf, b.create<math::CopySignOp>(rhsRealIsInfFloat, rhsReal),
-        rhsReal);
-    Value rhsImagIsInfFloat =
-        b.create<arith::SelectOp>(rhsImagIsInf, one, zero);
-    rhsImag = b.create<arith::SelectOp>(
-        rhsIsInf, b.create<math::CopySignOp>(rhsImagIsInfFloat, rhsImag),
-        rhsImag);
-    Value rhsIsInfAndLhsRealIsNan =
-        b.create<arith::AndIOp>(rhsIsInf, lhsRealIsNan);
-    lhsReal = b.create<arith::SelectOp>(
-        rhsIsInfAndLhsRealIsNan, b.create<math::CopySignOp>(zero, lhsReal),
-        lhsReal);
-    Value rhsIsInfAndLhsImagIsNan =
-        b.create<arith::AndIOp>(rhsIsInf, lhsImagIsNan);
-    lhsImag = b.create<arith::SelectOp>(
-        rhsIsInfAndLhsImagIsNan, b.create<math::CopySignOp>(zero, lhsImag),
-        lhsImag);
-    Value recalc = b.create<arith::OrIOp>(lhsIsInf, rhsIsInf);
-
-    // Case 3. One of the pairwise products of left hand side with right hand
-    // side is infinite.
-    Value lhsRealTimesRhsRealIsInf = b.create<arith::CmpFOp>(
-        arith::CmpFPredicate::OEQ, lhsRealTimesRhsRealAbs, inf);
-    Value lhsImagTimesRhsImagIsInf = b.create<arith::CmpFOp>(
-        arith::CmpFPredicate::OEQ, lhsImagTimesRhsImagAbs, inf);
-    Value isSpecialCase = b.create<arith::OrIOp>(lhsRealTimesRhsRealIsInf,
-                                                 lhsImagTimesRhsImagIsInf);
-    Value lhsRealTimesRhsImagIsInf = b.create<arith::CmpFOp>(
-        arith::CmpFPredicate::OEQ, lhsRealTimesRhsImagAbs, inf);
-    isSpecialCase =
-        b.create<arith::OrIOp>(isSpecialCase, lhsRealTimesRhsImagIsInf);
-    Value lhsImagTimesRhsRealIsInf = b.create<arith::CmpFOp>(
-        arith::CmpFPredicate::OEQ, lhsImagTimesRhsRealAbs, inf);
-    isSpecialCase =
-        b.create<arith::OrIOp>(isSpecialCase, lhsImagTimesRhsRealIsInf);
-    Type i1Type = b.getI1Type();
-    Value notRecalc = b.create<arith::XOrIOp>(
-        recalc,
-        b.create<arith::ConstantOp>(i1Type, b.getIntegerAttr(i1Type, 1)));
-    isSpecialCase = b.create<arith::AndIOp>(isSpecialCase, notRecalc);
-    Value isSpecialCaseAndLhsRealIsNan =
-        b.create<arith::AndIOp>(isSpecialCase, lhsRealIsNan);
-    lhsReal = b.create<arith::SelectOp>(
-        isSpecialCaseAndLhsRealIsNan, b.create<math::CopySignOp>(zero, lhsReal),
-        lhsReal);
-    Value isSpecialCaseAndLhsImagIsNan =
-        b.create<arith::AndIOp>(isSpecialCase, lhsImagIsNan);
-    lhsImag = b.create<arith::SelectOp>(
-        isSpecialCaseAndLhsImagIsNan, b.create<math::CopySignOp>(zero, lhsImag),
-        lhsImag);
-    Value isSpecialCaseAndRhsRealIsNan =
-        b.create<arith::AndIOp>(isSpecialCase, rhsRealIsNan);
-    rhsReal = b.create<arith::SelectOp>(
-        isSpecialCaseAndRhsRealIsNan, b.create<math::CopySignOp>(zero, rhsReal),
-        rhsReal);
-    Value isSpecialCaseAndRhsImagIsNan =
-        b.create<arith::AndIOp>(isSpecialCase, rhsImagIsNan);
-    rhsImag = b.create<arith::SelectOp>(
-        isSpecialCaseAndRhsImagIsNan, b.create<math::CopySignOp>(zero, rhsImag),
-        rhsImag);
-    recalc = b.create<arith::OrIOp>(recalc, isSpecialCase);
-    recalc = b.create<arith::AndIOp>(isNan, recalc);
-
-    // Recalculate real part.
-    lhsRealTimesRhsReal = b.create<arith::MulFOp>(lhsReal, rhsReal, fmfValue);
-    lhsImagTimesRhsImag = b.create<arith::MulFOp>(lhsImag, rhsImag, fmfValue);
-    Value newReal = b.create<arith::SubFOp>(lhsRealTimesRhsReal,
-                                            lhsImagTimesRhsImag, fmfValue);
-    real = b.create<arith::SelectOp>(
-        recalc, b.create<arith::MulFOp>(inf, newReal, fmfValue), real);
-
-    // Recalculate imag part.
-    lhsImagTimesRhsReal = b.create<arith::MulFOp>(lhsImag, rhsReal, fmfValue);
-    lhsRealTimesRhsImag = b.create<arith::MulFOp>(lhsReal, rhsImag, fmfValue);
-    Value newImag = b.create<arith::AddFOp>(lhsImagTimesRhsReal,
-                                            lhsRealTimesRhsImag, fmfValue);
-    imag = b.create<arith::SelectOp>(
-        recalc, b.create<arith::MulFOp>(inf, newImag, fmfValue), imag);
-
     rewriter.replaceOpWithNewOp<complex::CreateOp>(op, type, real, imag);
     return success();
   }
diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
index 03745f4537e99..a68c0153df443 100644
--- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
+++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
@@ -409,9 +409,7 @@ struct GPUToLLVMSPVConversionPass final
     RewritePatternSet patterns(context);
 
     LowerToLLVMOptions options(context);
-    if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
-      options.overrideIndexBitwidth(indexBitwidth);
-
+    options.overrideIndexBitwidth(this->use64bitIndex ? 64 : 32);
     LLVMTypeConverter converter(context, options);
     LLVMConversionTarget target(*context);
 
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
index 68c3d1cabb11c..052dee402b79e 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
@@ -746,11 +746,6 @@ FailureOr<TilingResult> tensor::bubbleUpPadSlice(OpBuilder &b,
   Location loc = padOp->getLoc();
   AffineExpr dim0, dim1;
   bindDims(b.getContext(), dim0, dim1);
-  // Add two integers.
-  auto addMap = AffineMap::get(2, 0, {dim0 + dim1});
-  auto add = [&](OpFoldResult v1, OpFoldResult v2) {
-    return affine::makeComposedFoldedAffineApply(b, loc, addMap, {v1, v2});
-  };
   // Subtract two integers.
   auto subMap = AffineMap::get(2, 0, {dim0 - dim1});
   auto sub = [&](OpFoldResult v1, OpFoldResult v2) {
@@ -825,16 +820,20 @@ FailureOr<TilingResult> tensor::bubbleUpPadSlice(OpBuilder &b,
     // The original read could also have stopped in the high padding zone.
     // In that case, set the end positition of the read should be the end of
     // the source tensor. (Similar to newOffset.)
-    //
-    // endLoc = min(max(offset - low + length, 0), srcSize)
-    //
-    // The new ExtractSliceOp length is `endLoc - newOffset`.
-    //
-    // Optimization: If low = 0, then the formula can be simplified.
-    OpFoldResult endLoc =
-        hasLowPad ? min(max(add(sub(offset, low), length), zero), srcSize)
-                  : min(add(offset, length), srcSize);
-    OpFoldResult newLength = sub(endLoc, newOffset);
+    // srcSize - newOffset represents how much length we have available
+    // and length - newLow represents how much length we want at most.
+    // Note that there are many ways to order this indexing math to compute
+    // newLength, but we want to make sure that the final affine.min ops in the
+    // sequence are bounding the index to as small a value as possible. If
+    // ValueBoundsOpInterface is used, this calculation will get upper bounds
+    // from the affine.min ops, so we want to use the smallest known value to
+    // set the bound at the end of the computation sequence. In this case, the
+    // index will be upper bounded by length - newLow.
+    OpFoldResult newLength = min(sub(srcSize, newOffset), sub(length, newLow));
+    // Optimization: If low = 0, then newLow = 0. then newLength >= 0 assuming
+    // length >= 0.
+    if (hasLowPad)
+      newLength = max(newLength, zero);
     newLengths.push_back(newLength);
 
     // Check if newLength is zero. In that case, no SubTensorOp should be
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
index 93e8cac6b84e9..893cedefc1ebd 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
@@ -524,18 +524,8 @@ bool TosaValidation::isValidElementType(Type type) {
     if (!isEnabledProfile(TosaProfileEnum::MainInference))
       return false;
     return type.isF32() || type.isF16() || type.isBF16();
-  }
-  if (auto intTy = dyn_cast<IntegerType>(type)) {
-    if (intTy.isUnsigned()) {
-      switch (intTy.getWidth()) {
-      case 8:
-      case 16:
-        return true;
-      default:
-        return false;
-      }
-    } else {
-      // Signless - treated as signed.
+  } else if (auto intTy = dyn_cast<IntegerType>(type)) {
+    if (intTy.isSignless()) {
       switch (intTy.getWidth()) {
       case 1:
       case 4:
@@ -544,13 +534,10 @@ bool TosaValidation::isValidElementType(Type type) {
       case 32:
       case 48:
         return true;
-      default:
-        return false;
       }
     }
-    return false;
   }
-  return true;
+  return false;
 }
 
 void TosaValidation::runOnOperation() {
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 49fe509800491..ff8606ed6b3f9 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -299,10 +299,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
             if (privatizer.getDataSharingType() ==
                 omp::DataSharingClauseType::FirstPrivate)
               result = todo("firstprivate");
-
-            if (!privatizer.getDeallocRegion().empty())
-              result = op.emitError("not yet implemented: privatization of "
-                                    "structures in omp.target operation");
           }
         }
         checkThreadLimit(op, result);
@@ -1290,6 +1286,41 @@ static LogicalResult allocAndInitializeReductionVars(
                            isByRef, deferredStores);
 }
 
+/// Return the llvm::Value * corresponding to the `privateVar` that
+/// is being privatized. It isn't always as simple as looking up
+/// moduleTranslation with privateVar. For instance, in case of
+/// an allocatable, the descriptor for the allocatable is privatized.
+/// This descriptor is mapped using an MapInfoOp. So, this function
+/// will return a pointer to the llvm::Value corresponding to the
+/// block argument for the mapped descriptor.
+static llvm::Value *
+findAssociatedValue(Value privateVar, llvm::IRBuilderBase &builder,
+                    LLVM::ModuleTranslation &moduleTranslation,
+                    llvm::DenseMap<Value, Value> *mappedPrivateVars = nullptr) {
+  if (mappedPrivateVars == nullptr || !mappedPrivateVars->contains(privateVar))
+    return moduleTranslation.lookupValue(privateVar);
+
+  Value blockArg = (*mappedPrivateVars)[privateVar];
+  Type privVarType = privateVar.getType();
+  Type blockArgType = blockArg.getType();
+  assert(isa<LLVM::LLVMPointerType>(blockArgType) &&
+         "A block argument corresponding to a mapped var should have "
+         "!llvm.ptr type");
+
+  if (privVarType == blockArgType)
+    return moduleTranslation.lookupValue(blockArg);
+
+  // This typically happens when the privatized type is lowered from
+  // boxchar<KIND> and gets lowered to !llvm.struct<(ptr, i64)>. That is the
+  // struct/pair is passed by value. But, mapped values are passed only as
+  // pointers, so before we privatize, we must load the pointer.
+  if (!isa<LLVM::LLVMPointerType>(privVarType))
+    return builder.CreateLoad(moduleTranslation.convertType(privVarType),
+                              moduleTranslation.lookupValue(blockArg));
+
+  return moduleTranslation.lookupValue(privateVar);
+}
+
 /// Allocate delayed private variables. Returns the basic block which comes
 /// after all of these allocations. llvm::Value * for each of these private
 /// variables are populated in llvmPrivateVars.
@@ -1300,7 +1331,8 @@ allocatePrivateVars(llvm::IRBuilderBase &builder,
                     MutableArrayRef<omp::PrivateClauseOp> privateDecls,
                     MutableArrayRef<mlir::Value> mlirPrivateVars,
                     llvm::SmallVectorImpl<llvm::Value *> &llvmPrivateVars,
-                    const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP) {
+                    const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
+                    llvm::DenseMap<Value, Value> *mappedPrivateVars = nullptr) {
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
   // Allocate private vars
   llvm::BranchInst *allocaTerminator =
@@ -1330,7 +1362,8 @@ allocatePrivateVars(llvm::IRBuilderBase &builder,
     Region &allocRegion = privDecl.getAllocRegion();
 
     // map allocation region block argument
-    llvm::Value *nonPrivateVar = moduleTranslation.lookupValue(mlirPrivVar);
+    llvm::Value *nonPrivateVar = findAssociatedValue(
+        mlirPrivVar, builder, moduleTranslation, mappedPrivateVars);
     assert(nonPrivateVar);
     moduleTranslation.mapValue(privDecl.getAllocMoldArg(), nonPrivateVar);
 
@@ -1345,6 +1378,7 @@ allocatePrivateVars(llvm::IRBuilderBase &builder,
     } else {
       builder.SetInsertPoint(privAllocBlock->getTerminator());
     }
+
     if (failed(inlineConvertOmpRegions(allocRegion, "omp.private.alloc",
                                        builder, moduleTranslation, &phis)))
       return llvm::createStringError(
@@ -3829,6 +3863,17 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   bool isTargetDevice = ompBuilder->Config.isTargetDevice();
   auto parentFn = opInst.getParentOfType<LLVM::LLVMFuncOp>();
   auto &targetRegion = targetOp.getRegion();
+  // Holds the private vars that have been mapped along with the block argument
+  // that corresponds to the MapInfoOp corresponding to the private var in
+  // question. So, for instance:
+  //
+  // %10 = omp.map.info var_ptr(%6#0 : !fir.ref<!fir.box<!fir.heap<i32>>>, ..)
+  // omp.target map_entries(%10 -> %arg0) private(@box.privatizer %6#0-> %arg1)
+  //
+  // Then, %10 has been created so that the descriptor can be used by the
+  // privatizer @box.privatizer on the device side. Here we'd record {%6#0,
+  // %arg0} in the mappedPrivateVars map.
+  llvm::DenseMap<Value, Value> mappedPrivateVars;
   DataLayout dl = DataLayout(opInst.getParentOfType<ModuleOp>());
   SmallVector<Value> mapVars = targetOp.getMapVars();
   ArrayRef<BlockArgument> mapBlockArgs =
@@ -3840,6 +3885,57 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   bool isOffloadEntry =
       isTargetDevice || !ompBuilder->Config.TargetTriples.empty();
 
+  // For some private variables, the MapsForPrivatizedVariablesPass
+  // creates MapInfoOp instances. Go through the private variables and
+  // the mapped variables so that during codegeneration we are able
+  // to quickly look up the corresponding map variable, if any for each
+  // private variable.
+  if (!targetOp.getPrivateVars().empty() && !targetOp.getMapVars().empty()) {
+    auto argIface = llvm::cast<omp::BlockArgOpenMPOpInterface>(*targetOp);
+    OperandRange privateVars = targetOp.getPrivateVars();
+    std::optional<ArrayAttr> privateSyms = targetOp.getPrivateSyms();
+    std::optional<DenseI64ArrayAttr> privateMapIndices =
+        targetOp.getPrivateMapsAttr();
+
+    for (auto [privVarIdx, privVarSymPair] :
+         llvm::enumerate(llvm::zip_equal(privateVars, *privateSyms))) {
+      auto privVar = std::get<0>(privVarSymPair);
+      auto privSym = std::get<1>(privVarSymPair);
+
+      SymbolRefAttr privatizerName = llvm::cast<SymbolRefAttr>(privSym);
+      omp::PrivateClauseOp privatizer =
+          findPrivatizer(targetOp, privatizerName);
+
+      if (!privatizer.needsMap())
+        continue;
+
+      mlir::Value mappedValue =
+          targetOp.getMappedValueForPrivateVar(privVarIdx);
+      assert(mappedValue && "Expected to find mapped value for a privatized "
+                            "variable that needs mapping");
+
+      // The MapInfoOp defining the map var isn't really needed later.
+      // So, we don't store it in any datastructure. Instead, we just
+      // do some sanity checks on it right now.
+      auto mapInfoOp = mappedValue.getDefiningOp<omp::MapInfoOp>();
+      [[maybe_unused]] Type varType = mapInfoOp.getVarType();
+
+      // Check #1: Check that the type of the private variable matches
+      // the type of the variable being mapped.
+      if (!isa<LLVM::LLVMPointerType>(privVar.getType()))
+        assert(
+            varType == privVar.getType() &&
+            "Type of private var doesn't match the type of the mapped value");
+
+      // Ok, only 1 sanity check for now.
+      // Record the block argument corresponding to this mapvar.
+      mappedPrivateVars.insert(
+          {privVar,
+           targetRegion.getArgument(argIface.getMapBlockArgsStart() +
+                                    (*privateMapIndices)[privVarIdx])});
+    }
+  }
+
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP)
       -> llvm::OpenMPIRBuilder::InsertPointOrErrorTy {
@@ -3859,7 +3955,6 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
         attr.isStringAttribute())
       llvmOutlinedFn->addFnAttr(attr);
 
-    builder.restoreIP(codeGenIP);
     for (auto [arg, mapOp] : llvm::zip_equal(mapBlockArgs, mapVars)) {
       auto mapInfoOp = cast<omp::MapInfoOp>(mapOp.getDefiningOp());
       llvm::Value *mapOpValue =
@@ -3869,50 +3964,52 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
 
     // Do privatization after moduleTranslation has already recorded
     // mapped values.
-    if (!targetOp.getPrivateVars().empty()) {
-      builder.restoreIP(allocaIP);
-
-      OperandRange privateVars = targetOp.getPrivateVars();
-      std::optional<ArrayAttr> privateSyms = targetOp.getPrivateSyms();
-      MutableArrayRef<BlockArgument> privateBlockArgs =
-          cast<omp::BlockArgOpenMPOpInterface>(opInst).getPrivateBlockArgs();
-
-      for (auto [privVar, privatizerNameAttr, privBlockArg] :
-           llvm::zip_equal(privateVars, *privateSyms, privateBlockArgs)) {
-
-        SymbolRefAttr privSym = cast<SymbolRefAttr>(privatizerNameAttr);
-        omp::PrivateClauseOp privatizer = findPrivatizer(&opInst, privSym);
-        assert(privatizer.getDataSharingType() !=
-                   omp::DataSharingClauseType::FirstPrivate &&
-               privatizer.getDeallocRegion().empty() &&
-               "unsupported privatizer");
-        moduleTranslation.mapValue(privatizer.getAllocMoldArg(),
-                                   moduleTranslation.lookupValue(privVar));
-        Region &allocRegion = privatizer.getAllocRegion();
-        SmallVector<llvm::Value *, 1> yieldedValues;
-        if (failed(inlineConvertOmpRegions(
-                allocRegion, "omp.targetop.privatizer", builder,
-                moduleTranslation, &yieldedValues))) {
-          return llvm::createStringError(
-              "failed to inline `alloc` region of `omp.private`");
-        }
-        assert(yieldedValues.size() == 1);
-        moduleTranslation.mapValue(privBlockArg, yieldedValues.front());
-        moduleTranslation.forgetMapping(allocRegion);
-        builder.restoreIP(builder.saveIP());
-      }
-    }
+    MutableArrayRef<BlockArgument> privateBlockArgs =
+        cast<omp::BlockArgOpenMPOpInterface>(opInst).getPrivateBlockArgs();
+    SmallVector<mlir::Value> mlirPrivateVars;
+    SmallVector<llvm::Value *> llvmPrivateVars;
+    SmallVector<omp::PrivateClauseOp> privateDecls;
+    mlirPrivateVars.reserve(privateBlockArgs.size());
+    llvmPrivateVars.reserve(privateBlockArgs.size());
+    collectPrivatizationDecls(targetOp, privateDecls);
+    for (mlir::Value privateVar : targetOp.getPrivateVars())
+      mlirPrivateVars.push_back(privateVar);
+
+    llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
+        builder, moduleTranslation, privateBlockArgs, privateDecls,
+        mlirPrivateVars, llvmPrivateVars, allocaIP, &mappedPrivateVars);
 
+    if (failed(handleError(afterAllocas, *targetOp)))
+      return llvm::make_error<PreviouslyReportedError>();
+
+    SmallVector<Region *> privateCleanupRegions;
+    llvm::transform(privateDecls, std::back_inserter(privateCleanupRegions),
+                    [](omp::PrivateClauseOp privatizer) {
+                      return &privatizer.getDeallocRegion();
+                    });
+
+    builder.restoreIP(codeGenIP);
     llvm::Expected<llvm::BasicBlock *> exitBlock = convertOmpOpRegions(
         targetRegion, "omp.target", builder, moduleTranslation);
+
     if (!exitBlock)
       return exitBlock.takeError();
 
     builder.SetInsertPoint(*exitBlock);
-    return builder.saveIP();
+    if (!privateCleanupRegions.empty()) {
+      if (failed(inlineOmpRegionCleanup(
+              privateCleanupRegions, llvmPrivateVars, moduleTranslation,
+              builder, "omp.targetop.private.cleanup",
+              /*shouldLoadCleanupRegionArg=*/false))) {
+        return llvm::createStringError(
+            "failed to inline `dealloc` region of `omp.private` "
+            "op in the target region");
+      }
+    }
+
+    return InsertPointTy(exitBlock.get(), exitBlock.get()->end());
   };
 
-  llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   StringRef parentName = parentFn.getName();
 
   llvm::TargetRegionEntryInfo entryInfo;
@@ -3923,9 +4020,6 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   int32_t defaultValTeams = -1;
   int32_t defaultValThreads = 0;
 
-  llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
-      findAllocaInsertPoint(builder, moduleTranslation);
-
   MapInfoData mapData;
   collectMapDataFromMapOperands(mapData, mapVars, moduleTranslation, dl,
                                 builder);
@@ -3973,6 +4067,10 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   buildDependData(targetOp.getDependKinds(), targetOp.getDependVars(),
                   moduleTranslation, dds);
 
+  llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
+      findAllocaInsertPoint(builder, moduleTranslation);
+  llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
+
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
       moduleTranslation.getOpenMPBuilder()->createTarget(
           ompLoc, isOffloadEntry, allocaIP, builder.saveIP(), entryInfo,
diff --git a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
index 3d73292e6b886..a4ddabbd0821a 100644
--- a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
+++ b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
@@ -339,115 +339,19 @@ func.func @complex_mul(%lhs: complex<f32>, %rhs: complex<f32>) -> complex<f32> {
   return %mul : complex<f32>
 }
 // CHECK: %[[LHS_REAL:.*]] = complex.re %[[LHS]] : complex<f32>
-// CHECK: %[[LHS_REAL_ABS:.*]] = math.absf %[[LHS_REAL]] : f32
 // CHECK: %[[LHS_IMAG:.*]] = complex.im %[[LHS]] : complex<f32>
-// CHECK: %[[LHS_IMAG_ABS:.*]] = math.absf %[[LHS_IMAG]] : f32
 // CHECK: %[[RHS_REAL:.*]] = complex.re %[[RHS]] : complex<f32>
-// CHECK: %[[RHS_REAL_ABS:.*]] = math.absf %[[RHS_REAL]] : f32
 // CHECK: %[[RHS_IMAG:.*]] = complex.im %[[RHS]] : complex<f32>
-// CHECK: %[[RHS_IMAG_ABS:.*]] = math.absf %[[RHS_IMAG]] : f32
 
 // CHECK: %[[LHS_REAL_TIMES_RHS_REAL:.*]] = arith.mulf %[[LHS_REAL]], %[[RHS_REAL]] : f32
-// CHECK: %[[LHS_REAL_TIMES_RHS_REAL_ABS:.*]] = math.absf %[[LHS_REAL_TIMES_RHS_REAL]] : f32
 // CHECK: %[[LHS_IMAG_TIMES_RHS_IMAG:.*]] = arith.mulf %[[LHS_IMAG]], %[[RHS_IMAG]] : f32
-// CHECK: %[[LHS_IMAG_TIMES_RHS_IMAG_ABS:.*]] = math.absf %[[LHS_IMAG_TIMES_RHS_IMAG]] : f32
 // CHECK: %[[REAL:.*]] = arith.subf %[[LHS_REAL_TIMES_RHS_REAL]], %[[LHS_IMAG_TIMES_RHS_IMAG]] : f32
 
 // CHECK: %[[LHS_IMAG_TIMES_RHS_REAL:.*]] = arith.mulf %[[LHS_IMAG]], %[[RHS_REAL]] : f32
-// CHECK: %[[LHS_IMAG_TIMES_RHS_REAL_ABS:.*]] = math.absf %[[LHS_IMAG_TIMES_RHS_REAL]] : f32
 // CHECK: %[[LHS_REAL_TIMES_RHS_IMAG:.*]] = arith.mulf %[[LHS_REAL]], %[[RHS_IMAG]] : f32
-// CHECK: %[[LHS_REAL_TIMES_RHS_IMAG_ABS:.*]] = math.absf %[[LHS_REAL_TIMES_RHS_IMAG]] : f32
 // CHECK: %[[IMAG:.*]] = arith.addf %[[LHS_IMAG_TIMES_RHS_REAL]], %[[LHS_REAL_TIMES_RHS_IMAG]] : f32
 
-// Handle cases where the "naive" calculation results in NaN values.
-// CHECK: %[[REAL_IS_NAN:.*]] = arith.cmpf uno, %[[REAL]], %[[REAL]] : f32
-// CHECK: %[[IMAG_IS_NAN:.*]] = arith.cmpf uno, %[[IMAG]], %[[IMAG]] : f32
-// CHECK: %[[IS_NAN:.*]] = arith.andi %[[REAL_IS_NAN]], %[[IMAG_IS_NAN]] : i1
-// CHECK: %[[INF:.*]] = arith.constant 0x7F800000 : f32
-
-// Case 1. LHS_REAL or LHS_IMAG are infinite.
-// CHECK: %[[LHS_REAL_IS_INF:.*]] = arith.cmpf oeq, %[[LHS_REAL_ABS]], %[[INF]] : f32
-// CHECK: %[[LHS_IMAG_IS_INF:.*]] = arith.cmpf oeq, %[[LHS_IMAG_ABS]], %[[INF]] : f32
-// CHECK: %[[LHS_IS_INF:.*]] = arith.ori %[[LHS_REAL_IS_INF]], %[[LHS_IMAG_IS_INF]] : i1
-// CHECK:  %[[RHS_REAL_IS_NAN:.*]] = arith.cmpf uno, %[[RHS_REAL]], %[[RHS_REAL]] : f32
-// CHECK: %[[RHS_IMAG_IS_NAN:.*]] = arith.cmpf uno, %[[RHS_IMAG]], %[[RHS_IMAG]] : f32
-// CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32
-// CHECK: %[[LHS_REAL_IS_INF_FLOAT:.*]] = arith.select %[[LHS_REAL_IS_INF]], %[[ONE]], %[[ZERO]] : f32
-// CHECK: %[[TMP:.*]] = math.copysign %[[LHS_REAL_IS_INF_FLOAT]], %[[LHS_REAL]] : f32
-// CHECK: %[[LHS_REAL1:.*]] = arith.select %[[LHS_IS_INF]], %[[TMP]], %[[LHS_REAL]] : f32
-// CHECK: %[[LHS_IMAG_IS_INF_FLOAT:.*]] = arith.select %[[LHS_IMAG_IS_INF]], %[[ONE]], %[[ZERO]] : f32
-// CHECK: %[[TMP:.*]] = math.copysign %[[LHS_IMAG_IS_INF_FLOAT]], %[[LHS_IMAG]] : f32
-// CHECK: %[[LHS_IMAG1:.*]] = arith.select %[[LHS_IS_INF]], %[[TMP]], %[[LHS_IMAG]] : f32
-// CHECK: %[[LHS_IS_INF_AND_RHS_REAL_IS_NAN:.*]] = arith.andi %[[LHS_IS_INF]], %[[RHS_REAL_IS_NAN]] : i1
-// CHECK: %[[TMP:.*]] = math.copysign %[[ZERO]], %[[RHS_REAL]] : f32
-// CHECK: %[[RHS_REAL1:.*]] = arith.select %[[LHS_IS_INF_AND_RHS_REAL_IS_NAN]], %[[TMP]], %[[RHS_REAL]] : f32
-// CHECK: %[[LHS_IS_INF_AND_RHS_IMAG_IS_NAN:.*]] = arith.andi %[[LHS_IS_INF]], %[[RHS_IMAG_IS_NAN]] : i1
-// CHECK: %[[TMP:.*]] = math.copysign %[[ZERO]], %[[RHS_IMAG]] : f32
-// CHECK: %[[RHS_IMAG1:.*]] = arith.select %[[LHS_IS_INF_AND_RHS_IMAG_IS_NAN]], %[[TMP]], %[[RHS_IMAG]] : f32
-
-// Case 2. RHS_REAL or RHS_IMAG are infinite.
-// CHECK: %[[RHS_REAL_IS_INF:.*]] = arith.cmpf oeq, %[[RHS_REAL_ABS]], %[[INF]] : f32
-// CHECK: %[[RHS_IMAG_IS_INF:.*]] = arith.cmpf oeq, %[[RHS_IMAG_ABS]], %[[INF]] : f32
-// CHECK: %[[RHS_IS_INF:.*]] = arith.ori %[[RHS_REAL_IS_INF]], %[[RHS_IMAG_IS_INF]] : i1
-// CHECK: %[[LHS_REAL_IS_NAN:.*]] = arith.cmpf uno, %[[LHS_REAL1]], %[[LHS_REAL1]] : f32
-// CHECK: %[[LHS_IMAG_IS_NAN:.*]] = arith.cmpf uno, %[[LHS_IMAG1]], %[[LHS_IMAG1]] : f32
-// CHECK: %[[RHS_REAL_IS_INF_FLOAT:.*]] = arith.select %[[RHS_REAL_IS_INF]], %[[ONE]], %[[ZERO]] : f32
-// CHECK: %[[TMP:.*]] = math.copysign %[[RHS_REAL_IS_INF_FLOAT]], %[[RHS_REAL1]] : f32
-// CHECK: %[[RHS_REAL2:.*]] = arith.select %[[RHS_IS_INF]], %[[TMP]], %[[RHS_REAL1]] : f32
-// CHECK: %[[RHS_IMAG_IS_INF_FLOAT:.*]] = arith.select %[[RHS_IMAG_IS_INF]], %[[ONE]], %[[ZERO]] : f32
-// CHECK: %[[TMP:.*]] = math.copysign %[[RHS_IMAG_IS_INF_FLOAT]], %[[RHS_IMAG1]] : f32
-// CHECK: %[[RHS_IMAG2:.*]] = arith.select %[[RHS_IS_INF]], %[[TMP]], %[[RHS_IMAG1]] : f32
-// CHECK: %[[RHS_IS_INF_AND_LHS_REAL_IS_NAN:.*]] = arith.andi %[[RHS_IS_INF]], %[[LHS_REAL_IS_NAN]] : i1
-// CHECK: %[[TMP:.*]] = math.copysign %[[ZERO]], %[[LHS_REAL1]] : f32
-// CHECK: %[[LHS_REAL2:.*]] = arith.select %[[RHS_IS_INF_AND_LHS_REAL_IS_NAN]], %[[TMP]], %[[LHS_REAL1]] : f32
-// CHECK: %[[RHS_IS_INF_AND_LHS_IMAG_IS_NAN:.*]] = arith.andi %[[RHS_IS_INF]], %[[LHS_IMAG_IS_NAN]] : i1
-// CHECK: %[[TMP:.*]] = math.copysign %[[ZERO]], %[[LHS_IMAG1]] : f32
-// CHECK: %[[LHS_IMAG2:.*]] = arith.select %[[RHS_IS_INF_AND_LHS_IMAG_IS_NAN]], %[[TMP]], %[[LHS_IMAG1]] : f32
-// CHECK: %[[RECALC:.*]] = arith.ori %[[LHS_IS_INF]], %[[RHS_IS_INF]] : i1
-
-// Case 3. One of the pairwise products of left hand side with right hand side
-// is infinite.
-// CHECK: %[[LHS_REAL_TIMES_RHS_REAL_IS_INF:.*]] = arith.cmpf oeq, %[[LHS_REAL_TIMES_RHS_REAL_ABS]], %[[INF]] : f32
-// CHECK: %[[LHS_IMAG_TIMES_RHS_IMAG_IS_INF:.*]] = arith.cmpf oeq, %[[LHS_IMAG_TIMES_RHS_IMAG_ABS]], %[[INF]] : f32
-// CHECK: %[[IS_SPECIAL_CASE:.*]] = arith.ori %[[LHS_REAL_TIMES_RHS_REAL_IS_INF]], %[[LHS_IMAG_TIMES_RHS_IMAG_IS_INF]] : i1
-// CHECK: %[[LHS_REAL_TIMES_RHS_IMAG_IS_INF:.*]] = arith.cmpf oeq, %[[LHS_REAL_TIMES_RHS_IMAG_ABS]], %[[INF]] : f32
-// CHECK: %[[IS_SPECIAL_CASE1:.*]] = arith.ori %[[IS_SPECIAL_CASE]], %[[LHS_REAL_TIMES_RHS_IMAG_IS_INF]] : i1
-// CHECK: %[[LHS_IMAG_TIMES_RHS_REAL_IS_INF:.*]] = arith.cmpf oeq, %[[LHS_IMAG_TIMES_RHS_REAL_ABS]], %[[INF]] : f32
-// CHECK: %[[IS_SPECIAL_CASE2:.*]] = arith.ori %[[IS_SPECIAL_CASE1]], %[[LHS_IMAG_TIMES_RHS_REAL_IS_INF]] : i1
-// CHECK: %[[TRUE:.*]] = arith.constant true
-// CHECK: %[[NOT_RECALC:.*]] = arith.xori %[[RECALC]], %[[TRUE]] : i1
-// CHECK: %[[IS_SPECIAL_CASE3:.*]] = arith.andi %[[IS_SPECIAL_CASE2]], %[[NOT_RECALC]] : i1
-// CHECK: %[[IS_SPECIAL_CASE_AND_LHS_REAL_IS_NAN:.*]] = arith.andi %[[IS_SPECIAL_CASE3]], %[[LHS_REAL_IS_NAN]] : i1
-// CHECK: %[[TMP:.*]] = math.copysign %[[ZERO]], %[[LHS_REAL2]] : f32
-// CHECK: %[[LHS_REAL3:.*]] = arith.select %[[IS_SPECIAL_CASE_AND_LHS_REAL_IS_NAN]], %[[TMP]], %[[LHS_REAL2]] : f32
-// CHECK: %[[IS_SPECIAL_CASE_AND_LHS_IMAG_IS_NAN:.*]] = arith.andi %[[IS_SPECIAL_CASE3]], %[[LHS_IMAG_IS_NAN]] : i1
-// CHECK: %[[TMP:.*]] = math.copysign %[[ZERO]], %[[LHS_IMAG2]] : f32
-// CHECK: %[[LHS_IMAG3:.*]] = arith.select %[[IS_SPECIAL_CASE_AND_LHS_IMAG_IS_NAN]], %[[TMP]], %[[LHS_IMAG2]] : f32
-// CHECK: %[[IS_SPECIAL_CASE_AND_RHS_REAL_IS_NAN:.*]] = arith.andi %[[IS_SPECIAL_CASE3]], %[[RHS_REAL_IS_NAN]] : i1
-// CHECK: %[[TMP:.*]] = math.copysign %[[ZERO]], %[[RHS_REAL2]] : f32
-// CHECK: %[[RHS_REAL3:.*]] = arith.select %[[IS_SPECIAL_CASE_AND_RHS_REAL_IS_NAN]], %[[TMP]], %[[RHS_REAL2]] : f32
-// CHECK: %[[IS_SPECIAL_CASE_AND_RHS_IMAG_IS_NAN:.*]] = arith.andi %[[IS_SPECIAL_CASE3]], %[[RHS_IMAG_IS_NAN]] : i1
-// CHECK: %[[TMP:.*]] = math.copysign %[[ZERO]], %[[RHS_IMAG2]] : f32
-// CHECK: %[[RHS_IMAG3:.*]] = arith.select %[[IS_SPECIAL_CASE_AND_RHS_IMAG_IS_NAN]], %[[TMP]], %[[RHS_IMAG2]] : f32
-// CHECK: %[[RECALC2:.*]] = arith.ori %[[RECALC]], %[[IS_SPECIAL_CASE3]] : i1
-// CHECK: %[[RECALC3:.*]] = arith.andi %[[IS_NAN]], %[[RECALC2]] : i1
-
- // Recalculate real part.
-// CHECK: %[[LHS_REAL_TIMES_RHS_REAL:.*]] = arith.mulf %[[LHS_REAL3]], %[[RHS_REAL3]] : f32
-// CHECK: %[[LHS_IMAG_TIMES_RHS_IMAG:.*]] = arith.mulf %[[LHS_IMAG3]], %[[RHS_IMAG3]] : f32
-// CHECK: %[[NEW_REAL:.*]] = arith.subf %[[LHS_REAL_TIMES_RHS_REAL]], %[[LHS_IMAG_TIMES_RHS_IMAG]] : f32
-// CHECK: %[[NEW_REAL_TIMES_INF:.*]] = arith.mulf %[[INF]], %[[NEW_REAL]] : f32
-// CHECK: %[[FINAL_REAL:.*]] = arith.select %[[RECALC3]], %[[NEW_REAL_TIMES_INF]], %[[REAL]] : f32
-
-// Recalculate imag part.
-// CHECK: %[[LHS_IMAG_TIMES_RHS_REAL:.*]] = arith.mulf %[[LHS_IMAG3]], %[[RHS_REAL3]] : f32
-// CHECK: %[[LHS_REAL_TIMES_RHS_IMAG:.*]] = arith.mulf %[[LHS_REAL3]], %[[RHS_IMAG3]] : f32
-// CHECK: %[[NEW_IMAG:.*]] = arith.addf %[[LHS_IMAG_TIMES_RHS_REAL]], %[[LHS_REAL_TIMES_RHS_IMAG]] : f32
-// CHECK: %[[NEW_IMAG_TIMES_INF:.*]] = arith.mulf %[[INF]], %[[NEW_IMAG]] : f32
-// CHECK: %[[FINAL_IMAG:.*]] = arith.select %[[RECALC3]], %[[NEW_IMAG_TIMES_INF]], %[[IMAG]] : f32
-
-// CHECK: %[[RESULT:.*]] = complex.create %[[FINAL_REAL]], %[[FINAL_IMAG]] : complex<f32>
+// CHECK: %[[RESULT:.*]] = complex.create %[[REAL]], %[[IMAG]] : complex<f32>
 // CHECK: return %[[RESULT]] : complex<f32>
 
 // -----
@@ -977,115 +881,16 @@ func.func @complex_mul_with_fmf(%lhs: complex<f32>, %rhs: complex<f32>) -> compl
   return %mul : complex<f32>
 }
 // CHECK: %[[LHS_REAL:.*]] = complex.re %[[LHS]] : complex<f32>
-// CHECK: %[[LHS_REAL_ABS:.*]] = math.absf %[[LHS_REAL]] fastmath<nnan,contract> : f32
 // CHECK: %[[LHS_IMAG:.*]] = complex.im %[[LHS]] : complex<f32>
-// CHECK: %[[LHS_IMAG_ABS:.*]] = math.absf %[[LHS_IMAG]] fastmath<nnan,contract> : f32
 // CHECK: %[[RHS_REAL:.*]] = complex.re %[[RHS]] : complex<f32>
-// CHECK: %[[RHS_REAL_ABS:.*]] = math.absf %[[RHS_REAL]] fastmath<nnan,contract> : f32
 // CHECK: %[[RHS_IMAG:.*]] = complex.im %[[RHS]] : complex<f32>
-// CHECK: %[[RHS_IMAG_ABS:.*]] = math.absf %[[RHS_IMAG]] fastmath<nnan,contract> : f32
-
 // CHECK: %[[LHS_REAL_TIMES_RHS_REAL:.*]] = arith.mulf %[[LHS_REAL]], %[[RHS_REAL]] fastmath<nnan,contract> : f32
-// CHECK: %[[LHS_REAL_TIMES_RHS_REAL_ABS:.*]] = math.absf %[[LHS_REAL_TIMES_RHS_REAL]] fastmath<nnan,contract> : f32
 // CHECK: %[[LHS_IMAG_TIMES_RHS_IMAG:.*]] = arith.mulf %[[LHS_IMAG]], %[[RHS_IMAG]] fastmath<nnan,contract> : f32
-// CHECK: %[[LHS_IMAG_TIMES_RHS_IMAG_ABS:.*]] = math.absf %[[LHS_IMAG_TIMES_RHS_IMAG]] fastmath<nnan,contract> : f32
 // CHECK: %[[REAL:.*]] = arith.subf %[[LHS_REAL_TIMES_RHS_REAL]], %[[LHS_IMAG_TIMES_RHS_IMAG]] fastmath<nnan,contract> : f32
-
 // CHECK: %[[LHS_IMAG_TIMES_RHS_REAL:.*]] = arith.mulf %[[LHS_IMAG]], %[[RHS_REAL]] fastmath<nnan,contract> : f32
-// CHECK: %[[LHS_IMAG_TIMES_RHS_REAL_ABS:.*]] = math.absf %[[LHS_IMAG_TIMES_RHS_REAL]] fastmath<nnan,contract> : f32
 // CHECK: %[[LHS_REAL_TIMES_RHS_IMAG:.*]] = arith.mulf %[[LHS_REAL]], %[[RHS_IMAG]] fastmath<nnan,contract> : f32
-// CHECK: %[[LHS_REAL_TIMES_RHS_IMAG_ABS:.*]] = math.absf %[[LHS_REAL_TIMES_RHS_IMAG]] fastmath<nnan,contract> : f32
 // CHECK: %[[IMAG:.*]] = arith.addf %[[LHS_IMAG_TIMES_RHS_REAL]], %[[LHS_REAL_TIMES_RHS_IMAG]] fastmath<nnan,contract> : f32
-
-// Handle cases where the "naive" calculation results in NaN values.
-// CHECK: %[[REAL_IS_NAN:.*]] = arith.cmpf uno, %[[REAL]], %[[REAL]] : f32
-// CHECK: %[[IMAG_IS_NAN:.*]] = arith.cmpf uno, %[[IMAG]], %[[IMAG]] : f32
-// CHECK: %[[IS_NAN:.*]] = arith.andi %[[REAL_IS_NAN]], %[[IMAG_IS_NAN]] : i1
-// CHECK: %[[INF:.*]] = arith.constant 0x7F800000 : f32
-
-// Case 1. LHS_REAL or LHS_IMAG are infinite.
-// CHECK: %[[LHS_REAL_IS_INF:.*]] = arith.cmpf oeq, %[[LHS_REAL_ABS]], %[[INF]] : f32
-// CHECK: %[[LHS_IMAG_IS_INF:.*]] = arith.cmpf oeq, %[[LHS_IMAG_ABS]], %[[INF]] : f32
-// CHECK: %[[LHS_IS_INF:.*]] = arith.ori %[[LHS_REAL_IS_INF]], %[[LHS_IMAG_IS_INF]] : i1
-// CHECK:  %[[RHS_REAL_IS_NAN:.*]] = arith.cmpf uno, %[[RHS_REAL]], %[[RHS_REAL]] : f32
-// CHECK: %[[RHS_IMAG_IS_NAN:.*]] = arith.cmpf uno, %[[RHS_IMAG]], %[[RHS_IMAG]] : f32
-// CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32
-// CHECK: %[[LHS_REAL_IS_INF_FLOAT:.*]] = arith.select %[[LHS_REAL_IS_INF]], %[[ONE]], %[[ZERO]] : f32
-// CHECK: %[[TMP:.*]] = math.copysign %[[LHS_REAL_IS_INF_FLOAT]], %[[LHS_REAL]] : f32
-// CHECK: %[[LHS_REAL1:.*]] = arith.select %[[LHS_IS_INF]], %[[TMP]], %[[LHS_REAL]] : f32
-// CHECK: %[[LHS_IMAG_IS_INF_FLOAT:.*]] = arith.select %[[LHS_IMAG_IS_INF]], %[[ONE]], %[[ZERO]] : f32
-// CHECK: %[[TMP:.*]] = math.copysign %[[LHS_IMAG_IS_INF_FLOAT]], %[[LHS_IMAG]] : f32
-// CHECK: %[[LHS_IMAG1:.*]] = arith.select %[[LHS_IS_INF]], %[[TMP]], %[[LHS_IMAG]] : f32
-// CHECK: %[[LHS_IS_INF_AND_RHS_REAL_IS_NAN:.*]] = arith.andi %[[LHS_IS_INF]], %[[RHS_REAL_IS_NAN]] : i1
-// CHECK: %[[TMP:.*]] = math.copysign %[[ZERO]], %[[RHS_REAL]] : f32
-// CHECK: %[[RHS_REAL1:.*]] = arith.select %[[LHS_IS_INF_AND_RHS_REAL_IS_NAN]], %[[TMP]], %[[RHS_REAL]] : f32
-// CHECK: %[[LHS_IS_INF_AND_RHS_IMAG_IS_NAN:.*]] = arith.andi %[[LHS_IS_INF]], %[[RHS_IMAG_IS_NAN]] : i1
-// CHECK: %[[TMP:.*]] = math.copysign %[[ZERO]], %[[RHS_IMAG]] : f32
-// CHECK: %[[RHS_IMAG1:.*]] = arith.select %[[LHS_IS_INF_AND_RHS_IMAG_IS_NAN]], %[[TMP]], %[[RHS_IMAG]] : f32
-
-// Case 2. RHS_REAL or RHS_IMAG are infinite.
-// CHECK: %[[RHS_REAL_IS_INF:.*]] = arith.cmpf oeq, %[[RHS_REAL_ABS]], %[[INF]] : f32
-// CHECK: %[[RHS_IMAG_IS_INF:.*]] = arith.cmpf oeq, %[[RHS_IMAG_ABS]], %[[INF]] : f32
-// CHECK: %[[RHS_IS_INF:.*]] = arith.ori %[[RHS_REAL_IS_INF]], %[[RHS_IMAG_IS_INF]] : i1
-// CHECK: %[[LHS_REAL_IS_NAN:.*]] = arith.cmpf uno, %[[LHS_REAL1]], %[[LHS_REAL1]] : f32
-// CHECK: %[[LHS_IMAG_IS_NAN:.*]] = arith.cmpf uno, %[[LHS_IMAG1]], %[[LHS_IMAG1]] : f32
-// CHECK: %[[RHS_REAL_IS_INF_FLOAT:.*]] = arith.select %[[RHS_REAL_IS_INF]], %[[ONE]], %[[ZERO]] : f32
-// CHECK: %[[TMP:.*]] = math.copysign %[[RHS_REAL_IS_INF_FLOAT]], %[[RHS_REAL1]] : f32
-// CHECK: %[[RHS_REAL2:.*]] = arith.select %[[RHS_IS_INF]], %[[TMP]], %[[RHS_REAL1]] : f32
-// CHECK: %[[RHS_IMAG_IS_INF_FLOAT:.*]] = arith.select %[[RHS_IMAG_IS_INF]], %[[ONE]], %[[ZERO]] : f32
-// CHECK: %[[TMP:.*]] = math.copysign %[[RHS_IMAG_IS_INF_FLOAT]], %[[RHS_IMAG1]] : f32
-// CHECK: %[[RHS_IMAG2:.*]] = arith.select %[[RHS_IS_INF]], %[[TMP]], %[[RHS_IMAG1]] : f32
-// CHECK: %[[RHS_IS_INF_AND_LHS_REAL_IS_NAN:.*]] = arith.andi %[[RHS_IS_INF]], %[[LHS_REAL_IS_NAN]] : i1
-// CHECK: %[[TMP:.*]] = math.copysign %[[ZERO]], %[[LHS_REAL1]] : f32
-// CHECK: %[[LHS_REAL2:.*]] = arith.select %[[RHS_IS_INF_AND_LHS_REAL_IS_NAN]], %[[TMP]], %[[LHS_REAL1]] : f32
-// CHECK: %[[RHS_IS_INF_AND_LHS_IMAG_IS_NAN:.*]] = arith.andi %[[RHS_IS_INF]], %[[LHS_IMAG_IS_NAN]] : i1
-// CHECK: %[[TMP:.*]] = math.copysign %[[ZERO]], %[[LHS_IMAG1]] : f32
-// CHECK: %[[LHS_IMAG2:.*]] = arith.select %[[RHS_IS_INF_AND_LHS_IMAG_IS_NAN]], %[[TMP]], %[[LHS_IMAG1]] : f32
-// CHECK: %[[RECALC:.*]] = arith.ori %[[LHS_IS_INF]], %[[RHS_IS_INF]] : i1
-
-// Case 3. One of the pairwise products of left hand side with right hand side
-// is infinite.
-// CHECK: %[[LHS_REAL_TIMES_RHS_REAL_IS_INF:.*]] = arith.cmpf oeq, %[[LHS_REAL_TIMES_RHS_REAL_ABS]], %[[INF]] : f32
-// CHECK: %[[LHS_IMAG_TIMES_RHS_IMAG_IS_INF:.*]] = arith.cmpf oeq, %[[LHS_IMAG_TIMES_RHS_IMAG_ABS]], %[[INF]] : f32
-// CHECK: %[[IS_SPECIAL_CASE:.*]] = arith.ori %[[LHS_REAL_TIMES_RHS_REAL_IS_INF]], %[[LHS_IMAG_TIMES_RHS_IMAG_IS_INF]] : i1
-// CHECK: %[[LHS_REAL_TIMES_RHS_IMAG_IS_INF:.*]] = arith.cmpf oeq, %[[LHS_REAL_TIMES_RHS_IMAG_ABS]], %[[INF]] : f32
-// CHECK: %[[IS_SPECIAL_CASE1:.*]] = arith.ori %[[IS_SPECIAL_CASE]], %[[LHS_REAL_TIMES_RHS_IMAG_IS_INF]] : i1
-// CHECK: %[[LHS_IMAG_TIMES_RHS_REAL_IS_INF:.*]] = arith.cmpf oeq, %[[LHS_IMAG_TIMES_RHS_REAL_ABS]], %[[INF]] : f32
-// CHECK: %[[IS_SPECIAL_CASE2:.*]] = arith.ori %[[IS_SPECIAL_CASE1]], %[[LHS_IMAG_TIMES_RHS_REAL_IS_INF]] : i1
-// CHECK: %[[TRUE:.*]] = arith.constant true
-// CHECK: %[[NOT_RECALC:.*]] = arith.xori %[[RECALC]], %[[TRUE]] : i1
-// CHECK: %[[IS_SPECIAL_CASE3:.*]] = arith.andi %[[IS_SPECIAL_CASE2]], %[[NOT_RECALC]] : i1
-// CHECK: %[[IS_SPECIAL_CASE_AND_LHS_REAL_IS_NAN:.*]] = arith.andi %[[IS_SPECIAL_CASE3]], %[[LHS_REAL_IS_NAN]] : i1
-// CHECK: %[[TMP:.*]] = math.copysign %[[ZERO]], %[[LHS_REAL2]] : f32
-// CHECK: %[[LHS_REAL3:.*]] = arith.select %[[IS_SPECIAL_CASE_AND_LHS_REAL_IS_NAN]], %[[TMP]], %[[LHS_REAL2]] : f32
-// CHECK: %[[IS_SPECIAL_CASE_AND_LHS_IMAG_IS_NAN:.*]] = arith.andi %[[IS_SPECIAL_CASE3]], %[[LHS_IMAG_IS_NAN]] : i1
-// CHECK: %[[TMP:.*]] = math.copysign %[[ZERO]], %[[LHS_IMAG2]] : f32
-// CHECK: %[[LHS_IMAG3:.*]] = arith.select %[[IS_SPECIAL_CASE_AND_LHS_IMAG_IS_NAN]], %[[TMP]], %[[LHS_IMAG2]] : f32
-// CHECK: %[[IS_SPECIAL_CASE_AND_RHS_REAL_IS_NAN:.*]] = arith.andi %[[IS_SPECIAL_CASE3]], %[[RHS_REAL_IS_NAN]] : i1
-// CHECK: %[[TMP:.*]] = math.copysign %[[ZERO]], %[[RHS_REAL2]] : f32
-// CHECK: %[[RHS_REAL3:.*]] = arith.select %[[IS_SPECIAL_CASE_AND_RHS_REAL_IS_NAN]], %[[TMP]], %[[RHS_REAL2]] : f32
-// CHECK: %[[IS_SPECIAL_CASE_AND_RHS_IMAG_IS_NAN:.*]] = arith.andi %[[IS_SPECIAL_CASE3]], %[[RHS_IMAG_IS_NAN]] : i1
-// CHECK: %[[TMP:.*]] = math.copysign %[[ZERO]], %[[RHS_IMAG2]] : f32
-// CHECK: %[[RHS_IMAG3:.*]] = arith.select %[[IS_SPECIAL_CASE_AND_RHS_IMAG_IS_NAN]], %[[TMP]], %[[RHS_IMAG2]] : f32
-// CHECK: %[[RECALC2:.*]] = arith.ori %[[RECALC]], %[[IS_SPECIAL_CASE3]] : i1
-// CHECK: %[[RECALC3:.*]] = arith.andi %[[IS_NAN]], %[[RECALC2]] : i1
-
- // Recalculate real part.
-// CHECK: %[[LHS_REAL_TIMES_RHS_REAL:.*]] = arith.mulf %[[LHS_REAL3]], %[[RHS_REAL3]] fastmath<nnan,contract> : f32
-// CHECK: %[[LHS_IMAG_TIMES_RHS_IMAG:.*]] = arith.mulf %[[LHS_IMAG3]], %[[RHS_IMAG3]] fastmath<nnan,contract> : f32
-// CHECK: %[[NEW_REAL:.*]] = arith.subf %[[LHS_REAL_TIMES_RHS_REAL]], %[[LHS_IMAG_TIMES_RHS_IMAG]] fastmath<nnan,contract> : f32
-// CHECK: %[[NEW_REAL_TIMES_INF:.*]] = arith.mulf %[[INF]], %[[NEW_REAL]] fastmath<nnan,contract> : f32
-// CHECK: %[[FINAL_REAL:.*]] = arith.select %[[RECALC3]], %[[NEW_REAL_TIMES_INF]], %[[REAL]] : f32
-
-// Recalculate imag part.
-// CHECK: %[[LHS_IMAG_TIMES_RHS_REAL:.*]] = arith.mulf %[[LHS_IMAG3]], %[[RHS_REAL3]] fastmath<nnan,contract> : f32
-// CHECK: %[[LHS_REAL_TIMES_RHS_IMAG:.*]] = arith.mulf %[[LHS_REAL3]], %[[RHS_IMAG3]] fastmath<nnan,contract> : f32
-// CHECK: %[[NEW_IMAG:.*]] = arith.addf %[[LHS_IMAG_TIMES_RHS_REAL]], %[[LHS_REAL_TIMES_RHS_IMAG]] fastmath<nnan,contract> : f32
-// CHECK: %[[NEW_IMAG_TIMES_INF:.*]] = arith.mulf %[[INF]], %[[NEW_IMAG]] fastmath<nnan,contract> : f32
-// CHECK: %[[FINAL_IMAG:.*]] = arith.select %[[RECALC3]], %[[NEW_IMAG_TIMES_INF]], %[[IMAG]] : f32
-
-// CHECK: %[[RESULT:.*]] = complex.create %[[FINAL_REAL]], %[[FINAL_IMAG]] : complex<f32>
+// CHECK: %[[RESULT:.*]] = complex.create %[[REAL]], %[[IMAG]] : complex<f32>
 // CHECK: return %[[RESULT]] : complex<f32>
 
 // -----
@@ -1098,193 +903,27 @@ func.func @complex_atan2_with_fmf(%lhs: complex<f32>,
 }
 
 // CHECK: %[[VAR0:.*]] = complex.re %arg1 : complex<f32>
-// CHECK: %[[VAR1:.*]] = math.absf %[[VAR0]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR2:.*]] = complex.im %arg1 : complex<f32>
-// CHECK: %[[VAR3:.*]] = math.absf %[[VAR2]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR4:.*]] = complex.re %arg1 : complex<f32>
-// CHECK: %[[VAR5:.*]] = math.absf %[[VAR4]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR6:.*]] = complex.im %arg1 : complex<f32>
-// CHECK: %[[VAR7:.*]] = math.absf %[[VAR6]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR8:.*]] = arith.mulf %[[VAR0]], %[[VAR4]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR9:.*]] = math.absf %[[VAR8]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR10:.*]] = arith.mulf %[[VAR2]], %[[VAR6]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR11:.*]] = math.absf %[[VAR10]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR12:.*]] = arith.subf %[[VAR8]], %[[VAR10]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR13:.*]] = arith.mulf %[[VAR2]], %[[VAR4]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR14:.*]] = math.absf %[[VAR13]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR15:.*]] = arith.mulf %[[VAR0]], %[[VAR6]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR16:.*]] = math.absf %[[VAR15]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR17:.*]] = arith.addf %[[VAR13]], %[[VAR15]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR18:.*]] = arith.cmpf uno, %[[VAR12]], %[[VAR12]] : f32
-// CHECK: %[[VAR19:.*]] = arith.cmpf uno, %[[VAR17]], %[[VAR17]] : f32
-// CHECK: %[[VAR20:.*]] = arith.andi %[[VAR18]], %[[VAR19]] : i1
-// CHECK: %[[CST:.*]] = arith.constant 0x7F800000 : f32
-// CHECK: %[[VAR21:.*]] = arith.cmpf oeq, %[[VAR1]], %[[CST]] : f32
-// CHECK: %[[VAR22:.*]] = arith.cmpf oeq, %[[VAR3]], %[[CST]] : f32
-// CHECK: %[[VAR23:.*]] = arith.ori %[[VAR21]], %[[VAR22]] : i1
-// CHECK: %[[VAR24:.*]] = arith.cmpf uno, %[[VAR4]], %[[VAR4]] : f32
-// CHECK: %[[VAR25:.*]] = arith.cmpf uno, %[[VAR6]], %[[VAR6]] : f32
-// CHECK: %[[CST_0:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK: %[[CST_1:.*]] = arith.constant 1.000000e+00 : f32
-// CHECK: %[[VAR26:.*]] = arith.select %[[VAR21]], %[[CST_1]], %[[CST_0]] : f32
-// CHECK: %[[VAR27:.*]] = math.copysign %[[VAR26]], %[[VAR0]] : f32
-// CHECK: %[[VAR28:.*]] = arith.select %[[VAR23]], %[[VAR27]], %[[VAR0]] : f32
-// CHECK: %[[VAR29:.*]] = arith.select %[[VAR22]], %[[CST_1]], %[[CST_0]] : f32
-// CHECK: %[[VAR30:.*]] = math.copysign %[[VAR29]], %[[VAR2]] : f32
-// CHECK: %[[VAR31:.*]] = arith.select %[[VAR23]], %[[VAR30]], %[[VAR2]] : f32
-// CHECK: %[[VAR32:.*]] = arith.andi %[[VAR23]], %[[VAR24]] : i1
-// CHECK: %[[VAR33:.*]] = math.copysign %[[CST_0]], %[[VAR4]] : f32
-// CHECK: %[[VAR34:.*]] = arith.select %[[VAR32]], %[[VAR33]], %[[VAR4]] : f32
-// CHECK: %[[VAR35:.*]] = arith.andi %[[VAR23]], %[[VAR25]] : i1
-// CHECK: %[[VAR36:.*]] = math.copysign %[[CST_0]], %[[VAR6]] : f32
-// CHECK: %[[VAR37:.*]] = arith.select %[[VAR35]], %[[VAR36]], %[[VAR6]] : f32
-// CHECK: %[[VAR38:.*]] = arith.cmpf oeq, %[[VAR5]], %cst : f32
-// CHECK: %[[VAR39:.*]] = arith.cmpf oeq, %[[VAR7]], %cst : f32
-// CHECK: %[[VAR40:.*]] = arith.ori %[[VAR38]], %[[VAR39]] : i1
-// CHECK: %[[VAR41:.*]] = arith.cmpf uno, %[[VAR28]], %[[VAR28]] : f32
-// CHECK: %[[VAR42:.*]] = arith.cmpf uno, %[[VAR31]], %[[VAR31]] : f32
-// CHECK: %[[VAR43:.*]] = arith.select %[[VAR38]], %[[CST_1]], %[[CST_0]] : f32
-// CHECK: %[[VAR44:.*]] = math.copysign %[[VAR43]], %[[VAR34]] : f32
-// CHECK: %[[VAR45:.*]] = arith.select %[[VAR40]], %[[VAR44]], %[[VAR34]] : f32
-// CHECK: %[[VAR46:.*]] = arith.select %[[VAR39]], %[[CST_1]], %[[CST_0]] : f32
-// CHECK: %[[VAR47:.*]] = math.copysign %[[VAR46]], %[[VAR37]] : f32
-// CHECK: %[[VAR48:.*]] = arith.select %[[VAR40]], %[[VAR47]], %[[VAR37]] : f32
-// CHECK: %[[VAR49:.*]] = arith.andi %[[VAR40]], %[[VAR41]] : i1
-// CHECK: %[[VAR50:.*]] = math.copysign %[[CST_0]], %[[VAR28]] : f32
-// CHECK: %[[VAR51:.*]] = arith.select %[[VAR49]], %[[VAR50]], %[[VAR28]] : f32
-// CHECK: %[[VAR52:.*]] = arith.andi %[[VAR40]], %[[VAR42]] : i1
-// CHECK: %[[VAR53:.*]] = math.copysign %[[CST_0]], %[[VAR31]] : f32
-// CHECK: %[[VAR54:.*]] = arith.select %[[VAR52]], %[[VAR53]], %[[VAR31]] : f32
-// CHECK: %[[VAR55:.*]] = arith.ori %[[VAR23]], %[[VAR40]] : i1
-// CHECK: %[[VAR56:.*]] = arith.cmpf oeq, %[[VAR9]], %[[CST]] : f32
-// CHECK: %[[VAR57:.*]] = arith.cmpf oeq, %[[VAR11]], %[[CST]] : f32
-// CHECK: %[[VAR58:.*]] = arith.ori %[[VAR56]], %[[VAR57]] : i1
-// CHECK: %[[VAR59:.*]] = arith.cmpf oeq, %[[VAR16]], %[[CST]] : f32
-// CHECK: %[[VAR60:.*]] = arith.ori %[[VAR58]], %[[VAR59]] : i1
-// CHECK: %[[VAR61:.*]] = arith.cmpf oeq, %[[VAR14]], %[[CST]] : f32
-// CHECK: %[[VAR62:.*]] = arith.ori %[[VAR60]], %[[VAR61]] : i1
-// CHECK: %[[TRUE:.*]] = arith.constant true
-// CHECK: %[[VAR63:.*]] = arith.xori %[[VAR55]], %[[TRUE]] : i1
-// CHECK: %[[VAR64:.*]] = arith.andi %[[VAR62]], %[[VAR63]] : i1
-// CHECK: %[[VAR65:.*]] = arith.andi %[[VAR64]], %[[VAR41]] : i1
-// CHECK: %[[VAR66:.*]] = math.copysign %[[CST_0]], %[[VAR51]] : f32
-// CHECK: %[[VAR67:.*]] = arith.select %[[VAR65]], %[[VAR66]], %[[VAR51]] : f32
-// CHECK: %[[VAR68:.*]] = arith.andi %[[VAR64]], %[[VAR42]] : i1
-// CHECK: %[[VAR69:.*]] = math.copysign %[[CST_0]], %[[VAR54]] : f32
-// CHECK: %[[VAR70:.*]] = arith.select %[[VAR68]], %[[VAR69]], %[[VAR54]] : f32
-// CHECK: %[[VAR71:.*]] = arith.andi %[[VAR64]], %[[VAR24]] : i1
-// CHECK: %[[VAR72:.*]] = math.copysign %[[CST_0]], %[[VAR45]] : f32
-// CHECK: %[[VAR73:.*]] = arith.select %[[VAR71]], %[[VAR72]], %[[VAR45]] : f32
-// CHECK: %[[VAR74:.*]] = arith.andi %[[VAR64]], %[[VAR25]] : i1
-// CHECK: %[[VAR75:.*]] = math.copysign %[[CST_0]], %[[VAR48]] : f32
-// CHECK: %[[VAR76:.*]] = arith.select %[[VAR74]], %[[VAR75]], %[[VAR48]] : f32
-// CHECK: %[[VAR77:.*]] = arith.ori %[[VAR55]], %[[VAR64]] : i1
-// CHECK: %[[VAR78:.*]] = arith.andi %[[VAR20]], %[[VAR77]] : i1
-// CHECK: %[[VAR79:.*]] = arith.mulf %[[VAR67]], %[[VAR73]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR80:.*]] = arith.mulf %[[VAR70]], %[[VAR76]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR81:.*]] = arith.subf %[[VAR79]], %[[VAR80]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR82:.*]] = arith.mulf %[[CST]], %[[VAR81]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR83:.*]] = arith.select %[[VAR78]], %[[VAR82]], %[[VAR12]] : f32
-// CHECK: %[[VAR84:.*]] = arith.mulf %[[VAR70]], %[[VAR73]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR85:.*]] = arith.mulf %[[VAR67]], %[[VAR76]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR86:.*]] = arith.addf %[[VAR84]], %[[VAR85]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR87:.*]] = arith.mulf %[[CST]], %[[VAR86]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR88:.*]] = arith.select %[[VAR78]], %[[VAR87]], %[[VAR17]] : f32
-// CHECK: %[[VAR89:.*]] = complex.create %[[VAR83]], %[[VAR88]] : complex<f32>
+// CHECK: %[[VAR89:.*]] = complex.create %[[VAR12]], %[[VAR17]] : complex<f32>
 // CHECK: %[[VAR90:.*]] = complex.re %arg0 : complex<f32>
-// CHECK: %[[VAR91:.*]] = math.absf %[[VAR90]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR92:.*]] = complex.im %arg0 : complex<f32>
-// CHECK: %[[VAR93:.*]] = math.absf %[[VAR92]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR94:.*]] = complex.re %arg0 : complex<f32>
-// CHECK: %[[VAR95:.*]] = math.absf %[[VAR94]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR96:.*]] = complex.im %arg0 : complex<f32>
-// CHECK: %[[VAR97:.*]] = math.absf %[[VAR96]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR98:.*]] = arith.mulf %[[VAR90]], %[[VAR94]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR99:.*]] = math.absf %[[VAR98]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR100:.*]] = arith.mulf %[[VAR92]], %[[VAR96]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR101:.*]] = math.absf %[[VAR100]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR102:.*]] = arith.subf %[[VAR98]], %[[VAR100]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR103:.*]] = arith.mulf %[[VAR92]], %[[VAR94]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR104:.*]] = math.absf %[[VAR103]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR105:.*]] = arith.mulf %[[VAR90]], %[[VAR96]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR106:.*]] = math.absf %[[VAR105]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR107:.*]] = arith.addf %[[VAR103]], %[[VAR105]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR108:.*]] = arith.cmpf uno, %[[VAR102]], %[[VAR102]] : f32
-// CHECK: %[[VAR109:.*]] = arith.cmpf uno, %[[VAR107]], %[[VAR107]] : f32
-// CHECK: %[[VAR110:.*]] = arith.andi %[[VAR108]], %[[VAR109]] : i1
-// CHECK: %[[CST_2:.*]] = arith.constant 0x7F800000 : f32
-// CHECK: %[[VAR111:.*]] = arith.cmpf oeq, %[[VAR91]], %[[CST_2]] : f32
-// CHECK: %[[VAR112:.*]] = arith.cmpf oeq, %[[VAR93]], %[[CST_2]] : f32
-// CHECK: %[[VAR113:.*]] = arith.ori %[[VAR111]], %[[VAR112]] : i1
-// CHECK: %[[VAR114:.*]] = arith.cmpf uno, %[[VAR94]], %[[VAR94]] : f32
-// CHECK: %[[VAR115:.*]] = arith.cmpf uno, %[[VAR96]], %[[VAR96]] : f32
-// CHECK: %[[CST_3:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK: %[[CST_4:.*]] = arith.constant 1.000000e+00 : f32
-// CHECK: %[[VAR116:.*]] = arith.select %[[VAR111]], %[[CST_4]], %[[CST_3]] : f32
-// CHECK: %[[VAR117:.*]] = math.copysign %[[VAR116]], %[[VAR90]] : f32
-// CHECK: %[[VAR118:.*]] = arith.select %[[VAR113]], %[[VAR117]], %[[VAR90]] : f32
-// CHECK: %[[VAR119:.*]] = arith.select %[[VAR112]], %[[CST_4]], %[[CST_3]] : f32
-// CHECK: %[[VAR120:.*]] = math.copysign %[[VAR119]], %[[VAR92]] : f32
-// CHECK: %[[VAR121:.*]] = arith.select %[[VAR113]], %[[VAR120]], %[[VAR92]] : f32
-// CHECK: %[[VAR122:.*]] = arith.andi %[[VAR113]], %[[VAR114]] : i1
-// CHECK: %[[VAR123:.*]] = math.copysign %[[CST_3]], %[[VAR94]] : f32
-// CHECK: %[[VAR124:.*]] = arith.select %[[VAR122]], %[[VAR123]], %[[VAR94]] : f32
-// CHECK: %[[VAR125:.*]] = arith.andi %[[VAR113]], %[[VAR115]] : i1
-// CHECK: %[[VAR126:.*]] = math.copysign %[[CST_3]], %[[VAR96]] : f32
-// CHECK: %[[VAR127:.*]] = arith.select %[[VAR125]], %[[VAR126]], %[[VAR96]] : f32
-// CHECK: %[[VAR128:.*]] = arith.cmpf oeq, %[[VAR95]], %[[CST_2]] : f32
-// CHECK: %[[VAR129:.*]] = arith.cmpf oeq, %[[VAR97]], %[[CST_2]] : f32
-// CHECK: %[[VAR130:.*]] = arith.ori %[[VAR128]], %[[VAR129]] : i1
-// CHECK: %[[VAR131:.*]] = arith.cmpf uno, %[[VAR118]], %[[VAR118]] : f32
-// CHECK: %[[VAR132:.*]] = arith.cmpf uno, %[[VAR121]], %[[VAR121]] : f32
-// CHECK: %[[VAR133:.*]] = arith.select %[[VAR128]], %[[CST_4]], %[[CST_3]] : f32
-// CHECK: %[[VAR134:.*]] = math.copysign %[[VAR133]], %[[VAR124]] : f32
-// CHECK: %[[VAR135:.*]] = arith.select %[[VAR130]], %[[VAR134]], %[[VAR124]] : f32
-// CHECK: %[[VAR136:.*]] = arith.select %[[VAR129]], %[[CST_4]], %[[CST_3]] : f32
-// CHECK: %[[VAR137:.*]] = math.copysign %[[VAR136]], %[[VAR127]] : f32
-// CHECK: %[[VAR138:.*]] = arith.select %[[VAR130]], %[[VAR137]], %[[VAR127]] : f32
-// CHECK: %[[VAR139:.*]] = arith.andi %[[VAR130]], %[[VAR131]] : i1
-// CHECK: %[[VAR140:.*]] = math.copysign %[[CST_3]], %[[VAR118]] : f32
-// CHECK: %[[VAR141:.*]] = arith.select %[[VAR139]], %[[VAR140]], %[[VAR118]] : f32
-// CHECK: %[[VAR142:.*]] = arith.andi %[[VAR130]], %[[VAR132]] : i1
-// CHECK: %[[VAR143:.*]] = math.copysign %[[CST_3]], %[[VAR121]] : f32
-// CHECK: %[[VAR144:.*]] = arith.select %[[VAR142]], %[[VAR143]], %[[VAR121]] : f32
-// CHECK: %[[VAR145:.*]] = arith.ori %[[VAR113]], %[[VAR130]] : i1
-// CHECK: %[[VAR146:.*]] = arith.cmpf oeq, %[[VAR99]], %[[CST_2]] : f32
-// CHECK: %[[VAR147:.*]] = arith.cmpf oeq, %[[VAR101]], %[[CST_2]] : f32
-// CHECK: %[[VAR148:.*]] = arith.ori %[[VAR146]], %[[VAR147]] : i1
-// CHECK: %[[VAR149:.*]] = arith.cmpf oeq, %[[VAR106]], %[[CST_2]] : f32
-// CHECK: %[[VAR150:.*]] = arith.ori %[[VAR148]], %[[VAR149]] : i1
-// CHECK: %[[VAR151:.*]] = arith.cmpf oeq, %[[VAR104]], %[[CST_2]] : f32
-// CHECK: %[[VAR152:.*]] = arith.ori %[[VAR150]], %[[VAR151]] : i1
-// CHECK: %[[TRUE_5:.*]] = arith.constant true
-// CHECK: %[[VAR153:.*]] = arith.xori %[[VAR145]], %[[TRUE_5]] : i1
-// CHECK: %[[VAR154:.*]] = arith.andi %[[VAR152]], %[[VAR153]] : i1
-// CHECK: %[[VAR155:.*]] = arith.andi %[[VAR154]], %[[VAR131]] : i1
-// CHECK: %[[VAR156:.*]] = math.copysign %[[CST_3]], %[[VAR141]] : f32
-// CHECK: %[[VAR157:.*]] = arith.select %[[VAR155]], %[[VAR156]], %[[VAR141]] : f32
-// CHECK: %[[VAR158:.*]] = arith.andi %[[VAR154]], %[[VAR132]] : i1
-// CHECK: %[[VAR159:.*]] = math.copysign %[[CST_3]], %[[VAR144]] : f32
-// CHECK: %[[VAR160:.*]] = arith.select %[[VAR158]], %[[VAR159]], %[[VAR144]] : f32
-// CHECK: %[[VAR161:.*]] = arith.andi %[[VAR154]], %[[VAR114]] : i1
-// CHECK: %[[VAR162:.*]] = math.copysign %[[CST_3]], %[[VAR135]] : f32
-// CHECK: %[[VAR163:.*]] = arith.select %[[VAR161]], %[[VAR162]], %[[VAR135]] : f32
-// CHECK: %[[VAR164:.*]] = arith.andi %[[VAR154]], %[[VAR115]] : i1
-// CHECK: %[[VAR165:.*]] = math.copysign %[[CST_3]], %[[VAR138]] : f32
-// CHECK: %[[VAR166:.*]] = arith.select %[[VAR164]], %[[VAR165]], %[[VAR138]] : f32
-// CHECK: %[[VAR167:.*]] = arith.ori %[[VAR145]], %[[VAR154]] : i1
-// CHECK: %[[VAR168:.*]] = arith.andi %[[VAR110]], %[[VAR167]] : i1
-// CHECK: %[[VAR169:.*]] = arith.mulf %[[VAR157]], %[[VAR163]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR170:.*]] = arith.mulf %[[VAR160]], %[[VAR166]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR171:.*]] = arith.subf %[[VAR169]], %[[VAR170]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR172:.*]] = arith.mulf %[[CST_2]], %[[VAR171]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR173:.*]] = arith.select %[[VAR168]], %[[VAR172]], %[[VAR102]] : f32
-// CHECK: %[[VAR174:.*]] = arith.mulf %[[VAR160]], %[[VAR163]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR175:.*]] = arith.mulf %[[VAR157]], %[[VAR166]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR176:.*]] = arith.addf %[[VAR174]], %[[VAR175]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR177:.*]] = arith.mulf %[[CST_2]], %[[VAR176]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR178:.*]] = arith.select %[[VAR168]], %[[VAR177]], %[[VAR107]] : f32
-// CHECK: %[[VAR179:.*]] = complex.create %[[VAR173]], %[[VAR178]] : complex<f32>
+// CHECK: %[[VAR179:.*]] = complex.create %[[VAR102]], %[[VAR107]] : complex<f32>
 // CHECK: %[[VAR180:.*]] = complex.re %[[VAR89]] : complex<f32>
 // CHECK: %[[VAR181:.*]] = complex.re %[[VAR179]] : complex<f32>
 // CHECK: %[[VAR182:.*]] = arith.addf %[[VAR180]], %[[VAR181]] fastmath<nnan,contract> : f32
@@ -1343,99 +982,16 @@ func.func @complex_atan2_with_fmf(%lhs: complex<f32>,
 // CHECK: %[[CST_11:.*]] = arith.constant 1.000000e+00 : f32
 // CHECK: %[[VAR229:.*]] = complex.create %[[CST_10]], %[[CST_11]] : complex<f32>
 // CHECK: %[[VAR230:.*]] = complex.re %[[VAR229]] : complex<f32>
-// CHECK: %[[VAR231:.*]] = math.absf %[[VAR230]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR232:.*]] = complex.im %[[VAR229]] : complex<f32>
-// CHECK: %[[VAR233:.*]] = math.absf %[[VAR232]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR234:.*]] = complex.re %arg0 : complex<f32>
-// CHECK: %[[VAR235:.*]] = math.absf %[[VAR234]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR236:.*]] = complex.im %arg0 : complex<f32>
-// CHECK: %[[VAR237:.*]] = math.absf %[[VAR236]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR238:.*]] = arith.mulf %[[VAR230]], %[[VAR234]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR239:.*]] = math.absf %[[VAR238]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR240:.*]] = arith.mulf %[[VAR232]], %[[VAR236]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR241:.*]] = math.absf %[[VAR240]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR242:.*]] = arith.subf %[[VAR238]], %[[VAR240]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR243:.*]] = arith.mulf %[[VAR232]], %[[VAR234]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR244:.*]] = math.absf %[[VAR243]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR245:.*]] = arith.mulf %[[VAR230]], %[[VAR236]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR246:.*]] = math.absf %[[VAR245]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR247:.*]] = arith.addf %[[VAR243]], %[[VAR245]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR248:.*]] = arith.cmpf uno, %[[VAR242]], %[[VAR242]] : f32
-// CHECK: %[[VAR249:.*]] = arith.cmpf uno, %[[VAR247]], %[[VAR247]] : f32
-// CHECK: %[[VAR250:.*]] = arith.andi %[[VAR248]], %[[VAR249]] : i1
-// CHECK: %[[CST_12:.*]] = arith.constant 0x7F800000 : f32
-// CHECK: %[[VAR251:.*]] = arith.cmpf oeq, %[[VAR231]], %[[CST_12]] : f32
-// CHECK: %[[VAR252:.*]] = arith.cmpf oeq, %[[VAR233]], %[[CST_12]] : f32
-// CHECK: %[[VAR253:.*]] = arith.ori %[[VAR251]], %[[VAR252]] : i1
-// CHECK: %[[VAR254:.*]] = arith.cmpf uno, %[[VAR234]], %[[VAR234]] : f32
-// CHECK: %[[VAR255:.*]] = arith.cmpf uno, %[[VAR236]], %[[VAR236]] : f32
-// CHECK: %[[CST_13:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK: %[[CST_14:.*]] = arith.constant 1.000000e+00 : f32
-// CHECK: %[[VAR256:.*]] = arith.select %[[VAR251]], %[[CST_14]], %[[CST_13]] : f32
-// CHECK: %[[VAR257:.*]] = math.copysign %[[VAR256]], %[[VAR230]] : f32
-// CHECK: %[[VAR258:.*]] = arith.select %[[VAR253]], %[[VAR257]], %[[VAR230]] : f32
-// CHECK: %[[VAR259:.*]] = arith.select %[[VAR252]], %[[CST_14]], %[[CST_13]] : f32
-// CHECK: %[[VAR260:.*]] = math.copysign %[[VAR259]], %[[VAR232]] : f32
-// CHECK: %[[VAR261:.*]] = arith.select %[[VAR253]], %[[VAR260]], %[[VAR232]] : f32
-// CHECK: %[[VAR262:.*]] = arith.andi %[[VAR253]], %[[VAR254]] : i1
-// CHECK: %[[VAR263:.*]] = math.copysign %[[CST_13]], %[[VAR234]] : f32
-// CHECK: %[[VAR264:.*]] = arith.select %[[VAR262]], %[[VAR263]], %[[VAR234]] : f32
-// CHECK: %[[VAR265:.*]] = arith.andi %[[VAR253]], %[[VAR255]] : i1
-// CHECK: %[[VAR266:.*]] = math.copysign %[[CST_13]], %[[VAR236]] : f32
-// CHECK: %[[VAR267:.*]] = arith.select %[[VAR265]], %[[VAR266]], %[[VAR236]] : f32
-// CHECK: %[[VAR268:.*]] = arith.cmpf oeq, %[[VAR235]], %[[CST_12]] : f32
-// CHECK: %[[VAR269:.*]] = arith.cmpf oeq, %[[VAR237]], %[[CST_12]] : f32
-// CHECK: %[[VAR270:.*]] = arith.ori %[[VAR268]], %[[VAR269]] : i1
-// CHECK: %[[VAR271:.*]] = arith.cmpf uno, %[[VAR258]], %[[VAR258]] : f32
-// CHECK: %[[VAR272:.*]] = arith.cmpf uno, %[[VAR261]], %[[VAR261]] : f32
-// CHECK: %[[VAR273:.*]] = arith.select %[[VAR268]], %[[CST_14]], %[[CST_13]] : f32
-// CHECK: %[[VAR274:.*]] = math.copysign %[[VAR273]], %[[VAR264]] : f32
-// CHECK: %[[VAR275:.*]] = arith.select %[[VAR270]], %[[VAR274]], %[[VAR264]] : f32
-// CHECK: %[[VAR276:.*]] = arith.select %[[VAR269]], %[[CST_14]], %[[CST_13]] : f32
-// CHECK: %[[VAR277:.*]] = math.copysign %[[VAR276]], %[[VAR267]] : f32
-// CHECK: %[[VAR278:.*]] = arith.select %[[VAR270]], %[[VAR277]], %[[VAR267]] : f32
-// CHECK: %[[VAR279:.*]] = arith.andi %[[VAR270]], %[[VAR271]] : i1
-// CHECK: %[[VAR280:.*]] = math.copysign %[[CST_13]], %[[VAR258]] : f32
-// CHECK: %[[VAR281:.*]] = arith.select %[[VAR279]], %[[VAR280]], %[[VAR258]] : f32
-// CHECK: %[[VAR282:.*]] = arith.andi %[[VAR270]], %[[VAR272]] : i1
-// CHECK: %[[VAR283:.*]] = math.copysign %[[CST_13]], %[[VAR261]] : f32
-// CHECK: %[[VAR284:.*]] = arith.select %[[VAR282]], %[[VAR283]], %[[VAR261]] : f32
-// CHECK: %[[VAR285:.*]] = arith.ori %[[VAR253]], %[[VAR270]] : i1
-// CHECK: %[[VAR286:.*]] = arith.cmpf oeq, %[[VAR239]], %[[CST_12]] : f32
-// CHECK: %[[VAR287:.*]] = arith.cmpf oeq, %[[VAR241]], %[[CST_12]] : f32
-// CHECK: %[[VAR288:.*]] = arith.ori %[[VAR286]], %[[VAR287]] : i1
-// CHECK: %[[VAR289:.*]] = arith.cmpf oeq, %[[VAR246]], %[[CST_12]] : f32
-// CHECK: %[[VAR290:.*]] = arith.ori %[[VAR288]], %[[VAR289]] : i1
-// CHECK: %[[VAR291:.*]] = arith.cmpf oeq, %[[VAR244]], %[[CST_12]] : f32
-// CHECK: %[[VAR292:.*]] = arith.ori %[[VAR290]], %[[VAR291]] : i1
-// CHECK: %[[TRUE_15:.*]] = arith.constant true
-// CHECK: %[[VAR293:.*]] = arith.xori %[[VAR285]], %[[TRUE_15]] : i1
-// CHECK: %[[VAR294:.*]] = arith.andi %[[VAR292]], %[[VAR293]] : i1
-// CHECK: %[[VAR295:.*]] = arith.andi %[[VAR294]], %[[VAR271]] : i1
-// CHECK: %[[VAR296:.*]] = math.copysign %[[CST_13]], %[[VAR281]] : f32
-// CHECK: %[[VAR297:.*]] = arith.select %[[VAR295]], %[[VAR296]], %[[VAR281]] : f32
-// CHECK: %[[VAR298:.*]] = arith.andi %[[VAR294]], %[[VAR272]] : i1
-// CHECK: %[[VAR299:.*]] = math.copysign %[[CST_13]], %[[VAR284]] : f32
-// CHECK: %[[VAR300:.*]] = arith.select %[[VAR298]], %[[VAR299]], %[[VAR284]] : f32
-// CHECK: %[[VAR301:.*]] = arith.andi %[[VAR294]], %[[VAR254]] : i1
-// CHECK: %[[VAR302:.*]] = math.copysign %[[CST_13]], %[[VAR275]] : f32
-// CHECK: %[[VAR303:.*]] = arith.select %[[VAR301]], %[[VAR302]], %[[VAR275]] : f32
-// CHECK: %[[VAR304:.*]] = arith.andi %[[VAR294]], %[[VAR255]] : i1
-// CHECK: %[[VAR305:.*]] = math.copysign %[[CST_13]], %[[VAR278]] : f32
-// CHECK: %[[VAR306:.*]] = arith.select %[[VAR304]], %[[VAR305]], %[[VAR278]] : f32
-// CHECK: %[[VAR307:.*]] = arith.ori %[[VAR285]], %[[VAR294]] : i1
-// CHECK: %[[VAR308:.*]] = arith.andi %[[VAR250]], %[[VAR307]] : i1
-// CHECK: %[[VAR309:.*]] = arith.mulf %[[VAR297]], %[[VAR303]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR310:.*]] = arith.mulf %[[VAR300]], %[[VAR306]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR311:.*]] = arith.subf %[[VAR309]], %[[VAR310]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR312:.*]] = arith.mulf %[[CST_12]], %[[VAR311]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR313:.*]] = arith.select %[[VAR308]], %[[VAR312]], %[[VAR242]] : f32
-// CHECK: %[[VAR314:.*]] = arith.mulf %[[VAR300]], %[[VAR303]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR315:.*]] = arith.mulf %[[VAR297]], %[[VAR306]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR316:.*]] = arith.addf %[[VAR314]], %[[VAR315]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR317:.*]] = arith.mulf %[[CST_12]], %[[VAR316]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR318:.*]] = arith.select %[[VAR308]], %[[VAR317]], %[[VAR247]] : f32
-// CHECK: %[[VAR319:.*]] = complex.create %[[VAR313]], %[[VAR318]] : complex<f32>
+// CHECK: %[[VAR319:.*]] = complex.create %[[VAR242]], %[[VAR247]] : complex<f32>
 // CHECK: %[[VAR320:.*]] = complex.re %arg1 : complex<f32>
 // CHECK: %[[VAR321:.*]] = complex.re %[[VAR319]] : complex<f32>
 // CHECK: %[[VAR322:.*]] = arith.addf %[[VAR320]], %[[VAR321]] fastmath<nnan,contract> : f32
@@ -1557,99 +1113,16 @@ func.func @complex_atan2_with_fmf(%lhs: complex<f32>,
 // CHECK: %[[CST_21:.*]] = arith.constant -1.000000e+00 : f32
 // CHECK: %[[VAR441:.*]] = complex.create %[[CST_10]], %[[CST_21]] : complex<f32>
 // CHECK: %[[VAR442:.*]] = complex.re %[[VAR441]] : complex<f32>
-// CHECK: %[[VAR443:.*]] = math.absf %[[VAR442]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR444:.*]] = complex.im %[[VAR441]] : complex<f32>
-// CHECK: %[[VAR445:.*]] = math.absf %[[VAR444]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR446:.*]] = complex.re %[[VAR440]] : complex<f32>
-// CHECK: %[[VAR447:.*]] = math.absf %[[VAR446]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR448:.*]] = complex.im %[[VAR440]] : complex<f32>
-// CHECK: %[[VAR449:.*]] = math.absf %[[VAR448]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR450:.*]] = arith.mulf %[[VAR442]], %[[VAR446]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR451:.*]] = math.absf %[[VAR450]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR452:.*]] = arith.mulf %[[VAR444]], %[[VAR448]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR453:.*]] = math.absf %[[VAR452]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR454:.*]] = arith.subf %[[VAR450]], %[[VAR452]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR455:.*]] = arith.mulf %[[VAR444]], %[[VAR446]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR456:.*]] = math.absf %[[VAR455]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR457:.*]] = arith.mulf %[[VAR442]], %[[VAR448]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR458:.*]] = math.absf %[[VAR457]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR459:.*]] = arith.addf %[[VAR455]], %[[VAR457]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR460:.*]] = arith.cmpf uno, %[[VAR454]], %[[VAR454]] : f32
-// CHECK: %[[VAR461:.*]] = arith.cmpf uno, %[[VAR459]], %[[VAR459]] : f32
-// CHECK: %[[VAR462:.*]] = arith.andi %[[VAR460]], %[[VAR461]] : i1
-// CHECK: %[[CST_22:.*]] = arith.constant 0x7F800000 : f32
-// CHECK: %[[VAR463:.*]] = arith.cmpf oeq, %[[VAR443]], %[[CST_22]] : f32
-// CHECK: %[[VAR464:.*]] = arith.cmpf oeq, %[[VAR445]], %[[CST_22]] : f32
-// CHECK: %[[VAR465:.*]] = arith.ori %[[VAR463]], %[[VAR464]] : i1
-// CHECK: %[[VAR466:.*]] = arith.cmpf uno, %[[VAR446]], %[[VAR446]] : f32
-// CHECK: %[[VAR467:.*]] = arith.cmpf uno, %[[VAR448]], %[[VAR448]] : f32
-// CHECK: %[[CST_23:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK: %[[CST_24:.*]] = arith.constant 1.000000e+00 : f32
-// CHECK: %[[VAR468:.*]] = arith.select %[[VAR463]], %[[CST_24]], %[[CST_23]] : f32
-// CHECK: %[[VAR469:.*]] = math.copysign %[[VAR468]], %[[VAR442]] : f32
-// CHECK: %[[VAR470:.*]] = arith.select %[[VAR465]], %[[VAR469]], %[[VAR442]] : f32
-// CHECK: %[[VAR471:.*]] = arith.select %[[VAR464]], %[[CST_24]], %[[CST_23]] : f32
-// CHECK: %[[VAR472:.*]] = math.copysign %[[VAR471]], %[[VAR444]] : f32
-// CHECK: %[[VAR473:.*]] = arith.select %[[VAR465]], %[[VAR472]], %[[VAR444]] : f32
-// CHECK: %[[VAR474:.*]] = arith.andi %[[VAR465]], %[[VAR466]] : i1
-// CHECK: %[[VAR475:.*]] = math.copysign %[[CST_23]], %[[VAR446]] : f32
-// CHECK: %[[VAR476:.*]] = arith.select %[[VAR474]], %[[VAR475]], %[[VAR446]] : f32
-// CHECK: %[[VAR477:.*]] = arith.andi %[[VAR465]], %[[VAR467]] : i1
-// CHECK: %[[VAR478:.*]] = math.copysign %[[CST_23]], %[[VAR448]] : f32
-// CHECK: %[[VAR479:.*]] = arith.select %[[VAR477]], %[[VAR478]], %[[VAR448]] : f32
-// CHECK: %[[VAR480:.*]] = arith.cmpf oeq, %[[VAR447]], %[[CST_22]] : f32
-// CHECK: %[[VAR481:.*]] = arith.cmpf oeq, %[[VAR449]], %[[CST_22]] : f32
-// CHECK: %[[VAR482:.*]] = arith.ori %[[VAR480]], %[[VAR481]] : i1
-// CHECK: %[[VAR483:.*]] = arith.cmpf uno, %[[VAR470]], %[[VAR470]] : f32
-// CHECK: %[[VAR484:.*]] = arith.cmpf uno, %[[VAR473]], %[[VAR473]] : f32
-// CHECK: %[[VAR485:.*]] = arith.select %[[VAR480]], %[[CST_24]], %[[CST_23]] : f32
-// CHECK: %[[VAR486:.*]] = math.copysign %[[VAR485]], %[[VAR476]] : f32
-// CHECK: %[[VAR487:.*]] = arith.select %[[VAR482]], %[[VAR486]], %[[VAR476]] : f32
-// CHECK: %[[VAR488:.*]] = arith.select %[[VAR481]], %[[CST_24]], %[[CST_23]] : f32
-// CHECK: %[[VAR489:.*]] = math.copysign %[[VAR488]], %[[VAR479]] : f32
-// CHECK: %[[VAR490:.*]] = arith.select %[[VAR482]], %[[VAR489]], %[[VAR479]] : f32
-// CHECK: %[[VAR491:.*]] = arith.andi %[[VAR482]], %[[VAR483]] : i1
-// CHECK: %[[VAR492:.*]] = math.copysign %[[CST_23]], %[[VAR470]] : f32
-// CHECK: %[[VAR493:.*]] = arith.select %[[VAR491]], %[[VAR492]], %[[VAR470]] : f32
-// CHECK: %[[VAR494:.*]] = arith.andi %[[VAR482]], %[[VAR484]] : i1
-// CHECK: %[[VAR495:.*]] = math.copysign %[[CST_23]], %[[VAR473]] : f32
-// CHECK: %[[VAR496:.*]] = arith.select %[[VAR494]], %[[VAR495]], %[[VAR473]] : f32
-// CHECK: %[[VAR497:.*]] = arith.ori %[[VAR465]], %[[VAR482]] : i1
-// CHECK: %[[VAR498:.*]] = arith.cmpf oeq, %[[VAR451]], %[[CST_22]] : f32
-// CHECK: %[[VAR499:.*]] = arith.cmpf oeq, %[[VAR453]], %[[CST_22]] : f32
-// CHECK: %[[VAR500:.*]] = arith.ori %[[VAR498]], %[[VAR499]] : i1
-// CHECK: %[[VAR501:.*]] = arith.cmpf oeq, %[[VAR458]], %[[CST_22]] : f32
-// CHECK: %[[VAR502:.*]] = arith.ori %[[VAR500]], %[[VAR501]] : i1
-// CHECK: %[[VAR503:.*]] = arith.cmpf oeq, %[[VAR456]], %[[CST_22]] : f32
-// CHECK: %[[VAR504:.*]] = arith.ori %[[VAR502]], %[[VAR503]] : i1
-// CHECK: %[[TRUE_25:.*]] = arith.constant true
-// CHECK: %[[VAR505:.*]] = arith.xori %[[VAR497]], %[[TRUE_25]] : i1
-// CHECK: %[[VAR506:.*]] = arith.andi %[[VAR504]], %[[VAR505]] : i1
-// CHECK: %[[VAR507:.*]] = arith.andi %[[VAR506]], %[[VAR483]] : i1
-// CHECK: %[[VAR508:.*]] = math.copysign %[[CST_23]], %[[VAR493]] : f32
-// CHECK: %[[VAR509:.*]] = arith.select %[[VAR507]], %[[VAR508]], %[[VAR493]] : f32
-// CHECK: %[[VAR510:.*]] = arith.andi %[[VAR506]], %[[VAR484]] : i1
-// CHECK: %[[VAR511:.*]] = math.copysign %[[CST_23]], %[[VAR496]] : f32
-// CHECK: %[[VAR512:.*]] = arith.select %[[VAR510]], %[[VAR511]], %[[VAR496]] : f32
-// CHECK: %[[VAR513:.*]] = arith.andi %[[VAR506]], %[[VAR466]] : i1
-// CHECK: %[[VAR514:.*]] = math.copysign %[[CST_23]], %[[VAR487]] : f32
-// CHECK: %[[VAR515:.*]] = arith.select %[[VAR513]], %[[VAR514]], %[[VAR487]] : f32
-// CHECK: %[[VAR516:.*]] = arith.andi %[[VAR506]], %[[VAR467]] : i1
-// CHECK: %[[VAR517:.*]] = math.copysign %[[CST_23]], %[[VAR490]] : f32
-// CHECK: %[[VAR518:.*]] = arith.select %[[VAR516]], %[[VAR517]], %[[VAR490]] : f32
-// CHECK: %[[VAR519:.*]] = arith.ori %[[VAR497]], %[[VAR506]] : i1
-// CHECK: %[[VAR520:.*]] = arith.andi %[[VAR462]], %[[VAR519]] : i1
-// CHECK: %[[VAR521:.*]] = arith.mulf %[[VAR509]], %[[VAR515]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR522:.*]] = arith.mulf %[[VAR512]], %[[VAR518]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR523:.*]] = arith.subf %[[VAR521]], %[[VAR522]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR524:.*]] = arith.mulf %[[CST_22]], %[[VAR523]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR525:.*]] = arith.select %[[VAR520]], %[[VAR524]], %[[VAR454]] : f32
-// CHECK: %[[VAR526:.*]] = arith.mulf %[[VAR512]], %[[VAR515]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR527:.*]] = arith.mulf %[[VAR509]], %[[VAR518]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR528:.*]] = arith.addf %[[VAR526]], %[[VAR527]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR529:.*]] = arith.mulf %[[CST_22]], %[[VAR528]] fastmath<nnan,contract> : f32
-// CHECK: %[[VAR530:.*]] = arith.select %[[VAR520]], %[[VAR529]], %[[VAR459]] : f32
-// CHECK: %[[VAR531:.*]] = complex.create %[[VAR525]], %[[VAR530]] : complex<f32>
+// CHECK: %[[VAR531:.*]] = complex.create %[[VAR454]], %[[VAR459]] : complex<f32>
 // CHECK: return %[[VAR531]] : complex<f32>
 
 // -----
diff --git a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
index 16b692b968939..e75225d6d54f5 100644
--- a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
+++ b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
@@ -1,6 +1,8 @@
-// RUN: mlir-opt -pass-pipeline="builtin.module(gpu.module(convert-gpu-to-llvm-spv))" -split-input-file -verify-diagnostics %s \
+// RUN: mlir-opt -pass-pipeline="builtin.module(gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true}))" -split-input-file -verify-diagnostics %s \
 // RUN: | FileCheck --check-prefixes=CHECK-64,CHECK %s
-// RUN: mlir-opt -pass-pipeline="builtin.module(gpu.module(convert-gpu-to-llvm-spv{index-bitwidth=32}))" -split-input-file -verify-diagnostics %s \
+// RUN: mlir-opt -pass-pipeline="builtin.module(gpu.module(convert-gpu-to-llvm-spv))" -split-input-file -verify-diagnostics %s \
+// RUN: | FileCheck --check-prefixes=CHECK-32,CHECK %s
+// RUN: mlir-opt -pass-pipeline="builtin.module(gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=false}))" -split-input-file -verify-diagnostics %s \
 // RUN: | FileCheck --check-prefixes=CHECK-32,CHECK %s
 
 gpu.module @builtins {
diff --git a/mlir/test/Dialect/Tosa/level_check.mlir b/mlir/test/Dialect/Tosa/level_check.mlir
index e851019362958..529a16ca48c7e 100644
--- a/mlir/test/Dialect/Tosa/level_check.mlir
+++ b/mlir/test/Dialect/Tosa/level_check.mlir
@@ -143,6 +143,14 @@ func.func @test_const_f64(%arg0 : tensor<1xf64>) {
 
 // -----
 
+func.func @test_const_ui8(%arg0 : tensor<1xui8>) {
+  // expected-error@+1 {{'tosa.const' op is not profile-aligned: element type 'ui8' is not legal}}
+  %0 = "tosa.const"() {value = dense<0> : tensor<1xui8>} : () -> tensor<1xui8>
+  return
+}
+
+// -----
+
 func.func @test_avgpool2d_kernel_y(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32> {
   // expected-error@+1 {{'tosa.avg_pool2d' op failed level check: kernel <= MAX_KERNEL}}
   %0 = "tosa.avg_pool2d"(%arg0) {kernel = array<i64: 8193, 1>, pad = array<i64: 4, 4, 4, 4>, stride = array<i64: 1, 1>, acc_type = f32} :
diff --git a/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir b/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir
index 9549de1258efc..2550730f3fdf2 100644
--- a/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir
@@ -33,9 +33,12 @@ module attributes {omp.is_target_device = true} {
 
 // CHECK: user_code.entry:                                  ; preds = %entry
 // CHECK: %[[LOAD_BYREF:.*]] = load ptr, ptr %[[ALLOCA_BYREF]], align 8
+// CHECK: br label %outlined.body
+
+// CHECK: outlined.body:
 // CHECK: br label %omp.target
 
-// CHECK: omp.target:                                       ; preds = %user_code.entry
+// CHECK: omp.target:
 // CHECK:  %[[VAL_LOAD_BYCOPY:.*]] = load i32, ptr %[[ALLOCA_BYCOPY]], align 4
 // CHECK:  store i32 %[[VAL_LOAD_BYCOPY]], ptr %[[LOAD_BYREF]], align 4
 // CHECK: br label %omp.region.cont
diff --git a/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir b/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir
index e0c4c02e03a65..b59e03bc465a2 100644
--- a/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir
@@ -17,7 +17,7 @@ module attributes {omp.is_target_device = true} {
   llvm.func @_QQmain() attributes {} {
     %0 = llvm.mlir.addressof @_QMtest_0Esp : !llvm.ptr
 
-  // CHECK-DAG:   omp.target:                                       ; preds = %user_code.entry
+  // CHECK-DAG:   omp.target:                                       ; preds = %outlined.body
   // CHECK-DAG: %[[V:.*]] = load ptr, ptr @_QMtest_0Esp_decl_tgt_ref_ptr, align 8
   // CHECK-DAG: store i32 1, ptr %[[V]], align 4
   // CHECK-DAG: br label %omp.region.cont
diff --git a/mlir/test/Target/LLVMIR/openmp-target-multiple-private.mlir b/mlir/test/Target/LLVMIR/openmp-target-multiple-private.mlir
new file mode 100644
index 0000000000000..c632a0ee42f8a
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-target-multiple-private.mlir
@@ -0,0 +1,80 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+llvm.func @dealloc_foo_0(!llvm.ptr)
+
+omp.private {type = private} @box.heap_privatizer0 : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %7 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>  : (i32) -> !llvm.ptr
+  omp.yield(%7 : !llvm.ptr)
+} dealloc {
+^bb0(%arg0: !llvm.ptr):
+  llvm.call @dealloc_foo_0(%arg0) : (!llvm.ptr) -> ()
+  omp.yield
+}
+
+llvm.func @alloc_foo_1(!llvm.ptr)
+llvm.func @dealloc_foo_1(!llvm.ptr)
+
+omp.private {type = private} @box.heap_privatizer1 : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %7 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>  : (i32) -> !llvm.ptr
+  llvm.call @alloc_foo_1(%arg0) : (!llvm.ptr) -> ()
+  omp.yield(%7 : !llvm.ptr)
+} dealloc {
+^bb0(%arg0: !llvm.ptr):
+  llvm.call @dealloc_foo_1(%arg0) : (!llvm.ptr) -> ()
+  omp.yield
+}
+
+llvm.func @target_allocatable_(%arg0: !llvm.ptr {fir.bindc_name = "lb"}, %arg1: !llvm.ptr {fir.bindc_name = "ub"}, %arg2: !llvm.ptr {fir.bindc_name = "l"}) attributes {fir.internal_name = "_QPtarget_allocatable"} {
+  %6 = llvm.mlir.constant(1 : i64) : i64
+  %7 = llvm.alloca %6 x i32 {bindc_name = "mapped_var"} : (i64) -> !llvm.ptr
+  %13 = llvm.alloca %6 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {bindc_name = "alloc_var0"} : (i64) -> !llvm.ptr
+  %14 = llvm.alloca %6 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {bindc_name = "alloc_var1"} : (i64) -> !llvm.ptr
+  %53 = omp.map.info var_ptr(%7 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "mapped_var"}
+  %54 = omp.map.info var_ptr(%13 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>) map_clauses(to) capture(ByRef) -> !llvm.ptr
+  %55 = omp.map.info var_ptr(%14 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>) map_clauses(to) capture(ByRef) -> !llvm.ptr
+  omp.target map_entries(%53 -> %arg3, %54 -> %arg4, %55 ->%arg5 : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@box.heap_privatizer0 %13 -> %arg6 [map_idx=1], @box.heap_privatizer1 %14 -> %arg7  [map_idx=2]: !llvm.ptr, !llvm.ptr) {
+    llvm.call @use_private_var0(%arg6) : (!llvm.ptr) -> ()
+    llvm.call @use_private_var1(%arg7) : (!llvm.ptr) -> ()
+    omp.terminator
+  }
+  llvm.return
+}
+
+
+llvm.func @use_private_var0(!llvm.ptr) -> ()
+llvm.func @use_private_var1(!llvm.ptr) -> ()
+
+// The first set of checks ensure that we are calling the offloaded function
+// with the right arguments, especially the second argument which needs to
+// be a memory reference to the descriptor for the privatized allocatable
+// CHECK: define void @target_allocatable_
+// CHECK-NOT: define internal void
+// CHECK: %[[DESC_ALLOC0:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1
+// CHECK: %[[DESC_ALLOC1:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1
+// CHECK: call void @__omp_offloading_[[OFFLOADED_FUNCTION:.*]](ptr {{[^,]+}},
+// CHECK-SAME: ptr %[[DESC_ALLOC0]], ptr %[[DESC_ALLOC1]])
+
+// CHECK: define internal void @__omp_offloading_[[OFFLOADED_FUNCTION]]
+// CHECK-SAME: (ptr {{[^,]+}}, ptr %[[DESCRIPTOR_ARG0:[^,]+]],
+// CHECK-SAME: ptr %[[DESCRIPTOR_ARG1:.*]]) {
+
+// `var0` privatrizer `alloc`
+// CHECK: %[[PRIV_DESC0:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }
+
+// `var1` privatrizer  `alloc`
+// CHECK: %[[PRIV_DESC1:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }
+// CHECK: call void @alloc_foo_1(ptr %[[DESCRIPTOR_ARG1]])
+
+// target op body
+// CHECK: call void @use_private_var0(ptr %[[PRIV_DESC0]]
+// CHECK: call void @use_private_var1(ptr %[[PRIV_DESC1]]
+
+// `var0` privatrizer `dealloc`
+// CHECK:  call void @dealloc_foo_0(ptr %[[PRIV_DESC0]])
+
+// `var1` privatrizer `dealloc`
+// CHECK:  call void @dealloc_foo_1(ptr %[[PRIV_DESC1]])
diff --git a/mlir/test/Target/LLVMIR/openmp-target-private-allocatable.mlir b/mlir/test/Target/LLVMIR/openmp-target-private-allocatable.mlir
new file mode 100644
index 0000000000000..88b4a6a63c7eb
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-target-private-allocatable.mlir
@@ -0,0 +1,64 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+llvm.func @alloc_foo_1(!llvm.ptr)
+llvm.func @dealloc_foo_1(!llvm.ptr)
+
+omp.private {type = private} @box.heap_privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %7 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>  : (i32) -> !llvm.ptr
+  llvm.call @alloc_foo_1(%arg0) : (!llvm.ptr) -> ()
+  omp.yield(%7 : !llvm.ptr)
+} dealloc {
+^bb0(%arg0: !llvm.ptr):
+  llvm.call @dealloc_foo_1(%arg0) : (!llvm.ptr) -> ()
+  omp.yield
+}
+
+llvm.func @target_allocatable_(%arg0: !llvm.ptr {fir.bindc_name = "lb"}, %arg1: !llvm.ptr {fir.bindc_name = "ub"}, %arg2: !llvm.ptr {fir.bindc_name = "l"}) attributes {fir.internal_name = "_QPtarget_allocatable"} {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %3 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %4 = llvm.mlir.constant(1 : i64) : i64
+  %5 = llvm.alloca %4 x f32 {bindc_name = "real_var"} : (i64) -> !llvm.ptr
+  %7 = llvm.alloca %4 x i32 {bindc_name = "mapped_var"} : (i64) -> !llvm.ptr
+  %9 = llvm.alloca %4 x !llvm.struct<(f32, f32)> {bindc_name = "comp_var"} : (i64) -> !llvm.ptr
+  %11 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %13 = llvm.alloca %4 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {bindc_name = "alloc_var"} : (i64) -> !llvm.ptr
+  %39 = llvm.load %arg2 : !llvm.ptr -> i64
+  %52 = llvm.alloca %39 x f32 {bindc_name = "real_arr"} : (i64) -> !llvm.ptr
+  %53 = omp.map.info var_ptr(%7 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "mapped_var"}
+  %54 = omp.map.info var_ptr(%13 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>) map_clauses(to) capture(ByRef) -> !llvm.ptr
+  omp.target map_entries(%53 -> %arg3, %54 -> %arg4 : !llvm.ptr, !llvm.ptr) private(@box.heap_privatizer %13 -> %arg5 [map_idx=1] : !llvm.ptr) {
+    llvm.call @use_private_var(%arg5) : (!llvm.ptr) -> ()
+    omp.terminator
+  }
+  llvm.return
+}
+
+llvm.func @use_private_var(!llvm.ptr) -> ()
+
+llvm.func @_FortranAAssign(!llvm.ptr, !llvm.ptr, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"}
+
+// The first set of checks ensure that we are calling the offloaded function
+// with the right arguments, especially the second argument which needs to
+// be a memory reference to the descriptor for the privatized allocatable
+// CHECK: define void @target_allocatable_
+// CHECK-NOT: define internal void
+// CHECK: %[[DESC_ALLOC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1
+// CHECK: call void @__omp_offloading_[[OFFLOADED_FUNCTION:.*]](ptr {{[^,]+}},
+// CHECK-SAME: ptr %[[DESC_ALLOC]])
+
+// The second set of checks ensure that to allocate memory for the
+// allocatable, we are, in fact, using the memory reference of the descriptor
+// passed as the second argument to the offloaded function.
+// CHECK: define internal void @__omp_offloading_[[OFFLOADED_FUNCTION]]
+// CHECK-SAME: (ptr {{[^,]+}}, ptr %[[DESCRIPTOR_ARG:.*]]) {
+// CHECK: %[[DESC_TO_DEALLOC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }
+// CHECK: call void @alloc_foo_1(ptr %[[DESCRIPTOR_ARG]])
+
+
+// CHECK: call void @use_private_var(ptr %[[DESC_TO_DEALLOC]]
+
+// Now, check the deallocation of the private var.
+// CHECK:  call void @dealloc_foo_1(ptr %[[DESC_TO_DEALLOC]])
diff --git a/mlir/test/Target/LLVMIR/openmp-target-private.mlir b/mlir/test/Target/LLVMIR/openmp-target-private.mlir
index e41b18f593efe..c9d5f37384a0b 100644
--- a/mlir/test/Target/LLVMIR/openmp-target-private.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-target-private.mlir
@@ -94,3 +94,92 @@ llvm.func @target_op_private_multi_block(%arg0: !llvm.ptr) {
 // CHECK: %[[PRIV_ALLOC:.*]] = alloca float, i32 %[[ONE]], align 4
 // CHECK: %[[PHI_ALLOCA:.*]]  = phi ptr [ %[[PRIV_ALLOC]], {{.*}} ]
 // CHECK: %[[RESULT:.*]] = load float, ptr %[[PHI_ALLOCA]], align 4
+
+// Descriptors are needed for CHARACTER arrays and their type is
+// !fir.boxchar<KIND>. When such arrays are used in the private construct, the
+// privatizer takes a !fir.boxchar<KIND> as input. This type is lowered to
+// !llvm.struct<(ptr, i64)>. This is unique because with other types of data,
+// typically, the privatizer funtion takes a !llvm.ptr. Now, on the host side,
+// we map the descriptor using the map clause of the omp.target op. Map clauses
+// take only !llvm.ptr types. This means, we have a case where the descriptor is
+// mapped by its pointer whereas the privatizer function expects the descriptor
+// by value. So, we have this test to ensure that the compiler correctly loads
+// from the mapped pointer before passing that to the privatizer function.
+omp.private {type = private} @_QFtarget_boxcharEchar_var_private_boxchar_c8xU : !llvm.struct<(ptr, i64)> alloc {
+^bb0(%arg0: !llvm.struct<(ptr, i64)>):
+  %0 = llvm.extractvalue %arg0[0] : !llvm.struct<(ptr, i64)>
+  %1 = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr, i64)>
+  %2 = llvm.mlir.constant(1 : i64) : i64
+  %3 = llvm.alloca %1 x i8 {bindc_name = "char_var", pinned} : (i64) -> !llvm.ptr
+  %4 = llvm.mlir.undef : !llvm.struct<(ptr, i64)>
+  %5 = llvm.insertvalue %3, %4[0] : !llvm.struct<(ptr, i64)>
+  %6 = llvm.insertvalue %1, %5[1] : !llvm.struct<(ptr, i64)>
+  omp.yield(%6 : !llvm.struct<(ptr, i64)>)
+}
+llvm.func @target_boxchar_(%arg0: !llvm.ptr {fir.bindc_name = "l"}) attributes {fir.internal_name = "_QPtarget_boxchar"} {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "mapped_var"} : (i64) -> !llvm.ptr
+  %3 = llvm.alloca %0 x !llvm.struct<(ptr, i64)> : (i64) -> !llvm.ptr
+  %4 = llvm.mlir.constant(0 : i64) : i64
+  %5 = llvm.load %arg0 : !llvm.ptr -> i64
+  %6 = llvm.icmp "sgt" %5, %4 : i64
+  %7 = llvm.select %6, %5, %4 : i1, i64
+  %9 = llvm.alloca %7 x i8 {bindc_name = "char_var"} : (i64) -> !llvm.ptr
+  %10 = llvm.mlir.undef : !llvm.struct<(ptr, i64)>
+  %11 = llvm.insertvalue %9, %10[0] : !llvm.struct<(ptr, i64)>
+  %12 = llvm.insertvalue %7, %11[1] : !llvm.struct<(ptr, i64)>
+  %13 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "mapped_var"}
+  llvm.store %12, %3 : !llvm.struct<(ptr, i64)>, !llvm.ptr
+  %14 = omp.map.info var_ptr(%3 : !llvm.ptr, !llvm.struct<(ptr, i64)>) map_clauses(to) capture(ByRef) -> !llvm.ptr
+  omp.target map_entries(%13 -> %arg1, %14 -> %arg2 : !llvm.ptr, !llvm.ptr) private(@_QFtarget_boxcharEchar_var_private_boxchar_c8xU %12 -> %arg3 [map_idx=1] : !llvm.struct<(ptr, i64)>) {
+    %15 = llvm.mlir.constant(0 : index) : i64
+    %16 = llvm.mlir.constant(32 : i8) : i8
+    %17 = llvm.mlir.constant(1 : index) : i64
+    %18 = llvm.mlir.constant(false) : i1
+    %19 = llvm.mlir.constant(5 : index) : i64
+    %20 = llvm.mlir.constant(5 : i32) : i32
+    %21 = llvm.extractvalue %arg3[0] : !llvm.struct<(ptr, i64)>
+    %22 = llvm.extractvalue %arg3[1] : !llvm.struct<(ptr, i64)>
+    llvm.store %20, %arg1 : i32, !llvm.ptr
+    %23 = llvm.mlir.addressof @_QQclX68656C6C6F : !llvm.ptr
+    %24 = llvm.icmp "slt" %22, %19 : i64
+    %25 = llvm.select %24, %22, %19 : i1, i64
+    llvm.call @llvm.memmove.p0.p0.i64(%21, %23, %25, %18) : (!llvm.ptr, !llvm.ptr, i64, i1) -> ()
+    %26 = llvm.sub %22, %17 : i64
+    %27 = llvm.mlir.undef : !llvm.array<1 x i8>
+    %28 = llvm.insertvalue %16, %27[0] : !llvm.array<1 x i8>
+    %29 = llvm.sub %26, %25 : i64
+    %30 = llvm.add %29, %17 : i64
+    llvm.br ^bb1(%25, %30 : i64, i64)
+  ^bb1(%31: i64, %32: i64):  // 2 preds: ^bb0, ^bb2
+    %33 = llvm.icmp "sgt" %32, %15 : i64
+    llvm.cond_br %33, ^bb2, ^bb3
+  ^bb2:  // pred: ^bb1
+    %34 = llvm.getelementptr %21[%31] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.array<1 x i8>
+    llvm.store %28, %34 : !llvm.array<1 x i8>, !llvm.ptr
+    %35 = llvm.add %31, %17 : i64
+    %36 = llvm.sub %32, %17 : i64
+    llvm.br ^bb1(%35, %36 : i64, i64)
+  ^bb3:  // pred: ^bb1
+    omp.terminator
+  }
+  llvm.return
+}
+llvm.mlir.global linkonce constant @_QQclX68656C6C6F() comdat(@__llvm_comdat::@_QQclX68656C6C6F) {addr_space = 0 : i32} : !llvm.array<5 x i8> {
+  %0 = llvm.mlir.constant("hello") : !llvm.array<5 x i8>
+  llvm.return %0 : !llvm.array<5 x i8>
+}
+llvm.comdat @__llvm_comdat {
+  llvm.comdat_selector @_QQclX68656C6C6F any
+}
+llvm.func @llvm.memmove.p0.p0.i64(!llvm.ptr, !llvm.ptr, i64, i1) attributes {sym_visibility = "private"}
+
+
+
+// CHECK: define internal void @__omp_offloading_{{.*}}(ptr %{{[^,]+}}, ptr %[[MAPPED_ARG:.*]]) {
+// CHECK: %[[BOXCHAR:.*]] = load { ptr, i64 }, ptr %[[MAPPED_ARG]]
+// CHECK: %[[BOXCHAR_PTR:.*]] = extractvalue { ptr, i64 } %[[BOXCHAR]], 0
+// CHECK: %[[BOXCHAR_i64:.*]] = extractvalue { ptr, i64 } %[[BOXCHAR]], 1
+// CHECK: %[[MEM_ALLOC:.*]] = alloca i8, i64 %[[BOXCHAR_i64]]
+// CHECK: %[[PRIV_BOXCHAR0:.*]] = insertvalue { ptr, i64 } undef, ptr %[[MEM_ALLOC]], 0
+// CHECK: %[[PRIV_BOXCHAR1:.*]] = insertvalue { ptr, i64 } %[[PRIV_BOXCHAR0]], i64 %[[BOXCHAR_i64]], 1
diff --git a/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir b/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir
index 3a71778e7d0a7..a94bbdce891f9 100644
--- a/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir
@@ -13,7 +13,11 @@
 // CHECK:            user_code.entry:                                  ; preds = %[[VAL_10:.*]]
 // CHECK-NEXT:         %[[VAL_11:.*]] = load ptr, ptr %[[VAL_3]], align 8
 // CHECK-NEXT:         br label %[[VAL_12:.*]]
-// CHECK:            omp.target:                                       ; preds = %[[VAL_8]]
+
+// CHECK:            [[VAL_12]]:
+// CHECK-NEXT:         br label %[[TARGET_REG_ENTRY:.*]]
+
+// CHECK:            [[TARGET_REG_ENTRY]]:                                       ; preds = %[[VAL_12]]
 // CHECK-NEXT:         %[[VAL_13:.*]] = load ptr, ptr %[[VAL_11]], align 8
 // CHECK-NEXT:         store i32 999, ptr %[[VAL_13]], align 4
 // CHECK-NEXT:         br label %[[VAL_14:.*]]
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index c1e0014d1f571..8f3e466cfbbeb 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -346,24 +346,6 @@ llvm.func @target_firstprivate(%x : !llvm.ptr) {
 
 // -----
 
-omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
-^bb0(%arg0: !llvm.ptr):
-  omp.yield(%arg0 : !llvm.ptr)
-} dealloc {
-^bb0(%arg0: !llvm.ptr):
-  omp.yield
-}
-llvm.func @target_struct_privatization(%x : !llvm.ptr) {
-  // expected-error@below {{not yet implemented: privatization of structures in omp.target operation}}
-  // expected-error@below {{LLVM Translation failed for operation: omp.target}}
-  omp.target private(@x.privatizer %x -> %arg0 : !llvm.ptr) {
-    omp.terminator
-  }
-  llvm.return
-}
-
-// -----
-
 llvm.func @target_thread_limit(%x : i32) {
   // expected-error@below {{not yet implemented: Unhandled clause thread_limit in omp.target operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.target}}
diff --git a/mlir/tools/mlir-cpu-runner/CMakeLists.txt b/mlir/tools/mlir-cpu-runner/CMakeLists.txt
index ae6dbceca855d..811583b97bc71 100644
--- a/mlir/tools/mlir-cpu-runner/CMakeLists.txt
+++ b/mlir/tools/mlir-cpu-runner/CMakeLists.txt
@@ -11,12 +11,10 @@ add_mlir_tool(mlir-cpu-runner
   EXPORT_SYMBOLS
   )
 llvm_update_compile_flags(mlir-cpu-runner)
-target_link_libraries(mlir-cpu-runner PRIVATE
+mlir_target_link_libraries(mlir-cpu-runner PRIVATE
   MLIRAnalysis
   MLIRBuiltinToLLVMIRTranslation
-  MLIRExecutionEngine
   MLIRIR
-  MLIRJitRunner
   MLIRLLVMDialect
   MLIRLLVMToLLVMIRTranslation
   MLIRToLLVMIRTranslationRegistration
@@ -24,3 +22,7 @@ target_link_libraries(mlir-cpu-runner PRIVATE
   MLIRTargetLLVMIRExport
   MLIRSupport
   )
+target_link_libraries(mlir-cpu-runner PRIVATE
+  MLIRExecutionEngine
+  MLIRJitRunner
+  )
diff --git a/mlir/tools/mlir-lsp-server/CMakeLists.txt b/mlir/tools/mlir-lsp-server/CMakeLists.txt
index 8ff9cc2f07e8e..6932e0f397795 100644
--- a/mlir/tools/mlir-lsp-server/CMakeLists.txt
+++ b/mlir/tools/mlir-lsp-server/CMakeLists.txt
@@ -38,7 +38,6 @@ set(LIBS
   ${conversion_libs}
   ${dialect_libs}
   ${extension_libs}
-  ${test_libs}
 
   MLIRAffineAnalysis
   MLIRAnalysis
@@ -56,11 +55,9 @@ set(LIBS
 
 add_mlir_tool(mlir-lsp-server
   mlir-lsp-server.cpp
-
-  DEPENDS
-  ${LIBS}
   )
-target_link_libraries(mlir-lsp-server PRIVATE ${LIBS})
+mlir_target_link_libraries(mlir-lsp-server PRIVATE ${LIBS})
+target_link_libraries(mlir-lsp-server PRIVATE ${test_libs})
 llvm_update_compile_flags(mlir-lsp-server)
 
 mlir_check_all_link_libraries(mlir-lsp-server)
diff --git a/mlir/tools/mlir-opt/CMakeLists.txt b/mlir/tools/mlir-opt/CMakeLists.txt
index 8b79de58fa102..3563d66fa9e79 100644
--- a/mlir/tools/mlir-opt/CMakeLists.txt
+++ b/mlir/tools/mlir-opt/CMakeLists.txt
@@ -45,6 +45,7 @@ if(MLIR_INCLUDE_TESTS)
     MLIRTestReducer
     MLIRTestTransforms
     MLIRTilingInterfaceTestPasses
+    MLIRTosaTestPasses
     MLIRVectorTestPasses
     MLIRTestVectorToSPIRV
     MLIRLLVMTestPasses
@@ -66,7 +67,6 @@ set(LIBS
   ${dialect_libs}
   ${conversion_libs}
   ${extension_libs}
-  ${test_libs}
 
   MLIRAffineAnalysis
   MLIRAnalysis
@@ -94,16 +94,16 @@ add_mlir_library(MLIRMlirOptMain
 
   LINK_LIBS PUBLIC
   ${LIBS}
+  ${test_libs}
   )
 
 add_mlir_tool(mlir-opt
   mlir-opt.cpp
 
-  DEPENDS
-  ${LIBS}
   SUPPORT_PLUGINS
   )
-target_link_libraries(mlir-opt PRIVATE ${LIBS})
+mlir_target_link_libraries(mlir-opt PRIVATE ${LIBS})
+target_link_libraries(mlir-opt PRIVATE ${test_libs})
 llvm_update_compile_flags(mlir-opt)
 
 mlir_check_all_link_libraries(mlir-opt)
diff --git a/mlir/tools/mlir-parser-fuzzer/bytecode/CMakeLists.txt b/mlir/tools/mlir-parser-fuzzer/bytecode/CMakeLists.txt
index 7d922656ad12f..e17b158851989 100644
--- a/mlir/tools/mlir-parser-fuzzer/bytecode/CMakeLists.txt
+++ b/mlir/tools/mlir-parser-fuzzer/bytecode/CMakeLists.txt
@@ -6,7 +6,7 @@ add_llvm_fuzzer(mlir-bytecode-parser-fuzzer
   mlir-bytecode-parser-fuzzer.cpp
   DUMMY_MAIN DummyParserFuzzer.cpp
 )
-target_link_libraries(mlir-bytecode-parser-fuzzer
+mlir_target_link_libraries(mlir-bytecode-parser-fuzzer
   PUBLIC
   MLIRIR
   MLIRParser
diff --git a/mlir/tools/mlir-parser-fuzzer/text/CMakeLists.txt b/mlir/tools/mlir-parser-fuzzer/text/CMakeLists.txt
index a9c9e1047b54e..b4a2bacc0ee0b 100644
--- a/mlir/tools/mlir-parser-fuzzer/text/CMakeLists.txt
+++ b/mlir/tools/mlir-parser-fuzzer/text/CMakeLists.txt
@@ -6,7 +6,7 @@ add_llvm_fuzzer(mlir-text-parser-fuzzer
   mlir-text-parser-fuzzer.cpp
   DUMMY_MAIN DummyParserFuzzer.cpp
 )
-target_link_libraries(mlir-text-parser-fuzzer
+mlir_target_link_libraries(mlir-text-parser-fuzzer
   PUBLIC
   MLIRIR
   MLIRParser
diff --git a/mlir/tools/mlir-query/CMakeLists.txt b/mlir/tools/mlir-query/CMakeLists.txt
index ef2e5a84b5569..18263970a7bbc 100644
--- a/mlir/tools/mlir-query/CMakeLists.txt
+++ b/mlir/tools/mlir-query/CMakeLists.txt
@@ -10,11 +10,11 @@ add_mlir_tool(mlir-query
   mlir-query.cpp
   )
 llvm_update_compile_flags(mlir-query)
-target_link_libraries(mlir-query
+mlir_target_link_libraries(mlir-query
   PRIVATE
   ${dialect_libs}
-  ${test_libs}
   MLIRQueryLib
   )
+target_link_libraries(mlir-query PRIVATE ${test_libs})
 
 mlir_check_link_libraries(mlir-query)
diff --git a/mlir/tools/mlir-reduce/CMakeLists.txt b/mlir/tools/mlir-reduce/CMakeLists.txt
index 8549dbf805f54..d71ac861a29dc 100644
--- a/mlir/tools/mlir-reduce/CMakeLists.txt
+++ b/mlir/tools/mlir-reduce/CMakeLists.txt
@@ -10,7 +10,6 @@ endif()
 set(LIBS
   ${conversion_libs}
   ${dialect_libs}
-  ${test_libs}
   MLIRDialect
   MLIRIR
   MLIRPass
@@ -19,12 +18,10 @@ set(LIBS
 
 add_mlir_tool(mlir-reduce
   mlir-reduce.cpp
-
-  DEPENDS
-  ${LIBS}
   )
 
-target_link_libraries(mlir-reduce PRIVATE ${LIBS})
+mlir_target_link_libraries(mlir-reduce PRIVATE ${LIBS})
+target_link_libraries(mlir-reduce PRIVATE ${test_libs})
 llvm_update_compile_flags(mlir-reduce)
 
 mlir_check_all_link_libraries(mlir-reduce)
diff --git a/mlir/tools/mlir-rewrite/CMakeLists.txt b/mlir/tools/mlir-rewrite/CMakeLists.txt
index 5b8c1cd455399..216491eb432af 100644
--- a/mlir/tools/mlir-rewrite/CMakeLists.txt
+++ b/mlir/tools/mlir-rewrite/CMakeLists.txt
@@ -5,7 +5,6 @@ set(LLVM_LINK_COMPONENTS
 
 set(LIBS
   ${dialect_libs}
-  ${test_libs}
 
   MLIRAffineAnalysis
   MLIRAnalysis
@@ -24,11 +23,9 @@ include_directories(../../../clang/include)
 add_mlir_tool(mlir-rewrite
   mlir-rewrite.cpp
 
-  DEPENDS
-  ${LIBS}
   SUPPORT_PLUGINS
   )
-target_link_libraries(mlir-rewrite PRIVATE ${LIBS})
+mlir_target_link_libraries(mlir-rewrite PRIVATE ${LIBS})
 llvm_update_compile_flags(mlir-rewrite)
 
 mlir_check_all_link_libraries(mlir-rewrite)
diff --git a/mlir/tools/mlir-translate/CMakeLists.txt b/mlir/tools/mlir-translate/CMakeLists.txt
index a78131b8c356c..b356e04bb1dc4 100644
--- a/mlir/tools/mlir-translate/CMakeLists.txt
+++ b/mlir/tools/mlir-translate/CMakeLists.txt
@@ -9,11 +9,9 @@ add_mlir_tool(mlir-translate
   mlir-translate.cpp
   )
 llvm_update_compile_flags(mlir-translate)
-target_link_libraries(mlir-translate
+mlir_target_link_libraries(mlir-translate
   PRIVATE
   ${dialect_libs}
-  ${translation_libs}
-  ${test_libs}
   MLIRIR
   MLIRParser
   MLIRPass
@@ -21,5 +19,10 @@ target_link_libraries(mlir-translate
   MLIRTranslateLib
   MLIRSupport
   )
+target_link_libraries(mlir-translate
+  PRIVATE
+  ${translation_libs}
+  ${test_libs}
+  )
 
 mlir_check_link_libraries(mlir-translate)
diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h
index 874974cc861df..7a73f9ba72877 100644
--- a/offload/DeviceRTL/include/Synchronization.h
+++ b/offload/DeviceRTL/include/Synchronization.h
@@ -26,6 +26,14 @@ enum OrderingTy {
   seq_cst = __ATOMIC_SEQ_CST,
 };
 
+enum ScopeTy {
+  system = __MEMORY_SCOPE_SYSTEM,
+  device_ = __MEMORY_SCOPE_DEVICE,
+  workgroup = __MEMORY_SCOPE_WRKGRP,
+  wavefront = __MEMORY_SCOPE_WVFRNT,
+  single = __MEMORY_SCOPE_SINGLE,
+};
+
 enum MemScopeTy {
   all,    // All threads on all devices
   device, // All threads on the device
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
index 9ea8d171cc830..3aee23a865d3c 100644
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ b/offload/DeviceRTL/src/Synchronization.cpp
@@ -232,50 +232,16 @@ void namedBarrier() {
   fence::team(atomic::release);
 }
 
-// sema checking of amdgcn_fence is aggressive. Intention is to patch clang
-// so that it is usable within a template environment and so that a runtime
-// value of the memory order is expanded to this switch within clang/llvm.
 void fenceTeam(atomic::OrderingTy Ordering) {
-  switch (Ordering) {
-  default:
-    __builtin_unreachable();
-  case atomic::aquire:
-    return __builtin_amdgcn_fence(atomic::aquire, "workgroup");
-  case atomic::release:
-    return __builtin_amdgcn_fence(atomic::release, "workgroup");
-  case atomic::acq_rel:
-    return __builtin_amdgcn_fence(atomic::acq_rel, "workgroup");
-  case atomic::seq_cst:
-    return __builtin_amdgcn_fence(atomic::seq_cst, "workgroup");
-  }
+  return __scoped_atomic_thread_fence(Ordering, atomic::workgroup);
 }
+
 void fenceKernel(atomic::OrderingTy Ordering) {
-  switch (Ordering) {
-  default:
-    __builtin_unreachable();
-  case atomic::aquire:
-    return __builtin_amdgcn_fence(atomic::aquire, "agent");
-  case atomic::release:
-    return __builtin_amdgcn_fence(atomic::release, "agent");
-  case atomic::acq_rel:
-    return __builtin_amdgcn_fence(atomic::acq_rel, "agent");
-  case atomic::seq_cst:
-    return __builtin_amdgcn_fence(atomic::seq_cst, "agent");
-  }
+  return __scoped_atomic_thread_fence(Ordering, atomic::device_);
 }
+
 void fenceSystem(atomic::OrderingTy Ordering) {
-  switch (Ordering) {
-  default:
-    __builtin_unreachable();
-  case atomic::aquire:
-    return __builtin_amdgcn_fence(atomic::aquire, "");
-  case atomic::release:
-    return __builtin_amdgcn_fence(atomic::release, "");
-  case atomic::acq_rel:
-    return __builtin_amdgcn_fence(atomic::acq_rel, "");
-  case atomic::seq_cst:
-    return __builtin_amdgcn_fence(atomic::seq_cst, "");
-  }
+  return __scoped_atomic_thread_fence(Ordering, atomic::system);
 }
 
 void syncWarp(__kmpc_impl_lanemask_t) {