From cb86359645d77688676f6e7d3806a3d052b51220 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Wed, 25 Jun 2025 14:39:58 -0700
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20change?=
 =?UTF-8?q?s=20to=20main=20this=20commit=20is=20based=20on?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.6-beta.1

[skip ci]
---
 bolt/include/bolt/Core/BinarySection.h        |   5 -
 bolt/lib/Passes/PAuthGadgetScanner.cpp        | 133 ++-
 .../AArch64/gs-pacret-autiasp.s               |  31 +-
 .../AArch64/gs-pauth-authentication-oracles.s |  20 -
 .../binary-analysis/AArch64/gs-pauth-calls.s  |  84 ++
 .../AArch64/gs-pauth-debug-output.s           |  32 +-
 .../AArch64/gs-pauth-signing-oracles.s        |  27 -
 .../bugprone/SizeofExpressionCheck.cpp        |  38 +-
 .../bugprone/SizeofExpressionCheck.h          |   1 +
 clang-tools-extra/docs/ReleaseNotes.rst       |   5 +
 .../checks/bugprone/sizeof-expression.rst     |   9 +
 .../checkers/bugprone/sizeof-expression.cpp   |  63 ++
 .../bindings/python/tests/cindex/test_file.py |   1 +
 clang/docs/ReleaseNotes.rst                   |   5 +-
 clang/include/clang-c/Index.h                 | 125 ++-
 .../Analysis/FlowSensitive/StorageLocation.h  |   2 -
 clang/include/clang/Basic/BuiltinsAArch64.def |   7 +
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   3 +
 clang/include/clang/Basic/BuiltinsPPC.def     |   6 +
 .../clang/Basic/DiagnosticSemaKinds.td        |  14 +-
 clang/include/clang/CIR/Dialect/IR/CIROps.td  |  88 +-
 clang/include/clang/CIR/MissingFeatures.h     | 154 +--
 clang/include/clang/Lex/Preprocessor.h        |   6 +-
 clang/include/clang/Lex/Token.h               |   9 +-
 clang/include/clang/Serialization/ASTReader.h |  20 +-
 .../clang/Serialization/ASTRecordReader.h     |  11 +-
 .../clang/Serialization/ASTRecordWriter.h     |  11 +-
 clang/include/clang/Serialization/ASTWriter.h |  10 +-
 .../Serialization/SourceLocationEncoding.h    | 120 +--
 .../Core/PathSensitive/CallEvent.h            |   2 +
 clang/lib/Basic/Targets/PPC.cpp               |   6 +
 clang/lib/CIR/CodeGen/CIRGenBuilder.h         |  10 +
 clang/lib/CIR/CodeGen/CIRGenCXX.cpp           |   2 +-
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp    |  53 +-
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        |  82 +-
 clang/lib/CIR/CodeGen/CIRGenModule.h          |  12 +-
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       | 139 ++-
 .../Dialect/Transforms/CIRCanonicalize.cpp    |   5 +-
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |  49 +-
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.h   |  20 +
 clang/lib/CodeGen/CGBuiltin.cpp               |   2 +-
 clang/lib/CodeGen/CGDebugInfo.cpp             |   4 +-
 clang/lib/CodeGen/CGVTables.cpp               |   4 +-
 clang/lib/CodeGen/CodeGenFunction.h           |   3 +
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      |  32 +-
 clang/lib/CodeGen/Targets/SPIR.cpp            |   4 +
 clang/lib/Driver/Driver.cpp                   | 123 ++-
 clang/lib/Format/FormatTokenLexer.cpp         |  30 +-
 clang/lib/Format/QualifierAlignmentFixer.cpp  |  25 +-
 clang/lib/Format/TokenAnnotator.cpp           |   5 +-
 clang/lib/Headers/intrin.h                    |  10 +-
 clang/lib/Lex/PPDirectives.cpp                |   4 +-
 clang/lib/Lex/Preprocessor.cpp                |   4 +-
 clang/lib/Sema/SemaPPC.cpp                    |   4 +
 clang/lib/Sema/SemaTypeTraits.cpp             | 271 +++++-
 clang/lib/Serialization/ASTReader.cpp         |  37 +-
 clang/lib/Serialization/ASTWriter.cpp         |  40 +-
 clang/lib/Serialization/ASTWriterStmt.cpp     |   2 +-
 .../Serialization/TemplateArgumentHasher.cpp  |   2 +
 clang/lib/StaticAnalyzer/Core/CallEvent.cpp   |  12 +
 .../Analysis/lambda-convert-to-func-ptr.cpp   |  21 +
 clang/test/CIR/CodeGen/align-load.c           |   6 +-
 clang/test/CIR/CodeGen/align-store.c          |   2 +-
 clang/test/CIR/CodeGen/array.cpp              |  26 +-
 clang/test/CIR/CodeGen/basic.c                |  48 +-
 clang/test/CIR/CodeGen/basic.cpp              |  18 +-
 clang/test/CIR/CodeGen/binassign.c            |   2 +-
 clang/test/CIR/CodeGen/binop.cpp              |  56 +-
 clang/test/CIR/CodeGen/builtin_call.cpp       |  24 +-
 clang/test/CIR/CodeGen/builtin_printf.cpp     |   8 +-
 clang/test/CIR/CodeGen/call.c                 |  36 +-
 clang/test/CIR/CodeGen/call.cpp               |  32 +-
 clang/test/CIR/CodeGen/cast.cpp               |  14 +-
 clang/test/CIR/CodeGen/class.cpp              |   6 +-
 clang/test/CIR/CodeGen/cmp.cpp                |  30 +-
 clang/test/CIR/CodeGen/comma.c                |   2 +-
 clang/test/CIR/CodeGen/complex.cpp            | 113 +++
 clang/test/CIR/CodeGen/compound_assign.cpp    |   2 +-
 clang/test/CIR/CodeGen/ctor.cpp               |  38 +-
 clang/test/CIR/CodeGen/dso-local.c            |  32 +-
 clang/test/CIR/CodeGen/forrange.cpp           |  10 +-
 clang/test/CIR/CodeGen/if.cpp                 |  24 +-
 clang/test/CIR/CodeGen/inline-cxx-func.cpp    |   4 +-
 clang/test/CIR/CodeGen/int-to-bool.cpp        |  16 +-
 clang/test/CIR/CodeGen/linkage-spec.cpp       |  28 +-
 clang/test/CIR/CodeGen/local-vars.cpp         |   2 +-
 clang/test/CIR/CodeGen/loop.cpp               |  40 +-
 clang/test/CIR/CodeGen/member-functions.cpp   |   6 +-
 clang/test/CIR/CodeGen/namespace.cpp          |  14 +-
 clang/test/CIR/CodeGen/nullptr-init.cpp       |   2 +-
 clang/test/CIR/CodeGen/string-literals.c      |  16 +-
 clang/test/CIR/CodeGen/struct.c               |  20 +-
 clang/test/CIR/CodeGen/struct.cpp             |  10 +-
 clang/test/CIR/CodeGen/switch.cpp             |  94 +-
 clang/test/CIR/CodeGen/switch_flat_op.cpp     |   4 +-
 clang/test/CIR/CodeGen/ternary.cpp            |  12 +-
 clang/test/CIR/CodeGen/typedef.c              |   4 +-
 clang/test/CIR/CodeGen/unary.cpp              |  74 +-
 clang/test/CIR/CodeGen/union.c                |  28 +-
 clang/test/CIR/CodeGen/vector-ext.cpp         |  15 +
 clang/test/CIR/CodeGen/vector.cpp             |  15 +
 clang/test/CIR/CodeGenOpenACC/combined-copy.c |   6 +-
 clang/test/CIR/CodeGenOpenACC/combined.cpp    |   4 +-
 clang/test/CIR/CodeGenOpenACC/compute-copy.c  |   4 +-
 clang/test/CIR/CodeGenOpenACC/data.c          |   2 +-
 clang/test/CIR/CodeGenOpenACC/host_data.c     |   2 +-
 clang/test/CIR/CodeGenOpenACC/init.c          |   2 +-
 clang/test/CIR/CodeGenOpenACC/kernels.c       |   4 +-
 clang/test/CIR/CodeGenOpenACC/loop.cpp        |   2 +-
 clang/test/CIR/CodeGenOpenACC/parallel.c      |   4 +-
 clang/test/CIR/CodeGenOpenACC/serial.c        |   4 +-
 clang/test/CIR/CodeGenOpenACC/set.c           |   2 +-
 clang/test/CIR/CodeGenOpenACC/shutdown.c      |   2 +-
 clang/test/CIR/CodeGenOpenACC/wait.c          |   2 +-
 clang/test/CIR/IR/array.cir                   |   6 +-
 clang/test/CIR/IR/binassign.cir               |   2 +-
 clang/test/CIR/IR/call.cir                    |  14 +-
 clang/test/CIR/IR/cast.cir                    |   4 +-
 clang/test/CIR/IR/cmp.cir                     |  10 +-
 clang/test/CIR/IR/func.cir                    |  14 +-
 clang/test/CIR/IR/invalid-call.cir            |  12 +-
 clang/test/CIR/IR/invalid-complex.cir         |  24 +
 clang/test/CIR/IR/ternary.cir                 |   2 +-
 clang/test/CIR/IR/unary.cir                   |   4 +-
 clang/test/CIR/IR/vector.cir                  |  16 +-
 clang/test/CIR/Lowering/array.cpp             |  18 +-
 clang/test/CIR/Transforms/canonicalize.cir    |  12 +-
 .../CIR/Transforms/complex-create-fold.cir    |   2 +-
 .../test/CIR/Transforms/complex-imag-fold.cir |  23 +
 .../test/CIR/Transforms/complex-real-fold.cir |  23 +
 clang/test/CIR/Transforms/hoist-allocas.cir   |   6 +-
 clang/test/CIR/Transforms/if.cir              |   4 +-
 clang/test/CIR/Transforms/loop.cir            |   6 +-
 clang/test/CIR/Transforms/scope.cir           |   6 +-
 clang/test/CIR/Transforms/select.cir          |  10 +-
 clang/test/CIR/Transforms/switch.cir          |  20 +-
 clang/test/CIR/Transforms/ternary-fold.cir    |   8 +-
 clang/test/CIR/Transforms/ternary.cir         |   4 +-
 clang/test/CIR/Transforms/vector-cmp-fold.cir |  24 +-
 .../CIR/Transforms/vector-create-fold.cir     |   2 +-
 .../vector-shuffle-dynamic-fold.cir           |   4 +-
 .../CIR/Transforms/vector-shuffle-fold.cir    |   6 +-
 .../CIR/Transforms/vector-ternary-fold.cir    |   2 +-
 clang/test/CIR/func-linkage.cpp               |  51 +
 clang/test/CIR/func-simple.cpp                |  20 +-
 clang/test/CIR/mlprint.c                      |   2 +-
 .../CodeGen/PowerPC/builtins-bcd-transform.c  |  79 ++
 .../CodeGen/X86/ms-secure-hotpatch-bad-file.c |   2 +-
 .../CodeGen/X86/ms-secure-hotpatch-cpp.cpp    |   2 +-
 .../CodeGen/X86/ms-secure-hotpatch-eh.cpp     |   2 +-
 .../CodeGen/X86/ms-secure-hotpatch-globals.c  |   2 +-
 .../test/CodeGen/X86/ms-secure-hotpatch-lto.c |   2 +-
 clang/test/CodeGen/X86/ms-secure-hotpatch.c   |   2 +-
 .../test/CodeGen/arm64-microsoft-intrinsics.c |  60 ++
 clang/test/CodeGen/builtins-overflow.c        |  12 +
 clang/test/CodeGen/logb_scalbn.c              | 873 ++++++++++++++++++
 clang/test/CodeGen/pragma-comment.c           |   2 +-
 clang/test/CodeGenCUDA/bf16.cu                |   6 +-
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  |  44 +
 clang/test/Index/inline-assembly.c            |  46 +
 clang/test/Modules/pr118137.cppm              |  24 +
 clang/test/Modules/template-declare.cppm      |  39 +
 clang/test/OpenMP/declare_mapper_messages.c   |  63 +-
 clang/test/OpenMP/declare_target_messages.cpp | 106 ++-
 .../declare_variant_clauses_messages.cpp      |   2 +-
 clang/test/OpenMP/target_data_ast_print.cpp   |   8 +-
 clang/test/OpenMP/target_map_messages.cpp     |  59 +-
 clang/test/Sema/builtins-bcd-transform.c      |  30 +
 .../SemaCXX/cxx2c-trivially-relocatable.cpp   |  36 +
 .../type-traits-unsatisfied-diags-std.cpp     | 158 ++++
 .../SemaCXX/type-traits-unsatisfied-diags.cpp | 279 ++++++
 clang/tools/c-index-test/c-index-test.c       |  50 +
 clang/tools/libclang/CIndex.cpp               |  94 ++
 clang/tools/libclang/libclang.map             |   9 +
 clang/unittests/Format/FormatTest.cpp         |  15 +
 clang/unittests/Format/QualifierFixerTest.cpp |   7 +-
 .../SourceLocationEncodingTest.cpp            |  58 --
 compiler-rt/cmake/Modules/AddCompilerRT.cmake |  18 +
 compiler-rt/lib/asan/CMakeLists.txt           |  13 +
 .../lib/asan/asan.link_with_main_exec.txt     | 115 +++
 .../lib/asan/asan_cxx.link_with_main_exec.txt |  21 +
 compiler-rt/lib/asan/asan_descriptions.cpp    |  22 +-
 compiler-rt/test/tsan/java_heap_init2.cpp     |  34 +
 compiler-rt/test/tsan/munmap_clear_shadow.c   |  59 ++
 flang/docs/ReleaseNotes.md                    |   5 +
 .../flang/Optimizer/Dialect/FIRTypes.td       |   3 +-
 flang/include/flang/Support/OpenMP-features.h |  14 +-
 flang/lib/Frontend/CompilerInvocation.cpp     |  42 +-
 flang/lib/Lower/Bridge.cpp                    |   2 +-
 flang/lib/Lower/OpenMP/OpenMP.cpp             |   2 +-
 flang/lib/Optimizer/Dialect/FIRType.cpp       |   6 +-
 .../HLFIR/Transforms/ConvertToFIR.cpp         |   5 +-
 .../test/Driver/bbc-openmp-version-macro.f90  |  11 +-
 .../Driver/flang-openmp-version-macro.f90     |   8 -
 flang/test/Driver/fopenmp-version.F90         |  25 +
 flang/test/Fir/convert-to-llvm.fir            |   4 +-
 flang/test/HLFIR/declare-codegen.fir          |  18 +
 .../OpenMP/target-data-skip-mapper-calls.f90  |  30 +
 libc/config/linux/x86_64/entrypoints.txt      |   1 +
 libc/include/wchar.yaml                       |   7 +
 .../__support/wchar/character_converter.cpp   |  18 +-
 .../src/__support/wchar/character_converter.h |   1 +
 libc/src/wchar/CMakeLists.txt                 |  13 +
 libc/src/wchar/wctomb.cpp                     |  35 +
 libc/src/wchar/wctomb.h                       |  22 +
 .../src/__support/wchar/utf32_to_8_test.cpp   |  42 +
 libc/test/src/wchar/CMakeLists.txt            |  11 +
 libc/test/src/wchar/wctomb_test.cpp           |  73 ++
 libclc/clc/include/clc/clcmacro.h             | 105 ---
 libclc/clc/include/clc/math/clc_pown.h        |   2 +-
 libclc/clc/include/clc/math/clc_rootn.h       |   2 +-
 .../binary_decl_with_int_second_arg.inc       |   0
 .../binary_def_with_int_second_arg.inc        |   0
 libclc/clc/lib/generic/math/clc_copysign.cl   |  28 +-
 libclc/clc/lib/generic/math/clc_pow.inc       |  13 +-
 libclc/clc/lib/generic/math/clc_pown.inc      |  13 +-
 libclc/clc/lib/generic/math/clc_powr.inc      |  13 +-
 libclc/clc/lib/generic/math/clc_rootn.inc     |  13 +-
 libclc/opencl/include/clc/opencl/math/ldexp.h |   5 +
 .../opencl/include/clc/opencl/math/ldexp.inc  |   4 +-
 libclc/opencl/include/clc/opencl/math/pown.h  |   2 +-
 libclc/opencl/include/clc/opencl/math/rootn.h |   2 +-
 libclc/opencl/lib/clspv/math/fma.cl           |   8 +-
 libclc/opencl/lib/generic/common/degrees.cl   |  18 +-
 libclc/opencl/lib/generic/common/radians.cl   |  18 +-
 libclc/opencl/lib/generic/math/fma.cl         |  18 +-
 libclc/opencl/lib/generic/math/ldexp.cl       |  21 +-
 libclc/opencl/lib/generic/math/mad.cl         |  18 +-
 libclc/opencl/lib/generic/math/nextafter.cl   |  24 +-
 libclc/opencl/lib/generic/math/pown.cl        |   2 +-
 libclc/opencl/lib/generic/math/rootn.cl       |   2 +-
 libclc/opencl/lib/spirv/math/fma.cl           |   8 +-
 libcxx/docs/Status/Cxx2cPapers.csv            |   2 +-
 libcxx/include/variant                        |  31 +
 .../variant/variant.relops/relops.pass.cpp    |  49 +
 .../relops_bool_conv.verify.cpp               |  11 +-
 libcxx/test/support/test_comparisons.h        |  41 +
 libcxxabi/src/demangle/ItaniumDemangle.h      |   4 +-
 libcxxabi/test/DemangleTestCases.inc          |   1 +
 lldb/include/lldb/Core/Debugger.h             |   6 -
 lldb/include/lldb/Core/ProtocolServer.h       |   5 +-
 lldb/include/lldb/Target/MemoryTagManager.h   |  12 +-
 lldb/include/lldb/Utility/XcodeSDK.h          |  13 -
 lldb/include/lldb/lldb-forward.h              |   2 +-
 lldb/include/lldb/lldb-private-interfaces.h   |   3 +-
 .../Python/lldbsuite/test/gdbclientutils.py   |  10 +
 .../Commands/CommandObjectProtocolServer.cpp  |  51 +-
 lldb/source/Core/Debugger.cpp                 |  23 -
 lldb/source/Core/ProtocolServer.cpp           |  34 +-
 .../Clang/ClangExpressionParser.cpp           |  51 +-
 .../Utility/MemoryTagManagerAArch64MTE.cpp    |  10 +-
 .../Utility/MemoryTagManagerAArch64MTE.h      |   2 +-
 .../gdb-remote/GDBRemoteClientBase.cpp        |   9 +-
 .../Process/gdb-remote/GDBRemoteClientBase.h  |   6 +-
 .../gdb-remote/GDBRemoteCommunication.cpp     | 172 ++--
 .../gdb-remote/GDBRemoteCommunication.h       |  10 +-
 .../GDBRemoteCommunicationClient.cpp          |   6 +-
 .../GDBRemoteCommunicationServerPlatform.cpp  |  30 +-
 .../Process/gdb-remote/ProcessGDBRemote.cpp   |   6 +-
 lldb/source/Plugins/Protocol/MCP/Protocol.h   |   2 +
 .../Protocol/MCP/ProtocolServerMCP.cpp        |  30 +-
 .../Plugins/Protocol/MCP/ProtocolServerMCP.h  |   6 +-
 lldb/source/Plugins/Protocol/MCP/Tool.cpp     | 109 ++-
 lldb/source/Plugins/Protocol/MCP/Tool.h       |  24 +-
 lldb/source/Utility/XcodeSDK.cpp              |  21 -
 .../script_alias/TestCommandScriptAlias.py    |   1 +
 .../gdb_remote_client/TestGDBRemoteClient.py  |  72 ++
 .../TestAArch64LinuxMTEMemoryTagCoreFile.py   |  23 +
 .../aarch64/mte_core_file/core.mte.notags     | Bin 0 -> 32768 bytes
 .../API/linux/aarch64/mte_core_file/main.c    |   7 +-
 .../restart/TestDAP_restart_runInTerminal.py  |   4 +-
 .../runInTerminal/TestDAP_runInTerminal.py    |   5 +-
 lldb/unittests/Host/CMakeLists.txt            |   4 +-
 .../MemoryTagManagerAArch64MTETest.cpp        |  51 +-
 .../Protocol/ProtocolMCPServerTest.cpp        |  21 +-
 llvm/docs/CommandGuide/llvm-objdump.rst       |   2 +-
 llvm/docs/HowToReleaseLLVM.rst                |  14 +-
 llvm/docs/ReleaseNotes.md                     |   1 +
 llvm/docs/SourceLevelDebugging.rst            |  57 +-
 llvm/include/llvm/ADT/ArrayRef.h              |  51 +-
 llvm/include/llvm/Analysis/DXILResource.h     |   4 +-
 llvm/include/llvm/Analysis/IR2Vec.h           |  52 +-
 llvm/include/llvm/Analysis/ValueTracking.h    |   4 +-
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |   9 +-
 .../llvm/CodeGen/GlobalISel/CallLowering.h    |   9 +
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |   5 +
 llvm/include/llvm/CodeGenTypes/LowLevelType.h |   5 +-
 .../llvm/CodeGenTypes/MachineValueType.h      |   9 +-
 llvm/include/llvm/Demangle/ItaniumDemangle.h  |   4 +-
 llvm/include/llvm/ExecutionEngine/Orc/COFF.h  |   5 +-
 .../Orc/ExecutorProcessControl.h              |  42 -
 .../llvm/Frontend/Directive/Spelling.h        |   4 +-
 .../llvm/Frontend/Driver/CodeGenOptions.h     |   2 +-
 .../Frontend/HLSL/HLSLRootSignatureUtils.h    |   9 +-
 llvm/include/llvm/Frontend/OpenMP/OMP.td      |  43 +-
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |   8 +-
 llvm/include/llvm/IR/DIBuilder.h              |  56 ++
 llvm/include/llvm/IR/DebugInfoMetadata.h      | 497 +++++++---
 llvm/include/llvm/IR/Intrinsics.td            |   4 +-
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   8 +
 llvm/include/llvm/IR/IntrinsicsPowerPC.td     |   8 +
 llvm/include/llvm/IR/RuntimeLibcalls.h        |   2 +
 llvm/include/llvm/MC/MCSchedule.h             |  12 +-
 llvm/include/llvm/TargetParser/TargetParser.h |   2 +-
 llvm/include/llvm/TargetParser/Triple.h       |   9 +-
 .../Testing/Demangle/DemangleTestCases.inc    |   1 +
 llvm/lib/Analysis/BasicAliasAnalysis.cpp      |   4 +
 llvm/lib/Analysis/ConstantFolding.cpp         |   6 +
 llvm/lib/AsmParser/LLParser.cpp               | 109 ++-
 llvm/lib/Bitcode/Reader/MetadataLoader.cpp    |  61 +-
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     |  35 +-
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    |  24 +-
 .../AsmPrinter/DbgEntityHistoryCalculator.cpp |  18 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp     | 199 ++--
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  60 ++
 llvm/lib/CodeGen/GlobalMergeFunctions.cpp     |   1 +
 llvm/lib/CodeGen/IfConversion.cpp             |  35 +-
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    | 199 +---
 llvm/lib/CodeGen/MachineDomTreeUpdater.cpp    |   9 +-
 llvm/lib/CodeGen/MachineDominators.cpp        |  30 +-
 llvm/lib/CodeGen/MachineLoopInfo.cpp          |   7 +-
 llvm/lib/CodeGen/MachinePassManager.cpp       |  15 +-
 llvm/lib/CodeGen/MachinePostDominators.cpp    |  29 +-
 llvm/lib/CodeGen/RegAllocScore.cpp            |  17 +-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  13 +
 .../Orc/ExecutorProcessControl.cpp            |   1 +
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |   4 +
 llvm/lib/IR/DIBuilder.cpp                     |  86 +-
 llvm/lib/IR/DebugInfoMetadata.cpp             | 101 +-
 llvm/lib/IR/Instructions.cpp                  |   4 +
 llvm/lib/IR/LLVMContextImpl.h                 |  76 +-
 llvm/lib/IR/RuntimeLibcalls.cpp               |  63 +-
 llvm/lib/IR/Verifier.cpp                      |  25 +-
 llvm/lib/MC/MCSchedule.cpp                    |   1 +
 llvm/lib/MC/MCStreamer.cpp                    |   8 +-
 llvm/lib/MCA/InstrBuilder.cpp                 |   3 +-
 llvm/lib/Option/Arg.cpp                       |  17 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  35 +-
 .../Target/AArch64/AArch64PerfectShuffle.h    |  27 +
 .../AArch64/AArch64TargetTransformInfo.cpp    |  17 +
 .../MCTargetDesc/AArch64InstPrinter.cpp       |  10 +
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   2 +-
 .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp    |   9 +-
 .../AMDGPU/AMDGPURegBankLegalizeHelper.h      |   6 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   2 +
 llvm/lib/Target/AMDGPU/DSDIRInstructions.td   |   4 +-
 .../Disassembler/AMDGPUDisassembler.cpp       |  14 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |   4 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  30 +-
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   |  37 +-
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    |  45 +-
 llvm/lib/Target/AMDGPU/VOPInstructions.td     |   1 +
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |   2 +-
 .../Target/DirectX/DXILDataScalarization.cpp  |   5 +-
 llvm/lib/Target/DirectX/DXILPrepare.cpp       |  11 +
 .../LoongArch/LoongArchFloat64InstrInfo.td    |  16 +
 .../LoongArch/LoongArchISelLowering.cpp       | 309 ++++++-
 .../Target/LoongArch/LoongArchISelLowering.h  |   4 +
 .../Target/LoongArch/LoongArchInstrInfo.td    |  12 +
 .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp   |  22 +
 .../NVPTX/MCTargetDesc/NVPTXInstPrinter.h     |   2 +
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp   |  72 +-
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h     |   1 -
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp   | 121 +--
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     |  32 +-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp      |  20 +-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.h        |   3 -
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       | 245 ++---
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      |   2 +
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp   |  15 +
 llvm/lib/Target/PowerPC/PPCInstrAltivec.td    |  12 +-
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp     |   2 +-
 llvm/lib/Target/RISCV/RISCVFeatures.td        |   7 -
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  45 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |   2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoC.td      |  10 +-
 llvm/lib/Target/RISCV/RISCVSchedSiFive7.td    |  12 +-
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  91 +-
 .../X86/GISel/X86InstructionSelector.cpp      |  33 +
 .../lib/Target/X86/GISel/X86LegalizerInfo.cpp |  12 +-
 llvm/lib/Target/X86/X86.td                    |   2 +-
 .../lib/Target/X86/X86TargetTransformInfo.cpp |   3 +-
 llvm/lib/TargetParser/Triple.cpp              |  14 +-
 llvm/lib/Transforms/IPO/GlobalOpt.cpp         |   8 +-
 .../InstCombine/InstructionCombining.cpp      |  16 +-
 .../Instrumentation/MemorySanitizer.cpp       |  43 +-
 .../lib/Transforms/Scalar/LoopInterchange.cpp |   4 +-
 llvm/lib/Transforms/Scalar/Reassociate.cpp    |  10 +-
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   |  13 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   8 +-
 .../Transforms/Vectorize/VectorCombine.cpp    |   6 +-
 llvm/runtimes/CMakeLists.txt                  |   7 +
 .../segmented-shufflevector-patterns.ll       |  53 ++
 .../AArch64/arm64-indexed-vector-ldst.ll      | 117 +++
 .../CodeGen/AArch64/exp10-libcall-names.ll    |  16 +-
 .../CodeGen/AArch64/streaming-func-no-sme.ll  |   2 +-
 .../CodeGen/AArch64/sve2p1-vector-shuffles.ll |  63 +-
 .../CodeGen/AMDGPU/convergence-laneops.ll     |   2 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll | 107 +++
 .../AMDGPU/shufflevector.v3bf16.v2bf16.ll     |  16 +-
 .../AMDGPU/shufflevector.v3bf16.v3bf16.ll     |  15 +-
 .../AMDGPU/shufflevector.v3f16.v2f16.ll       |  16 +-
 .../AMDGPU/shufflevector.v3f16.v3f16.ll       |  15 +-
 .../AMDGPU/unsupported-image-sample.ll        |  19 +-
 .../ARM/ifcvt_unanalyzable_fallthrough.mir    | 114 +++
 llvm/test/CodeGen/ARM/special-reg.ll          |  12 +-
 .../DirectX/issue-145408-gep-struct-fix.ll    |  17 +
 llvm/test/CodeGen/DirectX/llc-pipeline.ll     |   2 +-
 .../CodeGen/DirectX/strip-rootsignatures.ll   |  18 +
 .../CodeGen/LoongArch/calling-conv-ilp32d.ll  | 193 ++++
 .../LoongArch/inline-asm-constraint-f.ll      |  14 +-
 .../ir-instruction/double-convert.ll          |  14 +-
 .../ir-instruction/load-store-atomic.ll       |  40 +-
 llvm/test/CodeGen/Mips/msa/compare_float.ll   | 624 ++++++-------
 llvm/test/CodeGen/NVPTX/alias.ll              |   3 +-
 .../test/CodeGen/NVPTX/bf16x2-instructions.ll |   7 +-
 llvm/test/CodeGen/NVPTX/byval-const-global.ll |   6 +-
 .../CodeGen/NVPTX/call-with-alloca-buffer.ll  |   3 +-
 llvm/test/CodeGen/NVPTX/combine-mad.ll        |   7 +-
 .../test/CodeGen/NVPTX/convergent-mir-call.ll |  10 +-
 .../CodeGen/NVPTX/convert-call-to-indirect.ll |  43 +-
 llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll |  12 +-
 llvm/test/CodeGen/NVPTX/f16-instructions.ll   |  24 +-
 llvm/test/CodeGen/NVPTX/f16x2-instructions.ll |  21 +-
 llvm/test/CodeGen/NVPTX/fma.ll                |  14 +-
 llvm/test/CodeGen/NVPTX/forward-ld-param.ll   |  12 +-
 llvm/test/CodeGen/NVPTX/fp128-storage-type.ll |   6 +-
 llvm/test/CodeGen/NVPTX/i16x2-instructions.ll |  21 +-
 llvm/test/CodeGen/NVPTX/i8x4-instructions.ll  |  21 +-
 llvm/test/CodeGen/NVPTX/indirect_byval.ll     |  16 +-
 llvm/test/CodeGen/NVPTX/ldparam-v4.ll         |   5 +-
 llvm/test/CodeGen/NVPTX/local-stack-frame.ll  |  36 +-
 .../CodeGen/NVPTX/lower-args-gridconstant.ll  |  36 +-
 llvm/test/CodeGen/NVPTX/lower-args.ll         |  14 +-
 llvm/test/CodeGen/NVPTX/lower-byval-args.ll   |  24 +-
 llvm/test/CodeGen/NVPTX/misched_func_call.ll  |   6 +-
 .../NVPTX/naked-fn-with-frame-pointer.ll      |  20 +-
 llvm/test/CodeGen/NVPTX/param-add.ll          |   6 +-
 llvm/test/CodeGen/NVPTX/param-load-store.ll   | 168 ++--
 llvm/test/CodeGen/NVPTX/param-overalign.ll    |  89 +-
 .../CodeGen/NVPTX/param-vectorize-device.ll   |  72 +-
 llvm/test/CodeGen/NVPTX/shift-opt.ll          |  12 +-
 llvm/test/CodeGen/NVPTX/st-param-imm.ll       | 504 ++--------
 llvm/test/CodeGen/NVPTX/store-undef.ll        |  12 +-
 llvm/test/CodeGen/NVPTX/tex-read-cuda.ll      |   6 +-
 .../NVPTX/unaligned-param-load-store.ll       |  42 +-
 llvm/test/CodeGen/NVPTX/unreachable.ll        |  20 +-
 llvm/test/CodeGen/NVPTX/variadics-backend.ll  |  28 +-
 .../CodeGen/PowerPC/PR35812-neg-cmpxchg.ll    |   2 -
 llvm/test/CodeGen/PowerPC/all-atomics.ll      |  32 +-
 .../CodeGen/PowerPC/atomics-regression.ll     |  68 --
 .../CodeGen/PowerPC/builtins-bcd-transform.ll |  91 ++
 llvm/test/CodeGen/PowerPC/loop-comment.ll     |   1 -
 llvm/test/CodeGen/RISCV/features-info.ll      |   1 -
 .../rvv/fixed-vectors-deinterleave-load.ll    |  67 --
 .../rvv/fixed-vectors-interleave-store.ll     |  34 -
 .../rvv/fixed-vectors-interleaved-access.ll   |  14 +-
 ...t.ll => fixed-vectors-vp-reverse-float.ll} |  33 +-
 ...int.ll => fixed-vectors-vp-reverse-int.ll} |   0
 .../RISCV/rvv/fixed-vectors-vp-splice.ll      | 106 ++-
 .../RISCV/rvv/vector-deinterleave-load.ll     |  67 --
 .../RISCV/rvv/vector-interleave-store.ll      |  34 -
 llvm/test/CodeGen/RISCV/rvv/vp-splice.ll      |  78 +-
 .../RISCV/rvv/vp-vector-interleaved-access.ll | 308 ++----
 .../CodeGen/SystemZ/vec-max-min-zerosplat.ll  |  70 +-
 .../CodeGen/X86/GlobalISel/llvm.sincos.mir    | 189 ++++
 llvm/test/CodeGen/X86/isel-fabs-x87.ll        |  46 +-
 llvm/test/CodeGen/X86/isel-fabs.ll            |  66 +-
 llvm/test/CodeGen/X86/llvm.sincos.ll          |  92 +-
 llvm/test/CodeGen/X86/shift-i512.ll           | 194 +---
 llvm/test/CodeGen/X86/var-permute-256.ll      | 216 +++--
 llvm/test/CodeGen/X86/vec_int_to_fp.ll        |  24 +-
 .../CodeGen/X86/x86-64-double-shifts-var.ll   |  64 +-
 llvm/test/DebugInfo/dynamic-bitfield.ll       |  62 ++
 .../BoundsChecking/runtimes.ll                |   2 +-
 .../MemorySanitizer/count-zeroes.ll           |  88 +-
 llvm/test/MC/AMDGPU/gfx1250_asm_unsupported.s |  94 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s |  65 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s        |  68 ++
 .../MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s |  67 ++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s  |  79 ++
 .../MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s  |  23 +
 llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s   |  35 +
 llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s    |  36 +
 .../gfx1250_asm_vop3_from_vop1-fake16.s       | 101 ++
 .../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s    | 104 +++
 .../gfx1250_asm_vop3_from_vop1_dpp16-fake16.s |  63 ++
 .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s |  67 ++
 .../gfx1250_asm_vop3_from_vop1_dpp8-fake16.s  |  23 +
 .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s  |  27 +
 .../Disassembler/AMDGPU/gfx1250_dasm_vop1.txt |  71 ++
 .../AMDGPU/gfx1250_dasm_vop1_dpp16.txt        |  64 ++
 .../AMDGPU/gfx1250_dasm_vop1_dpp8.txt         |  31 +
 .../AMDGPU/gfx1250_dasm_vop3_from_vop1.txt    |  79 ++
 .../gfx1250_dasm_vop3_from_vop1_dpp16.txt     |  67 ++
 .../gfx1250_dasm_vop3_from_vop1_dpp8.txt      |  27 +
 .../MC/Disassembler/RISCV/c_lui_disasm.txt    |  68 --
 llvm/test/MC/RISCV/rv32c-invalid.s            |  11 +-
 llvm/test/MC/X86/gotpcrel-non-globals.ll      |  36 +
 .../TableGen/CompressWriteLatencyEntry.td     |   8 +-
 llvm/test/TableGen/InvalidMCSchedClassDesc.td |  22 +-
 .../Transforms/FunctionAttrs/initializes.ll   |   6 +-
 llvm/test/Transforms/FunctionAttrs/nosync.ll  |   2 +-
 .../assumption-cache-invalidation.ll          |   2 +-
 .../InstSimplify/ConstProp/atan-intrinsic.ll  |   1 -
 .../InstSimplify/ConstProp/calls.ll           |  12 +
 .../AArch64/sve-deinterleave4.ll              |  90 +-
 .../AArch64/sve-interleave4.ll                |  17 +-
 .../RISCV/interleaved-accesses.ll             | 196 ----
 .../RISCV/riscv-vector-reverse.ll             |  80 +-
 ...able-info-from-assumption-variable-size.ll | 407 --------
 .../early_exit_store_legality.ll              |  29 -
 .../SLPVectorizer/X86/pr47629-inseltpoison.ll |  26 +-
 .../Transforms/SLPVectorizer/X86/pr47629.ll   |  26 +-
 ...masked-loads-consecutive-loads-same-ptr.ll |  10 +-
 .../SimplifyCFG/preserve-branchweights.ll     |  44 +-
 llvm/test/Verifier/assume-bundles.ll          |   3 +-
 llvm/test/Verifier/branch-weight.ll           |  39 +
 .../Inputs/nvptx-basic.ll.expected            |   6 +-
 .../symbolize-operands-executable.yaml        |  67 ++
 .../AArch64/symbolize-operands-relocatable.s  |  79 ++
 llvm/tools/llvm-exegesis/lib/Analysis.cpp     |   9 +-
 llvm/tools/llvm-objdump/llvm-objdump.cpp      |   3 +-
 llvm/unittests/ADT/ArrayRefTest.cpp           |  19 +-
 .../Analysis/MemoryProfileInfoTest.cpp        |   3 +-
 .../Orc/ObjectLinkingLayerTest.cpp            |   2 +
 .../ExecutionEngine/Orc/OrcTestCommon.h       |  41 +
 llvm/unittests/Frontend/CMakeLists.txt        |   1 +
 .../Frontend/OpenMPDirectiveNameTest.cpp      |  96 ++
 llvm/unittests/IR/DebugInfoTest.cpp           |  29 +
 .../unittests/IR/DebugTypeODRUniquingTest.cpp |   6 +-
 llvm/unittests/TargetParser/TripleTest.cpp    |  38 +
 llvm/utils/TableGen/SubtargetEmitter.cpp      |  22 +-
 .../include/mlir/Dialect/ArmNeon/Transforms.h |   2 +-
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td   |  14 +-
 mlir/include/mlir/Dialect/Linalg/IR/Linalg.h  |  14 +
 .../mlir/Dialect/Linalg/IR/LinalgEnums.td     |  15 +
 .../mlir/Dialect/Linalg/IR/LinalgOps.td       |  18 +-
 .../Linalg/TransformOps/LinalgTransformOps.td |   4 +-
 .../Dialect/Linalg/Transforms/Transforms.h    |   9 +-
 .../SCF/Transforms/TileUsingInterface.h       |  22 +-
 .../Dialect/Tensor/Transforms/Transforms.h    |  14 +-
 .../DebugExtension/DebugExtensionOps.td       |   4 +-
 .../mlir/Dialect/Vector/IR/VectorOps.td       |   7 +-
 mlir/include/mlir/IR/EnumAttr.td              |   7 +-
 mlir/include/mlir/IR/OpDefinition.h           |   2 +-
 mlir/include/mlir/IR/Visitors.h               |  36 +-
 .../mlir/Interfaces/TilingInterface.td        |  55 +-
 .../include/mlir/{IR => Support}/StateStack.h |   8 +-
 mlir/include/mlir/Support/WalkResult.h        |  59 ++
 .../mlir/Target/LLVMIR/ModuleTranslation.h    |   2 +-
 .../PDLToPDLInterp/PDLToPDLInterp.cpp         |   2 +-
 .../Conversion/TosaToLinalg/TosaToLinalg.cpp  |  16 +-
 .../VectorToLLVM/ConvertVectorToLLVMPass.cpp  |   2 +-
 .../ArmNeonVectorTransformOps.cpp             |   2 +-
 .../Dialect/ArmNeon/Transforms/CMakeLists.txt |   2 +-
 ... => LowerContractionToNeonI8MMPattern.cpp} | 178 +++-
 .../ArmSME/Transforms/VectorLegalization.cpp  | 171 ++--
 .../Transforms/LegalizeVectorStorage.cpp      | 152 ++-
 .../LowerContractionToSVEI8MMPattern.cpp      |   7 +-
 .../GPU/Transforms/DecomposeMemRefs.cpp       |   3 +-
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp      |  55 +-
 .../TransformOps/LinalgTransformOps.cpp       |   2 +-
 .../Linalg/Transforms/TilingInterfaceImpl.cpp | 191 ++--
 .../Linalg/Transforms/WinogradConv2D.cpp      | 152 ++-
 mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp      |  22 +-
 .../SCF/Transforms/TileUsingInterface.cpp     | 216 +++--
 .../SwapExtractSliceWithProducerPatterns.cpp  |  57 +-
 .../DebugExtension/DebugExtensionOps.cpp      |  13 +-
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      |  13 +-
 mlir/lib/IR/CMakeLists.txt                    |   1 -
 mlir/lib/Support/CMakeLists.txt               |   1 +
 mlir/lib/{IR => Support}/StateStack.cpp       |   2 +-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  14 +-
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  |   6 +-
 mlir/lib/Tools/PDLL/CodeGen/MLIRGen.cpp       |   5 +-
 mlir/python/CMakeLists.txt                    |   9 +
 .../dialects/TransformDebugExtensionOps.td    |  19 +
 mlir/python/mlir/dialects/transform/debug.py  |  81 ++
 .../Dialect/ArmNeon/lower-to-arm-neon.mlir    |  80 +-
 .../Dialect/ArmSME/vector-legalization.mlir   | 101 +-
 .../ArmSVE/legalize-transfer-read.mlir        | 257 ++++++
 mlir/test/Dialect/Linalg/hoisting.mlir        | 266 ++++--
 mlir/test/Dialect/Linalg/invalid.mlir         |  30 +-
 mlir/test/Dialect/Linalg/roundtrip.mlir       |  24 +-
 .../transform-op-fuse-into-containing.mlir    |   1 +
 .../transform-tile-and-winograd-rewrite.mlir  |  24 +-
 .../Linalg/transform-tile-winograd.mlir       |  36 +-
 .../Linalg/transform-winograd-conv2d.mlir     |  24 +-
 .../Linalg/winograd-conv2d-rewrite.mlir       |   6 +-
 mlir/test/Dialect/Linalg/winograd-conv2d.mlir |  42 +-
 mlir/test/Dialect/MemRef/invalid.mlir         |  10 +
 mlir/test/Dialect/MemRef/ops.mlir             |   3 +
 .../Vector/canonicalize/vector-transpose.mlir |  64 +-
 mlir/test/Dialect/Vector/invalid.mlir         |   9 +
 mlir/test/Dialect/Vector/ops.mlir             |  17 +-
 mlir/test/IR/attribute.mlir                   |   4 +
 .../transfer-read-scalable-non-trailing.mlir  |  79 ++
 .../tile-and-fuse-consumer.mlir               | 293 +++++-
 .../Target/LLVMIR/Import/import-failure.ll    |  16 -
 mlir/test/Target/LLVMIR/nvvm/elect.mlir       |  20 +
 mlir/test/Target/LLVMIR/nvvmir.mlir           |   9 -
 mlir/test/Target/LLVMIR/omptarget-llvm.mlir   | 348 +++----
 .../LLVMIR/omptargetdata-nowait-llvm.mlir     |  42 +-
 .../LLVMIR/openmp-data-target-device.mlir     |   2 +-
 .../openmp-nested-task-target-parallel.mlir   |  62 ++
 .../Dialect/Linalg/TestLinalgTransforms.cpp   |   4 +-
 mlir/test/lib/Dialect/Test/TestEnumDefs.td    |   8 +-
 mlir/test/lib/Dialect/Test/TestPatterns.cpp   |   4 +-
 .../TestTilingInterfaceTransformOps.cpp       |  50 +-
 .../TestTilingInterfaceTransformOps.td        |   7 +-
 .../python/dialects/transform_debug_ext.py    |  45 +
 mlir/tools/mlir-tblgen/EnumsGen.cpp           |   6 +-
 offload/liboffload/API/Program.td             |   4 +-
 offload/liboffload/src/OffloadImpl.cpp        |   8 +
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    |  20 +-
 .../common/include/PluginInterface.h          |  12 +-
 .../common/src/PluginInterface.cpp            |  72 +-
 offload/plugins-nextgen/cuda/src/rtl.cpp      |  27 +-
 offload/plugins-nextgen/host/src/rtl.cpp      |   6 +
 openmp/runtime/src/kmp_alloc.cpp              |   8 +-
 openmp/runtime/src/kmp_csupport.cpp           |   2 +-
 openmp/runtime/src/kmp_lock.cpp               |   2 +
 openmp/runtime/src/kmp_tasking.cpp            |   2 +-
 .../test/ompt/misc/lock_double_destroy.cpp    |  40 +
 .../llvm-project-overlay/clang/BUILD.bazel    |   1 +
 .../llvm-project-overlay/llvm/config.bzl      |   1 +
 .../llvm/include/llvm/Config/llvm-config.h    |   3 +
 .../llvm-project-overlay/mlir/BUILD.bazel     |   1 +
 utils/bazel/llvm_configs/llvm-config.h.cmake  |   3 +
 630 files changed, 15064 insertions(+), 7696 deletions(-)
 create mode 100644 clang/test/Analysis/lambda-convert-to-func-ptr.cpp
 create mode 100644 clang/test/CIR/Transforms/complex-imag-fold.cir
 create mode 100644 clang/test/CIR/Transforms/complex-real-fold.cir
 create mode 100644 clang/test/CIR/func-linkage.cpp
 create mode 100644 clang/test/CodeGen/PowerPC/builtins-bcd-transform.c
 create mode 100644 clang/test/Index/inline-assembly.c
 create mode 100644 clang/test/Modules/pr118137.cppm
 create mode 100644 clang/test/Modules/template-declare.cppm
 create mode 100644 clang/test/Sema/builtins-bcd-transform.c
 create mode 100644 compiler-rt/lib/asan/asan.link_with_main_exec.txt
 create mode 100644 compiler-rt/lib/asan/asan_cxx.link_with_main_exec.txt
 create mode 100644 compiler-rt/test/tsan/java_heap_init2.cpp
 create mode 100644 compiler-rt/test/tsan/munmap_clear_shadow.c
 create mode 100644 flang/test/Driver/fopenmp-version.F90
 create mode 100644 flang/test/Lower/OpenMP/target-data-skip-mapper-calls.f90
 create mode 100644 libc/src/wchar/wctomb.cpp
 create mode 100644 libc/src/wchar/wctomb.h
 create mode 100644 libc/test/src/wchar/wctomb_test.cpp
 rename libclc/clc/include/clc/{math => shared}/binary_decl_with_int_second_arg.inc (100%)
 rename libclc/clc/include/clc/{math => shared}/binary_def_with_int_second_arg.inc (100%)
 create mode 100644 lldb/test/API/linux/aarch64/mte_core_file/core.mte.notags
 create mode 100644 llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll
 create mode 100644 llvm/test/CodeGen/ARM/ifcvt_unanalyzable_fallthrough.mir
 create mode 100644 llvm/test/CodeGen/DirectX/issue-145408-gep-struct-fix.ll
 create mode 100644 llvm/test/CodeGen/DirectX/strip-rootsignatures.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/calling-conv-ilp32d.ll
 create mode 100644 llvm/test/CodeGen/PowerPC/builtins-bcd-transform.ll
 rename llvm/test/CodeGen/RISCV/rvv/{fixed-vectors-vp-reverser-float.ll => fixed-vectors-vp-reverse-float.ll} (71%)
 rename llvm/test/CodeGen/RISCV/rvv/{fixed-vectors-vp-reverser-int.ll => fixed-vectors-vp-reverse-int.ll} (100%)
 create mode 100644 llvm/test/CodeGen/X86/GlobalISel/llvm.sincos.mir
 create mode 100644 llvm/test/DebugInfo/dynamic-bitfield.ll
 create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt
 create mode 100644 llvm/test/MC/X86/gotpcrel-non-globals.ll
 delete mode 100644 llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-variable-size.ll
 create mode 100644 llvm/test/Verifier/branch-weight.ll
 create mode 100644 llvm/test/tools/llvm-objdump/ELF/AArch64/symbolize-operands-executable.yaml
 create mode 100644 llvm/test/tools/llvm-objdump/ELF/AArch64/symbolize-operands-relocatable.s
 create mode 100644 llvm/unittests/Frontend/OpenMPDirectiveNameTest.cpp
 rename mlir/include/mlir/{IR => Support}/StateStack.h (96%)
 create mode 100644 mlir/include/mlir/Support/WalkResult.h
 rename mlir/lib/Dialect/ArmNeon/Transforms/{LowerContractionToSMMLAPattern.cpp => LowerContractionToNeonI8MMPattern.cpp} (59%)
 rename mlir/lib/{IR => Support}/StateStack.cpp (92%)
 create mode 100644 mlir/python/mlir/dialects/TransformDebugExtensionOps.td
 create mode 100644 mlir/python/mlir/dialects/transform/debug.py
 create mode 100644 mlir/test/Dialect/ArmSVE/legalize-transfer-read.mlir
 create mode 100644 mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/transfer-read-scalable-non-trailing.mlir
 create mode 100644 mlir/test/Target/LLVMIR/nvvm/elect.mlir
 create mode 100644 mlir/test/Target/LLVMIR/openmp-nested-task-target-parallel.mlir
 create mode 100644 mlir/test/python/dialects/transform_debug_ext.py
 create mode 100644 openmp/runtime/test/ompt/misc/lock_double_destroy.cpp

diff --git a/bolt/include/bolt/Core/BinarySection.h b/bolt/include/bolt/Core/BinarySection.h
index ad2fed2cf27eb..154a8d12de5ce 100644
--- a/bolt/include/bolt/Core/BinarySection.h
+++ b/bolt/include/bolt/Core/BinarySection.h
@@ -523,11 +523,6 @@ inline uint8_t *copyByteArray(const uint8_t *Data, uint64_t Size) {
   return Array;
 }
 
-inline uint8_t *copyByteArray(StringRef Buffer) {
-  return copyByteArray(reinterpret_cast<const uint8_t *>(Buffer.data()),
-                       Buffer.size());
-}
-
 inline uint8_t *copyByteArray(ArrayRef<char> Buffer) {
   return copyByteArray(reinterpret_cast<const uint8_t *>(Buffer.data()),
                        Buffer.size());
diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index 95e831fe9c8ca..2eadaf15d3a65 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -82,6 +82,22 @@ namespace PAuthGadgetScanner {
   dbgs() << "\n";
 }
 
+// Iterates over BinaryFunction's instructions like a range-based for loop:
+//
+// iterateOverInstrs(BF, [&](MCInstReference Inst) {
+//   // loop body
+// });
+template <typename T> static void iterateOverInstrs(BinaryFunction &BF, T Fn) {
+  if (BF.hasCFG()) {
+    for (BinaryBasicBlock &BB : BF)
+      for (int64_t I = 0, E = BB.size(); I < E; ++I)
+        Fn(MCInstInBBReference(&BB, I));
+  } else {
+    for (auto I : BF.instrs())
+      Fn(MCInstInBFReference(&BF, I.first));
+  }
+}
+
 // This class represents mapping from a set of arbitrary physical registers to
 // consecutive array indexes.
 class TrackedRegisters {
@@ -342,6 +358,29 @@ class SrcSafetyAnalysis {
     return S;
   }
 
+  /// Computes a reasonably pessimistic estimation of the register state when
+  /// the previous instruction is not known for sure. Takes the set of registers
+  /// which are trusted at function entry and removes all registers that can be
+  /// clobbered inside this function.
+  SrcState computePessimisticState(BinaryFunction &BF) {
+    BitVector ClobberedRegs(NumRegs);
+    iterateOverInstrs(BF, [&](MCInstReference Inst) {
+      BC.MIB->getClobberedRegs(Inst, ClobberedRegs);
+
+      // If this is a call instruction, no register is safe anymore, unless
+      // it is a tail call. Ignore tail calls for the purpose of estimating the
+      // worst-case scenario, assuming no instructions are executed in the
+      // caller after this point anyway.
+      if (BC.MIB->isCall(Inst) && !BC.MIB->isTailCall(Inst))
+        ClobberedRegs.set();
+    });
+
+    SrcState S = createEntryState();
+    S.SafeToDerefRegs.reset(ClobberedRegs);
+    S.TrustedRegs.reset(ClobberedRegs);
+    return S;
+  }
+
   BitVector getClobberedRegs(const MCInst &Point) const {
     BitVector Clobbered(NumRegs);
     // Assume a call can clobber all registers, including callee-saved
@@ -545,6 +584,10 @@ class DataflowSrcSafetyAnalysis
   using SrcSafetyAnalysis::BC;
   using SrcSafetyAnalysis::computeNext;
 
+  // Pessimistic initial state for basic blocks without any predecessors
+  // (not needed for most functions, thus initialized lazily).
+  SrcState PessimisticState;
+
 public:
   DataflowSrcSafetyAnalysis(BinaryFunction &BF,
                             MCPlusBuilder::AllocatorIdTy AllocId,
@@ -585,6 +628,18 @@ class DataflowSrcSafetyAnalysis
     if (BB.isEntryPoint())
       return createEntryState();
 
+    // If a basic block without any predecessors is found in an optimized code,
+    // this likely means that some CFG edges were not detected. Pessimistically
+    // assume any register that can ever be clobbered in this function to be
+    // unsafe before this basic block.
+    // Warn about this fact in FunctionAnalysis::findUnsafeUses(), as it likely
+    // means imprecise CFG information.
+    if (BB.pred_empty()) {
+      if (PessimisticState.empty())
+        PessimisticState = computePessimisticState(*BB.getParent());
+      return PessimisticState;
+    }
+
     return SrcState();
   }
 
@@ -682,19 +737,14 @@ template <typename StateTy> class CFGUnawareAnalysis {
 //
 // Then, a function can be split into a number of disjoint contiguous sequences
 // of instructions without labels in between. These sequences can be processed
-// the same way basic blocks are processed by data-flow analysis, assuming
-// pessimistically that all registers are unsafe at the start of each sequence.
+// the same way basic blocks are processed by data-flow analysis, with the same
+// pessimistic estimation of the initial state at the start of each sequence
+// (except the first instruction of the function).
 class CFGUnawareSrcSafetyAnalysis : public SrcSafetyAnalysis,
                                     public CFGUnawareAnalysis<SrcState> {
   using SrcSafetyAnalysis::BC;
   BinaryFunction &BF;
 
-  /// Creates a state with all registers marked unsafe (not to be confused
-  /// with empty state).
-  SrcState createUnsafeState() const {
-    return SrcState(NumRegs, RegsToTrackInstsFor.getNumTrackedRegisters());
-  }
-
 public:
   CFGUnawareSrcSafetyAnalysis(BinaryFunction &BF,
                               MCPlusBuilder::AllocatorIdTy AllocId,
@@ -704,6 +754,7 @@ class CFGUnawareSrcSafetyAnalysis : public SrcSafetyAnalysis,
   }
 
   void run() override {
+    const SrcState DefaultState = computePessimisticState(BF);
     SrcState S = createEntryState();
     for (auto &I : BF.instrs()) {
       MCInst &Inst = I.second;
@@ -718,7 +769,7 @@ class CFGUnawareSrcSafetyAnalysis : public SrcSafetyAnalysis,
         LLVM_DEBUG({
           traceInst(BC, "Due to label, resetting the state before", Inst);
         });
-        S = createUnsafeState();
+        S = DefaultState;
       }
 
       // Attach the state *before* this instruction executes.
@@ -1344,17 +1395,6 @@ shouldReportAuthOracle(const BinaryContext &BC, const MCInstReference &Inst,
   return make_gadget_report(AuthOracleKind, Inst, *AuthReg);
 }
 
-template <typename T> static void iterateOverInstrs(BinaryFunction &BF, T Fn) {
-  if (BF.hasCFG()) {
-    for (BinaryBasicBlock &BB : BF)
-      for (int64_t I = 0, E = BB.size(); I < E; ++I)
-        Fn(MCInstInBBReference(&BB, I));
-  } else {
-    for (auto I : BF.instrs())
-      Fn(MCInstInBFReference(&BF, I.first));
-  }
-}
-
 static SmallVector<MCPhysReg>
 collectRegsToTrack(ArrayRef<PartialReport<MCPhysReg>> Reports) {
   SmallSet<MCPhysReg, 4> RegsToTrack;
@@ -1375,17 +1415,60 @@ void FunctionAnalysisContext::findUnsafeUses(
     BF.dump();
   });
 
+  bool UnreachableBBReported = false;
+  if (BF.hasCFG()) {
+    // Warn on basic blocks being unreachable according to BOLT (at most once
+    // per BinaryFunction), as this likely means the CFG reconstructed by BOLT
+    // is imprecise. A basic block can be
+    // * reachable from an entry basic block - a hopefully correct non-empty
+    //   state is propagated to that basic block sooner or later. All basic
+    //   blocks are expected to belong to this category under normal conditions.
+    // * reachable from a "directly unreachable" BB (a basic block that has no
+    //   direct predecessors and this is not because it is an entry BB) - *some*
+    //   non-empty state is propagated to this basic block sooner or later, as
+    //   the initial state of directly unreachable basic blocks is
+    //   pessimistically initialized to "all registers are unsafe"
+    //   - a warning can be printed for the "directly unreachable" basic block
+    // * neither reachable from an entry nor from a "directly unreachable" BB
+    //   (such as if this BB is in an isolated loop of basic blocks) - the final
+    //   state is computed to be empty for this basic block
+    //   - a warning can be printed for this basic block
+    for (BinaryBasicBlock &BB : BF) {
+      MCInst *FirstInst = BB.getFirstNonPseudoInstr();
+      // Skip empty basic block early for simplicity.
+      if (!FirstInst)
+        continue;
+
+      bool IsDirectlyUnreachable = BB.pred_empty() && !BB.isEntryPoint();
+      bool HasNoStateComputed = Analysis->getStateBefore(*FirstInst).empty();
+      if (!IsDirectlyUnreachable && !HasNoStateComputed)
+        continue;
+
+      // Arbitrarily attach the report to the first instruction of BB.
+      // This is printed as "[message] in function [name], basic block ...,
+      // at address ..." when the issue is reported to the user.
+      Reports.push_back(make_generic_report(
+          MCInstReference::get(FirstInst, BF),
+          "Warning: possibly imprecise CFG, the analysis quality may be "
+          "degraded in this function. According to BOLT, unreachable code is "
+          "found" /* in function [name]... */));
+      UnreachableBBReported = true;
+      break; // One warning per function.
+    }
+  }
+  // FIXME: Warn the user about imprecise analysis when the function has no CFG
+  //        information at all.
+
   iterateOverInstrs(BF, [&](MCInstReference Inst) {
     if (BC.MIB->isCFI(Inst))
       return;
 
     const SrcState &S = Analysis->getStateBefore(Inst);
-
-    // If non-empty state was never propagated from the entry basic block
-    // to Inst, assume it to be unreachable and report a warning.
     if (S.empty()) {
-      Reports.push_back(
-          make_generic_report(Inst, "Warning: unreachable instruction found"));
+      LLVM_DEBUG(
+          { traceInst(BC, "Instruction has no state, skipping", Inst); });
+      assert(UnreachableBBReported && "Should be reported at least once");
+      (void)UnreachableBBReported;
       return;
     }
 
diff --git a/bolt/test/binary-analysis/AArch64/gs-pacret-autiasp.s b/bolt/test/binary-analysis/AArch64/gs-pacret-autiasp.s
index 284f0bea607a5..2dadcef095863 100644
--- a/bolt/test/binary-analysis/AArch64/gs-pacret-autiasp.s
+++ b/bolt/test/binary-analysis/AArch64/gs-pacret-autiasp.s
@@ -215,7 +215,7 @@ f_callclobbered_calleesaved:
         .globl  f_unreachable_instruction
         .type   f_unreachable_instruction,@function
 f_unreachable_instruction:
-// CHECK-LABEL: GS-PAUTH: Warning: unreachable instruction found in function f_unreachable_instruction, basic block {{[0-9a-zA-Z.]+}}, at address
+// CHECK-LABEL: GS-PAUTH: Warning: possibly imprecise CFG, the analysis quality may be degraded in this function. According to BOLT, unreachable code is found in function f_unreachable_instruction, basic block {{[0-9a-zA-Z.]+}}, at address
 // CHECK-NEXT:    The instruction is     {{[0-9a-f]+}}:       add     x0, x1, x2
 // CHECK-NOT:   instructions that write to the affected registers after any authentication are:
         b       1f
@@ -224,20 +224,33 @@ f_unreachable_instruction:
         ret
         .size f_unreachable_instruction, .-f_unreachable_instruction
 
-// Expected false positive: without CFG, the state is reset to all-unsafe
-// after an unconditional branch.
+// Without CFG, the state is reset at labels, assuming every register that can
+// be clobbered in the function was actually clobbered.
 
-        .globl  state_is_reset_after_indirect_branch_nocfg
-        .type   state_is_reset_after_indirect_branch_nocfg,@function
-state_is_reset_after_indirect_branch_nocfg:
-// CHECK-LABEL: GS-PAUTH: non-protected ret found in function state_is_reset_after_indirect_branch_nocfg, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:         ret
+        .globl  lr_untouched_nocfg
+        .type   lr_untouched_nocfg,@function
+lr_untouched_nocfg:
+// CHECK-NOT: lr_untouched_nocfg
+        adr     x2, 1f
+        br      x2
+1:
+        ret
+        .size lr_untouched_nocfg, .-lr_untouched_nocfg
+
+        .globl  lr_clobbered_nocfg
+        .type   lr_clobbered_nocfg,@function
+lr_clobbered_nocfg:
+// CHECK-LABEL: GS-PAUTH: non-protected ret found in function lr_clobbered_nocfg, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      ret
 // CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
         adr     x2, 1f
         br      x2
 1:
+        b       2f
+        bl      g   // never executed, but affects the expected worst-case scenario
+2:
         ret
-        .size state_is_reset_after_indirect_branch_nocfg, .-state_is_reset_after_indirect_branch_nocfg
+        .size lr_clobbered_nocfg, .-lr_clobbered_nocfg
 
 /// Now do a basic sanity check on every different Authentication instruction:
 
diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-authentication-oracles.s b/bolt/test/binary-analysis/AArch64/gs-pauth-authentication-oracles.s
index 717bf40df3d02..c314bc7cfe5a3 100644
--- a/bolt/test/binary-analysis/AArch64/gs-pauth-authentication-oracles.s
+++ b/bolt/test/binary-analysis/AArch64/gs-pauth-authentication-oracles.s
@@ -491,10 +491,6 @@ good_address_arith_multi_bb:
         ret
         .size good_address_arith_multi_bb, .-good_address_arith_multi_bb
 
-// FIXME: Most *_nocfg test cases contain paciasp+autiasp instructions even if
-//        LR is not spilled - this is a workaround for RET instructions being
-//        reported as non-protected, because LR state is reset at every label.
-
         .globl  good_ret_nocfg
         .type   good_ret_nocfg,@function
 good_ret_nocfg:
@@ -541,14 +537,12 @@ good_branch_nocfg:
         .type   good_load_other_reg_nocfg,@function
 good_load_other_reg_nocfg:
 // CHECK-NOT: good_load_other_reg_nocfg
-        paciasp
         adr     x2, 1f
         br      x2
 1:
         autia   x0, x1
         ldr     x2, [x0]
 
-        autiasp
         ret
         .size good_load_other_reg_nocfg, .-good_load_other_reg_nocfg
 
@@ -556,14 +550,12 @@ good_load_other_reg_nocfg:
         .type   good_load_same_reg_nocfg,@function
 good_load_same_reg_nocfg:
 // CHECK-NOT: good_load_same_reg_nocfg
-        paciasp
         adr     x2, 1f
         br      x2
 1:
         autia   x0, x1
         ldr     x0, [x0]
 
-        autiasp
         ret
         .size good_load_same_reg_nocfg, .-good_load_same_reg_nocfg
 
@@ -575,13 +567,11 @@ bad_unchecked_nocfg:
 // CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unchecked_nocfg, at address
 // CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
 // CHECK-NEXT:  The 0 instructions that leak the affected registers are:
-        paciasp
         adr     x2, 1f
         br      x2
 1:
         autia   x0, x1
 
-        autiasp
         ret
         .size bad_unchecked_nocfg, .-bad_unchecked_nocfg
 
@@ -615,7 +605,6 @@ bad_unknown_usage_read_nocfg:
 // CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
 // CHECK-NEXT:  The 1 instructions that leak the affected registers are:
 // CHECK-NEXT:  1.     {{[0-9a-f]+}}:      mul     x3, x0, x1
-        paciasp
         adr     x2, 1f
         br      x2
 1:
@@ -623,7 +612,6 @@ bad_unknown_usage_read_nocfg:
         mul     x3, x0, x1
         ldr     x2, [x0]
 
-        autiasp
         ret
         .size bad_unknown_usage_read_nocfg, .-bad_unknown_usage_read_nocfg
 
@@ -634,7 +622,6 @@ bad_unknown_usage_subreg_read_nocfg:
 // CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
 // CHECK-NEXT:  The 1 instructions that leak the affected registers are:
 // CHECK-NEXT:  1.     {{[0-9a-f]+}}:      mul     w3, w0, w1
-        paciasp
         adr     x2, 1f
         br      x2
 1:
@@ -642,7 +629,6 @@ bad_unknown_usage_subreg_read_nocfg:
         mul     w3, w0, w1
         ldr     x2, [x0]
 
-        autiasp
         ret
         .size bad_unknown_usage_subreg_read_nocfg, .-bad_unknown_usage_subreg_read_nocfg
 
@@ -653,7 +639,6 @@ bad_unknown_usage_update_nocfg:
 // CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
 // CHECK-NEXT:  The 1 instructions that leak the affected registers are:
 // CHECK-NEXT:  1.     {{[0-9a-f]+}}:      movk    x0, #0x2a, lsl #16
-        paciasp
         adr     x2, 1f
         br      x2
 1:
@@ -661,7 +646,6 @@ bad_unknown_usage_update_nocfg:
         movk    x0, #42, lsl #16  // does not overwrite x0 completely
         ldr     x2, [x0]
 
-        autiasp
         ret
         .size bad_unknown_usage_update_nocfg, .-bad_unknown_usage_update_nocfg
 
@@ -669,14 +653,12 @@ bad_unknown_usage_update_nocfg:
         .type   good_overwrite_with_constant_nocfg,@function
 good_overwrite_with_constant_nocfg:
 // CHECK-NOT: good_overwrite_with_constant_nocfg
-        paciasp
         adr     x2, 1f
         br      x2
 1:
         autia   x0, x1
         mov     x0, #42
 
-        autiasp
         ret
         .size good_overwrite_with_constant_nocfg, .-good_overwrite_with_constant_nocfg
 
@@ -684,7 +666,6 @@ good_overwrite_with_constant_nocfg:
         .type   good_address_arith_nocfg,@function
 good_address_arith_nocfg:
 // CHECK-NOT: good_address_arith_nocfg
-        paciasp
         adr     x2, 1f
         br      x2
 1:
@@ -698,7 +679,6 @@ good_address_arith_nocfg:
         mov     x1, #0
         mov     x2, #0
 
-        autiasp
         ret
         .size good_address_arith_nocfg, .-good_address_arith_nocfg
 
diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-calls.s b/bolt/test/binary-analysis/AArch64/gs-pauth-calls.s
index c79c5926a05cd..fb0bc7cff2377 100644
--- a/bolt/test/binary-analysis/AArch64/gs-pauth-calls.s
+++ b/bolt/test/binary-analysis/AArch64/gs-pauth-calls.s
@@ -1428,6 +1428,90 @@ printed_instrs_nocfg:
         br      x0
         .size   printed_instrs_nocfg, .-printed_instrs_nocfg
 
+// Test handling of unreachable basic blocks.
+//
+// Basic blocks without any predecessors were observed in real-world optimized
+// code. At least sometimes they were actually reachable via jump table, which
+// was not detected, but the function was processed as if its CFG was
+// reconstructed successfully.
+//
+// As a more predictable model example, let's use really unreachable code
+// for testing.
+
+        .globl  bad_unreachable_call
+        .type   bad_unreachable_call,@function
+bad_unreachable_call:
+// CHECK-LABEL: GS-PAUTH: Warning: possibly imprecise CFG, the analysis quality may be degraded in this function. According to BOLT, unreachable code is found in function bad_unreachable_call, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      blr     x0
+// CHECK-NOT:   instructions that write to the affected registers after any authentication are:
+// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_unreachable_call, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      blr     x0
+// CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
+        paciasp
+        stp     x29, x30, [sp, #-16]!
+        mov     x29, sp
+
+        b       1f
+        // unreachable basic block:
+        blr     x0
+
+1:      // reachable basic block:
+        ldp     x29, x30, [sp], #16
+        autiasp
+        ret
+        .size bad_unreachable_call, .-bad_unreachable_call
+
+        .globl  good_unreachable_call
+        .type   good_unreachable_call,@function
+good_unreachable_call:
+// CHECK-NOT: non-protected call{{.*}}good_unreachable_call
+// CHECK-LABEL: GS-PAUTH: Warning: possibly imprecise CFG, the analysis quality may be degraded in this function. According to BOLT, unreachable code is found in function good_unreachable_call, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NOT: instructions that write to the affected registers after any authentication are:
+// CHECK-NOT: non-protected call{{.*}}good_unreachable_call
+        paciasp
+        stp     x29, x30, [sp, #-16]!
+        mov     x29, sp
+
+        b       1f
+        // unreachable basic block:
+        autia   x0, x1
+        blr     x0      // <-- this call is definitely protected provided at least
+                        //     basic block boundaries are detected correctly
+
+1:      // reachable basic block:
+        ldp     x29, x30, [sp], #16
+        autiasp
+        ret
+        .size good_unreachable_call, .-good_unreachable_call
+
+        .globl  unreachable_loop_of_bbs
+        .type   unreachable_loop_of_bbs,@function
+unreachable_loop_of_bbs:
+// CHECK-NOT: unreachable basic blocks{{.*}}unreachable_loop_of_bbs
+// CHECK-NOT: non-protected call{{.*}}unreachable_loop_of_bbs
+// CHECK-LABEL: GS-PAUTH: Warning: possibly imprecise CFG, the analysis quality may be degraded in this function. According to BOLT, unreachable code is found in function unreachable_loop_of_bbs, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      blr     x0
+// CHECK-NOT: unreachable basic blocks{{.*}}unreachable_loop_of_bbs
+// CHECK-NOT: non-protected call{{.*}}unreachable_loop_of_bbs
+        paciasp
+        stp     x29, x30, [sp, #-16]!
+        mov     x29, sp
+        b       .Lreachable_epilogue_bb
+
+.Lfirst_unreachable_bb:
+        blr     x0      // <-- this call is not analyzed
+        b       .Lsecond_unreachable_bb
+.Lsecond_unreachable_bb:
+        blr     x1      // <-- this call is not analyzed
+        b       .Lfirst_unreachable_bb
+
+.Lreachable_epilogue_bb:
+        ldp     x29, x30, [sp], #16
+        autiasp
+        ret
+        .size unreachable_loop_of_bbs, .-unreachable_loop_of_bbs
+
         .globl  main
         .type   main,@function
 main:
diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s b/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
index fbb96a63d41ed..b1cec7f92ad05 100644
--- a/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
+++ b/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
@@ -199,8 +199,8 @@ nocfg:
 // CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(  br      x0, src-state<SafeToDerefRegs: LR W0 W30 X0 W0_HI W30_HI , TrustedRegs: LR W0 W30 X0 W0_HI W30_HI , Insts: >)
 // CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: LR W0 W30 X0 W0_HI W30_HI , TrustedRegs: LR W0 W30 X0 W0_HI W30_HI , Insts: >)
 // CHECK-NEXT:   Due to label, resetting the state before:     00000000:       ret # Offset: 8
-// CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(  ret     x30, src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
-// CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(  ret     x30, src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: LR W30 W30_HI , Insts: >)
+// CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: LR W30 W30_HI , Insts: >)
 // CHECK-NEXT: After src register safety analysis:
 // CHECK-NEXT: Binary Function "nocfg"  {
 // CHECK-NEXT:   Number      : 3
@@ -223,33 +223,7 @@ nocfg:
 // PAUTH-NEXT:     SafeToDerefRegs: LR W0 W30 X0 W0_HI W30_HI{{[ \t]*$}}
 // CHECK-NEXT:   Found RET inst:     00000000:         ret # Offset: 8 # CFGUnawareSrcSafetyAnalysis: src-state<SafeToDerefRegs: BitVector, TrustedRegs: BitVector, Insts: >
 // CHECK-NEXT:     RetReg: LR
-// CHECK-NEXT:     SafeToDerefRegs:{{[ \t]*$}}
-// CHECK-EMPTY:
-// CHECK-NEXT: Running detailed src register safety analysis...
-// CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(   adr     x0, __ENTRY_nocfg@0x[[ENTRY_ADDR]], src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: LR W30 W30_HI , Insts: [0]()>)
-// CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: LR W0 W30 X0 W0_HI W30_HI , TrustedRegs: LR W0 W30 X0 W0_HI W30_HI , Insts: [0]()>)
-// CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(  br      x0, src-state<SafeToDerefRegs: LR W0 W30 X0 W0_HI W30_HI , TrustedRegs: LR W0 W30 X0 W0_HI W30_HI , Insts: [0]()>)
-// CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: LR W0 W30 X0 W0_HI W30_HI , TrustedRegs: LR W0 W30 X0 W0_HI W30_HI , Insts: [0]()>)
-// CHECK-NEXT:   Due to label, resetting the state before:     00000000:       ret # Offset: 8
-// CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(  ret     x30, src-state<SafeToDerefRegs: , TrustedRegs: , Insts: [0]()>)
-// CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: , TrustedRegs: , Insts: [0]()>)
-// CHECK-NEXT: After detailed src register safety analysis:
-// CHECK-NEXT: Binary Function "nocfg"  {
-// CHECK-NEXT:   Number      : 3
-// ...
-// CHECK:        Secondary Entry Points : __ENTRY_nocfg@0x[[ENTRY_ADDR]]
-// CHECK-NEXT: }
-// CHECK-NEXT: .{{[A-Za-z0-9]+}}:
-// CHECK-NEXT:     00000000:   adr     x0, __ENTRY_nocfg@0x[[ENTRY_ADDR]] # CFGUnawareSrcSafetyAnalysis: src-state<SafeToDerefRegs: BitVector, TrustedRegs: BitVector, Insts: [0]()>
-// CHECK-NEXT:     00000004:   br      x0 # UNKNOWN CONTROL FLOW # Offset: 4 # CFGUnawareSrcSafetyAnalysis: src-state<SafeToDerefRegs: BitVector, TrustedRegs: BitVector, Insts: [0]()>
-// CHECK-NEXT: __ENTRY_nocfg@0x[[ENTRY_ADDR]] (Entry Point):
-// CHECK-NEXT: .{{[A-Za-z0-9]+}}:
-// CHECK-NEXT:     00000008:   ret # Offset: 8 # CFGUnawareSrcSafetyAnalysis: src-state<SafeToDerefRegs: BitVector, TrustedRegs: BitVector, Insts: [0]()>
-// CHECK-NEXT: DWARF CFI Instructions:
-// CHECK-NEXT:     <empty>
-// CHECK-NEXT: End of Function "nocfg"
-// CHECK-EMPTY:
-// CHECK-NEXT:   Attaching clobbering info to:     00000000:   ret # Offset: 8 # CFGUnawareSrcSafetyAnalysis: src-state<SafeToDerefRegs: BitVector, TrustedRegs: BitVector, Insts: [0]()>
+// CHECK-NEXT:     SafeToDerefRegs: LR W30 W30_HI{{[ \t]*$}}
 
         .globl  auth_oracle
         .type   auth_oracle,@function
diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-signing-oracles.s b/bolt/test/binary-analysis/AArch64/gs-pauth-signing-oracles.s
index 334a4108d8ab8..3a4d383ec5bc6 100644
--- a/bolt/test/binary-analysis/AArch64/gs-pauth-signing-oracles.s
+++ b/bolt/test/binary-analysis/AArch64/gs-pauth-signing-oracles.s
@@ -505,21 +505,16 @@ bad_one_auted_one_checked_multi_bb:
 // * untrusted: not even s-t-d - from arg and from memory
 // * untrusted: subreg clobbered - between address materialization and use, between auth and check, between check and use
 // * untrusted: first checked then auted, auted then auted, checked then checked
-//
-// Note that it is important to sign and authenticate LR, as it is not kept
-// safe-to-dereference across unconditional branches.
 
         .globl  good_sign_addr_mat_nocfg
         .type   good_sign_addr_mat_nocfg,@function
 good_sign_addr_mat_nocfg:
 // CHECK-NOT: good_sign_addr_mat_nocfg
-        paciasp
         adr     x3, 1f
         br      x3
 1:
         adr     x0, sym
         pacda   x0, x1
-        autiasp
         ret
         .size good_sign_addr_mat_nocfg, .-good_sign_addr_mat_nocfg
 
@@ -527,14 +522,12 @@ good_sign_addr_mat_nocfg:
         .type   good_sign_auted_checked_ldr_nocfg,@function
 good_sign_auted_checked_ldr_nocfg:
 // CHECK-NOT: good_sign_auted_checked_ldr_nocfg
-        paciasp
         adr     x3, 1f
         br      x3
 1:
         autda   x0, x2
         ldr     x2, [x0]
         pacda   x0, x1
-        autiasp
         ret
         .size good_sign_auted_checked_ldr_nocfg, .-good_sign_auted_checked_ldr_nocfg
 
@@ -544,13 +537,11 @@ bad_sign_authed_unchecked_nocfg:
 // CHECK-LABEL: GS-PAUTH: signing oracle found in function bad_sign_authed_unchecked_nocfg, at address
 // CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
 // CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
-        paciasp
         adr     x3, 1f
         br      x3
 1:
         autda   x0, x2
         pacda   x0, x1
-        autiasp
         ret
         .size bad_sign_authed_unchecked_nocfg, .-bad_sign_authed_unchecked_nocfg
 
@@ -560,13 +551,11 @@ bad_sign_checked_not_auted_nocfg:
 // CHECK-LABEL: GS-PAUTH: signing oracle found in function bad_sign_checked_not_auted_nocfg, at address
 // CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
 // CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
-        paciasp
         adr     x3, 1f
         br      x3
 1:
         ldr     x2, [x0]
         pacda   x0, x1
-        autiasp
         ret
         .size bad_sign_checked_not_auted_nocfg, .-bad_sign_checked_not_auted_nocfg
 
@@ -576,12 +565,10 @@ bad_sign_plain_arg_nocfg:
 // CHECK-LABEL: GS-PAUTH: signing oracle found in function bad_sign_plain_arg_nocfg, at address
 // CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
 // CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
-        paciasp
         adr     x3, 1f
         br      x3
 1:
         pacda   x0, x1
-        autiasp
         ret
         .size bad_sign_plain_arg_nocfg, .-bad_sign_plain_arg_nocfg
 
@@ -592,13 +579,11 @@ bad_sign_plain_mem_nocfg:
 // CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
 // CHECK-NEXT:  The 1 instructions that write to the affected registers after any authentication are:
 // CHECK-NEXT:  1.     {{[0-9a-f]+}}:      ldr     x0, [x1]
-        paciasp
         adr     x3, 1f
         br      x3
 1:
         ldr     x0, [x1]
         pacda   x0, x1
-        autiasp
         ret
         .size bad_sign_plain_mem_nocfg, .-bad_sign_plain_mem_nocfg
 
@@ -609,14 +594,12 @@ bad_clobber_between_addr_mat_and_use_nocfg:
 // CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
 // CHECK-NEXT:  The 1 instructions that write to the affected registers after any authentication are:
 // CHECK-NEXT:  1.     {{[0-9a-f]+}}:      mov     w0, w4
-        paciasp
         adr     x3, 1f
         br      x3
 1:
         adr     x0, sym
         mov     w0, w4
         pacda   x0, x1
-        autiasp
         ret
         .size bad_clobber_between_addr_mat_and_use_nocfg, .-bad_clobber_between_addr_mat_and_use_nocfg
 
@@ -627,7 +610,6 @@ bad_clobber_between_auted_and_checked_nocfg:
 // CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
 // CHECK-NEXT:  The 1 instructions that write to the affected registers after any authentication are:
 // CHECK-NEXT:  1.     {{[0-9a-f]+}}:      mov     w0, w4
-        paciasp
         adr     x3, 1f
         br      x3
 1:
@@ -635,7 +617,6 @@ bad_clobber_between_auted_and_checked_nocfg:
         mov     w0, w4
         ldr     x2, [x0]
         pacda   x0, x1
-        autiasp
         ret
         .size bad_clobber_between_auted_and_checked_nocfg, .-bad_clobber_between_auted_and_checked_nocfg
 
@@ -646,7 +627,6 @@ bad_clobber_between_checked_and_used_nocfg:
 // CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
 // CHECK-NEXT:  The 1 instructions that write to the affected registers after any authentication are:
 // CHECK-NEXT:  1.     {{[0-9a-f]+}}:      mov     w0, w4
-        paciasp
         adr     x3, 1f
         br      x3
 1:
@@ -654,7 +634,6 @@ bad_clobber_between_checked_and_used_nocfg:
         ldr     x2, [x0]
         mov     w0, w4
         pacda   x0, x1
-        autiasp
         ret
         .size bad_clobber_between_checked_and_used_nocfg, .-bad_clobber_between_checked_and_used_nocfg
 
@@ -664,14 +643,12 @@ bad_transition_check_then_auth_nocfg:
 // CHECK-LABEL: GS-PAUTH: signing oracle found in function bad_transition_check_then_auth_nocfg, at address
 // CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
 // CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
-        paciasp
         adr     x3, 1f
         br      x3
 1:
         ldr     x2, [x0]
         autda   x0, x2
         pacda   x0, x1
-        autiasp
         ret
         .size bad_transition_check_then_auth_nocfg, .-bad_transition_check_then_auth_nocfg
 
@@ -681,14 +658,12 @@ bad_transition_auth_then_auth_nocfg:
 // CHECK-LABEL: GS-PAUTH: signing oracle found in function bad_transition_auth_then_auth_nocfg, at address
 // CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
 // CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
-        paciasp
         adr     x3, 1f
         br      x3
 1:
         autda   x0, x2
         autda   x0, x2
         pacda   x0, x1
-        autiasp
         ret
         .size bad_transition_auth_then_auth_nocfg, .-bad_transition_auth_then_auth_nocfg
 
@@ -698,14 +673,12 @@ bad_transition_check_then_check_nocfg:
 // CHECK-LABEL: GS-PAUTH: signing oracle found in function bad_transition_check_then_check_nocfg, at address
 // CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
 // CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
-        paciasp
         adr     x3, 1f
         br      x3
 1:
         ldr     x2, [x0]
         ldr     x2, [x0]
         pacda   x0, x1
-        autiasp
         ret
         .size bad_transition_check_then_check_nocfg, .-bad_transition_check_then_check_nocfg
 
diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
index 9eeba867f5211..88d2f2c388d07 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
@@ -72,7 +72,9 @@ SizeofExpressionCheck::SizeofExpressionCheck(StringRef Name,
           Options.get("WarnOnSizeOfPointerToAggregate", true)),
       WarnOnSizeOfPointer(Options.get("WarnOnSizeOfPointer", false)),
       WarnOnOffsetDividedBySizeOf(
-          Options.get("WarnOnOffsetDividedBySizeOf", true)) {}
+          Options.get("WarnOnOffsetDividedBySizeOf", true)),
+      WarnOnSizeOfInLoopTermination(
+          Options.get("WarnOnSizeOfInLoopTermination", true)) {}
 
 void SizeofExpressionCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "WarnOnSizeOfConstant", WarnOnSizeOfConstant);
@@ -86,6 +88,8 @@ void SizeofExpressionCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "WarnOnSizeOfPointer", WarnOnSizeOfPointer);
   Options.store(Opts, "WarnOnOffsetDividedBySizeOf",
                 WarnOnOffsetDividedBySizeOf);
+  Options.store(Opts, "WarnOnSizeOfInLoopTermination",
+                WarnOnSizeOfInLoopTermination);
 }
 
 void SizeofExpressionCheck::registerMatchers(MatchFinder *Finder) {
@@ -93,6 +97,13 @@ void SizeofExpressionCheck::registerMatchers(MatchFinder *Finder) {
   // Some of the checks should not match in template code to avoid false
   // positives if sizeof is applied on template argument.
 
+  auto LoopCondExpr =
+      [](const ast_matchers::internal::Matcher<Stmt> &InnerMatcher) {
+        return stmt(anyOf(forStmt(hasCondition(InnerMatcher)),
+                          whileStmt(hasCondition(InnerMatcher)),
+                          doStmt(hasCondition(InnerMatcher))));
+      };
+
   const auto IntegerExpr = ignoringParenImpCasts(integerLiteral());
   const auto ConstantExpr = ignoringParenImpCasts(
       anyOf(integerLiteral(), unaryOperator(hasUnaryOperand(IntegerExpr)),
@@ -130,6 +141,14 @@ void SizeofExpressionCheck::registerMatchers(MatchFinder *Finder) {
                        this);
   }
 
+  if (WarnOnSizeOfInLoopTermination) {
+    auto CondExpr = binaryOperator(
+        allOf(has(SizeOfExpr.bind("sizeof-expr")), isComparisonOperator()));
+    Finder->addMatcher(LoopCondExpr(anyOf(CondExpr, hasDescendant(CondExpr)))
+                           .bind("loop-expr"),
+                       this);
+  }
+
   // Detect sizeof(kPtr) where kPtr is 'const char* kPtr = "abc"';
   const auto CharPtrType = pointerType(pointee(isAnyCharacter()));
   const auto ConstStrLiteralDecl =
@@ -349,6 +368,23 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) {
     diag(E->getBeginLoc(),
          "suspicious usage of 'sizeof(char*)'; do you mean 'strlen'?")
         << E->getSourceRange();
+  } else if (Result.Nodes.getNodeAs<Stmt>("loop-expr")) {
+    auto *SizeofArgTy = Result.Nodes.getNodeAs<Type>("sizeof-arg-type");
+    if (const auto member = dyn_cast<MemberPointerType>(SizeofArgTy))
+      SizeofArgTy = member->getPointeeType().getTypePtr();
+
+    const auto *SzOfExpr = Result.Nodes.getNodeAs<Expr>("sizeof-expr");
+
+    if (const auto type = dyn_cast<ArrayType>(SizeofArgTy)) {
+      // check if the array element size is larger than one. If true,
+      // the size of the array is higher than the number of elements
+      CharUnits sSize = Ctx.getTypeSizeInChars(type->getElementType());
+      if (!sSize.isOne()) {
+        diag(SzOfExpr->getBeginLoc(),
+             "suspicious usage of 'sizeof' in the loop")
+            << SzOfExpr->getSourceRange();
+      }
+    }
   } else if (const auto *E = Result.Nodes.getNodeAs<Expr>("sizeof-pointer")) {
     diag(E->getBeginLoc(), "suspicious usage of 'sizeof()' on an expression "
                            "of pointer type")
diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h
index fbd62cb80fb2d..e979b4723cf2e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h
@@ -32,6 +32,7 @@ class SizeofExpressionCheck : public ClangTidyCheck {
   const bool WarnOnSizeOfPointerToAggregate;
   const bool WarnOnSizeOfPointer;
   const bool WarnOnOffsetDividedBySizeOf;
+  const bool WarnOnSizeOfInLoopTermination;
 };
 
 } // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index fc51f3c9329ad..934d52086b3b9 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -173,6 +173,11 @@ Changes in existing checks
   <clang-tidy/checks/bugprone/optional-value-conversion>` check to detect
   conversion in argument of ``std::make_optional``.
 
+- Improved :doc: `bugprone-sizeof-expression
+  <clang-tidy/checks/bugprone/bugprone-sizeof-expression>` check by adding
+  `WarnOnSizeOfInLoopTermination` option to detect misuses of ``sizeof``
+  expression in loop conditions.
+
 - Improved :doc:`bugprone-string-constructor
   <clang-tidy/checks/bugprone/string-constructor>` check to find suspicious
   calls of ``std::string`` constructor with char pointer, start position and
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst
index 29edb26ed7aa2..04824cc1fe0e4 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst
@@ -316,3 +316,12 @@ Options
    When `true`, the check will warn on pointer arithmetic where the
    element count is obtained from a division with ``sizeof(...)``,
    e.g., ``Ptr + Bytes / sizeof(*T)``. Default is `true`.
+
+.. option:: WarnOnSizeOfInLoopTermination
+
+   When `true`, the check will warn about incorrect use of sizeof expression
+   in loop termination condition. The warning triggers if the ``sizeof``
+   expression appears to be incorrectly used to determine the number of
+   array/buffer elements.
+   e.g, ``long arr[10]; for(int i = 0; i < sizeof(arr); i++) { ... }``. Default
+   is `true`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression.cpp
index 5e6f394152e9d..33cf1cbea8377 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression.cpp
@@ -164,6 +164,69 @@ int Test2(MyConstChar* A) {
   return sum;
 }
 
+struct A {
+   int array[10];
+};
+
+struct B {
+  struct A a;
+};
+
+void loop_access_elements(int num, struct B b) {
+  struct A arr[10];
+  char buf[20];
+
+  // CHECK-MESSAGES: :[[@LINE+1]]:22: warning: suspicious usage of 'sizeof' in the loop [bugprone-sizeof-expression]
+  for(int i = 0; i < sizeof(arr); i++) {
+    struct A a = arr[i];
+  }
+
+  // Loop warning should not trigger here, even though this code is incorrect
+  // CHECK-MESSAGES: :[[@LINE+2]]:22: warning: suspicious usage of 'sizeof(K)'; did you mean 'K'? [bugprone-sizeof-expression]
+  // CHECK-MESSAGES: :[[@LINE+1]]:32: warning: suspicious usage of 'sizeof(...)/sizeof(...)'; numerator is not a multiple of denominator [bugprone-sizeof-expression] 
+  for(int i = 0; i < sizeof(10)/sizeof(A); i++) {
+    struct A a = arr[i];
+  }
+    
+  // Should not warn here
+  for(int i = 0; i < sizeof(arr)/sizeof(A); i++) {}
+
+  // Should not warn here
+  for (int i = 0; i < 10; i++) {
+    if (sizeof(arr) != 0) {
+
+    }
+  }
+
+  for (int i = 0; i < 10; i++) {
+    // CHECK-MESSAGES: :[[@LINE+1]]:25: warning: suspicious usage of 'sizeof' in the loop [bugprone-sizeof-expression]
+    for (int j = 0; j < sizeof(arr); j++) {
+    }
+  }
+
+  // CHECK-MESSAGES: :[[@LINE+1]]:22: warning: suspicious usage of 'sizeof' in the loop [bugprone-sizeof-expression]
+  for(int j = 0; j < sizeof(b.a.array); j++) {}
+  
+  // Should not warn here
+  for(int i = 0; i < sizeof(buf); i++) {} 
+
+  // Should not warn here
+  for(int i = 0; i < (sizeof(arr) << 3); i++) {}
+  
+  int i = 0;
+  // CHECK-MESSAGES: :[[@LINE+1]]:14: warning: suspicious usage of 'sizeof' in the loop [bugprone-sizeof-expression]
+  while(i <= sizeof(arr)) {i++;}
+   
+  i = 0;
+  do {
+    i++;
+  // CHECK-MESSAGES: :[[@LINE+1]]:16: warning: suspicious usage of 'sizeof' in the loop [bugprone-sizeof-expression] 
+  } while(i <= sizeof(arr));
+
+  // CHECK-MESSAGES: :[[@LINE+1]]:29: warning: suspicious usage of 'sizeof' in the loop [bugprone-sizeof-expression]
+  for(int i = 0, j = 0; i < sizeof(arr) && j < sizeof(buf); i++, j++) {}
+}
+
 template <int T>
 int Foo() { int A[T]; return sizeof(T); }
 // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: suspicious usage of 'sizeof(K)'
diff --git a/clang/bindings/python/tests/cindex/test_file.py b/clang/bindings/python/tests/cindex/test_file.py
index a8c1dbf558543..2be9b9e332611 100644
--- a/clang/bindings/python/tests/cindex/test_file.py
+++ b/clang/bindings/python/tests/cindex/test_file.py
@@ -9,6 +9,7 @@
 
 inputs_dir = os.path.join(os.path.dirname(__file__), "INPUTS")
 
+
 class TestFile(unittest.TestCase):
     def test_file(self):
         index = Index.create()
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index e1fe22393eebb..ec1e1e7334d90 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -329,6 +329,9 @@ Non-comprehensive list of changes in this release
   ``__reference_constructs_from_temporary`` should be used instead. (#GH44056)
 - Added `__builtin_get_vtable_pointer` to directly load the primary vtable pointer from a
   polymorphic object.
+- ``libclang`` receives a family of new bindings to query basic facts about
+  GCC-style inline assembly blocks, including whether the block is ``volatile``
+  and its template string following the LLVM IR ``asm`` format. (#GH143424)
 - Clang no longer rejects reinterpret_cast conversions between indirect
   ARC-managed pointers and other pointer types. The prior behavior was overly
   strict and inconsistent with the ARC specification.
@@ -644,7 +647,7 @@ Improvements to Clang's diagnostics
   #GH69470, #GH59391, #GH58172, #GH46215, #GH45915, #GH45891, #GH44490,
   #GH36703, #GH32903, #GH23312, #GH69874.
 
-
+  
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index e4cb4327fbaac..c35311c886413 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -36,7 +36,7 @@
 #define CINDEX_VERSION_MAJOR 0
 #define CINDEX_VERSION_MINOR 64
 
-#define CINDEX_VERSION_ENCODE(major, minor) (((major)*10000) + ((minor)*1))
+#define CINDEX_VERSION_ENCODE(major, minor) (((major) * 10000) + ((minor) * 1))
 
 #define CINDEX_VERSION                                                         \
   CINDEX_VERSION_ENCODE(CINDEX_VERSION_MAJOR, CINDEX_VERSION_MINOR)
@@ -4495,6 +4495,129 @@ CINDEX_LINKAGE CXStringSet *clang_Cursor_getCXXManglings(CXCursor);
  */
 CINDEX_LINKAGE CXStringSet *clang_Cursor_getObjCManglings(CXCursor);
 
+/**
+ * @}
+ */
+
+/**
+ * \defgroup CINDEX_MODULE Inline Assembly introspection
+ *
+ * The functions in this group provide access to information about GCC-style
+ * inline assembly statements.
+ *
+ * @{
+ */
+
+/**
+ * Given a CXCursor_GCCAsmStmt cursor, return the assembly template string.
+ * As per LLVM IR Assembly Template language, template placeholders for
+ * inputs and outputs are either of the form $N where N is a decimal number
+ * as an index into the input-output specification,
+ * or ${N:M} where N is a decimal number also as an index into the
+ * input-output specification and M is the template argument modifier.
+ * The index N in both cases points into the the total inputs and outputs,
+ * or more specifically, into the list of outputs followed by the inputs,
+ * starting from index 0 as the first available template argument.
+ *
+ * This function also returns a valid empty string if the cursor does not point
+ * at a GCC inline assembly block.
+ *
+ * Users are responsible for releasing the allocation of returned string via
+ * \c clang_disposeString.
+ */
+
+CINDEX_LINKAGE CXString clang_Cursor_getGCCAssemblyTemplate(CXCursor);
+
+/**
+ * Given a CXCursor_GCCAsmStmt cursor, check if the assembly block has goto
+ * labels.
+ * This function also returns 0 if the cursor does not point at a GCC inline
+ * assembly block.
+ */
+
+CINDEX_LINKAGE unsigned clang_Cursor_isGCCAssemblyHasGoto(CXCursor);
+
+/**
+ * Given a CXCursor_GCCAsmStmt cursor, count the number of outputs.
+ * This function also returns 0 if the cursor does not point at a GCC inline
+ * assembly block.
+ */
+
+CINDEX_LINKAGE unsigned clang_Cursor_getGCCAssemblyNumOutputs(CXCursor);
+
+/**
+ * Given a CXCursor_GCCAsmStmt cursor, count the number of inputs.
+ * This function also returns 0 if the cursor does not point at a GCC inline
+ * assembly block.
+ */
+
+CINDEX_LINKAGE unsigned clang_Cursor_getGCCAssemblyNumInputs(CXCursor);
+
+/**
+ * Given a CXCursor_GCCAsmStmt cursor, get the constraint and expression cursor
+ * to the Index-th input.
+ * This function returns 1 when the cursor points at a GCC inline assembly
+ * statement, `Index` is within bounds and both the `Constraint` and `Expr` are
+ * not NULL.
+ * Otherwise, this function returns 0 but leaves `Constraint` and `Expr`
+ * intact.
+ *
+ * Users are responsible for releasing the allocation of `Constraint` via
+ * \c clang_disposeString.
+ */
+
+CINDEX_LINKAGE unsigned clang_Cursor_getGCCAssemblyInput(CXCursor Cursor,
+                                                         unsigned Index,
+                                                         CXString *Constraint,
+                                                         CXCursor *Expr);
+
+/**
+ * Given a CXCursor_GCCAsmStmt cursor, get the constraint and expression cursor
+ * to the Index-th output.
+ * This function returns 1 when the cursor points at a GCC inline assembly
+ * statement, `Index` is within bounds and both the `Constraint` and `Expr` are
+ * not NULL.
+ * Otherwise, this function returns 0 but leaves `Constraint` and `Expr`
+ * intact.
+ *
+ * Users are responsible for releasing the allocation of `Constraint` via
+ * \c clang_disposeString.
+ */
+
+CINDEX_LINKAGE unsigned clang_Cursor_getGCCAssemblyOutput(CXCursor Cursor,
+                                                          unsigned Index,
+                                                          CXString *Constraint,
+                                                          CXCursor *Expr);
+
+/**
+ * Given a CXCursor_GCCAsmStmt cursor, count the clobbers in it.
+ * This function also returns 0 if the cursor does not point at a GCC inline
+ * assembly block.
+ */
+
+CINDEX_LINKAGE unsigned clang_Cursor_getGCCAssemblyNumClobbers(CXCursor Cursor);
+
+/**
+ * Given a CXCursor_GCCAsmStmt cursor, get the Index-th clobber of it.
+ * This function returns a valid empty string if the cursor does not point
+ * at a GCC inline assembly block or `Index` is out of bounds.
+ *
+ * Users are responsible for releasing the allocation of returned string via
+ * \c clang_disposeString.
+ */
+
+CINDEX_LINKAGE CXString clang_Cursor_getGCCAssemblyClobber(CXCursor Cursor,
+                                                           unsigned Index);
+
+/**
+ * Given a CXCursor_GCCAsmStmt cursor, check if the inline assembly is
+ * `volatile`.
+ * This function returns 0 if the cursor does not point at a GCC inline
+ * assembly block.
+ */
+
+CINDEX_LINKAGE unsigned clang_Cursor_isGCCAssemblyVolatile(CXCursor Cursor);
+
 /**
  * @}
  */
diff --git a/clang/include/clang/Analysis/FlowSensitive/StorageLocation.h b/clang/include/clang/Analysis/FlowSensitive/StorageLocation.h
index 8b263b16d5b1e..8fcc6a44027a0 100644
--- a/clang/include/clang/Analysis/FlowSensitive/StorageLocation.h
+++ b/clang/include/clang/Analysis/FlowSensitive/StorageLocation.h
@@ -168,8 +168,6 @@ class RecordStorageLocation final : public StorageLocation {
     return {Children.begin(), Children.end()};
   }
 
-  bool hasChild(const ValueDecl &D) const { return Children.contains(&D); }
-
 private:
   FieldToLoc Children;
   SyntheticFieldMap SyntheticFields;
diff --git a/clang/include/clang/Basic/BuiltinsAArch64.def b/clang/include/clang/Basic/BuiltinsAArch64.def
index 8867a9fe09fb9..909e35792b461 100644
--- a/clang/include/clang/Basic/BuiltinsAArch64.def
+++ b/clang/include/clang/Basic/BuiltinsAArch64.def
@@ -155,6 +155,13 @@ TARGET_HEADER_BUILTIN(_InterlockedIncrement64,   "LLiLLiD*",    "nh", INTRIN_H,
 TARGET_HEADER_BUILTIN(_InterlockedOr64,          "LLiLLiD*LLi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
 TARGET_HEADER_BUILTIN(_InterlockedXor64,         "LLiLLiD*LLi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
 
+TARGET_HEADER_BUILTIN(_InterlockedAdd_acq,       "NiNiD*Ni",    "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
+TARGET_HEADER_BUILTIN(_InterlockedAdd_rel,       "NiNiD*Ni",    "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
+TARGET_HEADER_BUILTIN(_InterlockedAdd_nf,        "NiNiD*Ni",    "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
+TARGET_HEADER_BUILTIN(_InterlockedAdd64_acq,     "LLiLLiD*LLi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
+TARGET_HEADER_BUILTIN(_InterlockedAdd64_rel,     "LLiLLiD*LLi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
+TARGET_HEADER_BUILTIN(_InterlockedAdd64_nf,      "LLiLLiD*LLi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
+
 TARGET_HEADER_BUILTIN(_InterlockedExchangeAdd_acq, "NiNiD*Ni", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
 TARGET_HEADER_BUILTIN(_InterlockedExchangeAdd_rel, "NiNiD*Ni", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
 TARGET_HEADER_BUILTIN(_InterlockedExchangeAdd_nf, "NiNiD*Ni", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index edb3a17ac07c6..1d1f5a4ee3f9f 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -642,5 +642,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
 
 TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst")
 
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_bf8, "V2hs", "nc", "gfx1250-insts")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def
index 099500754a0e0..7c278d6841c74 100644
--- a/clang/include/clang/Basic/BuiltinsPPC.def
+++ b/clang/include/clang/Basic/BuiltinsPPC.def
@@ -535,6 +535,12 @@ TARGET_BUILTIN(__builtin_ppc_bcdadd_p, "iiV16UcV16Uc", "",
 TARGET_BUILTIN(__builtin_ppc_bcdsub_p, "iiV16UcV16Uc", "",
                "isa-v207-instructions")
 
+// P9 Binary-coded decimal (BCD) builtins.                                                
+TARGET_BUILTIN(__builtin_ppc_national2packed, "V16UcV16UcUc", "t", "power9-vector")
+TARGET_BUILTIN(__builtin_ppc_packed2national, "V16UcV16Uc", "", "power9-vector")
+TARGET_BUILTIN(__builtin_ppc_packed2zoned, "V16UcV16UcUc", "t", "power9-vector")
+TARGET_BUILTIN(__builtin_ppc_zoned2packed, "V16UcV16UcUc", "t", "power9-vector")
+
 TARGET_BUILTIN(__builtin_altivec_vclzlsbb, "SiV16Uc", "", "power9-vector")
 TARGET_BUILTIN(__builtin_altivec_vctzlsbb, "SiV16Uc", "", "power9-vector")
 TARGET_BUILTIN(__builtin_altivec_vprtybw, "V4UiV4Ui", "", "power9-vector")
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 9392cbb39c021..5062505cf3c01 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -1767,7 +1767,9 @@ def note_unsatisfied_trait
     : Note<"%0 is not %enum_select<TraitName>{"
            "%TriviallyRelocatable{trivially relocatable}|"
            "%Replaceable{replaceable}|"
-           "%TriviallyCopyable{trivially copyable}"
+           "%TriviallyCopyable{trivially copyable}|"
+           "%Empty{empty}|"
+           "%StandardLayout{standard-layout}"
            "}1">;
 
 def note_unsatisfied_trait_reason
@@ -1787,6 +1789,16 @@ def note_unsatisfied_trait_reason
            "%NonReplaceableField{has a non-replaceable member %1 of type %2}|"
            "%NTCBase{has a non-trivially-copyable base %1}|"
            "%NTCField{has a non-trivially-copyable member %1 of type %2}|"
+           "%NonEmptyMember{has a non-static data member %1 of type %2}|"
+           "%VirtualFunction{has a virtual function %1}|"
+           "%NonEmptyBase{has a base class %1 that is not empty}|"
+           "%NonZeroLengthField{field %1 is a non-zero-length bit-field}|"
+           "%NonStandardLayoutBase{has a non-standard-layout base %1}|"
+           "%MixedAccess{has mixed access specifiers}|"
+           "%MixedAccessField{field %1 has a different access specifier than field %2}|"
+           "%MultipleDataBase{has multiple base classes with data members}|"
+           "%NonStandardLayoutMember{has a non-standard-layout member %1 of type %2}|"
+           "%IndirectBaseWithFields{has an indirect base %1 with data members}|"
            "%DeletedDtr{has a %select{deleted|user-provided}1 destructor}|"
            "%UserProvidedCtr{has a user provided %select{copy|move}1 "
            "constructor}|"
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index decba83251df2..ef77c46b011f7 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -1737,16 +1737,40 @@ def GetMemberOp : CIR_Op<"get_member"> {
 
 def FuncOp : CIR_Op<"func", [
   AutomaticAllocationScope, CallableOpInterface, FunctionOpInterface,
+  DeclareOpInterfaceMethods<CIRGlobalValueInterface>,
   IsolatedFromAbove
 ]> {
   let summary = "Declare or define a function";
   let description = [{
     The `cir.func` operation defines a function, similar to the `mlir::FuncOp`
     built-in.
+
+    The function linkage information is specified by `linkage`, as defined by
+    `GlobalLinkageKind` attribute.
+
+    Example:
+
+    ```mlir
+    // External function definitions.
+    cir.func @abort()
+
+    // A function with internal linkage.
+    cir.func internal @count(%x: i64) -> (i64)
+      return %x : i64
+
+    // Linkage information
+    cir.func linkonce_odr @some_method(...)
+    ```
   }];
 
   let arguments = (ins SymbolNameAttr:$sym_name,
+                       CIR_VisibilityAttr:$global_visibility,
                        TypeAttrOf<CIR_FuncType>:$function_type,
+                       UnitAttr:$dso_local,
+                       DefaultValuedAttr<CIR_GlobalLinkageKind,
+                                         "cir::GlobalLinkageKind::ExternalLinkage">:$linkage,
+                       OptionalAttr<StrAttr>:$sym_visibility,
+                       UnitAttr:$comdat,
                        OptionalAttr<DictArrayAttr>:$arg_attrs,
                        OptionalAttr<DictArrayAttr>:$res_attrs);
 
@@ -1754,8 +1778,10 @@ def FuncOp : CIR_Op<"func", [
 
   let skipDefaultBuilders = 1;
 
-  let builders = [OpBuilder<(ins "llvm::StringRef":$sym_name,
-                                 "FuncType":$type)>];
+  let builders = [OpBuilder<(ins
+    "llvm::StringRef":$sym_name, "FuncType":$type,
+    CArg<"cir::GlobalLinkageKind", "cir::GlobalLinkageKind::ExternalLinkage">:$linkage)
+  >];
 
   let extraClassDeclaration = [{
     /// Returns the region on the current operation that is callable. This may
@@ -2371,6 +2397,64 @@ def ComplexCreateOp : CIR_Op<"complex.create", [Pure, SameTypeOperands]> {
   let hasFolder = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// ComplexRealOp
+//===----------------------------------------------------------------------===//
+
+def ComplexRealOp : CIR_Op<"complex.real", [Pure]> {
+  let summary = "Extract the real part of a complex value";
+  let description = [{
+    `cir.complex.real` operation takes an operand of `!cir.complex` type and
+    yields the real part of it.
+
+    Example:
+
+    ```mlir
+    %1 = cir.complex.real %0 : !cir.complex<!cir.float> -> !cir.float
+    ```
+  }];
+
+  let results = (outs CIR_AnyIntOrFloatType:$result);
+  let arguments = (ins CIR_ComplexType:$operand);
+
+  let assemblyFormat = [{
+    $operand `:` qualified(type($operand)) `->` qualified(type($result))
+    attr-dict
+  }];
+
+  let hasVerifier = 1;
+  let hasFolder = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// ComplexImagOp
+//===----------------------------------------------------------------------===//
+
+def ComplexImagOp : CIR_Op<"complex.imag", [Pure]> {
+  let summary = "Extract the imaginary part of a complex value";
+  let description = [{
+    `cir.complex.imag` operation takes an operand of `!cir.complex` type and
+    yields the imaginary part of it.
+
+    Example:
+
+    ```mlir
+    %1 = cir.complex.imag %0 : !cir.complex<!cir.float> -> !cir.float
+    ```
+  }];
+
+  let results = (outs CIR_AnyIntOrFloatType:$result);
+  let arguments = (ins CIR_ComplexType:$operand);
+
+  let assemblyFormat = [{
+    $operand `:` qualified(type($operand)) `->` qualified(type($result))
+    attr-dict
+  }];
+
+  let hasVerifier = 1;
+  let hasFolder = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Assume Operations
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index fb5014a877151..9e8944d1114b8 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -72,16 +72,18 @@ struct MissingFeatures {
 
   // FuncOp handling
   static bool opFuncOpenCLKernelMetadata() { return false; }
+  static bool opFuncAstDeclAttr() { return false; }
   static bool opFuncCallingConv() { return false; }
   static bool opFuncExtraAttrs() { return false; }
-  static bool opFuncDsoLocal() { return false; }
-  static bool opFuncLinkage() { return false; }
-  static bool opFuncVisibility() { return false; }
   static bool opFuncNoProto() { return false; }
   static bool opFuncCPUAndFeaturesAttributes() { return false; }
   static bool opFuncSection() { return false; }
-  static bool opFuncSetComdat() { return false; }
+  static bool opFuncMultipleReturnVals() { return false; }
   static bool opFuncAttributesForDefinition() { return false; }
+  static bool opFuncMaybeHandleStaticInExternC() { return false; }
+  static bool opFuncGlobalAliases() { return false; }
+  static bool setLLVMFunctionFEnvAttributes() { return false; }
+  static bool setFunctionAttributes() { return false; }
 
   // CallOp handling
   static bool opCallPseudoDtor() { return false; }
@@ -157,92 +159,92 @@ struct MissingFeatures {
   static bool addressPointerAuthInfo() { return false; }
 
   // Misc
-  static bool cirgenABIInfo() { return false; }
   static bool abiArgInfo() { return false; }
-  static bool tryEmitAsConstant() { return false; }
-  static bool constructABIArgDirectExtend() { return false; }
-  static bool opGlobalViewAttr() { return false; }
-  static bool lowerModeOptLevel() { return false; }
-  static bool opTBAA() { return false; }
-  static bool objCLifetime() { return false; }
-  static bool objCBlocks() { return false; }
-  static bool emitNullabilityCheck() { return false; }
-  static bool emitLValueAlignmentAssumption() { return false; }
-  static bool emitLifetimeMarkers() { return false; }
-  static bool astVarDeclInterface() { return false; }
-  static bool stackSaveOp() { return false; }
+  static bool addHeapAllocSiteMetadata() { return false; }
   static bool aggValueSlot() { return false; }
-  static bool aggValueSlotMayOverlap() { return false; }
-  static bool aggValueSlotVolatile() { return false; }
-  static bool aggValueSlotDestructedFlag() { return false; }
   static bool aggValueSlotAlias() { return false; }
+  static bool aggValueSlotDestructedFlag() { return false; }
   static bool aggValueSlotGC() { return false; }
-  static bool generateDebugInfo() { return false; }
-  static bool pointerOverflowSanitizer() { return false; }
-  static bool fpConstraints() { return false; }
-  static bool sanitizers() { return false; }
-  static bool addHeapAllocSiteMetadata() { return false; }
-  static bool targetCIRGenInfoArch() { return false; }
-  static bool targetCIRGenInfoOS() { return false; }
-  static bool targetCodeGenInfoGetNullPointer() { return false; }
-  static bool loopInfoStack() { return false; }
-  static bool requiresCleanups() { return false; }
-  static bool createProfileWeightsForLoop() { return false; }
-  static bool emitCondLikelihoodViaExpectIntrinsic() { return false; }
-  static bool pgoUse() { return false; }
-  static bool cgFPOptionsRAII() { return false; }
-  static bool metaDataNode() { return false; }
-  static bool fastMathFlags() { return false; }
+  static bool aggValueSlotMayOverlap() { return false; }
+  static bool aggValueSlotVolatile() { return false; }
   static bool alignCXXRecordDecl() { return false; }
-  static bool setNonGC() { return false; }
-  static bool incrementProfileCounter() { return false; }
-  static bool insertBuiltinUnpredictable() { return false; }
-  static bool objCGC() { return false; }
-  static bool weakRefReference() { return false; }
-  static bool hip() { return false; }
-  static bool setObjCGCLValueClass() { return false; }
-  static bool setDLLStorageClass() { return false; }
-  static bool openMP() { return false; }
-  static bool emitCheckedInBoundsGEP() { return false; }
-  static bool preservedAccessIndexRegion() { return false; }
+  static bool armComputeVolatileBitfields() { return false; }
+  static bool asmLabelAttr() { return false; }
+  static bool astVarDeclInterface() { return false; }
+  static bool attributeNoBuiltin() { return false; }
   static bool bitfields() { return false; }
-  static bool msabi() { return false; }
-  static bool typeChecks() { return false; }
-  static bool lambdaFieldToName() { return false; }
-  static bool moduleNameHash() { return false; }
-  static bool constantFoldSwitchStatement() { return false; }
-  static bool cudaSupport() { return false; }
-  static bool maybeHandleStaticInExternC() { return false; }
+  static bool builtinCall() { return false; }
+  static bool builtinCallF128() { return false; }
+  static bool builtinCallMathErrno() { return false; }
+  static bool cgFPOptionsRAII() { return false; }
+  static bool cirgenABIInfo() { return false; }
+  static bool cleanupAfterErrorDiags() { return false; }
+  static bool cleanupsToDeactivate() { return false; }
   static bool constEmitterArrayILE() { return false; }
   static bool constEmitterVectorILE() { return false; }
-  static bool needsGlobalCtorDtor() { return false; }
-  static bool emitTypeCheck() { return false; }
-  static bool writebacks() { return false; }
-  static bool cleanupsToDeactivate() { return false; }
-  static bool stackBase() { return false; }
-  static bool deferredCXXGlobalInit() { return false; }
-  static bool setTargetAttributes() { return false; }
+  static bool constantFoldSwitchStatement() { return false; }
+  static bool constructABIArgDirectExtend() { return false; }
   static bool coverageMapping() { return false; }
-  static bool peepholeProtection() { return false; }
-  static bool instrumentation() { return false; }
-  static bool cleanupAfterErrorDiags() { return false; }
+  static bool createProfileWeightsForLoop() { return false; }
+  static bool ctorMemcpyizer() { return false; }
+  static bool cudaSupport() { return false; }
   static bool cxxRecordStaticMembers() { return false; }
-  static bool isMemcpyEquivalentSpecialMember() { return false; }
-  static bool isTrivialCtorOrDtor() { return false; }
+  static bool dataLayoutTypeAllocSize() { return false; }
+  static bool deferredCXXGlobalInit() { return false; }
+  static bool emitCheckedInBoundsGEP() { return false; }
+  static bool emitCondLikelihoodViaExpectIntrinsic() { return false; }
+  static bool emitLifetimeMarkers() { return false; }
+  static bool emitLValueAlignmentAssumption() { return false; }
+  static bool emitNullabilityCheck() { return false; }
+  static bool emitTypeCheck() { return false; }
+  static bool fastMathFlags() { return false; }
+  static bool fpConstraints() { return false; }
+  static bool generateDebugInfo() { return false; }
+  static bool hip() { return false; }
   static bool implicitConstructorArgs() { return false; }
+  static bool incrementProfileCounter() { return false; }
+  static bool insertBuiltinUnpredictable() { return false; }
+  static bool instrumentation() { return false; }
   static bool intrinsics() { return false; }
-  static bool attributeNoBuiltin() { return false; }
-  static bool thunks() { return false; }
-  static bool runCleanupsScope() { return false; }
+  static bool isMemcpyEquivalentSpecialMember() { return false; }
+  static bool isTrivialCtorOrDtor() { return false; }
+  static bool lambdaFieldToName() { return false; }
+  static bool loopInfoStack() { return false; }
   static bool lowerAggregateLoadStore() { return false; }
-  static bool dataLayoutTypeAllocSize() { return false; }
-  static bool asmLabelAttr() { return false; }
-  static bool builtinCall() { return false; }
-  static bool builtinCallF128() { return false; }
-  static bool builtinCallMathErrno() { return false; }
+  static bool lowerModeOptLevel() { return false; }
+  static bool maybeHandleStaticInExternC() { return false; }
+  static bool metaDataNode() { return false; }
+  static bool moduleNameHash() { return false; }
+  static bool msabi() { return false; }
+  static bool needsGlobalCtorDtor() { return false; }
   static bool nonFineGrainedBitfields() { return false; }
-  static bool armComputeVolatileBitfields() { return false; }
-  static bool ctorMemcpyizer() { return false; }
+  static bool objCBlocks() { return false; }
+  static bool objCGC() { return false; }
+  static bool objCLifetime() { return false; }
+  static bool openMP() { return false; }
+  static bool opGlobalViewAttr() { return false; }
+  static bool opTBAA() { return false; }
+  static bool peepholeProtection() { return false; }
+  static bool pgoUse() { return false; }
+  static bool pointerOverflowSanitizer() { return false; }
+  static bool preservedAccessIndexRegion() { return false; }
+  static bool requiresCleanups() { return false; }
+  static bool runCleanupsScope() { return false; }
+  static bool sanitizers() { return false; }
+  static bool setDLLStorageClass() { return false; }
+  static bool setNonGC() { return false; }
+  static bool setObjCGCLValueClass() { return false; }
+  static bool setTargetAttributes() { return false; }
+  static bool stackBase() { return false; }
+  static bool stackSaveOp() { return false; }
+  static bool targetCIRGenInfoArch() { return false; }
+  static bool targetCIRGenInfoOS() { return false; }
+  static bool targetCodeGenInfoGetNullPointer() { return false; }
+  static bool thunks() { return false; }
+  static bool tryEmitAsConstant() { return false; }
+  static bool typeChecks() { return false; }
+  static bool weakRefReference() { return false; }
+  static bool writebacks() { return false; }
 
   // Missing types
   static bool dataMemberType() { return false; }
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 0ec1cb4d0c5d8..dae12a6015439 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -2304,7 +2304,9 @@ class Preprocessor {
 
   /// Check whether the next pp-token is one of the specificed token kind. this
   /// method should have no observable side-effect on the lexed tokens.
-  template <tok::TokenKind K, tok::TokenKind... Ks> bool isNextPPTokenOneOf() {
+  template <typename... Ts> bool isNextPPTokenOneOf(Ts... Ks) {
+    static_assert(sizeof...(Ts) > 0,
+                  "requires at least one tok::TokenKind specified");
     // Do some quick tests for rejection cases.
     std::optional<Token> Val;
     if (CurLexer)
@@ -2335,7 +2337,7 @@ class Preprocessor {
 
     // Okay, we found the token and return.  Otherwise we found the end of the
     // translation unit.
-    return Val->is(K) || (... || Val->is(Ks));
+    return Val->isOneOf(Ks...);
   }
 
 private:
diff --git a/clang/include/clang/Lex/Token.h b/clang/include/clang/Lex/Token.h
index d4dfd7b44d9af..fc43e72593b94 100644
--- a/clang/include/clang/Lex/Token.h
+++ b/clang/include/clang/Lex/Token.h
@@ -101,11 +101,10 @@ class Token {
   /// "if (Tok.is(tok::l_brace)) {...}".
   bool is(tok::TokenKind K) const { return Kind == K; }
   bool isNot(tok::TokenKind K) const { return Kind != K; }
-  bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const {
-    return is(K1) || is(K2);
-  }
-  template <typename... Ts> bool isOneOf(tok::TokenKind K1, Ts... Ks) const {
-    return is(K1) || isOneOf(Ks...);
+  template <typename... Ts> bool isOneOf(Ts... Ks) const {
+    static_assert(sizeof...(Ts) > 0,
+                  "requires at least one tok::TokenKind specified");
+    return (is(Ks) || ...);
   }
 
   /// Return true if this is a raw identifier (when lexing
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index 7a4b7d21bb20e..7d4b4467eb97d 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -464,8 +464,6 @@ class ASTReader
   using ModuleReverseIterator = ModuleManager::ModuleReverseIterator;
 
 private:
-  using LocSeq = SourceLocationSequence;
-
   /// The receiver of some callbacks invoked by ASTReader.
   std::unique_ptr<ASTReaderListener> Listener;
 
@@ -2445,18 +2443,16 @@ class ASTReader
   /// Read a source location from raw form and return it in its
   /// originating module file's source location space.
   std::pair<SourceLocation, unsigned>
-  ReadUntranslatedSourceLocation(RawLocEncoding Raw,
-                                 LocSeq *Seq = nullptr) const {
-    return SourceLocationEncoding::decode(Raw, Seq);
+  ReadUntranslatedSourceLocation(RawLocEncoding Raw) const {
+    return SourceLocationEncoding::decode(Raw);
   }
 
   /// Read a source location from raw form.
-  SourceLocation ReadSourceLocation(ModuleFile &MF, RawLocEncoding Raw,
-                                    LocSeq *Seq = nullptr) const {
+  SourceLocation ReadSourceLocation(ModuleFile &MF, RawLocEncoding Raw) const {
     if (!MF.ModuleOffsetMap.empty())
       ReadModuleOffsetMap(MF);
 
-    auto [Loc, ModuleFileIndex] = ReadUntranslatedSourceLocation(Raw, Seq);
+    auto [Loc, ModuleFileIndex] = ReadUntranslatedSourceLocation(Raw);
     ModuleFile *OwningModuleFile =
         ModuleFileIndex == 0 ? &MF : MF.TransitiveImports[ModuleFileIndex - 1];
 
@@ -2484,9 +2480,9 @@ class ASTReader
 
   /// Read a source location.
   SourceLocation ReadSourceLocation(ModuleFile &ModuleFile,
-                                    const RecordDataImpl &Record, unsigned &Idx,
-                                    LocSeq *Seq = nullptr) {
-    return ReadSourceLocation(ModuleFile, Record[Idx++], Seq);
+                                    const RecordDataImpl &Record,
+                                    unsigned &Idx) {
+    return ReadSourceLocation(ModuleFile, Record[Idx++]);
   }
 
   /// Read a FileID.
@@ -2505,7 +2501,7 @@ class ASTReader
 
   /// Read a source range.
   SourceRange ReadSourceRange(ModuleFile &F, const RecordData &Record,
-                              unsigned &Idx, LocSeq *Seq = nullptr);
+                              unsigned &Idx);
 
   static llvm::BitVector ReadBitVector(const RecordData &Record,
                                        const StringRef Blob);
diff --git a/clang/include/clang/Serialization/ASTRecordReader.h b/clang/include/clang/Serialization/ASTRecordReader.h
index da3f504ff27df..1472497ff5e7e 100644
--- a/clang/include/clang/Serialization/ASTRecordReader.h
+++ b/clang/include/clang/Serialization/ASTRecordReader.h
@@ -32,7 +32,6 @@ class OMPChildren;
 class ASTRecordReader
     : public serialization::DataStreamBasicReader<ASTRecordReader> {
   using ModuleFile = serialization::ModuleFile;
-  using LocSeq = SourceLocationSequence;
 
   ASTReader *Reader;
   ModuleFile *F;
@@ -160,7 +159,7 @@ class ASTRecordReader
   TypeSourceInfo *readTypeSourceInfo();
 
   /// Reads the location information for a type.
-  void readTypeLoc(TypeLoc TL, LocSeq *Seq = nullptr);
+  void readTypeLoc(TypeLoc TL);
 
   /// Map a local type ID within a given AST file to a global type ID.
   serialization::TypeID getGlobalTypeID(serialization::TypeID LocalID) const {
@@ -287,13 +286,13 @@ class ASTRecordReader
   void readOpenACCRoutineDeclAttr(OpenACCRoutineDeclAttr *A);
 
   /// Read a source location, advancing Idx.
-  SourceLocation readSourceLocation(LocSeq *Seq = nullptr) {
-    return Reader->ReadSourceLocation(*F, Record, Idx, Seq);
+  SourceLocation readSourceLocation() {
+    return Reader->ReadSourceLocation(*F, Record, Idx);
   }
 
   /// Read a source range, advancing Idx.
-  SourceRange readSourceRange(LocSeq *Seq = nullptr) {
-    return Reader->ReadSourceRange(*F, Record, Idx, Seq);
+  SourceRange readSourceRange() {
+    return Reader->ReadSourceRange(*F, Record, Idx);
   }
 
   /// Read an arbitrary constant value, advancing Idx.
diff --git a/clang/include/clang/Serialization/ASTRecordWriter.h b/clang/include/clang/Serialization/ASTRecordWriter.h
index 964c9e6ea8a25..ee005ec287708 100644
--- a/clang/include/clang/Serialization/ASTRecordWriter.h
+++ b/clang/include/clang/Serialization/ASTRecordWriter.h
@@ -29,7 +29,6 @@ class TypeLoc;
 /// An object for streaming information to a record.
 class ASTRecordWriter
     : public serialization::DataStreamBasicWriter<ASTRecordWriter> {
-  using LocSeq = SourceLocationSequence;
 
   ASTWriter *Writer;
   ASTWriter::RecordDataImpl *Record;
@@ -147,8 +146,8 @@ class ASTRecordWriter
   void AddFunctionDefinition(const FunctionDecl *FD);
 
   /// Emit a source location.
-  void AddSourceLocation(SourceLocation Loc, LocSeq *Seq = nullptr) {
-    return Writer->AddSourceLocation(Loc, *Record, Seq);
+  void AddSourceLocation(SourceLocation Loc) {
+    return Writer->AddSourceLocation(Loc, *Record);
   }
   void writeSourceLocation(SourceLocation Loc) {
     AddSourceLocation(Loc);
@@ -174,8 +173,8 @@ class ASTRecordWriter
   }
 
   /// Emit a source range.
-  void AddSourceRange(SourceRange Range, LocSeq *Seq = nullptr) {
-    return Writer->AddSourceRange(Range, *Record, Seq);
+  void AddSourceRange(SourceRange Range) {
+    return Writer->AddSourceRange(Range, *Record);
   }
 
   void writeBool(bool Value) {
@@ -245,7 +244,7 @@ class ASTRecordWriter
   void AddTypeSourceInfo(TypeSourceInfo *TInfo);
 
   /// Emits source location information for a type. Does not emit the type.
-  void AddTypeLoc(TypeLoc TL, LocSeq *Seq = nullptr);
+  void AddTypeLoc(TypeLoc TL);
 
   /// Emits a template argument location info.
   void AddTemplateArgumentLocInfo(TemplateArgument::ArgKind Kind,
diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index 97679ace8b610..162be84bbda19 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -115,8 +115,6 @@ class ASTWriter : public ASTDeserializationListener,
   using TypeIdxMap = llvm::DenseMap<QualType, serialization::TypeIdx,
                                     serialization::UnsafeQualTypeDenseMapInfo>;
 
-  using LocSeq = SourceLocationSequence;
-
   /// The bitstream writer used to emit this precompiled header.
   llvm::BitstreamWriter &Stream;
 
@@ -733,16 +731,14 @@ class ASTWriter : public ASTDeserializationListener,
   void AddFileID(FileID FID, RecordDataImpl &Record);
 
   /// Emit a source location.
-  void AddSourceLocation(SourceLocation Loc, RecordDataImpl &Record,
-                         LocSeq *Seq = nullptr);
+  void AddSourceLocation(SourceLocation Loc, RecordDataImpl &Record);
 
   /// Return the raw encodings for source locations.
   SourceLocationEncoding::RawLocEncoding
-  getRawSourceLocationEncoding(SourceLocation Loc, LocSeq *Seq = nullptr);
+  getRawSourceLocationEncoding(SourceLocation Loc);
 
   /// Emit a source range.
-  void AddSourceRange(SourceRange Range, RecordDataImpl &Record,
-                      LocSeq *Seq = nullptr);
+  void AddSourceRange(SourceRange Range, RecordDataImpl &Record);
 
   /// Emit a reference to an identifier.
   void AddIdentifierRef(const IdentifierInfo *II, RecordDataImpl &Record);
diff --git a/clang/include/clang/Serialization/SourceLocationEncoding.h b/clang/include/clang/Serialization/SourceLocationEncoding.h
index 33ca1728fa479..5b2485dbc719f 100644
--- a/clang/include/clang/Serialization/SourceLocationEncoding.h
+++ b/clang/include/clang/Serialization/SourceLocationEncoding.h
@@ -25,20 +25,17 @@
 //  * C: The macro bit. We rotate it to the lowest bit so that we can save some
 //  space in case the index of the module file is 0.
 //
-// Specially, if the index of the module file is 0, we allow to encode a
-// sequence of locations we store only differences between successive elements.
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_CLANG_SERIALIZATION_SOURCELOCATIONENCODING_H
+#define LLVM_CLANG_SERIALIZATION_SOURCELOCATIONENCODING_H
+
 #include "clang/Basic/SourceLocation.h"
 #include "llvm/Support/MathExtras.h"
 #include <climits>
 
-#ifndef LLVM_CLANG_SERIALIZATION_SOURCELOCATIONENCODING_H
-#define LLVM_CLANG_SERIALIZATION_SOURCELOCATIONENCODING_H
-
 namespace clang {
-class SourceLocationSequence;
 
 /// Serialized encoding of SourceLocations without context.
 /// Optimized to have small unsigned values (=> small after VBR encoding).
@@ -54,119 +51,22 @@ class SourceLocationEncoding {
   static UIntTy decodeRaw(UIntTy Raw) {
     return (Raw >> 1) | (Raw << (UIntBits - 1));
   }
-  friend SourceLocationSequence;
 
 public:
   using RawLocEncoding = uint64_t;
 
   static RawLocEncoding encode(SourceLocation Loc, UIntTy BaseOffset,
-                               unsigned BaseModuleFileIndex,
-                               SourceLocationSequence * = nullptr);
-  static std::pair<SourceLocation, unsigned>
-  decode(RawLocEncoding, SourceLocationSequence * = nullptr);
-};
-
-/// Serialized encoding of a sequence of SourceLocations.
-///
-/// Optimized to produce small values when locations with the sequence are
-/// similar. Each element can be delta-encoded against the last nonzero element.
-///
-/// Sequences should be started by creating a SourceLocationSequence::State,
-/// and then passed around as SourceLocationSequence*. Example:
-///
-///   // establishes a sequence
-///   void EmitTopLevelThing() {
-///     SourceLocationSequence::State Seq;
-///     EmitContainedThing(Seq);
-///     EmitRecursiveThing(Seq);
-///   }
-///
-///   // optionally part of a sequence
-///   void EmitContainedThing(SourceLocationSequence *Seq = nullptr) {
-///     Record.push_back(SourceLocationEncoding::encode(SomeLoc, Seq));
-///   }
-///
-///   // establishes a sequence if there isn't one already
-///   void EmitRecursiveThing(SourceLocationSequence *ParentSeq = nullptr) {
-///     SourceLocationSequence::State Seq(ParentSeq);
-///     Record.push_back(SourceLocationEncoding::encode(SomeLoc, Seq));
-///     EmitRecursiveThing(Seq);
-///   }
-///
-class SourceLocationSequence {
-  using UIntTy = SourceLocation::UIntTy;
-  using EncodedTy = uint64_t;
-  constexpr static auto UIntBits = SourceLocationEncoding::UIntBits;
-  static_assert(sizeof(EncodedTy) > sizeof(UIntTy), "Need one extra bit!");
-
-  // Prev stores the rotated last nonzero location.
-  UIntTy &Prev;
-
-  // Zig-zag encoding turns small signed integers into small unsigned integers.
-  // 0 => 0, -1 => 1, 1 => 2, -2 => 3, ...
-  static UIntTy zigZag(UIntTy V) {
-    UIntTy Sign = (V & (1 << (UIntBits - 1))) ? UIntTy(-1) : UIntTy(0);
-    return Sign ^ (V << 1);
-  }
-  static UIntTy zagZig(UIntTy V) { return (V >> 1) ^ -(V & 1); }
-
-  SourceLocationSequence(UIntTy &Prev) : Prev(Prev) {}
-
-  EncodedTy encodeRaw(UIntTy Raw) {
-    if (Raw == 0)
-      return 0;
-    UIntTy Rotated = SourceLocationEncoding::encodeRaw(Raw);
-    if (Prev == 0)
-      return Prev = Rotated;
-    UIntTy Delta = Rotated - Prev;
-    Prev = Rotated;
-    // Exactly one 33 bit value is possible! (1 << 32).
-    // This is because we have two representations of zero: trivial & relative.
-    return 1 + EncodedTy{zigZag(Delta)};
-  }
-  UIntTy decodeRaw(EncodedTy Encoded) {
-    if (Encoded == 0)
-      return 0;
-    if (Prev == 0)
-      return SourceLocationEncoding::decodeRaw(Prev = Encoded);
-    return SourceLocationEncoding::decodeRaw(Prev += zagZig(Encoded - 1));
-  }
-
-public:
-  SourceLocation decode(EncodedTy Encoded) {
-    return SourceLocation::getFromRawEncoding(decodeRaw(Encoded));
-  }
-  EncodedTy encode(SourceLocation Loc) {
-    return encodeRaw(Loc.getRawEncoding());
-  }
-
-  class State;
-};
-
-/// This object establishes a SourceLocationSequence.
-class SourceLocationSequence::State {
-  UIntTy Prev = 0;
-  SourceLocationSequence Seq;
-
-public:
-  // If Parent is provided and non-null, then this root becomes part of that
-  // enclosing sequence instead of establishing a new one.
-  State(SourceLocationSequence *Parent = nullptr)
-      : Seq(Parent ? Parent->Prev : Prev) {}
-
-  // Implicit conversion for uniform use of roots vs propagated sequences.
-  operator SourceLocationSequence *() { return &Seq; }
+                               unsigned BaseModuleFileIndex);
+  static std::pair<SourceLocation, unsigned> decode(RawLocEncoding);
 };
 
 inline SourceLocationEncoding::RawLocEncoding
 SourceLocationEncoding::encode(SourceLocation Loc, UIntTy BaseOffset,
-                               unsigned BaseModuleFileIndex,
-                               SourceLocationSequence *Seq) {
+                               unsigned BaseModuleFileIndex) {
   // If the source location is a local source location, we can try to optimize
   // the similar sequences to only record the differences.
   if (!BaseOffset)
-    return Seq ? Seq->encode(Loc) : encodeRaw(Loc.getRawEncoding());
-
+    return encodeRaw(Loc.getRawEncoding());
   if (Loc.isInvalid())
     return 0;
 
@@ -183,13 +83,11 @@ SourceLocationEncoding::encode(SourceLocation Loc, UIntTy BaseOffset,
   return Encoded;
 }
 inline std::pair<SourceLocation, unsigned>
-SourceLocationEncoding::decode(RawLocEncoding Encoded,
-                               SourceLocationSequence *Seq) {
+SourceLocationEncoding::decode(RawLocEncoding Encoded) {
   unsigned ModuleFileIndex = Encoded >> 32;
 
   if (!ModuleFileIndex)
-    return {Seq ? Seq->decode(Encoded)
-                : SourceLocation::getFromRawEncoding(decodeRaw(Encoded)),
+    return {SourceLocation::getFromRawEncoding(decodeRaw(Encoded)),
             ModuleFileIndex};
 
   Encoded &= llvm::maskTrailingOnes<RawLocEncoding>(32);
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
index f6a43bf5f493b..5dcf03f7a4648 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
@@ -554,6 +554,8 @@ class SimpleFunctionCall : public AnyFunctionCall {
 
   const FunctionDecl *getDecl() const override;
 
+  RuntimeDefinition getRuntimeDefinition() const override;
+
   unsigned getNumArgs() const override { return getOriginExpr()->getNumArgs(); }
 
   const Expr *getArgExpr(unsigned Index) const override {
diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index 77145e2891a8a..05a5dc2d94256 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -89,6 +89,12 @@ bool PPCTargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
 }
 
 static void defineXLCompatMacros(MacroBuilder &Builder) {
+  Builder.defineMacro("__builtin_national2packed",
+                      "__builtin_ppc_national2packed");
+  Builder.defineMacro("__builtin_packed2national",
+                      "__builtin_ppc_packed2national");
+  Builder.defineMacro("__builtin_packed2zoned", "__builtin_ppc_packed2zoned");
+  Builder.defineMacro("__builtin_zoned2packed", "__builtin_ppc_zoned2packed");
   Builder.defineMacro("__cdtbcd", "__builtin_ppc_cdtbcd");
   Builder.defineMacro("__cbcdtd", "__builtin_ppc_cbcdtd");
   Builder.defineMacro("__addg6s", "__builtin_ppc_addg6s");
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index e38faba83b80c..ac62ea7c6aa16 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -366,6 +366,16 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
     return create<cir::ComplexCreateOp>(loc, resultComplexTy, real, imag);
   }
 
+  mlir::Value createComplexReal(mlir::Location loc, mlir::Value operand) {
+    auto operandTy = mlir::cast<cir::ComplexType>(operand.getType());
+    return create<cir::ComplexRealOp>(loc, operandTy.getElementType(), operand);
+  }
+
+  mlir::Value createComplexImag(mlir::Location loc, mlir::Value operand) {
+    auto operandTy = mlir::cast<cir::ComplexType>(operand.getType());
+    return create<cir::ComplexImagOp>(loc, operandTy.getElementType(), operand);
+  }
+
   /// Create a cir.ptr_stride operation to get access to an array element.
   /// \p idx is the index of the element to access, \p shouldDecay is true if
   /// the result should decay to a pointer to the element type.
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
index 51751483d34e9..da507d6f28335 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
@@ -25,7 +25,7 @@ cir::FuncOp CIRGenModule::codegenCXXStructor(GlobalDecl gd) {
   cir::FuncType funcType = getTypes().getFunctionType(fnInfo);
   cir::FuncOp fn = getAddrOfCXXStructor(gd, &fnInfo, /*FnType=*/nullptr,
                                         /*DontDefer=*/true, ForDefinition);
-  assert(!cir::MissingFeatures::opFuncLinkage());
+  setFunctionLinkage(gd, fn);
   CIRGenFunction cgf{*this, builder};
   curCGF = &cgf;
   {
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 8d0db5cd0a1e5..7f8dcd96a6bff 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -603,6 +603,10 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 
   mlir::Value VisitUnaryLNot(const UnaryOperator *e);
 
+  mlir::Value VisitUnaryReal(const UnaryOperator *e);
+
+  mlir::Value VisitUnaryImag(const UnaryOperator *e);
+
   mlir::Value VisitCXXThisExpr(CXXThisExpr *te) { return cgf.loadCXXThis(); }
 
   /// Emit a conversion from the specified type to the specified destination
@@ -1891,6 +1895,48 @@ mlir::Value ScalarExprEmitter::VisitUnaryLNot(const UnaryOperator *e) {
   return maybePromoteBoolResult(boolVal, cgf.convertType(e->getType()));
 }
 
+mlir::Value ScalarExprEmitter::VisitUnaryReal(const UnaryOperator *e) {
+  // TODO(cir): handle scalar promotion.
+  Expr *op = e->getSubExpr();
+  if (op->getType()->isAnyComplexType()) {
+    // If it's an l-value, load through the appropriate subobject l-value.
+    // Note that we have to ask `e` because `op` might be an l-value that
+    // this won't work for, e.g. an Obj-C property.
+    if (e->isGLValue()) {
+      mlir::Location loc = cgf.getLoc(e->getExprLoc());
+      mlir::Value complex = cgf.emitComplexExpr(op);
+      return cgf.builder.createComplexReal(loc, complex);
+    }
+
+    // Otherwise, calculate and project.
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "VisitUnaryReal calculate and project");
+  }
+
+  return Visit(op);
+}
+
+mlir::Value ScalarExprEmitter::VisitUnaryImag(const UnaryOperator *e) {
+  // TODO(cir): handle scalar promotion.
+  Expr *op = e->getSubExpr();
+  if (op->getType()->isAnyComplexType()) {
+    // If it's an l-value, load through the appropriate subobject l-value.
+    // Note that we have to ask `e` because `op` might be an l-value that
+    // this won't work for, e.g. an Obj-C property.
+    if (e->isGLValue()) {
+      mlir::Location loc = cgf.getLoc(e->getExprLoc());
+      mlir::Value complex = cgf.emitComplexExpr(op);
+      return cgf.builder.createComplexImag(loc, complex);
+    }
+
+    // Otherwise, calculate and project.
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "VisitUnaryImag calculate and project");
+  }
+
+  return Visit(op);
+}
+
 /// Return the size or alignment of the type of argument of the sizeof
 /// expression as an integer.
 mlir::Value ScalarExprEmitter::VisitUnaryExprOrTypeTraitExpr(
@@ -1914,13 +1960,6 @@ mlir::Value ScalarExprEmitter::VisitUnaryExprOrTypeTraitExpr(
     return builder.getConstant(
         loc, builder.getAttr<cir::IntAttr>(
                  cgf.cgm.UInt64Ty, llvm::APSInt(llvm::APInt(64, 1), true)));
-  } else if (e->getKind() == UETT_VectorElements) {
-    cgf.getCIRGenModule().errorNYI(e->getSourceRange(),
-                                   "sizeof operator for VectorElements",
-                                   e->getStmtClassName());
-    return builder.getConstant(
-        loc, builder.getAttr<cir::IntAttr>(
-                 cgf.cgm.UInt64Ty, llvm::APSInt(llvm::APInt(64, 1), true)));
   }
 
   return builder.getConstant(
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 68ab81ed53af9..f24bee44f26a7 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -406,6 +406,16 @@ void CIRGenModule::emitGlobalFunctionDefinition(clang::GlobalDecl gd,
                                /*DontDefer=*/true, ForDefinition);
   }
 
+  // Already emitted.
+  if (!funcOp.isDeclaration())
+    return;
+
+  setFunctionLinkage(gd, funcOp);
+  setGVProperties(funcOp, funcDecl);
+  assert(!cir::MissingFeatures::opFuncMaybeHandleStaticInExternC());
+  maybeSetTrivialComdat(*funcDecl, funcOp);
+  assert(!cir::MissingFeatures::setLLVMFunctionFEnvAttributes());
+
   CIRGenFunction cgf(*this, builder);
   curCGF = &cgf;
   {
@@ -413,7 +423,17 @@ void CIRGenModule::emitGlobalFunctionDefinition(clang::GlobalDecl gd,
     cgf.generateCode(gd, funcOp, funcType);
   }
   curCGF = nullptr;
+
+  setNonAliasAttributes(gd, funcOp);
   assert(!cir::MissingFeatures::opFuncAttributesForDefinition());
+
+  if (const ConstructorAttr *ca = funcDecl->getAttr<ConstructorAttr>())
+    errorNYI(funcDecl->getSourceRange(), "constructor attribute");
+  if (const DestructorAttr *da = funcDecl->getAttr<DestructorAttr>())
+    errorNYI(funcDecl->getSourceRange(), "destructor attribute");
+
+  if (funcDecl->getAttr<AnnotateAttr>())
+    errorNYI(funcDecl->getSourceRange(), "deferredAnnotations");
 }
 
 mlir::Operation *CIRGenModule::getGlobalValue(StringRef name) {
@@ -855,10 +875,12 @@ static bool shouldBeInCOMDAT(CIRGenModule &cgm, const Decl &d) {
 void CIRGenModule::maybeSetTrivialComdat(const Decl &d, mlir::Operation *op) {
   if (!shouldBeInCOMDAT(*this, d))
     return;
-  if (auto globalOp = dyn_cast_or_null<cir::GlobalOp>(op))
+  if (auto globalOp = dyn_cast_or_null<cir::GlobalOp>(op)) {
     globalOp.setComdat(true);
-
-  assert(!cir::MissingFeatures::opFuncSetComdat());
+  } else {
+    auto funcOp = cast<cir::FuncOp>(op);
+    funcOp.setComdat(true);
+  }
 }
 
 void CIRGenModule::updateCompletedType(const TagDecl *td) {
@@ -1028,6 +1050,17 @@ CIRGenModule::getCIRLinkageVarDefinition(const VarDecl *vd, bool isConstant) {
   return getCIRLinkageForDeclarator(vd, linkage, isConstant);
 }
 
+cir::GlobalLinkageKind CIRGenModule::getFunctionLinkage(GlobalDecl gd) {
+  const auto *fd = cast<FunctionDecl>(gd.getDecl());
+
+  GVALinkage linkage = astContext.GetGVALinkageForFunction(fd);
+
+  if (const auto *dtor = dyn_cast<CXXDestructorDecl>(fd))
+    errorNYI(fd->getSourceRange(), "getFunctionLinkage: CXXDestructorDecl");
+
+  return getCIRLinkageForDeclarator(fd, linkage, /*IsConstantVariable=*/false);
+}
+
 static cir::GlobalOp
 generateStringLiteral(mlir::Location loc, mlir::TypedAttr c,
                       cir::GlobalLinkageKind lt, CIRGenModule &cgm,
@@ -1534,6 +1567,27 @@ void CIRGenModule::setGVPropertiesAux(mlir::Operation *op,
   assert(!cir::MissingFeatures::opGlobalPartition());
 }
 
+void CIRGenModule::setFunctionAttributes(GlobalDecl globalDecl,
+                                         cir::FuncOp func,
+                                         bool isIncompleteFunction,
+                                         bool isThunk) {
+  // NOTE(cir): Original CodeGen checks if this is an intrinsic. In CIR we
+  // represent them in dedicated ops. The correct attributes are ensured during
+  // translation to LLVM. Thus, we don't need to check for them here.
+
+  assert(!cir::MissingFeatures::setFunctionAttributes());
+  assert(!cir::MissingFeatures::setTargetAttributes());
+
+  // TODO(cir): This needs a lot of work to better match CodeGen. That
+  // ultimately ends up in setGlobalVisibility, which already has the linkage of
+  // the LLVM GV (corresponding to our FuncOp) computed, so it doesn't have to
+  // recompute it here. This is a minimal fix for now.
+  if (!isLocalLinkage(getFunctionLinkage(globalDecl))) {
+    const Decl *decl = globalDecl.getDecl();
+    func.setGlobalVisibilityAttr(getGlobalVisibilityAttrFromDecl(decl));
+  }
+}
+
 cir::FuncOp CIRGenModule::getOrCreateCIRFunction(
     StringRef mangledName, mlir::Type funcType, GlobalDecl gd, bool forVTable,
     bool dontDefer, bool isThunk, ForDefinition_t isForDefinition,
@@ -1576,8 +1630,9 @@ cir::FuncOp CIRGenModule::getOrCreateCIRFunction(
     // If there are two attempts to define the same mangled name, issue an
     // error.
     auto fn = cast<cir::FuncOp>(entry);
-    assert((!isForDefinition || !fn || !fn.isDeclaration()) &&
-           "Duplicate function definition");
+    if (isForDefinition && fn && !fn.isDeclaration()) {
+      errorNYI(d->getSourceRange(), "Duplicate function definition");
+    }
     if (fn && fn.getFunctionType() == funcType) {
       return fn;
     }
@@ -1598,6 +1653,9 @@ cir::FuncOp CIRGenModule::getOrCreateCIRFunction(
       invalidLoc ? theModule->getLoc() : getLoc(funcDecl->getSourceRange()),
       mangledName, mlir::cast<cir::FuncType>(funcType), funcDecl);
 
+  if (d)
+    setFunctionAttributes(gd, funcOp, /*isIncompleteFunction=*/false, isThunk);
+
   // 'dontDefer' actually means don't move this to the deferredDeclsToEmit list.
   if (dontDefer) {
     // TODO(cir): This assertion will need an additional condition when we
@@ -1668,6 +1726,20 @@ CIRGenModule::createCIRFunction(mlir::Location loc, StringRef name,
 
     func = builder.create<cir::FuncOp>(loc, name, funcType);
 
+    assert(!cir::MissingFeatures::opFuncAstDeclAttr());
+    assert(!cir::MissingFeatures::opFuncNoProto());
+
+    assert(func.isDeclaration() && "expected empty body");
+
+    // A declaration gets private visibility by default, but external linkage
+    // as the default linkage.
+    func.setLinkageAttr(cir::GlobalLinkageKindAttr::get(
+        &getMLIRContext(), cir::GlobalLinkageKind::ExternalLinkage));
+    mlir::SymbolTable::setSymbolVisibility(
+        func, mlir::SymbolTable::Visibility::Private);
+
+    assert(!cir::MissingFeatures::opFuncExtraAttrs());
+
     if (!cgf)
       theModule.push_back(func);
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 71806e3c5de21..9f6a57c31d291 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -268,6 +268,10 @@ class CIRGenModule : public CIRGenTypeCache {
   void setGVProperties(mlir::Operation *op, const NamedDecl *d) const;
   void setGVPropertiesAux(mlir::Operation *op, const NamedDecl *d) const;
 
+  /// Set function attributes for a function declaration.
+  void setFunctionAttributes(GlobalDecl gd, cir::FuncOp f,
+                             bool isIncompleteFunction, bool isThunk);
+
   void emitGlobalDefinition(clang::GlobalDecl gd,
                             mlir::Operation *op = nullptr);
   void emitGlobalFunctionDefinition(clang::GlobalDecl gd, mlir::Operation *op);
@@ -340,10 +344,16 @@ class CIRGenModule : public CIRGenTypeCache {
       clang::VisibilityAttr::VisibilityType visibility);
   cir::VisibilityAttr getGlobalVisibilityAttrFromDecl(const Decl *decl);
   static mlir::SymbolTable::Visibility getMLIRVisibility(cir::GlobalOp op);
-
+  cir::GlobalLinkageKind getFunctionLinkage(GlobalDecl gd);
   cir::GlobalLinkageKind getCIRLinkageForDeclarator(const DeclaratorDecl *dd,
                                                     GVALinkage linkage,
                                                     bool isConstantVariable);
+  void setFunctionLinkage(GlobalDecl gd, cir::FuncOp f) {
+    cir::GlobalLinkageKind l = getFunctionLinkage(gd);
+    f.setLinkageAttr(cir::GlobalLinkageKindAttr::get(&getMLIRContext(), l));
+    mlir::SymbolTable::setSymbolVisibility(f,
+                                           getMLIRVisibilityFromCIRLinkage(l));
+  }
 
   cir::GlobalLinkageKind getCIRLinkageVarDefinition(const VarDecl *vd,
                                                     bool isConstant);
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 27f4ecb5ab85d..17157561357f9 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -115,9 +115,26 @@ template <typename Ty> struct EnumTraits {};
     static unsigned getMaxEnumVal() { return cir::getMaxEnumValFor##Ty(); }    \
   }
 
+REGISTER_ENUM_TYPE(GlobalLinkageKind);
+REGISTER_ENUM_TYPE(VisibilityKind);
 REGISTER_ENUM_TYPE(SideEffect);
 } // namespace
 
+/// Parse an enum from the keyword, or default to the provided default value.
+/// The return type is the enum type by default, unless overriden with the
+/// second template argument.
+template <typename EnumTy, typename RetTy = EnumTy>
+static RetTy parseOptionalCIRKeyword(AsmParser &parser, EnumTy defaultValue) {
+  llvm::SmallVector<llvm::StringRef, 10> names;
+  for (unsigned i = 0, e = EnumTraits<EnumTy>::getMaxEnumVal(); i <= e; ++i)
+    names.push_back(EnumTraits<EnumTy>::stringify(static_cast<EnumTy>(i)));
+
+  int index = parseOptionalKeywordAlternative(parser, names);
+  if (index == -1)
+    return static_cast<RetTy>(defaultValue);
+  return static_cast<RetTy>(index);
+}
+
 /// Parse an enum from the keyword, return failure if the keyword is not found.
 template <typename EnumTy, typename RetTy = EnumTy>
 static ParseResult parseCIRKeyword(AsmParser &parser, RetTy &result) {
@@ -170,6 +187,26 @@ static bool omitRegionTerm(mlir::Region &r) {
   return singleNonEmptyBlock && yieldsNothing();
 }
 
+void printVisibilityAttr(OpAsmPrinter &printer,
+                         cir::VisibilityAttr &visibility) {
+  switch (visibility.getValue()) {
+  case cir::VisibilityKind::Hidden:
+    printer << "hidden";
+    break;
+  case cir::VisibilityKind::Protected:
+    printer << "protected";
+    break;
+  case cir::VisibilityKind::Default:
+    break;
+  }
+}
+
+void parseVisibilityAttr(OpAsmParser &parser, cir::VisibilityAttr &visibility) {
+  cir::VisibilityKind visibilityKind =
+      parseOptionalCIRKeyword(parser, cir::VisibilityKind::Default);
+  visibility = cir::VisibilityAttr::get(parser.getContext(), visibilityKind);
+}
+
 //===----------------------------------------------------------------------===//
 // CIR Custom Parsers/Printers
 //===----------------------------------------------------------------------===//
@@ -1287,19 +1324,54 @@ cir::GetGlobalOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
 // FuncOp
 //===----------------------------------------------------------------------===//
 
+/// Returns the name used for the linkage attribute. This *must* correspond to
+/// the name of the attribute in ODS.
+static llvm::StringRef getLinkageAttrNameString() { return "linkage"; }
+
 void cir::FuncOp::build(OpBuilder &builder, OperationState &result,
-                        StringRef name, FuncType type) {
+                        StringRef name, FuncType type,
+                        GlobalLinkageKind linkage) {
   result.addRegion();
   result.addAttribute(SymbolTable::getSymbolAttrName(),
                       builder.getStringAttr(name));
   result.addAttribute(getFunctionTypeAttrName(result.name),
                       TypeAttr::get(type));
+  result.addAttribute(
+      getLinkageAttrNameString(),
+      GlobalLinkageKindAttr::get(builder.getContext(), linkage));
+  result.addAttribute(getGlobalVisibilityAttrName(result.name),
+                      cir::VisibilityAttr::get(builder.getContext()));
 }
 
 ParseResult cir::FuncOp::parse(OpAsmParser &parser, OperationState &state) {
   llvm::SMLoc loc = parser.getCurrentLocation();
   mlir::Builder &builder = parser.getBuilder();
 
+  mlir::StringAttr visNameAttr = getSymVisibilityAttrName(state.name);
+  mlir::StringAttr visibilityNameAttr = getGlobalVisibilityAttrName(state.name);
+  mlir::StringAttr dsoLocalNameAttr = getDsoLocalAttrName(state.name);
+
+  // Default to external linkage if no keyword is provided.
+  state.addAttribute(getLinkageAttrNameString(),
+                     GlobalLinkageKindAttr::get(
+                         parser.getContext(),
+                         parseOptionalCIRKeyword<GlobalLinkageKind>(
+                             parser, GlobalLinkageKind::ExternalLinkage)));
+
+  ::llvm::StringRef visAttrStr;
+  if (parser.parseOptionalKeyword(&visAttrStr, {"private", "public", "nested"})
+          .succeeded()) {
+    state.addAttribute(visNameAttr,
+                       parser.getBuilder().getStringAttr(visAttrStr));
+  }
+
+  cir::VisibilityAttr cirVisibilityAttr;
+  parseVisibilityAttr(parser, cirVisibilityAttr);
+  state.addAttribute(visibilityNameAttr, cirVisibilityAttr);
+
+  if (parser.parseOptionalKeyword(dsoLocalNameAttr).succeeded())
+    state.addAttribute(dsoLocalNameAttr, parser.getBuilder().getUnitAttr());
+
   StringAttr nameAttr;
   if (parser.parseSymbolName(nameAttr, SymbolTable::getSymbolAttrName(),
                              state.attributes))
@@ -1346,10 +1418,14 @@ ParseResult cir::FuncOp::parse(OpAsmParser &parser, OperationState &state) {
   return success();
 }
 
+// This function corresponds to `llvm::GlobalValue::isDeclaration` and should
+// have a similar implementation. We don't currently support aliases, ifuncs,
+// or materializable functions, but those should be handled here as they are
+// implemented.
 bool cir::FuncOp::isDeclaration() {
-  // TODO(CIR): This function will actually do something once external
-  // function declarations and aliases are upstreamed.
-  return false;
+  assert(!cir::MissingFeatures::opFuncGlobalAliases());
+  assert(!cir::MissingFeatures::supportIFuncAttr());
+  return getFunctionBody().empty();
 }
 
 mlir::Region *cir::FuncOp::getCallableRegion() {
@@ -1359,6 +1435,25 @@ mlir::Region *cir::FuncOp::getCallableRegion() {
 }
 
 void cir::FuncOp::print(OpAsmPrinter &p) {
+  if (getComdat())
+    p << " comdat";
+
+  if (getLinkage() != GlobalLinkageKind::ExternalLinkage)
+    p << ' ' << stringifyGlobalLinkageKind(getLinkage());
+
+  mlir::SymbolTable::Visibility vis = getVisibility();
+  if (vis != mlir::SymbolTable::Visibility::Public)
+    p << ' ' << vis;
+
+  cir::VisibilityAttr cirVisibilityAttr = getGlobalVisibilityAttr();
+  if (!cirVisibilityAttr.isDefault()) {
+    p << ' ';
+    printVisibilityAttr(p, cirVisibilityAttr);
+  }
+
+  if (getDsoLocal())
+    p << " dso_local";
+
   p << ' ';
   p.printSymbolName(getSymName());
   cir::FuncType fnType = getFunctionType();
@@ -1914,6 +2009,42 @@ OpFoldResult cir::ComplexCreateOp::fold(FoldAdaptor adaptor) {
   return cir::ConstComplexAttr::get(realAttr, imagAttr);
 }
 
+//===----------------------------------------------------------------------===//
+// ComplexRealOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult cir::ComplexRealOp::verify() {
+  if (getType() != getOperand().getType().getElementType()) {
+    emitOpError() << ": result type does not match operand type";
+    return failure();
+  }
+  return success();
+}
+
+OpFoldResult cir::ComplexRealOp::fold(FoldAdaptor adaptor) {
+  auto complex =
+      mlir::cast_if_present<cir::ConstComplexAttr>(adaptor.getOperand());
+  return complex ? complex.getReal() : nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// ComplexImagOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult cir::ComplexImagOp::verify() {
+  if (getType() != getOperand().getType().getElementType()) {
+    emitOpError() << ": result type does not match operand type";
+    return failure();
+  }
+  return success();
+}
+
+OpFoldResult cir::ComplexImagOp::fold(FoldAdaptor adaptor) {
+  auto complex =
+      mlir::cast_if_present<cir::ConstComplexAttr>(adaptor.getOperand());
+  return complex ? complex.getImag() : nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
index f07e234e5e84c..e505db50d3609 100644
--- a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
@@ -141,8 +141,9 @@ void CIRCanonicalizePass::runOnOperation() {
     // Many operations are here to perform a manual `fold` in
     // applyOpPatternsGreedily.
     if (isa<BrOp, BrCondOp, CastOp, ScopeOp, SwitchOp, SelectOp, UnaryOp,
-            ComplexCreateOp, VecCmpOp, VecCreateOp, VecExtractOp, VecShuffleOp,
-            VecShuffleDynamicOp, VecTernaryOp>(op))
+            ComplexCreateOp, ComplexImagOp, ComplexRealOp, VecCmpOp,
+            VecCreateOp, VecExtractOp, VecShuffleOp, VecShuffleDynamicOp,
+            VecTernaryOp>(op))
       ops.push_back(op);
   });
 
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index a870e6c45b69d..1c13c88902d9a 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -97,6 +97,18 @@ static mlir::Value createIntCast(mlir::OpBuilder &bld, mlir::Value src,
   return bld.create<mlir::LLVM::BitcastOp>(loc, dstTy, src);
 }
 
+static mlir::LLVM::Visibility
+lowerCIRVisibilityToLLVMVisibility(cir::VisibilityKind visibilityKind) {
+  switch (visibilityKind) {
+  case cir::VisibilityKind::Default:
+    return ::mlir::LLVM::Visibility::Default;
+  case cir::VisibilityKind::Hidden:
+    return ::mlir::LLVM::Visibility::Hidden;
+  case cir::VisibilityKind::Protected:
+    return ::mlir::LLVM::Visibility::Protected;
+  }
+}
+
 /// Emits the value from memory as expected by its users. Should be called when
 /// the memory represetnation of a CIR type is not equal to its scalar
 /// representation.
@@ -1014,9 +1026,12 @@ void CIRToLLVMFuncOpLowering::lowerFuncAttributes(
     SmallVectorImpl<mlir::NamedAttribute> &result) const {
   assert(!cir::MissingFeatures::opFuncCallingConv());
   for (mlir::NamedAttribute attr : func->getAttrs()) {
+    assert(!cir::MissingFeatures::opFuncCallingConv());
     if (attr.getName() == mlir::SymbolTable::getSymbolAttrName() ||
         attr.getName() == func.getFunctionTypeAttrName() ||
         attr.getName() == getLinkageAttrNameString() ||
+        attr.getName() == func.getGlobalVisibilityAttrName() ||
+        attr.getName() == func.getDsoLocalAttrName() ||
         (filterArgAndResAttrs &&
          (attr.getName() == func.getArgAttrsAttrName() ||
           attr.getName() == func.getResAttrsAttrName())))
@@ -1032,8 +1047,7 @@ mlir::LogicalResult CIRToLLVMFuncOpLowering::matchAndRewrite(
     mlir::ConversionPatternRewriter &rewriter) const {
 
   cir::FuncType fnType = op.getFunctionType();
-  assert(!cir::MissingFeatures::opFuncDsoLocal());
-  bool isDsoLocal = false;
+  bool isDsoLocal = op.getDsoLocal();
   mlir::TypeConverter::SignatureConversion signatureConversion(
       fnType.getNumInputs());
 
@@ -1061,8 +1075,7 @@ mlir::LogicalResult CIRToLLVMFuncOpLowering::matchAndRewrite(
           mlir::isa<mlir::UnknownLoc>(loc)) &&
          "expected single location or unknown location here");
 
-  assert(!cir::MissingFeatures::opFuncLinkage());
-  mlir::LLVM::Linkage linkage = mlir::LLVM::Linkage::External;
+  mlir::LLVM::Linkage linkage = convertLinkage(op.getLinkage());
   assert(!cir::MissingFeatures::opFuncCallingConv());
   mlir::LLVM::CConv cconv = mlir::LLVM::CConv::C;
   SmallVector<mlir::NamedAttribute, 4> attributes;
@@ -1072,7 +1085,11 @@ mlir::LogicalResult CIRToLLVMFuncOpLowering::matchAndRewrite(
       loc, op.getName(), llvmFnTy, linkage, isDsoLocal, cconv,
       mlir::SymbolRefAttr(), attributes);
 
-  assert(!cir::MissingFeatures::opFuncVisibility());
+  assert(!cir::MissingFeatures::opFuncMultipleReturnVals());
+
+  fn.setVisibility_Attr(mlir::LLVM::VisibilityAttr::get(
+      getContext(), lowerCIRVisibilityToLLVMVisibility(
+                        op.getGlobalVisibilityAttr().getValue())));
 
   rewriter.inlineRegionBefore(op.getBody(), fn.getBody(), fn.end());
   if (failed(rewriter.convertRegionTypes(&fn.getBody(), *typeConverter,
@@ -1903,7 +1920,9 @@ void ConvertCIRToLLVMPass::runOnOperation() {
                CIRToLLVMVecShuffleOpLowering,
                CIRToLLVMVecShuffleDynamicOpLowering,
                CIRToLLVMVecTernaryOpLowering,
-               CIRToLLVMComplexCreateOpLowering
+               CIRToLLVMComplexCreateOpLowering,
+               CIRToLLVMComplexRealOpLowering,
+               CIRToLLVMComplexImagOpLowering
       // clang-format on
       >(converter, patterns.getContext());
 
@@ -2207,6 +2226,24 @@ mlir::LogicalResult CIRToLLVMComplexCreateOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMComplexRealOpLowering::matchAndRewrite(
+    cir::ComplexRealOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::Type resultLLVMTy = getTypeConverter()->convertType(op.getType());
+  rewriter.replaceOpWithNewOp<mlir::LLVM::ExtractValueOp>(
+      op, resultLLVMTy, adaptor.getOperand(), llvm::ArrayRef<std::int64_t>{0});
+  return mlir::success();
+}
+
+mlir::LogicalResult CIRToLLVMComplexImagOpLowering::matchAndRewrite(
+    cir::ComplexImagOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::Type resultLLVMTy = getTypeConverter()->convertType(op.getType());
+  rewriter.replaceOpWithNewOp<mlir::LLVM::ExtractValueOp>(
+      op, resultLLVMTy, adaptor.getOperand(), llvm::ArrayRef<std::int64_t>{1});
+  return mlir::success();
+}
+
 std::unique_ptr<mlir::Pass> createConvertCIRToLLVMPass() {
   return std::make_unique<ConvertCIRToLLVMPass>();
 }
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
index 52959d61355b0..8502cb1ae5d9f 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
@@ -443,6 +443,26 @@ class CIRToLLVMComplexCreateOpLowering
                   mlir::ConversionPatternRewriter &) const override;
 };
 
+class CIRToLLVMComplexRealOpLowering
+    : public mlir::OpConversionPattern<cir::ComplexRealOp> {
+public:
+  using mlir::OpConversionPattern<cir::ComplexRealOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::ComplexRealOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &) const override;
+};
+
+class CIRToLLVMComplexImagOpLowering
+    : public mlir::OpConversionPattern<cir::ComplexImagOp> {
+public:
+  using mlir::OpConversionPattern<cir::ComplexImagOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::ComplexImagOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &) const override;
+};
+
 } // namespace direct
 } // namespace cir
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 2c011a9519860..2a8722221f24b 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2356,7 +2356,7 @@ EmitCheckedMixedSignMultiply(CodeGenFunction &CGF, const clang::Expr *Op1,
   llvm::Type *OpTy = Signed->getType();
   llvm::Value *Zero = llvm::Constant::getNullValue(OpTy);
   Address ResultPtr = CGF.EmitPointerWithAlignment(ResultArg);
-  llvm::Type *ResTy = ResultPtr.getElementType();
+  llvm::Type *ResTy = CGF.getTypes().ConvertType(ResultQTy);
   unsigned OpWidth = std::max(Op1Info.Width, Op2Info.Width);
 
   // Take the absolute value of the signed operand.
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index ee5e3d68a5ffa..7ab0e2fdaa731 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -841,8 +841,8 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) {
     auto *ISATy = DBuilder.createPointerType(ClassTy, Size);
 
     ObjTy = DBuilder.createStructType(TheCU, "objc_object", TheCU->getFile(), 0,
-                                      0, 0, llvm::DINode::FlagZero, nullptr,
-                                      llvm::DINodeArray());
+                                      (uint64_t)0, 0, llvm::DINode::FlagZero,
+                                      nullptr, llvm::DINodeArray());
 
     DBuilder.replaceArrays(
         ObjTy, DBuilder.getOrCreateArray(&*DBuilder.createMemberType(
diff --git a/clang/lib/CodeGen/CGVTables.cpp b/clang/lib/CodeGen/CGVTables.cpp
index 2897ccdf88660..0b6e830e0d557 100644
--- a/clang/lib/CodeGen/CGVTables.cpp
+++ b/clang/lib/CodeGen/CGVTables.cpp
@@ -1138,7 +1138,9 @@ CodeGenModule::getVTableLinkage(const CXXRecordDecl *RD) {
                  llvm::Function::InternalLinkage;
 
       case TSK_ExplicitInstantiationDeclaration:
-        llvm_unreachable("Should not have been asked to emit this");
+        return IsExternalDefinition
+                   ? llvm::GlobalVariable::AvailableExternallyLinkage
+                   : llvm::GlobalVariable::ExternalLinkage;
       }
   }
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 59f14b3e35fd0..6c32c98cec011 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -234,6 +234,9 @@ class ApplyAtomGroup {
   uint64_t OriginalAtom = 0;
   CGDebugInfo *DI = nullptr;
 
+  ApplyAtomGroup(const ApplyAtomGroup &) = delete;
+  void operator=(const ApplyAtomGroup &) = delete;
+
 public:
   ApplyAtomGroup(CGDebugInfo *DI);
   ~ApplyAtomGroup();
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 6738d4be6dd21..e30a8c6133055 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -6499,12 +6499,38 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   }
 
   case clang::AArch64::BI_InterlockedAdd:
-  case clang::AArch64::BI_InterlockedAdd64: {
+  case clang::AArch64::BI_InterlockedAdd_acq:
+  case clang::AArch64::BI_InterlockedAdd_rel:
+  case clang::AArch64::BI_InterlockedAdd_nf:
+  case clang::AArch64::BI_InterlockedAdd64:
+  case clang::AArch64::BI_InterlockedAdd64_acq:
+  case clang::AArch64::BI_InterlockedAdd64_rel:
+  case clang::AArch64::BI_InterlockedAdd64_nf: {
     Address DestAddr = CheckAtomicAlignment(*this, E);
     Value *Val = EmitScalarExpr(E->getArg(1));
+    llvm::AtomicOrdering Ordering;
+    switch (BuiltinID) {
+    case clang::AArch64::BI_InterlockedAdd:
+    case clang::AArch64::BI_InterlockedAdd64:
+      Ordering = llvm::AtomicOrdering::SequentiallyConsistent;
+      break;
+    case clang::AArch64::BI_InterlockedAdd_acq:
+    case clang::AArch64::BI_InterlockedAdd64_acq:
+      Ordering = llvm::AtomicOrdering::Acquire;
+      break;
+    case clang::AArch64::BI_InterlockedAdd_rel:
+    case clang::AArch64::BI_InterlockedAdd64_rel:
+      Ordering = llvm::AtomicOrdering::Release;
+      break;
+    case clang::AArch64::BI_InterlockedAdd_nf:
+    case clang::AArch64::BI_InterlockedAdd64_nf:
+      Ordering = llvm::AtomicOrdering::Monotonic;
+      break;
+    default:
+      llvm_unreachable("missing builtin ID in switch!");
+    }
     AtomicRMWInst *RMWI =
-        Builder.CreateAtomicRMW(AtomicRMWInst::Add, DestAddr, Val,
-                                llvm::AtomicOrdering::SequentiallyConsistent);
+        Builder.CreateAtomicRMW(AtomicRMWInst::Add, DestAddr, Val, Ordering);
     return Builder.CreateAdd(RMWI, Val);
   }
   }
diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
index afa23bffcd073..be9a5e60af358 100644
--- a/clang/lib/CodeGen/Targets/SPIR.cpp
+++ b/clang/lib/CodeGen/Targets/SPIR.cpp
@@ -75,6 +75,10 @@ class SPIRVTargetCodeGenInfo : public CommonSPIRTargetCodeGenInfo {
                                          SyncScope Scope,
                                          llvm::AtomicOrdering Ordering,
                                          llvm::LLVMContext &Ctx) const override;
+  bool supportsLibCall() const override {
+    return getABIInfo().getTarget().getTriple().getVendor() !=
+           llvm::Triple::AMD;
+  }
 };
 
 inline StringRef mapClangSyncScopeToLLVM(SyncScope Scope) {
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 2d055ffa17a8f..b88f148b2f1ad 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -1030,10 +1030,6 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
       return;
     }
 
-    llvm::StringMap<llvm::DenseSet<StringRef>> DerivedArchs;
-    llvm::StringMap<StringRef> FoundNormalizedTriples;
-    std::multiset<StringRef> OpenMPTriples;
-
     // If the user specified -fopenmp-targets= we create a toolchain for each
     // valid triple. Otherwise, if only --offload-arch= was specified we instead
     // attempt to derive the appropriate toolchains from the arguments.
@@ -1044,82 +1040,77 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
             << OpenMPTargets->getAsString(C.getInputArgs());
         return;
       }
+
+      // Make sure these show up in a deterministic order.
+      std::multiset<StringRef> OpenMPTriples;
       for (StringRef T : OpenMPTargets->getValues())
         OpenMPTriples.insert(T);
+
+      llvm::StringMap<StringRef> FoundNormalizedTriples;
+      for (StringRef T : OpenMPTriples) {
+        llvm::Triple TT(ToolChain::getOpenMPTriple(T));
+        std::string NormalizedName = TT.normalize();
+
+        // Make sure we don't have a duplicate triple.
+        auto [TripleIt, Inserted] =
+            FoundNormalizedTriples.try_emplace(NormalizedName, T);
+        if (!Inserted) {
+          Diag(clang::diag::warn_drv_omp_offload_target_duplicate)
+              << T << TripleIt->second;
+          continue;
+        }
+
+        // If the specified target is invalid, emit a diagnostic.
+        if (TT.getArch() == llvm::Triple::UnknownArch) {
+          Diag(clang::diag::err_drv_invalid_omp_target) << T;
+          continue;
+        }
+
+        auto &TC = getOffloadToolChain(C.getInputArgs(), Action::OFK_OpenMP, TT,
+                                       C.getDefaultToolChain().getTriple());
+        C.addOffloadDeviceToolChain(&TC, Action::OFK_OpenMP);
+      }
     } else if (C.getInputArgs().hasArg(options::OPT_offload_arch_EQ) &&
                ((!IsHIP && !IsCuda) || UseLLVMOffload)) {
-      const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
-      auto AMDTriple = getHIPOffloadTargetTriple(*this, C.getInputArgs());
-      auto NVPTXTriple = getNVIDIAOffloadTargetTriple(*this, C.getInputArgs(),
-                                                      HostTC->getTriple());
+      llvm::Triple AMDTriple("amdgcn-amd-amdhsa");
+      llvm::Triple NVPTXTriple("nvptx64-nvidia-cuda");
 
       // Attempt to deduce the offloading triple from the set of architectures.
       // We can only correctly deduce NVPTX / AMDGPU triples currently.
-      // We need to temporarily create these toolchains so that we can access
-      // tools for inferring architectures.
-      llvm::DenseSet<StringRef> Archs;
-      for (const std::optional<llvm::Triple> &TT : {NVPTXTriple, AMDTriple}) {
-        if (!TT)
-          continue;
-
-        auto &TC =
-            getOffloadToolChain(C.getInputArgs(), Action::OFK_OpenMP, *TT,
-                                C.getDefaultToolChain().getTriple());
-        for (StringRef Arch :
-             getOffloadArchs(C, C.getArgs(), Action::OFK_OpenMP, &TC, true))
-          Archs.insert(Arch);
-      }
+      for (const llvm::Triple &TT : {AMDTriple, NVPTXTriple}) {
+        auto &TC = getOffloadToolChain(C.getInputArgs(), Action::OFK_OpenMP, TT,
+                                       C.getDefaultToolChain().getTriple());
+
+        llvm::DenseSet<StringRef> Archs =
+            getOffloadArchs(C, C.getArgs(), Action::OFK_OpenMP, &TC, true);
+        llvm::DenseSet<StringRef> ArchsForTarget;
+        for (StringRef Arch : Archs) {
+          bool IsNVPTX = IsNVIDIAOffloadArch(
+              StringToOffloadArch(getProcessorFromTargetID(NVPTXTriple, Arch)));
+          bool IsAMDGPU = IsAMDOffloadArch(
+              StringToOffloadArch(getProcessorFromTargetID(AMDTriple, Arch)));
+          if (!IsNVPTX && !IsAMDGPU && !Arch.equals_insensitive("native")) {
+            Diag(clang::diag::err_drv_failed_to_deduce_target_from_arch)
+                << Arch;
+            return;
+          }
 
-      for (StringRef Arch : Archs) {
-        if (NVPTXTriple && IsNVIDIAOffloadArch(StringToOffloadArch(
-                               getProcessorFromTargetID(*NVPTXTriple, Arch)))) {
-          DerivedArchs[NVPTXTriple->getTriple()].insert(Arch);
-        } else if (AMDTriple &&
-                   IsAMDOffloadArch(StringToOffloadArch(
-                       getProcessorFromTargetID(*AMDTriple, Arch)))) {
-          DerivedArchs[AMDTriple->getTriple()].insert(Arch);
-        } else {
-          Diag(clang::diag::err_drv_failed_to_deduce_target_from_arch) << Arch;
-          return;
+          if (TT.isNVPTX() && IsNVPTX)
+            ArchsForTarget.insert(Arch);
+          else if (TT.isAMDGPU() && IsAMDGPU)
+            ArchsForTarget.insert(Arch);
+        }
+        if (!ArchsForTarget.empty()) {
+          C.addOffloadDeviceToolChain(&TC, Action::OFK_OpenMP);
+          KnownArchs[&TC] = ArchsForTarget;
         }
       }
 
       // If the set is empty then we failed to find a native architecture.
-      if (Archs.empty()) {
+      auto TCRange = C.getOffloadToolChains(Action::OFK_OpenMP);
+      if (TCRange.first == TCRange.second)
         Diag(clang::diag::err_drv_failed_to_deduce_target_from_arch)
             << "native";
-        return;
-      }
-
-      for (const auto &TripleAndArchs : DerivedArchs)
-        OpenMPTriples.insert(TripleAndArchs.first());
-    }
-
-    for (StringRef Val : OpenMPTriples) {
-      llvm::Triple TT(ToolChain::getOpenMPTriple(Val));
-      std::string NormalizedName = TT.normalize();
-
-      // Make sure we don't have a duplicate triple.
-      auto [TripleIt, Inserted] =
-          FoundNormalizedTriples.try_emplace(NormalizedName, Val);
-      if (!Inserted) {
-        Diag(clang::diag::warn_drv_omp_offload_target_duplicate)
-            << Val << TripleIt->second;
-        continue;
-      }
-
-      // If the specified target is invalid, emit a diagnostic.
-      if (TT.getArch() == llvm::Triple::UnknownArch) {
-        Diag(clang::diag::err_drv_invalid_omp_target) << Val;
-        continue;
-      }
-
-      auto &TC = getOffloadToolChain(C.getInputArgs(), Action::OFK_OpenMP, TT,
-                                     C.getDefaultToolChain().getTriple());
-      C.addOffloadDeviceToolChain(&TC, Action::OFK_OpenMP);
-      auto It = DerivedArchs.find(TT.getTriple());
-      if (It != DerivedArchs.end())
-        KnownArchs[&TC] = It->second;
     }
   } else if (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ)) {
     Diag(clang::diag::err_drv_expecting_fopenmp_with_fopenmp_targets);
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 4cc4f5f22db0d..06f68ec8b0fc1 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -14,6 +14,7 @@
 
 #include "FormatTokenLexer.h"
 #include "FormatToken.h"
+#include "clang/Basic/CharInfo.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Format/Format.h"
@@ -1203,16 +1204,22 @@ static size_t countLeadingWhitespace(StringRef Text) {
   const unsigned char *const End = Text.bytes_end();
   const unsigned char *Cur = Begin;
   while (Cur < End) {
-    if (isspace(Cur[0])) {
+    if (isWhitespace(Cur[0])) {
       ++Cur;
-    } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {
-      // A '\' followed by a newline always escapes the newline, regardless
-      // of whether there is another '\' before it.
+    } else if (Cur[0] == '\\') {
+      // A backslash followed by optional horizontal whitespaces (P22232R2) and
+      // then a newline always escapes the newline.
       // The source has a null byte at the end. So the end of the entire input
       // isn't reached yet. Also the lexer doesn't break apart an escaped
       // newline.
-      assert(End - Cur >= 2);
-      Cur += 2;
+      const auto *Lookahead = Cur + 1;
+      while (isHorizontalWhitespace(*Lookahead))
+        ++Lookahead;
+      // No line splice found; the backslash is a token.
+      if (!isVerticalWhitespace(*Lookahead))
+        break;
+      // Splice found, consume it.
+      Cur = Lookahead + 1;
     } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
                (Cur[3] == '\n' || Cur[3] == '\r')) {
       // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
@@ -1295,13 +1302,18 @@ FormatToken *FormatTokenLexer::getNextToken() {
       case '/':
         // The text was entirely whitespace when this loop was entered. Thus
         // this has to be an escape sequence.
-        assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
-               Text.substr(i, 4) == "\?\?/\r" ||
+        assert(Text.substr(i, 4) == "\?\?/\r" ||
                Text.substr(i, 4) == "\?\?/\n" ||
                (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
                            Text.substr(i - 1, 4) == "\?\?/\n")) ||
                (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
-                           Text.substr(i - 2, 4) == "\?\?/\n")));
+                           Text.substr(i - 2, 4) == "\?\?/\n")) ||
+               (Text[i] == '\\' && [&]() -> bool {
+                 size_t j = i + 1;
+                 while (j < Text.size() && isHorizontalWhitespace(Text[j]))
+                   ++j;
+                 return j < Text.size() && (Text[j] == '\n' || Text[j] == '\r');
+               }()));
         InEscape = true;
         break;
       default:
diff --git a/clang/lib/Format/QualifierAlignmentFixer.cpp b/clang/lib/Format/QualifierAlignmentFixer.cpp
index 8e55d339b2388..b0dda65adfba1 100644
--- a/clang/lib/Format/QualifierAlignmentFixer.cpp
+++ b/clang/lib/Format/QualifierAlignmentFixer.cpp
@@ -635,15 +635,26 @@ bool isConfiguredQualifierOrType(const FormatToken *Tok,
 // If a token is an identifier and it's upper case, it could
 // be a macro and hence we need to be able to ignore it.
 bool isPossibleMacro(const FormatToken *Tok) {
-  if (!Tok)
-    return false;
+  assert(Tok);
   if (Tok->isNot(tok::identifier))
     return false;
-  if (Tok->TokenText.upper() == Tok->TokenText.str()) {
-    // T,K,U,V likely could be template arguments
-    return Tok->TokenText.size() != 1;
-  }
-  return false;
+
+  const auto Text = Tok->TokenText;
+  assert(Text.size() > 0);
+
+  // T,K,U,V likely could be template arguments
+  if (Text.size() == 1)
+    return false;
+
+  // It's unlikely that qualified names are object-like macros.
+  const auto *Prev = Tok->getPreviousNonComment();
+  if (Prev && Prev->is(tok::coloncolon))
+    return false;
+  const auto *Next = Tok->getNextNonComment();
+  if (Next && Next->is(tok::coloncolon))
+    return false;
+
+  return Text == Text.upper();
 }
 
 } // namespace format
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index d2f8b2703a9a3..6ad9a79998426 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -1371,7 +1371,7 @@ class AnnotatingParser {
         Tok->setType(TT_InlineASMColon);
       } else if (Contexts.back().ColonIsDictLiteral || Style.isProto()) {
         Tok->setType(TT_DictLiteral);
-        if (Prev && Style.isTextProto())
+        if (Style.isTextProto())
           Prev->setType(TT_SelectorName);
       } else if (Contexts.back().ColonIsObjCMethodExpr ||
                  Line.startsWith(TT_ObjCMethodSpecifier)) {
@@ -1408,7 +1408,6 @@ class AnnotatingParser {
         }
       } else if (Contexts.back().ContextType == Context::C11GenericSelection) {
         Tok->setType(TT_GenericSelectionColon);
-        assert(Prev);
         if (Prev->isPointerOrReference())
           Prev->setFinalizedType(TT_PointerOrReference);
       } else if ((CurrentToken && CurrentToken->is(tok::numeric_constant)) ||
@@ -1419,8 +1418,6 @@ class AnnotatingParser {
                  !Line.getFirstNonComment()->isOneOf(tok::kw_enum, tok::kw_case,
                                                      tok::kw_default) &&
                  !Line.startsWith(tok::kw_typedef, tok::kw_enum)) {
-        if (!Prev)
-          break;
         if (Prev->isOneOf(tok::r_paren, tok::kw_noexcept) ||
             Prev->ClosesRequiresClause) {
           Tok->setType(TT_CtorInitializerColon);
diff --git a/clang/lib/Headers/intrin.h b/clang/lib/Headers/intrin.h
index 3dd1eb45817d4..39ccc97540b1e 100644
--- a/clang/lib/Headers/intrin.h
+++ b/clang/lib/Headers/intrin.h
@@ -370,8 +370,14 @@ static __inline__ void __DEFAULT_FN_ATTRS __nop(void) {
 \*----------------------------------------------------------------------------*/
 #if defined(__aarch64__) || defined(__arm64ec__)
 unsigned __int64 __getReg(int);
-long _InterlockedAdd(long volatile *Addend, long Value);
-__int64 _InterlockedAdd64(__int64 volatile *Addend, __int64 Value);
+long _InterlockedAdd(long volatile *, long);
+long _InterlockedAdd_acq(long volatile *, long);
+long _InterlockedAdd_nf(long volatile *, long);
+long _InterlockedAdd_rel(long volatile *, long);
+__int64 _InterlockedAdd64(__int64 volatile *, __int64);
+__int64 _InterlockedAdd64_acq(__int64 volatile *, __int64);
+__int64 _InterlockedAdd64_nf(__int64 volatile *, __int64);
+__int64 _InterlockedAdd64_rel(__int64 volatile *, __int64);
 __int64 _ReadStatusReg(int);
 void _WriteStatusReg(int, __int64);
 
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index c8974e5a3528c..b88624b22e622 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -183,9 +183,9 @@ static bool isReservedCXXAttributeName(Preprocessor &PP, IdentifierInfo *II) {
     AttributeCommonInfo::AttrArgsInfo AttrArgsInfo =
         AttributeCommonInfo::getCXX11AttrArgsInfo(II);
     if (AttrArgsInfo == AttributeCommonInfo::AttrArgsInfo::Required)
-      return PP.isNextPPTokenOneOf<tok::l_paren>();
+      return PP.isNextPPTokenOneOf(tok::l_paren);
 
-    return !PP.isNextPPTokenOneOf<tok::l_paren>() ||
+    return !PP.isNextPPTokenOneOf(tok::l_paren) ||
            AttrArgsInfo == AttributeCommonInfo::AttrArgsInfo::Optional;
   }
   return false;
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 7fecbe9eee53c..500cf6f8400e0 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -813,14 +813,14 @@ bool Preprocessor::HandleIdentifier(Token &Identifier) {
       if (!Identifier.isExpandDisabled() && MI->isEnabled()) {
         // C99 6.10.3p10: If the preprocessing token immediately after the
         // macro name isn't a '(', this macro should not be expanded.
-        if (!MI->isFunctionLike() || isNextPPTokenOneOf<tok::l_paren>())
+        if (!MI->isFunctionLike() || isNextPPTokenOneOf(tok::l_paren))
           return HandleMacroExpandedIdentifier(Identifier, MD);
       } else {
         // C99 6.10.3.4p2 says that a disabled macro may never again be
         // expanded, even if it's in a context where it could be expanded in the
         // future.
         Identifier.setFlag(Token::DisableExpand);
-        if (MI->isObjectLike() || isNextPPTokenOneOf<tok::l_paren>())
+        if (MI->isObjectLike() || isNextPPTokenOneOf(tok::l_paren))
           Diag(Identifier, diag::pp_disabled_macro_expansion);
       }
     }
diff --git a/clang/lib/Sema/SemaPPC.cpp b/clang/lib/Sema/SemaPPC.cpp
index 9b4d82745f881..d5c83aedb3008 100644
--- a/clang/lib/Sema/SemaPPC.cpp
+++ b/clang/lib/Sema/SemaPPC.cpp
@@ -106,6 +106,10 @@ bool SemaPPC::CheckPPCBuiltinFunctionCall(const TargetInfo &TI,
   switch (BuiltinID) {
   default:
     return false;
+  case PPC::BI__builtin_ppc_national2packed:
+  case PPC::BI__builtin_ppc_packed2zoned:
+  case PPC::BI__builtin_ppc_zoned2packed:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 1);
   case PPC::BI__builtin_altivec_crypto_vshasigmaw:
   case PPC::BI__builtin_altivec_crypto_vshasigmad:
     return SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 1) ||
diff --git a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp
index 4dbb2450857e0..cb3d9b77ee4dd 100644
--- a/clang/lib/Sema/SemaTypeTraits.cpp
+++ b/clang/lib/Sema/SemaTypeTraits.cpp
@@ -121,7 +121,7 @@ static bool hasSuitableConstructorForRelocation(Sema &SemaRef,
 
   CXXMethodDecl *Decl =
       LookupSpecialMemberFromXValue(SemaRef, D, /*Assign=*/false);
-  return Decl && Decl->isUserProvided() == AllowUserDefined &&
+  return Decl && (AllowUserDefined || !Decl->isUserProvided()) &&
          !Decl->isDeleted();
 }
 
@@ -137,7 +137,7 @@ static bool hasSuitableMoveAssignmentOperatorForRelocation(
   if (!Decl)
     return false;
 
-  return Decl && Decl->isUserProvided() == AllowUserDefined &&
+  return Decl && (AllowUserDefined || !Decl->isUserProvided()) &&
          !Decl->isDeleted();
 }
 
@@ -1725,14 +1725,15 @@ static bool EvaluateBinaryTypeTrait(Sema &Self, TypeTrait BTT,
 
     // Build expressions that emulate the effect of declval<T>() and
     // declval<U>().
-    if (LhsT->isObjectType() || LhsT->isFunctionType())
-      LhsT = Self.Context.getRValueReferenceType(LhsT);
-    if (RhsT->isObjectType() || RhsT->isFunctionType())
-      RhsT = Self.Context.getRValueReferenceType(RhsT);
-    OpaqueValueExpr Lhs(KeyLoc, LhsT.getNonLValueExprType(Self.Context),
-                        Expr::getValueKindForType(LhsT));
-    OpaqueValueExpr Rhs(KeyLoc, RhsT.getNonLValueExprType(Self.Context),
-                        Expr::getValueKindForType(RhsT));
+    auto createDeclValExpr = [&](QualType Ty) -> OpaqueValueExpr {
+      if (Ty->isObjectType() || Ty->isFunctionType())
+        Ty = Self.Context.getRValueReferenceType(Ty);
+      return {KeyLoc, Ty.getNonLValueExprType(Self.Context),
+              Expr::getValueKindForType(Ty)};
+    };
+
+    auto Lhs = createDeclValExpr(LhsT);
+    auto Rhs = createDeclValExpr(RhsT);
 
     // Attempt the assignment in an unevaluated context within a SFINAE
     // trap at translation unit scope.
@@ -1956,6 +1957,9 @@ static std::optional<TypeTrait> StdNameToTypeTrait(StringRef Name) {
             TypeTrait::UTT_IsCppTriviallyRelocatable)
       .Case("is_replaceable", TypeTrait::UTT_IsReplaceable)
       .Case("is_trivially_copyable", TypeTrait::UTT_IsTriviallyCopyable)
+      .Case("is_assignable", TypeTrait::BTT_IsAssignable)
+      .Case("is_empty", TypeTrait::UTT_IsEmpty)
+      .Case("is_standard_layout", TypeTrait::UTT_IsStandardLayout)
       .Default(std::nullopt);
 }
 
@@ -2285,6 +2289,244 @@ static void DiagnoseNonTriviallyCopyableReason(Sema &SemaRef,
   SemaRef.Diag(D->getLocation(), diag::note_defined_here) << D;
 }
 
+static void DiagnoseNonAssignableReason(Sema &SemaRef, SourceLocation Loc,
+                                        QualType T, QualType U) {
+  const CXXRecordDecl *D = T->getAsCXXRecordDecl();
+
+  auto createDeclValExpr = [&](QualType Ty) -> OpaqueValueExpr {
+    if (Ty->isObjectType() || Ty->isFunctionType())
+      Ty = SemaRef.Context.getRValueReferenceType(Ty);
+    return {Loc, Ty.getNonLValueExprType(SemaRef.Context),
+            Expr::getValueKindForType(Ty)};
+  };
+
+  auto LHS = createDeclValExpr(T);
+  auto RHS = createDeclValExpr(U);
+
+  EnterExpressionEvaluationContext Unevaluated(
+      SemaRef, Sema::ExpressionEvaluationContext::Unevaluated);
+  Sema::ContextRAII TUContext(SemaRef,
+                              SemaRef.Context.getTranslationUnitDecl());
+  SemaRef.BuildBinOp(/*S=*/nullptr, Loc, BO_Assign, &LHS, &RHS);
+
+  if (!D || D->isInvalidDecl())
+    return;
+
+  SemaRef.Diag(D->getLocation(), diag::note_defined_here) << D;
+}
+
+static void DiagnoseIsEmptyReason(Sema &S, SourceLocation Loc,
+                                  const CXXRecordDecl *D) {
+  // Non-static data members (ignore zero-width bit‐fields).
+  for (const auto *Field : D->fields()) {
+    if (Field->isZeroLengthBitField())
+      continue;
+    if (Field->isBitField()) {
+      S.Diag(Loc, diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::NonZeroLengthField << Field
+          << Field->getSourceRange();
+      continue;
+    }
+    S.Diag(Loc, diag::note_unsatisfied_trait_reason)
+        << diag::TraitNotSatisfiedReason::NonEmptyMember << Field
+        << Field->getType() << Field->getSourceRange();
+  }
+
+  // Virtual functions.
+  for (const auto *M : D->methods()) {
+    if (M->isVirtual()) {
+      S.Diag(Loc, diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::VirtualFunction << M
+          << M->getSourceRange();
+      break;
+    }
+  }
+
+  // Virtual bases and non-empty bases.
+  for (const auto &B : D->bases()) {
+    const auto *BR = B.getType()->getAsCXXRecordDecl();
+    if (!BR || BR->isInvalidDecl())
+      continue;
+    if (B.isVirtual()) {
+      S.Diag(Loc, diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::VBase << B.getType()
+          << B.getSourceRange();
+    }
+    if (!BR->isEmpty()) {
+      S.Diag(Loc, diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::NonEmptyBase << B.getType()
+          << B.getSourceRange();
+    }
+  }
+}
+
+static void DiagnoseIsEmptyReason(Sema &S, SourceLocation Loc, QualType T) {
+  // Emit primary "not empty" diagnostic.
+  S.Diag(Loc, diag::note_unsatisfied_trait) << T << diag::TraitName::Empty;
+
+  // While diagnosing is_empty<T>, we want to look at the actual type, not a
+  // reference or an array of it. So we need to massage the QualType param to
+  // strip refs and arrays.
+  if (T->isReferenceType())
+    S.Diag(Loc, diag::note_unsatisfied_trait_reason)
+        << diag::TraitNotSatisfiedReason::Ref;
+  T = T.getNonReferenceType();
+
+  if (auto *AT = S.Context.getAsArrayType(T))
+    T = AT->getElementType();
+
+  if (auto *D = T->getAsCXXRecordDecl()) {
+    if (D->hasDefinition()) {
+      DiagnoseIsEmptyReason(S, Loc, D);
+      S.Diag(D->getLocation(), diag::note_defined_here) << D;
+    }
+  }
+}
+
+static bool hasMultipleDataBaseClassesWithFields(const CXXRecordDecl *D) {
+  int NumBasesWithFields = 0;
+  for (const CXXBaseSpecifier &Base : D->bases()) {
+    const CXXRecordDecl *BaseRD = Base.getType()->getAsCXXRecordDecl();
+    if (!BaseRD || BaseRD->isInvalidDecl())
+      continue;
+
+    for (const FieldDecl *Field : BaseRD->fields()) {
+      if (!Field->isUnnamedBitField()) {
+        if (++NumBasesWithFields > 1)
+          return true; // found more than one base class with fields
+        break;         // no need to check further fields in this base class
+      }
+    }
+  }
+  return false;
+}
+
+static void DiagnoseNonStandardLayoutReason(Sema &SemaRef, SourceLocation Loc,
+                                            const CXXRecordDecl *D) {
+  for (const CXXBaseSpecifier &B : D->bases()) {
+    assert(B.getType()->getAsCXXRecordDecl() && "invalid base?");
+    if (B.isVirtual()) {
+      SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::VBase << B.getType()
+          << B.getSourceRange();
+    }
+    if (!B.getType()->isStandardLayoutType()) {
+      SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::NonStandardLayoutBase << B.getType()
+          << B.getSourceRange();
+    }
+  }
+  // Check for mixed access specifiers in fields.
+  const FieldDecl *FirstField = nullptr;
+  AccessSpecifier FirstAccess = AS_none;
+
+  for (const FieldDecl *Field : D->fields()) {
+    if (Field->isUnnamedBitField())
+      continue;
+
+    // Record the first field we see
+    if (!FirstField) {
+      FirstField = Field;
+      FirstAccess = Field->getAccess();
+      continue;
+    }
+
+    // Check if the field has a different access specifier than the first one.
+    if (Field->getAccess() != FirstAccess) {
+      // Emit a diagnostic about mixed access specifiers.
+      SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::MixedAccess;
+
+      SemaRef.Diag(FirstField->getLocation(), diag::note_defined_here)
+          << FirstField;
+
+      SemaRef.Diag(Field->getLocation(), diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::MixedAccessField << Field
+          << FirstField;
+
+      // No need to check further fields, as we already found mixed access.
+      break;
+    }
+  }
+  if (hasMultipleDataBaseClassesWithFields(D)) {
+    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+        << diag::TraitNotSatisfiedReason::MultipleDataBase;
+  }
+  if (D->isPolymorphic()) {
+    // Find the best location to point “defined here” at.
+    const CXXMethodDecl *VirtualMD = nullptr;
+    // First, look for a virtual method.
+    for (const auto *M : D->methods()) {
+      if (M->isVirtual()) {
+        VirtualMD = M;
+        break;
+      }
+    }
+    if (VirtualMD) {
+      SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::VirtualFunction << VirtualMD;
+      SemaRef.Diag(VirtualMD->getLocation(), diag::note_defined_here)
+          << VirtualMD;
+    } else {
+      // If no virtual method, point to the record declaration itself.
+      SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::VirtualFunction << D;
+      SemaRef.Diag(D->getLocation(), diag::note_defined_here) << D;
+    }
+  }
+  for (const FieldDecl *Field : D->fields()) {
+    if (!Field->getType()->isStandardLayoutType()) {
+      SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::NonStandardLayoutMember << Field
+          << Field->getType() << Field->getSourceRange();
+    }
+  }
+  // Find any indirect base classes that have fields.
+  if (D->hasDirectFields()) {
+    const CXXRecordDecl *Indirect = nullptr;
+    D->forallBases([&](const CXXRecordDecl *BaseDef) {
+      if (BaseDef->hasDirectFields()) {
+        Indirect = BaseDef;
+        return false; // stop traversal
+      }
+      return true; // continue to the next base
+    });
+    if (Indirect) {
+      SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::IndirectBaseWithFields << Indirect
+          << Indirect->getSourceRange();
+    }
+  }
+}
+
+static void DiagnoseNonStandardLayoutReason(Sema &SemaRef, SourceLocation Loc,
+                                            QualType T) {
+  SemaRef.Diag(Loc, diag::note_unsatisfied_trait)
+      << T << diag::TraitName::StandardLayout;
+
+  // Check type-level exclusion first.
+  if (T->isVariablyModifiedType()) {
+    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+        << diag::TraitNotSatisfiedReason::VLA;
+    return;
+  }
+
+  if (T->isReferenceType()) {
+    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+        << diag::TraitNotSatisfiedReason::Ref;
+    return;
+  }
+  T = T.getNonReferenceType();
+  const CXXRecordDecl *D = T->getAsCXXRecordDecl();
+  if (!D || D->isInvalidDecl())
+    return;
+
+  if (D->hasDefinition())
+    DiagnoseNonStandardLayoutReason(SemaRef, Loc, D);
+
+  SemaRef.Diag(D->getLocation(), diag::note_defined_here) << D;
+}
+
 void Sema::DiagnoseTypeTraitDetails(const Expr *E) {
   E = E->IgnoreParenImpCasts();
   if (E->containsErrors())
@@ -2305,6 +2547,15 @@ void Sema::DiagnoseTypeTraitDetails(const Expr *E) {
   case UTT_IsTriviallyCopyable:
     DiagnoseNonTriviallyCopyableReason(*this, E->getBeginLoc(), Args[0]);
     break;
+  case BTT_IsAssignable:
+    DiagnoseNonAssignableReason(*this, E->getBeginLoc(), Args[0], Args[1]);
+    break;
+  case UTT_IsEmpty:
+    DiagnoseIsEmptyReason(*this, E->getBeginLoc(), Args[0]);
+    break;
+  case UTT_IsStandardLayout:
+    DiagnoseNonStandardLayoutReason(*this, E->getBeginLoc(), Args[0]);
+    break;
   default:
     break;
   }
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index b696cb2efee3d..523165c6cab64 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -1914,10 +1914,9 @@ bool ASTReader::ReadSLocEntry(int ID) {
   }
 
   case SM_SLOC_EXPANSION_ENTRY: {
-    LocSeq::State Seq;
-    SourceLocation SpellingLoc = ReadSourceLocation(*F, Record[1], Seq);
-    SourceLocation ExpansionBegin = ReadSourceLocation(*F, Record[2], Seq);
-    SourceLocation ExpansionEnd = ReadSourceLocation(*F, Record[3], Seq);
+    SourceLocation SpellingLoc = ReadSourceLocation(*F, Record[1]);
+    SourceLocation ExpansionBegin = ReadSourceLocation(*F, Record[2]);
+    SourceLocation ExpansionEnd = ReadSourceLocation(*F, Record[3]);
     SourceMgr.createExpansionLoc(SpellingLoc, ExpansionBegin, ExpansionEnd,
                                  Record[5], Record[4], ID,
                                  BaseOffset + Record[0]);
@@ -7072,13 +7071,10 @@ QualType ASTReader::readTypeRecord(TypeID ID) {
 namespace clang {
 
 class TypeLocReader : public TypeLocVisitor<TypeLocReader> {
-  using LocSeq = SourceLocationSequence;
-
   ASTRecordReader &Reader;
-  LocSeq *Seq;
 
-  SourceLocation readSourceLocation() { return Reader.readSourceLocation(Seq); }
-  SourceRange readSourceRange() { return Reader.readSourceRange(Seq); }
+  SourceLocation readSourceLocation() { return Reader.readSourceLocation(); }
+  SourceRange readSourceRange() { return Reader.readSourceRange(); }
 
   TypeSourceInfo *GetTypeSourceInfo() {
     return Reader.readTypeSourceInfo();
@@ -7093,8 +7089,7 @@ class TypeLocReader : public TypeLocVisitor<TypeLocReader> {
   }
 
 public:
-  TypeLocReader(ASTRecordReader &Reader, LocSeq *Seq)
-      : Reader(Reader), Seq(Seq) {}
+  TypeLocReader(ASTRecordReader &Reader) : Reader(Reader) {}
 
   // We want compile-time assurance that we've enumerated all of
   // these, so unfortunately we have to declare them first, then
@@ -7458,9 +7453,8 @@ void TypeLocReader::VisitDependentBitIntTypeLoc(
   TL.setNameLoc(readSourceLocation());
 }
 
-void ASTRecordReader::readTypeLoc(TypeLoc TL, LocSeq *ParentSeq) {
-  LocSeq::State Seq(ParentSeq);
-  TypeLocReader TLR(*this, Seq);
+void ASTRecordReader::readTypeLoc(TypeLoc TL) {
+  TypeLocReader TLR(*this);
   for (; !TL.isNull(); TL = TL.getNextTypeLoc())
     TLR.Visit(TL);
 }
@@ -8381,6 +8375,15 @@ bool ASTReader::LoadExternalSpecializationsImpl(
   if (It == SpecLookups.end())
     return false;
 
+  llvm::TimeTraceScope TimeScope("Load External Specializations for ", [&] {
+    std::string Name;
+    llvm::raw_string_ostream OS(Name);
+    auto *ND = cast<NamedDecl>(D);
+    ND->getNameForDiagnostic(OS, ND->getASTContext().getPrintingPolicy(),
+                             /*Qualified=*/true);
+    return Name;
+  });
+
   Deserializing LookupResults(this);
   auto HashValue = StableHashForTemplateArguments(TemplateArgs);
 
@@ -10016,9 +10019,9 @@ ASTRecordReader::readNestedNameSpecifierLoc() {
 }
 
 SourceRange ASTReader::ReadSourceRange(ModuleFile &F, const RecordData &Record,
-                                       unsigned &Idx, LocSeq *Seq) {
-  SourceLocation beg = ReadSourceLocation(F, Record, Idx, Seq);
-  SourceLocation end = ReadSourceLocation(F, Record, Idx, Seq);
+                                       unsigned &Idx) {
+  SourceLocation beg = ReadSourceLocation(F, Record, Idx);
+  SourceLocation end = ReadSourceLocation(F, Record, Idx);
   return SourceRange(beg, end);
 }
 
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 4cca214f8e308..04cbd1ca552b7 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -330,19 +330,13 @@ class ASTTypeWriter {
 };
 
 class TypeLocWriter : public TypeLocVisitor<TypeLocWriter> {
-  using LocSeq = SourceLocationSequence;
-
   ASTRecordWriter &Record;
-  LocSeq *Seq;
 
-  void addSourceLocation(SourceLocation Loc) {
-    Record.AddSourceLocation(Loc, Seq);
-  }
-  void addSourceRange(SourceRange Range) { Record.AddSourceRange(Range, Seq); }
+  void addSourceLocation(SourceLocation Loc) { Record.AddSourceLocation(Loc); }
+  void addSourceRange(SourceRange Range) { Record.AddSourceRange(Range); }
 
 public:
-  TypeLocWriter(ASTRecordWriter &Record, LocSeq *Seq)
-      : Record(Record), Seq(Seq) {}
+  TypeLocWriter(ASTRecordWriter &Record) : Record(Record) {}
 
 #define ABSTRACT_TYPELOC(CLASS, PARENT)
 #define TYPELOC(CLASS, PARENT) \
@@ -2449,13 +2443,12 @@ void ASTWriter::WriteSourceManagerBlock(SourceManager &SourceMgr) {
       SLocEntryOffsets.push_back(Offset);
       // Starting offset of this entry within this module, so skip the dummy.
       Record.push_back(getAdjustedOffset(SLoc->getOffset()) - 2);
-      LocSeq::State Seq;
-      AddSourceLocation(Expansion.getSpellingLoc(), Record, Seq);
-      AddSourceLocation(Expansion.getExpansionLocStart(), Record, Seq);
+      AddSourceLocation(Expansion.getSpellingLoc(), Record);
+      AddSourceLocation(Expansion.getExpansionLocStart(), Record);
       AddSourceLocation(Expansion.isMacroArgExpansion()
                             ? SourceLocation()
                             : Expansion.getExpansionLocEnd(),
-                        Record, Seq);
+                        Record);
       Record.push_back(Expansion.isExpansionTokenRange());
 
       // Compute the token length for this macro expansion.
@@ -6653,7 +6646,7 @@ void ASTWriter::AddFileID(FileID FID, RecordDataImpl &Record) {
 }
 
 SourceLocationEncoding::RawLocEncoding
-ASTWriter::getRawSourceLocationEncoding(SourceLocation Loc, LocSeq *Seq) {
+ASTWriter::getRawSourceLocationEncoding(SourceLocation Loc) {
   unsigned BaseOffset = 0;
   unsigned ModuleFileIndex = 0;
 
@@ -6672,19 +6665,17 @@ ASTWriter::getRawSourceLocationEncoding(SourceLocation Loc, LocSeq *Seq) {
     assert(&getChain()->getModuleManager()[F->Index] == F);
   }
 
-  return SourceLocationEncoding::encode(Loc, BaseOffset, ModuleFileIndex, Seq);
+  return SourceLocationEncoding::encode(Loc, BaseOffset, ModuleFileIndex);
 }
 
-void ASTWriter::AddSourceLocation(SourceLocation Loc, RecordDataImpl &Record,
-                                  SourceLocationSequence *Seq) {
+void ASTWriter::AddSourceLocation(SourceLocation Loc, RecordDataImpl &Record) {
   Loc = getAdjustedLocation(Loc);
-  Record.push_back(getRawSourceLocationEncoding(Loc, Seq));
+  Record.push_back(getRawSourceLocationEncoding(Loc));
 }
 
-void ASTWriter::AddSourceRange(SourceRange Range, RecordDataImpl &Record,
-                               SourceLocationSequence *Seq) {
-  AddSourceLocation(Range.getBegin(), Record, Seq);
-  AddSourceLocation(Range.getEnd(), Record, Seq);
+void ASTWriter::AddSourceRange(SourceRange Range, RecordDataImpl &Record) {
+  AddSourceLocation(Range.getBegin(), Record);
+  AddSourceLocation(Range.getEnd(), Record);
 }
 
 void ASTRecordWriter::AddAPFloat(const llvm::APFloat &Value) {
@@ -6804,9 +6795,8 @@ void ASTRecordWriter::AddTypeSourceInfo(TypeSourceInfo *TInfo) {
   AddTypeLoc(TInfo->getTypeLoc());
 }
 
-void ASTRecordWriter::AddTypeLoc(TypeLoc TL, LocSeq *OuterSeq) {
-  LocSeq::State Seq(OuterSeq);
-  TypeLocWriter TLW(*this, Seq);
+void ASTRecordWriter::AddTypeLoc(TypeLoc TL) {
+  TypeLocWriter TLW(*this);
   for (; !TL.isNull(); TL = TL.getNextTypeLoc())
     TLW.Visit(TL);
 }
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index a6e320c7f3eb0..87536be8c8d98 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -971,7 +971,7 @@ void ASTStmtWriter::VisitCallExpr(CallExpr *E) {
     Record.push_back(E->getFPFeatures().getAsOpaqueInt());
 
   if (!E->hasStoredFPFeatures() && !static_cast<bool>(E->getADLCallKind()) &&
-      E->getStmtClass() == Stmt::CallExprClass)
+      !E->usesMemberSyntax() && E->getStmtClass() == Stmt::CallExprClass)
     AbbrevToUse = Writer.getCallExprAbbrev();
 
   Code = serialization::EXPR_CALL;
diff --git a/clang/lib/Serialization/TemplateArgumentHasher.cpp b/clang/lib/Serialization/TemplateArgumentHasher.cpp
index aa61496d4aa0c..c56138e8893c1 100644
--- a/clang/lib/Serialization/TemplateArgumentHasher.cpp
+++ b/clang/lib/Serialization/TemplateArgumentHasher.cpp
@@ -15,6 +15,7 @@
 #include "clang/AST/TypeVisitor.h"
 #include "clang/Basic/IdentifierTable.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/Support/TimeProfiler.h"
 
 using namespace clang;
 
@@ -405,6 +406,7 @@ void TemplateArgumentHasher::AddType(const Type *T) {
 
 unsigned clang::serialization::StableHashForTemplateArguments(
     llvm::ArrayRef<TemplateArgument> Args) {
+  llvm::TimeTraceScope TimeScope("Stable Hash for Template Arguments");
   TemplateArgumentHasher Hasher;
   Hasher.AddInteger(Args.size());
   for (TemplateArgument Arg : Args)
diff --git a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
index f78b1b84f9df6..34fcb9b64d555 100644
--- a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
+++ b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
@@ -688,6 +688,18 @@ const FunctionDecl *SimpleFunctionCall::getDecl() const {
   return getSVal(getOriginExpr()->getCallee()).getAsFunctionDecl();
 }
 
+RuntimeDefinition SimpleFunctionCall::getRuntimeDefinition() const {
+  // Clang converts lambdas to function pointers using an implicit conversion
+  // operator, which returns the lambda's '__invoke' method. However, Sema
+  // leaves the body of '__invoke' empty (it is generated later in CodeGen), so
+  // we need to skip '__invoke' and access the lambda's operator() directly.
+  if (const auto *CMD = dyn_cast_if_present<CXXMethodDecl>(getDecl());
+      CMD && CMD->isLambdaStaticInvoker())
+    return RuntimeDefinition{CMD->getParent()->getLambdaCallOperator()};
+
+  return AnyFunctionCall::getRuntimeDefinition();
+}
+
 const FunctionDecl *CXXInstanceCall::getDecl() const {
   const auto *CE = cast_or_null<CallExpr>(getOriginExpr());
   if (!CE)
diff --git a/clang/test/Analysis/lambda-convert-to-func-ptr.cpp b/clang/test/Analysis/lambda-convert-to-func-ptr.cpp
new file mode 100644
index 0000000000000..c2ad7cd2de34a
--- /dev/null
+++ b/clang/test/Analysis/lambda-convert-to-func-ptr.cpp
@@ -0,0 +1,21 @@
+// RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core,debug.ExprInspection -analyzer-config inline-lambdas=true -verify %s
+
+void clang_analyzer_eval(bool);
+
+void basic() {
+  int (*ret_zero)() = []() { return 0; };
+  clang_analyzer_eval(ret_zero() == 0); // expected-warning{{TRUE}}
+}
+
+void withParam() {
+  int (*add_ten)(int) = [](int b) { return b + 10; };
+  clang_analyzer_eval(add_ten(1) == 11); // expected-warning{{TRUE}}
+}
+
+int callBack(int (*fp)(int), int x) {
+  return fp(x);
+}
+
+void passWithFunc() {
+  clang_analyzer_eval(callBack([](int x) { return x; }, 5) == 5); // expected-warning{{TRUE}}
+}
diff --git a/clang/test/CIR/CodeGen/align-load.c b/clang/test/CIR/CodeGen/align-load.c
index 06553a307f93a..17171d3607545 100644
--- a/clang/test/CIR/CodeGen/align-load.c
+++ b/clang/test/CIR/CodeGen/align-load.c
@@ -21,7 +21,7 @@ void accessStruct(struct S u) {
   u.d;
 }
 
-// CIR: cir.func @accessStruct
+// CIR: cir.func{{.*}} @accessStruct
 // CIR:   cir.load align(8)
 // CIR:   cir.load align(2)
 // CIR:   cir.load align(4)
@@ -58,7 +58,7 @@ void accessUnion(union U u) {
   u.d;
 }
 
-// CIR: cir.func @accessUnion
+// CIR: cir.func{{.*}} @accessUnion
 // CIR:   cir.load align(8)
 // CIR:   cir.load align(8)
 // CIR:   cir.load align(8)
@@ -86,7 +86,7 @@ int loadAligned(myint *p) {
   return *p;
 }
 
-// CIR: cir.func @loadAligned
+// CIR: cir.func{{.*}} @loadAligned
 // CIR:   cir.load align(1)
 
 // LLVM: @loadAligned
diff --git a/clang/test/CIR/CodeGen/align-store.c b/clang/test/CIR/CodeGen/align-store.c
index 9ce26fa020eeb..88686b94d8adf 100644
--- a/clang/test/CIR/CodeGen/align-store.c
+++ b/clang/test/CIR/CodeGen/align-store.c
@@ -12,7 +12,7 @@ void test1(myint *p) {
   *p = 0;
 }
 
-// CIR: cir.func @test1
+// CIR: cir.func{{.*}} @test1
 // CIR:   cir.store align(1)
 
 // LLVM: @test1
diff --git a/clang/test/CIR/CodeGen/array.cpp b/clang/test/CIR/CodeGen/array.cpp
index 26e172a006451..141b67e0e63c7 100644
--- a/clang/test/CIR/CodeGen/array.cpp
+++ b/clang/test/CIR/CodeGen/array.cpp
@@ -101,7 +101,7 @@ void func() {
 // CIR: %[[TMP:.*]] = cir.load{{.*}} %[[ELE_PTR]] : !cir.ptr<!s32i>, !s32i
 // CIR" cir.store %[[TMP]], %[[INIT_2]] : !s32i, !cir.ptr<!s32i>
 
-// LLVM: define void @_Z4funcv()
+// LLVM: define{{.*}} void @_Z4funcv()
 // LLVM-NEXT: %[[ARR:.*]] = alloca [10 x i32], i64 1, align 16
 // LLVM-NEXT: %[[INIT:.*]] = alloca i32, i64 1, align 4
 // LLVM-NEXT: %[[INIT_2:.*]] = alloca i32, i64 1, align 4
@@ -143,7 +143,7 @@ void func2() {
 // CIR: %[[ELE_1_PTR:.*]] = cir.ptr_stride(%[[LOAD_1]] : !cir.ptr<!s32i>, %[[OFFSET_1]] : !s64i), !cir.ptr<!s32i>
 // CIR: cir.store{{.*}} %[[ELE_1_PTR]], %[[ELE_ALLOCA]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 
-// LLVM: define void @_Z5func2v()
+// LLVM: define{{.*}} void @_Z5func2v()
 // LLVM:  %[[ARR:.*]] = alloca [2 x i32], i64 1, align 4
 // LLVM:  %[[TMP:.*]] = alloca ptr, i64 1, align 8
 // LLVM:  %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR]], i32 0
@@ -183,7 +183,7 @@ void func3() {
 // CIR: %[[ELE_TMP:.*]] = cir.load{{.*}} %[[ELE_PTR]] : !cir.ptr<!s32i>, !s32i
 // CIR: cir.store{{.*}} %[[ELE_TMP]], %[[INIT]] : !s32i, !cir.ptr<!s32i>
 
-// LLVM: define void @_Z5func3v()
+// LLVM: define{{.*}} void @_Z5func3v()
 // LLVM:  %[[ARR:.*]] = alloca [2 x i32], i64 1, align 4
 // LLVM:  %[[IDX:.*]] = alloca i32, i64 1, align 4
 // LLVM:  %[[INIT:.*]] = alloca i32, i64 1, align 4
@@ -235,7 +235,7 @@ void func4() {
 // CIR: %[[TMP:.*]] = cir.load{{.*}} %[[ELE_0]] : !cir.ptr<!s32i>, !s32i
 // CIR: cir.store{{.*}} %[[TMP]], %[[INIT]] : !s32i, !cir.ptr<!s32i>
 
-// LLVM: define void @_Z5func4v()
+// LLVM: define{{.*}} void @_Z5func4v()
 // LLVM:  %[[ARR:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
 // LLVM:  %[[INIT:.*]] = alloca i32, i64 1, align 4
 // LLVM:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR]], i32 0
@@ -279,7 +279,7 @@ void func5() {
 // CIR: %10 = cir.ptr_stride(%7 : !cir.ptr<!cir.array<!s32i x 1>>, %[[OFFSET_1]] : !s64i), !cir.ptr<!cir.array<!s32i x 1>>
 // CIR: cir.store{{.*}} %10, %[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>
 
-// LLVM: define void @_Z5func5v()
+// LLVM: define{{.*}} void @_Z5func5v()
 // LLVM:  %[[ARR:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
 // LLVM:  %[[TMP:.*]] = alloca ptr, i64 1, align 8
 // LLVM:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR]], i32 0
@@ -312,7 +312,7 @@ void func6() {
 // CIR: %[[V1:.*]] = cir.const #cir.int<5> : !s32i
 // CIR: cir.store{{.*}} %[[V1]], %[[ELE_PTR]] : !s32i, !cir.ptr<!s32i>
 
-// LLVM: define void @_Z5func6v()
+// LLVM: define{{.*}} void @_Z5func6v()
 // LLVM:  %[[VAR:.*]] = alloca i32, i64 1, align 4
 // LLVM:  %[[ARR:.*]] = alloca [2 x i32], i64 1, align 4
 // LLVM:  store i32 4, ptr %[[VAR]], align 4
@@ -345,7 +345,7 @@ void func7() {
 // CIR: %[[ELE_PTR:.*]] = cir.ptr_stride(%[[TMP]] : !cir.ptr<!cir.ptr<!s32i>>, %[[OFFSET]] : !s64i), !cir.ptr<!cir.ptr<!s32i>>
 // CIR: cir.store{{.*}} %[[ELE_PTR]], %[[ARR_TMP]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
 
-// LLVM: define void @_Z5func7v()
+// LLVM: define{{.*}} void @_Z5func7v()
 // LLVM:  %[[ARR:.*]] = alloca [1 x ptr], i64 1, align 8
 // LLVM:  %[[ALLOCA:.*]] = alloca ptr, i64 1, align 8
 // LLVM:  %[[ELE_PTR:.*]] = getelementptr ptr, ptr %[[ARR]], i32 0
@@ -363,7 +363,7 @@ void func8(int arr[10]) {
   int e2 = arr[1];
 }
 
-// CIR: cir.func @_Z5func8Pi(%[[ARG:.*]]: !cir.ptr<!s32i>
+// CIR: cir.func{{.*}} @_Z5func8Pi(%[[ARG:.*]]: !cir.ptr<!s32i>
 // CIR:  %[[ARR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arr", init]
 // CIR:  %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
 // CIR:  %[[INIT_2:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e2", init]
@@ -379,7 +379,7 @@ void func8(int arr[10]) {
 // CIR:  %[[TMP_4:.*]] = cir.load{{.*}} %[[ELE_1]] : !cir.ptr<!s32i>, !s32i
 // CIR:  cir.store{{.*}} %[[TMP_4]], %[[INIT_2]] : !s32i, !cir.ptr<!s32i>
 
-// LLVM: define void @_Z5func8Pi(ptr %[[ARG:.*]])
+// LLVM: define{{.*}} void @_Z5func8Pi(ptr %[[ARG:.*]])
 // LLVM:  %[[ARR:.*]] = alloca ptr, i64 1, align 8
 // LLVM:  %[[INIT:.*]] = alloca i32, i64 1, align 4
 // LLVM:  %[[INIT_2:.*]] = alloca i32, i64 1, align 4
@@ -410,7 +410,7 @@ void func9(int arr[10][5]) {
   int e = arr[1][2];
 }
 
-// CIR: cir.func @_Z5func9PA5_i(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>
+// CIR: cir.func{{.*}} @_Z5func9PA5_i(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>
 // CIR:  %[[ARR:.*]] = cir.alloca !cir.ptr<!cir.array<!s32i x 5>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, ["arr", init]
 // CIR:  %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
 // CIR:  cir.store{{.*}} %[[ARG]], %[[ARR]] : !cir.ptr<!cir.array<!s32i x 5>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>
@@ -423,7 +423,7 @@ void func9(int arr[10][5]) {
 // CIR:  %[[TMP_2:.*]] = cir.load{{.*}} %[[ARR_1_2]] : !cir.ptr<!s32i>, !s32i
 // CIR:  cir.store{{.*}} %[[TMP_2]], %[[INIT]] : !s32i, !cir.ptr<!s32i>
 
-// LLVM: define void @_Z5func9PA5_i(ptr %[[ARG:.*]])
+// LLVM: define{{.*}} void @_Z5func9PA5_i(ptr %[[ARG:.*]])
 // LLVM:  %[[ARR:.*]] = alloca ptr, i64 1, align 8
 // LLVM:  %[[INIT:.*]] = alloca i32, i64 1, align 4
 // LLVM:  store ptr %[[ARG]], ptr %[[ARR]], align 8
@@ -447,7 +447,7 @@ void func10(int *a) {
   int e = a[5];
 }
 
-// CIR: cir.func @_Z6func10Pi(%[[ARG:.*]]: !cir.ptr<!s32i>
+// CIR: cir.func{{.*}} @_Z6func10Pi(%[[ARG:.*]]: !cir.ptr<!s32i>
 // CIR: %[[ARR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["a", init]
 // CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
 // CIR: cir.store{{.*}} %[[ARG]], %[[ARR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
@@ -457,7 +457,7 @@ void func10(int *a) {
 // CIR: %[[TMP_2:.*]] = cir.load{{.*}} %[[ELE]] : !cir.ptr<!s32i>, !s32i
 // CIR: cir.store{{.*}} %[[TMP_2]], %[[INIT]] : !s32i, !cir.ptr<!s32i>
 
-// LLVM: define void @_Z6func10Pi(ptr %[[ARG:.*]]) {
+// LLVM: define{{.*}} void @_Z6func10Pi(ptr %[[ARG:.*]]) {
 // LLVM:  %[[ARR:.*]] = alloca ptr, i64 1, align 8
 // LLVM:  %[[INIT:.*]] = alloca i32, i64 1, align 4
 // LLVM:  store ptr %[[ARG]], ptr %[[ARR]], align 8
diff --git a/clang/test/CIR/CodeGen/basic.c b/clang/test/CIR/CodeGen/basic.c
index 7ff73ee95f799..2c3c5b0f22a5c 100644
--- a/clang/test/CIR/CodeGen/basic.c
+++ b/clang/test/CIR/CodeGen/basic.c
@@ -34,7 +34,7 @@ int f1(int i) {
   return i;
 }
 
-// CIR:      cir.func @f1(%arg0: !s32i loc({{.*}})) -> !s32i
+// CIR:      cir.func{{.*}} @f1(%arg0: !s32i loc({{.*}})) -> !s32i
 // CIR-NEXT:   %[[I_PTR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init] {alignment = 4 : i64}
 // CIR-NEXT:   %[[RV:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"] {alignment = 4 : i64}
 // CIR-NEXT:   cir.store{{.*}} %arg0, %[[I_PTR]] : !s32i, !cir.ptr<!s32i>
@@ -44,7 +44,7 @@ int f1(int i) {
 // CIR-NEXT:   %[[R:.*]] = cir.load{{.*}} %[[RV]] : !cir.ptr<!s32i>, !s32i
 // CIR-NEXT:   cir.return %[[R]] : !s32i
 
-//      LLVM: define i32 @f1(i32 %[[IP:.*]])
+//      LLVM: define{{.*}} i32 @f1(i32 %[[IP:.*]])
 // LLVM-NEXT:   %[[I_PTR:.*]] = alloca i32, i64 1, align 4
 // LLVM-NEXT:   %[[RV:.*]] = alloca i32, i64 1, align 4
 // LLVM-NEXT:   store i32 %[[IP]], ptr %[[I_PTR]], align 4
@@ -64,14 +64,14 @@ int f1(int i) {
 
 int f2(void) { return 3; }
 
-//      CIR: cir.func @f2() -> !s32i
+//      CIR: cir.func{{.*}} @f2() -> !s32i
 // CIR-NEXT:   %[[RV:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"] {alignment = 4 : i64}
 // CIR-NEXT:   %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
 // CIR-NEXT:   cir.store{{.*}} %[[THREE]], %[[RV]] : !s32i, !cir.ptr<!s32i>
 // CIR-NEXT:   %[[R:.*]] = cir.load{{.*}} %0 : !cir.ptr<!s32i>, !s32i
 // CIR-NEXT:   cir.return %[[R]] : !s32i
 
-//      LLVM: define i32 @f2()
+//      LLVM: define{{.*}} i32 @f2()
 // LLVM-NEXT:   %[[RV:.*]] = alloca i32, i64 1, align 4
 // LLVM-NEXT:   store i32 3, ptr %[[RV]], align 4
 // LLVM-NEXT:   %[[R:.*]] = load i32, ptr %[[RV]], align 4
@@ -86,7 +86,7 @@ int f3(void) {
   return i;
 }
 
-//      CIR: cir.func @f3() -> !s32i
+//      CIR: cir.func{{.*}} @f3() -> !s32i
 // CIR-NEXT:   %[[RV:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"] {alignment = 4 : i64}
 // CIR-NEXT:   %[[I_PTR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init] {alignment = 4 : i64}
 // CIR-NEXT:   %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
@@ -96,7 +96,7 @@ int f3(void) {
 // CIR-NEXT:   %[[R:.*]] = cir.load{{.*}} %[[RV]] : !cir.ptr<!s32i>, !s32i
 // CIR-NEXT:   cir.return %[[R]] : !s32i
 
-//      LLVM: define i32 @f3()
+//      LLVM: define{{.*}} i32 @f3()
 // LLVM-NEXT:   %[[RV:.*]] = alloca i32, i64 1, align 4
 // LLVM-NEXT:   %[[I_PTR:.*]] = alloca i32, i64 1, align 4
 // LLVM-NEXT:   store i32 3, ptr %[[I_PTR]], align 4
@@ -117,10 +117,10 @@ void f4(void) {
   ;
 }
 
-//      CIR: cir.func @f4()
+//      CIR: cir.func{{.*}} @f4()
 // CIR-NEXT:   cir.return
 
-//      LLVM: define void @f4()
+//      LLVM: define{{.*}} void @f4()
 // LLVM-NEXT:   ret void
 
 //      OGCG: define{{.*}} void @f4()
@@ -133,7 +133,7 @@ void f5(void) {
     ;
 }
 
-//      CIR: cir.func @f5()
+//      CIR: cir.func{{.*}} @f5()
 // CIR-NEXT:   cir.scope {
 // CIR-NEXT:      cir.for : cond {
 // CIR-NEXT:        %0 = cir.const #true
@@ -147,7 +147,7 @@ void f5(void) {
 // CIR-NEXT:   cir.return
 // CIR-NEXT: }
 
-// LLVM: define void @f5()
+// LLVM: define{{.*}} void @f5()
 // LLVM:   br label %[[SCOPE:.*]]
 // LLVM: [[SCOPE]]:
 // LLVM:   br label %[[LOOP:.*]]
@@ -171,7 +171,7 @@ int f6(void) {
   return gv;
 }
 
-//      CIR: cir.func @f6() -> !s32i
+//      CIR: cir.func{{.*}} @f6() -> !s32i
 // CIR-NEXT:   %[[RV:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"] {alignment = 4 : i64}
 // CIR-NEXT:   %[[GV_PTR:.*]] = cir.get_global @gv : !cir.ptr<!s32i>
 // CIR-NEXT:   %[[GV:.*]] = cir.load{{.*}} %[[GV_PTR]] : !cir.ptr<!s32i>, !s32i
@@ -179,7 +179,7 @@ int f6(void) {
 // CIR-NEXT:   %[[R:.*]] = cir.load{{.*}} %[[RV]] : !cir.ptr<!s32i>, !s32i
 // CIR-NEXT:   cir.return %[[R]] : !s32i
 
-// LLVM:      define i32 @f6()
+// LLVM:      define{{.*}} i32 @f6()
 // LLVM-NEXT:   %[[RV_PTR:.*]] = alloca i32, i64 1, align 4
 // LLVM-NEXT:   %[[GV:.*]] = load i32, ptr @gv, align 4
 // LLVM-NEXT:   store i32 %[[GV]], ptr %[[RV_PTR]], align 4
@@ -195,7 +195,7 @@ int f7(int a, int b, int c) {
   return a + (b + c);
 }
 
-// CIR: cir.func @f7
+// CIR: cir.func{{.*}} @f7
 // CIR:  %[[A_PTR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
 // CIR:  %[[B_PTR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init]
 // CIR:  %[[C_PTR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["c", init]
@@ -205,7 +205,7 @@ int f7(int a, int b, int c) {
 // CIR:  %[[B_PLUS_C:.*]] = cir.binop(add, %[[B]], %[[C]]) nsw : !s32i
 // CIR:  %[[RETVAL:.*]] = cir.binop(add, %[[A]], %[[B_PLUS_C]]) nsw : !s32i
 
-// LLVM: define i32 @f7
+// LLVM: define{{.*}} i32 @f7
 // LLVM:   %[[A_PTR:.*]] = alloca i32, i64 1, align 4
 // LLVM:   %[[B_PTR:.*]] = alloca i32, i64 1, align 4
 // LLVM:   %[[C_PTR:.*]] = alloca i32, i64 1, align 4
@@ -231,7 +231,7 @@ int f8(int *p) {
   return (*p);
 }
 
-// CIR: cir.func @f8
+// CIR: cir.func{{.*}} @f8
 // CIR:    %[[P_PTR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["p", init]
 // CIR:    %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
 // CIR:    %[[P:.*]] = cir.load deref{{.*}} %[[P_PTR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
@@ -239,7 +239,7 @@ int f8(int *p) {
 // CIR:    %[[P2:.*]] = cir.load deref{{.*}} %[[P_PTR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
 // CIR:    %[[STAR_P:.*]] = cir.load{{.*}} %[[P2]] : !cir.ptr<!s32i>, !s32i
 
-// LLVM: define i32 @f8
+// LLVM: define{{.*}} i32 @f8
 // LLVM:   %[[P_PTR:.*]] = alloca ptr, i64 1, align 8
 // LLVM:   %[[P:.*]] = load ptr, ptr %[[P_PTR]], align 8
 // LLVM:   store i32 2, ptr %[[P]], align 4
@@ -257,10 +257,10 @@ int f8(int *p) {
 
 void f9() {}
 
-//      CIR: cir.func @f9()
+//      CIR: cir.func{{.*}} @f9()
 // CIR-NEXT:   cir.return
 
-//      LLVM: define void @f9()
+//      LLVM: define{{.*}} void @f9()
 // LLVM-NEXT:   ret void
 
 //      OGCG: define{{.*}} void @f9()
@@ -269,12 +269,12 @@ void f9() {}
 
 void f10(int arg0, ...) {}
 
-//      CIR: cir.func @f10(%[[ARG0:.*]]: !s32i loc({{.*}}), ...)
+//      CIR: cir.func{{.*}} @f10(%[[ARG0:.*]]: !s32i loc({{.*}}), ...)
 // CIR-NEXT:   %[[ARG0_PTR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["arg0", init] {alignment = 4 : i64}
 // CIR-NEXT:   cir.store{{.*}} %[[ARG0]], %[[ARG0_PTR]] : !s32i, !cir.ptr<!s32i>
 // CIR-NEXT:   cir.return
 
-//      LLVM: define void @f10(i32 %[[ARG0:.*]], ...)
+//      LLVM: define{{.*}} void @f10(i32 %[[ARG0:.*]], ...)
 // LLVM-NEXT:   %[[ARG0_PTR:.*]] = alloca i32, i64 1, align 4
 // LLVM-NEXT:   store i32 %[[ARG0]], ptr %[[ARG0_PTR]], align 4
 // LLVM-NEXT:   ret void
@@ -292,7 +292,7 @@ size_type max_size(void) {
   return (size_type)~0 / sizeof(_Tp);
 }
 
-// CIR: cir.func @max_size()
+// CIR: cir.func{{.*}} @max_size()
 // CIR:   %0 = cir.alloca !u64i, !cir.ptr<!u64i>, ["__retval"] {alignment = 8 : i64}
 // CIR:   %1 = cir.const #cir.int<0> : !s32i
 // CIR:   %2 = cir.unary(not, %1) : !s32i, !s32i
@@ -300,7 +300,7 @@ size_type max_size(void) {
 // CIR:   %4 = cir.const #cir.int<8> : !u64i
 // CIR:   %5 = cir.binop(div, %3, %4) : !u64i
 
-// LLVM: define i64 @max_size()
+// LLVM: define{{.*}} i64 @max_size()
 // LLVM:   store i64 2305843009213693951, ptr
 
 // OGCG: define{{.*}} i64 @max_size()
@@ -315,10 +315,10 @@ void test_char_literal() {
   c = 'X';
 }
 
-// CIR: cir.func @test_char_literal
+// CIR: cir.func{{.*}} @test_char_literal
 // CIR:   cir.const #cir.int<88>
 
-// LLVM: define void @test_char_literal()
+// LLVM: define{{.*}} void @test_char_literal()
 // LLVM:   store i8 88, ptr %{{.*}}, align 1
 
 // OGCG: define{{.*}} void @test_char_literal()
diff --git a/clang/test/CIR/CodeGen/basic.cpp b/clang/test/CIR/CodeGen/basic.cpp
index ed1c6d364a0ef..fe6dd938f0faf 100644
--- a/clang/test/CIR/CodeGen/basic.cpp
+++ b/clang/test/CIR/CodeGen/basic.cpp
@@ -31,7 +31,7 @@ int f1() {
   return i;
 }
 
-// CHECK: cir.func @_Z2f1v() -> !s32i
+// CHECK: cir.func{{.*}} @_Z2f1v() -> !s32i
 // CHECK:    %[[RV:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"] {alignment = 4 : i64}
 // CHECK:    %[[I_PTR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i"] {alignment = 4 : i64}
 // CHECK:    %[[I:.*]] = cir.load{{.*}} %[[I_PTR]] : !cir.ptr<!s32i>, !s32i
@@ -44,7 +44,7 @@ int f2() {
   return i;
 }
 
-// CHECK: cir.func @_Z2f2v() -> !s32i
+// CHECK: cir.func{{.*}} @_Z2f2v() -> !s32i
 // CHECK:    %[[RV:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"] {alignment = 4 : i64}
 // CHECK:    %[[I_PTR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init, const] {alignment = 4 : i64}
 // CHECK:    %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
@@ -58,7 +58,7 @@ int f3(int i) {
   return i;
 }
 
-// CHECK: cir.func @_Z2f3i(%[[ARG:.*]]: !s32i loc({{.*}})) -> !s32i
+// CHECK: cir.func{{.*}} @_Z2f3i(%[[ARG:.*]]: !s32i loc({{.*}})) -> !s32i
 // CHECK:   %[[ARG_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init] {alignment = 4 : i64}
 // CHECK:   %[[RV:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"] {alignment = 4 : i64}
 // CHECK:   cir.store{{.*}} %[[ARG]], %[[ARG_ALLOCA]] : !s32i, !cir.ptr<!s32i>
@@ -71,7 +71,7 @@ int f4(const int i) {
   return i;
 }
 
-// CHECK: cir.func @_Z2f4i(%[[ARG:.*]]: !s32i loc({{.*}})) -> !s32i
+// CHECK: cir.func{{.*}} @_Z2f4i(%[[ARG:.*]]: !s32i loc({{.*}})) -> !s32i
 // CHECK:   %[[ARG_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init, const] {alignment = 4 : i64}
 // CHECK:   %[[RV:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"] {alignment = 4 : i64}
 // CHECK:   cir.store{{.*}} %[[ARG]], %[[ARG_ALLOCA]] : !s32i, !cir.ptr<!s32i>
@@ -91,7 +91,7 @@ int *f5() {
   return p;
 }
 
-// CHECK:      cir.func @_Z2f5v() -> !cir.ptr<!s32i>
+// CHECK:      cir.func{{.*}} @_Z2f5v() -> !cir.ptr<!s32i>
 // CHECK-NEXT:   %[[RET_ADDR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["__retval"] {alignment = 8 : i64}
 // CHECK-NEXT:   %[[P_ADDR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["p", init] {alignment = 8 : i64}
 // CHECK-NEXT:   %[[NULLPTR:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!s32i>
@@ -120,7 +120,7 @@ size_type max_size() {
   return size_type(~0) / sizeof(_Tp);
 }
 
-// CHECK: cir.func @_Z8max_sizev() -> !u64i
+// CHECK: cir.func{{.*}} @_Z8max_sizev() -> !u64i
 // CHECK:   %0 = cir.alloca !u64i, !cir.ptr<!u64i>, ["__retval"] {alignment = 8 : i64}
 // CHECK:   %1 = cir.const #cir.int<0> : !s32i
 // CHECK:   %2 = cir.unary(not, %1) : !s32i, !s32i
@@ -137,7 +137,7 @@ void ref_arg(int &x) {
   x = 3;
 }
 
-// CHECK: cir.func @_Z7ref_argRi(%[[ARG:.*]]: !cir.ptr<!s32i> {{.*}})
+// CHECK: cir.func{{.*}} @_Z7ref_argRi(%[[ARG:.*]]: !cir.ptr<!s32i> {{.*}})
 // CHECK:   %[[X_REF_ADDR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["x", init, const] {alignment = 8 : i64}
 // CHECK:   %[[Y_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["y", init] {alignment = 4 : i64}
 // CHECK:   cir.store{{.*}} %[[ARG]], %[[X_REF_ADDR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
@@ -154,7 +154,7 @@ short &return_ref() {
   return gs;
 }
 
-// CHECK: cir.func @_Z10return_refv() -> !cir.ptr<!s16i>
+// CHECK: cir.func{{.*}} @_Z10return_refv() -> !cir.ptr<!s16i>
 // CHECK:   %[[RETVAL_ADDR:.*]] = cir.alloca !cir.ptr<!s16i>, !cir.ptr<!cir.ptr<!s16i>>, ["__retval"] {alignment = 8 : i64}
 // CHECK:   %[[GS_ADDR:.*]] = cir.get_global @gs : !cir.ptr<!s16i>
 // CHECK:   cir.store{{.*}} %[[GS_ADDR]], %[[RETVAL_ADDR]] : !cir.ptr<!s16i>, !cir.ptr<!cir.ptr<!s16i>>
@@ -165,7 +165,7 @@ void ref_local(short x) {
   short &y = x;
 }
 
-// CHECK: cir.func @_Z9ref_locals(%[[ARG:.*]]: !s16i {{.*}})
+// CHECK: cir.func{{.*}} @_Z9ref_locals(%[[ARG:.*]]: !s16i {{.*}})
 // CHECK:   %[[X_ADDR:.*]] = cir.alloca !s16i, !cir.ptr<!s16i>, ["x", init] {alignment = 2 : i64}
 // CHECK:   %[[Y_REF_ADDR:.*]] = cir.alloca !cir.ptr<!s16i>, !cir.ptr<!cir.ptr<!s16i>>, ["y", init, const] {alignment = 8 : i64}
 // CHECK:   cir.store{{.*}} %[[ARG]], %[[X_ADDR]] : !s16i, !cir.ptr<!s16i>
diff --git a/clang/test/CIR/CodeGen/binassign.c b/clang/test/CIR/CodeGen/binassign.c
index 4955c988ec095..541b50a664c0e 100644
--- a/clang/test/CIR/CodeGen/binassign.c
+++ b/clang/test/CIR/CodeGen/binassign.c
@@ -17,7 +17,7 @@ void binary_assign(void) {
     i = 42;
 }
 
-// CIR-LABEL: cir.func @binary_assign() {
+// CIR-LABEL: cir.func{{.*}} @binary_assign() {
 // CIR:         %[[B:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["b"]
 // CIR:         %[[C:.*]] = cir.alloca !s8i, !cir.ptr<!s8i>, ["c"]
 // CIR:         %[[F:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["f"]
diff --git a/clang/test/CIR/CodeGen/binop.cpp b/clang/test/CIR/CodeGen/binop.cpp
index c728f0d0c1bc1..847e81755939f 100644
--- a/clang/test/CIR/CodeGen/binop.cpp
+++ b/clang/test/CIR/CodeGen/binop.cpp
@@ -16,7 +16,7 @@ void b0(int a, int b) {
   x = x | b;
 }
 
-// CIR-LABEL: cir.func @_Z2b0ii(
+// CIR-LABEL: cir.func{{.*}} @_Z2b0ii(
 // CIR: %{{.+}} = cir.binop(mul, %{{.+}}, %{{.+}}) nsw : !s32i
 // CIR: %{{.+}} = cir.binop(div, %{{.+}}, %{{.+}}) : !s32i
 // CIR: %{{.+}} = cir.binop(rem, %{{.+}}, %{{.+}}) : !s32i
@@ -27,7 +27,7 @@ void b0(int a, int b) {
 // CIR: %{{.+}} = cir.binop(or, %{{.+}}, %{{.+}}) : !s32i
 // CIR: cir.return
 
-// LLVM-LABEL: define void @_Z2b0ii(
+// LLVM-LABEL: define{{.*}} void @_Z2b0ii(
 // LLVM-SAME: i32 %[[A:.*]], i32 %[[B:.*]])
 // LLVM:         %[[A_ADDR:.*]] = alloca i32
 // LLVM:         %[[B_ADDR:.*]] = alloca i32
@@ -77,7 +77,7 @@ void b0(int a, int b) {
 
 // LLVM:         ret void
 
-// OGCG-LABEL: define dso_local void @_Z2b0ii(i32 {{.*}} %a, i32 {{.*}} %b) {{.*}} { 
+// OGCG-LABEL: define{{.*}} void @_Z2b0ii(i32 {{.*}} %a, i32 {{.*}} %b) {{.*}} { 
 // OGCG:         %[[A_ADDR:.*]] = alloca i32
 // OGCG:         %[[B_ADDR:.*]] = alloca i32
 // OGCG:         %[[X:.*]] = alloca i32
@@ -133,14 +133,14 @@ void testFloatingPointBinOps(float a, float b) {
   a - b;
 }
 
-// CIR-LABEL: cir.func @_Z23testFloatingPointBinOpsff(
+// CIR-LABEL: cir.func{{.*}} @_Z23testFloatingPointBinOpsff(
 // CIR: cir.binop(mul, %{{.+}}, %{{.+}}) : !cir.float
 // CIR: cir.binop(div, %{{.+}}, %{{.+}}) : !cir.float
 // CIR: cir.binop(add, %{{.+}}, %{{.+}}) : !cir.float
 // CIR: cir.binop(sub, %{{.+}}, %{{.+}}) : !cir.float
 // CIR: cir.return
 
-// LLVM-LABEL: define void @_Z23testFloatingPointBinOpsff(
+// LLVM-LABEL: define{{.*}} void @_Z23testFloatingPointBinOpsff(
 // LLVM-SAME: float %[[A:.*]], float %[[B:.*]])
 // LLVM:         %[[A_ADDR:.*]] = alloca float, i64 1
 // LLVM:         %[[B_ADDR:.*]] = alloca float, i64 1
@@ -165,7 +165,7 @@ void testFloatingPointBinOps(float a, float b) {
 
 // LLVM:         ret void
 
-// OGCG-LABEL: define dso_local void @_Z23testFloatingPointBinOpsff(float {{.*}} %a, float {{.*}} %b)
+// OGCG-LABEL: define{{.*}} void @_Z23testFloatingPointBinOpsff(float {{.*}} %a, float {{.*}} %b)
 // OGCG:         %a.addr = alloca float
 // OGCG:         %b.addr = alloca float
 // OGCG:         store float %a, ptr %a.addr
@@ -194,7 +194,7 @@ void signed_shift(int a, int b) {
   x = a << b;
 }
 
-// CIR-LABEL: cir.func @_Z12signed_shiftii(
+// CIR-LABEL: cir.func{{.*}} @_Z12signed_shiftii(
 // CIR-SAME: %[[ARG0:.*]]: !s32i{{.*}}, %[[ARG1:.*]]: !s32i{{.*}})
 // CIR: %[[A_PTR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
 // CIR: %[[B_PTR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init]
@@ -215,7 +215,7 @@ void signed_shift(int a, int b) {
 
 // CIR: cir.return
 
-// LLVM-LABEL: define void @_Z12signed_shiftii
+// LLVM-LABEL: define{{.*}} void @_Z12signed_shiftii
 // LLVM-SAME: (i32 %[[A:.*]], i32 %[[B:.*]])
 // LLVM:         %[[A_ADDR:.*]] = alloca i32
 // LLVM:         %[[B_ADDR:.*]] = alloca i32
@@ -235,7 +235,7 @@ void signed_shift(int a, int b) {
 
 // LLVM:         ret void
 
-// OGCG-LABEL: define dso_local void @_Z12signed_shiftii
+// OGCG-LABEL: define{{.*}} void @_Z12signed_shiftii
 // OGCG-SAME: (i32 {{.*}} %[[A:.*]], i32 {{.*}} %[[B:.*]])
 // OGCG:         %[[A_ADDR:.*]] = alloca i32
 // OGCG:         %[[B_ADDR:.*]] = alloca i32
@@ -260,7 +260,7 @@ void unsigned_shift(unsigned a, unsigned b) {
   x = a << b;
 }
 
-// CIR-LABEL: cir.func @_Z14unsigned_shiftjj(
+// CIR-LABEL: cir.func{{.*}} @_Z14unsigned_shiftjj(
 // CIR-SAME: %[[ARG0:.*]]: !u32i{{.*}}, %[[ARG1:.*]]: !u32i{{.*}})
 // CIR: %[[A_PTR:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["a", init]
 // CIR: %[[B_PTR:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["b", init]
@@ -281,7 +281,7 @@ void unsigned_shift(unsigned a, unsigned b) {
 
 // CIR: cir.return
 
-// LLVM-LABEL: define void @_Z14unsigned_shiftjj
+// LLVM-LABEL: define{{.*}} void @_Z14unsigned_shiftjj
 // LLVM-SAME: (i32 %[[A:.*]], i32 %[[B:.*]])
 // LLVM:         %[[A_ADDR:.*]] = alloca i32
 // LLVM:         %[[B_ADDR:.*]] = alloca i32
@@ -301,7 +301,7 @@ void unsigned_shift(unsigned a, unsigned b) {
 
 // LLVM:         ret void
 
-// OGCG-LABEL: define dso_local void @_Z14unsigned_shiftjj
+// OGCG-LABEL: define{{.*}} void @_Z14unsigned_shiftjj
 // OGCG-SAME: (i32 {{.*}} %[[A:.*]], i32 {{.*}} %[[B:.*]])
 // OGCG:         %[[A_ADDR:.*]] = alloca i32
 // OGCG:         %[[B_ADDR:.*]] = alloca i32
@@ -326,7 +326,7 @@ void zext_shift_example(int a, unsigned char b) {
   x = a << b;
 }
 
-// CIR-LABEL: cir.func @_Z18zext_shift_exampleih(
+// CIR-LABEL: cir.func{{.*}} @_Z18zext_shift_exampleih(
 // CIR-SAME: %[[ARG0:.*]]: !s32i{{.*}}, %[[ARG1:.*]]: !u8i{{.*}})
 // CIR: %[[A_PTR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
 // CIR: %[[B_PTR:.*]] = cir.alloca !u8i, !cir.ptr<!u8i>, ["b", init]
@@ -349,7 +349,7 @@ void zext_shift_example(int a, unsigned char b) {
 
 // CIR: cir.return
 
-// LLVM-LABEL: define void @_Z18zext_shift_exampleih
+// LLVM-LABEL: define{{.*}} void @_Z18zext_shift_exampleih
 // LLVM-SAME: (i32 %[[A:.*]], i8 %[[B:.*]])
 // LLVM:         %[[A_ADDR:.*]] = alloca i32
 // LLVM:         %[[B_ADDR:.*]] = alloca i8
@@ -371,7 +371,7 @@ void zext_shift_example(int a, unsigned char b) {
 
 // LLVM:         ret void
 
-// OGCG-LABEL: define dso_local void @_Z18zext_shift_exampleih
+// OGCG-LABEL: define{{.*}} void @_Z18zext_shift_exampleih
 // OGCG-SAME: (i32 {{.*}} %[[A:.*]], i8 {{.*}} %[[B:.*]])
 // OGCG:         %[[A_ADDR:.*]] = alloca i32
 // OGCG:         %[[B_ADDR:.*]] = alloca i8
@@ -398,7 +398,7 @@ void sext_shift_example(int a, signed char b) {
   x = a << b;
 }
 
-// CIR-LABEL: cir.func @_Z18sext_shift_exampleia(
+// CIR-LABEL: cir.func{{.*}} @_Z18sext_shift_exampleia(
 // CIR-SAME: %[[ARG0:.*]]: !s32i{{.*}}, %[[ARG1:.*]]: !s8i{{.*}})
 // CIR: %[[A_PTR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
 // CIR: %[[B_PTR:.*]] = cir.alloca !s8i, !cir.ptr<!s8i>, ["b", init]
@@ -421,7 +421,7 @@ void sext_shift_example(int a, signed char b) {
 
 // CIR: cir.return
 
-// LLVM-LABEL: define void @_Z18sext_shift_exampleia
+// LLVM-LABEL: define{{.*}} void @_Z18sext_shift_exampleia
 // LLVM-SAME: (i32 %[[A:.*]], i8 %[[B:.*]])
 // LLVM:         %[[A_ADDR:.*]] = alloca i32
 // LLVM:         %[[B_ADDR:.*]] = alloca i8
@@ -443,7 +443,7 @@ void sext_shift_example(int a, signed char b) {
 
 // LLVM:         ret void
 
-// OGCG-LABEL: define dso_local void @_Z18sext_shift_exampleia
+// OGCG-LABEL: define{{.*}} void @_Z18sext_shift_exampleia
 // OGCG-SAME: (i32 {{.*}} %[[A:.*]], i8 {{.*}} %[[B:.*]])
 // OGCG:         %[[A_ADDR:.*]] = alloca i32
 // OGCG:         %[[B_ADDR:.*]] = alloca i8
@@ -470,7 +470,7 @@ void long_shift_example(long long a, short b) {
   x = a << b;
 }
 
-// CIR-LABEL: cir.func @_Z18long_shift_examplexs(
+// CIR-LABEL: cir.func{{.*}} @_Z18long_shift_examplexs(
 // CIR-SAME: %[[ARG0:.*]]: !s64i{{.*}}, %[[ARG1:.*]]: !s16i{{.*}})
 // CIR: %[[A_PTR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["a", init]
 // CIR: %[[B_PTR:.*]] = cir.alloca !s16i, !cir.ptr<!s16i>, ["b", init]
@@ -493,7 +493,7 @@ void long_shift_example(long long a, short b) {
 
 // CIR: cir.return
 
-// LLVM-LABEL: define void @_Z18long_shift_examplexs
+// LLVM-LABEL: define{{.*}} void @_Z18long_shift_examplexs
 // LLVM-SAME: (i64 %[[A:.*]], i16 %[[B:.*]])
 // LLVM:         %[[A_ADDR:.*]] = alloca i64
 // LLVM:         %[[B_ADDR:.*]] = alloca i16
@@ -517,7 +517,7 @@ void long_shift_example(long long a, short b) {
 
 // LLVM:         ret void
 
-// OGCG-LABEL: define dso_local void @_Z18long_shift_examplexs
+// OGCG-LABEL: define{{.*}} void @_Z18long_shift_examplexs
 // OGCG-SAME: (i64 {{.*}} %[[A:.*]], i16 {{.*}} %[[B:.*]])
 // OGCG:         %[[A_ADDR:.*]] = alloca i64
 // OGCG:         %[[B_ADDR:.*]] = alloca i16
@@ -546,7 +546,7 @@ void b1(bool a, bool b) {
   x = x || b;
 }
 
-// CIR-LABEL: cir.func @_Z2b1bb(
+// CIR-LABEL: cir.func{{.*}} @_Z2b1bb(
 // CIR-SAME: %[[ARG0:.*]]: !cir.bool {{.*}}, %[[ARG1:.*]]: !cir.bool {{.*}})
 // CIR: [[A:%[0-9]+]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["a", init]
 // CIR: [[B:%[0-9]+]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["b", init]
@@ -574,7 +574,7 @@ void b1(bool a, bool b) {
 // CIR: cir.return
 
 
-// LLVM-LABEL: define void @_Z2b1bb(
+// LLVM-LABEL: define{{.*}} void @_Z2b1bb(
 // LLVM-SAME: i1 %[[ARG0:.+]], i1 %[[ARG1:.+]])
 // LLVM: %[[A_ADDR:.*]] = alloca i8
 // LLVM: %[[B_ADDR:.*]] = alloca i8
@@ -611,7 +611,7 @@ void b1(bool a, bool b) {
 // LLVM: store i8 %[[ZEXT_OR]], ptr %[[X]]
 // LLVM: ret void
 
-// OGCG-LABEL: define dso_local void @_Z2b1bb
+// OGCG-LABEL: define{{.*}} void @_Z2b1bb
 // OGCG-SAME: (i1 {{.*}} %[[ARG0:.+]], i1 {{.*}} %[[ARG1:.+]])
 // OGCG: [[ENTRY:.*]]:
 // OGCG: %[[A_ADDR:.*]] = alloca i8
@@ -650,7 +650,7 @@ void b3(int a, int b, int c, int d) {
   x = (a == b) || (c == d);
 }
 
-// CIR-LABEL: cir.func @_Z2b3iiii(
+// CIR-LABEL: cir.func{{.*}} @_Z2b3iiii(
 // CIR-SAME: %[[ARG0:.*]]: !s32i {{.*}}, %[[ARG1:.*]]: !s32i {{.*}}, %[[ARG2:.*]]: !s32i {{.*}}, %[[ARG3:.*]]: !s32i {{.*}})
 // CIR: [[A:%[0-9]+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
 // CIR: [[B:%[0-9]+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init]
@@ -690,7 +690,7 @@ void b3(int a, int b, int c, int d) {
 // CIR: cir.return
 
 
-// LLVM-LABEL: define void @_Z2b3iiii(
+// LLVM-LABEL: define{{.*}} void @_Z2b3iiii(
 // LLVM-SAME: i32 %[[ARG0:.+]], i32 %[[ARG1:.+]], i32 %[[ARG2:.+]], i32 %[[ARG3:.+]])
 // LLVM: %[[A_ADDR:.*]] = alloca i32, i64 1
 // LLVM: %[[B_ADDR:.*]] = alloca i32, i64 1
@@ -733,7 +733,7 @@ void b3(int a, int b, int c, int d) {
 // LLVM: store i8 %[[ZEXT_OR]], ptr %[[X]]
 // LLVM: ret void
 
-// OGCG-LABEL: define dso_local void @_Z2b3iiii(
+// OGCG-LABEL: define{{.*}} void @_Z2b3iiii(
 // OGCG-SAME: i32 {{.*}} %[[ARG0:.+]], i32 {{.*}} %[[ARG1:.+]], i32 {{.*}} %[[ARG2:.+]], i32 {{.*}} %[[ARG3:.+]])
 // OGCG: [[ENTRY:.*]]:
 // OGCG: %[[A_ADDR:.*]] = alloca i32
@@ -771,4 +771,4 @@ void b3(int a, int b, int c, int d) {
 // OGCG: %[[OR_PHI:.*]] = phi i1 [ true, %[[AND_MERGE]] ], [ %[[CMP4]], %[[OR_FALSE]] ]
 // OGCG: %[[ZEXT_OR:.*]] = zext i1 %[[OR_PHI]] to i8
 // OGCG: store i8 %[[ZEXT_OR]], ptr %[[X]]
-// OGCG: ret void
\ No newline at end of file
+// OGCG: ret void
diff --git a/clang/test/CIR/CodeGen/builtin_call.cpp b/clang/test/CIR/CodeGen/builtin_call.cpp
index bbe5e36b8bd99..b956f2580593e 100644
--- a/clang/test/CIR/CodeGen/builtin_call.cpp
+++ b/clang/test/CIR/CodeGen/builtin_call.cpp
@@ -27,7 +27,7 @@ int is_constant_evaluated() {
   return __builtin_is_constant_evaluated();
 }
 
-// CIR: cir.func @_Z21is_constant_evaluatedv() -> !s32i
+// CIR: cir.func{{.*}} @_Z21is_constant_evaluatedv() -> !s32i
 // CIR: %[[ZERO:.+]] = cir.const #cir.int<0>
 
 // LLVM: define {{.*}}i32 @_Z21is_constant_evaluatedv()
@@ -45,7 +45,7 @@ long double constant_fp_builtin_ld() {
   return __builtin_fabsl(-0.1L);
 }
 
-// CIR: cir.func @_Z22constant_fp_builtin_ldv() -> !cir.long_double<!cir.f80>
+// CIR: cir.func{{.*}} @_Z22constant_fp_builtin_ldv() -> !cir.long_double<!cir.f80>
 // CIR: %[[PONE:.+]] = cir.const #cir.fp<1.000000e-01> : !cir.long_double<!cir.f80>
 
 // LLVM: define {{.*}}x86_fp80 @_Z22constant_fp_builtin_ldv()
@@ -63,7 +63,7 @@ float constant_fp_builtin_single() {
   return __builtin_fabsf(-0.1f);
 }
 
-// CIR: cir.func @_Z26constant_fp_builtin_singlev() -> !cir.float
+// CIR: cir.func{{.*}} @_Z26constant_fp_builtin_singlev() -> !cir.float
 // CIR: %[[PONE:.+]] = cir.const #cir.fp<1.000000e-01> : !cir.float
 
 // LLVM: define {{.*}}float @_Z26constant_fp_builtin_singlev()
@@ -82,16 +82,16 @@ void library_builtins() {
   __builtin_abort();
 }
 
-// CIR: cir.func @_Z16library_builtinsv() {
+// CIR: cir.func{{.*}} @_Z16library_builtinsv() {
 // CIR: %[[NULL:.+]] = cir.const #cir.ptr<null> : !cir.ptr<!s8i>
 // CIR: cir.call @printf(%[[NULL]]) : (!cir.ptr<!s8i>) -> !s32i
 // CIR: cir.call @abort() : () -> ()
 
-// LLVM: define void @_Z16library_builtinsv()
+// LLVM: define{{.*}} void @_Z16library_builtinsv()
 // LLVM: call i32 (ptr, ...) @printf(ptr null)
 // LLVM: call void @abort()
 
-// OGCG: define dso_local void @_Z16library_builtinsv()
+// OGCG: define{{.*}} void @_Z16library_builtinsv()
 // OGCG: call i32 (ptr, ...) @printf(ptr noundef null)
 // OGCG: call void @abort()
 
@@ -99,11 +99,11 @@ void assume(bool arg) {
   __builtin_assume(arg);
 }
 
-// CIR: cir.func @_Z6assumeb
+// CIR: cir.func{{.*}} @_Z6assumeb
 // CIR:   cir.assume %{{.+}} : !cir.bool
 // CIR: }
 
-// LLVM: define void @_Z6assumeb
+// LLVM: define {{.*}}void @_Z6assumeb
 // LLVM:   call void @llvm.assume(i1 %{{.+}})
 // LLVM: }
 
@@ -115,7 +115,7 @@ void expect(int x, int y) {
   __builtin_expect(x, y);
 }
 
-// CIR-LABEL: cir.func @_Z6expectii
+// CIR-LABEL: cir.func{{.*}} @_Z6expectii
 // CIR:         %[[X:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!s32i>, !s32i
 // CIR-NEXT:    %[[X_LONG:.+]] = cir.cast(integral, %[[X]] : !s32i), !s64i
 // CIR-NEXT:    %[[Y:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!s32i>, !s32i
@@ -123,7 +123,7 @@ void expect(int x, int y) {
 // CIR-NEXT:    %{{.+}} = cir.expect(%[[X_LONG]], %[[Y_LONG]]) : !s64i
 // CIR:       }
 
-// LLVM-LABEL: define void @_Z6expectii
+// LLVM-LABEL: define{{.*}} void @_Z6expectii
 // LLVM:         %[[X:.+]] = load i32, ptr %{{.+}}, align 4
 // LLVM-NEXT:    %[[X_LONG:.+]] = sext i32 %[[X]] to i64
 // LLVM-NEXT:    %[[Y:.+]] = load i32, ptr %{{.+}}, align 4
@@ -135,7 +135,7 @@ void expect_prob(int x, int y) {
   __builtin_expect_with_probability(x, y, 0.25);
 }
 
-// CIR-LABEL: cir.func @_Z11expect_probii
+// CIR-LABEL: cir.func{{.*}} @_Z11expect_probii
 // CIR:         %[[X:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!s32i>, !s32i
 // CIR-NEXT:    %[[X_LONG:.+]] = cir.cast(integral, %[[X]] : !s32i), !s64i
 // CIR-NEXT:    %[[Y:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!s32i>, !s32i
@@ -143,7 +143,7 @@ void expect_prob(int x, int y) {
 // CIR-NEXT:    %{{.+}} = cir.expect(%[[X_LONG]], %[[Y_LONG]], 2.500000e-01) : !s64i
 // CIR:       }
 
-// LLVM:       define void @_Z11expect_probii
+// LLVM:       define{{.*}} void @_Z11expect_probii
 // LLVM:         %[[X:.+]] = load i32, ptr %{{.+}}, align 4
 // LLVM-NEXT:    %[[X_LONG:.+]] = sext i32 %[[X]] to i64
 // LLVM-NEXT:    %[[Y:.+]] = load i32, ptr %{{.+}}, align 4
diff --git a/clang/test/CIR/CodeGen/builtin_printf.cpp b/clang/test/CIR/CodeGen/builtin_printf.cpp
index 35c71eba86874..d12f822d43ebf 100644
--- a/clang/test/CIR/CodeGen/builtin_printf.cpp
+++ b/clang/test/CIR/CodeGen/builtin_printf.cpp
@@ -18,9 +18,9 @@ void func(char const * const str, int i) {
   __builtin_printf("%s %d\n", str, i);
 }
 
-// CIR: cir.func @printf(!cir.ptr<!s8i>, ...) -> !s32i
+// CIR: cir.func{{.*}} @printf(!cir.ptr<!s8i>, ...) -> !s32i
 
-// CIR: cir.func @_Z4funcPKci(%[[arg0:.+]]: !cir.ptr<!s8i>{{.*}}, %[[arg1:.+]]: !s32i{{.*}}) {
+// CIR: cir.func{{.*}} @_Z4funcPKci(%[[arg0:.+]]: !cir.ptr<!s8i>{{.*}}, %[[arg1:.+]]: !s32i{{.*}}) {
 // CIR:   %[[str_ptr:.+]] = cir.alloca !cir.ptr<!s8i>, !cir.ptr<!cir.ptr<!s8i>>, ["str", init, const]
 // CIR:   %[[i_ptr:.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init]
 // CIR:   cir.store %[[arg0]], %[[str_ptr]] : !cir.ptr<!s8i>, !cir.ptr<!cir.ptr<!s8i>>
@@ -38,7 +38,7 @@ void func(char const * const str, int i) {
 // CIR:   %[[printf_result3:.+]] = cir.call @printf(%[[full_fmt_ptr]], %[[str_val2]], %[[i_val]]) : (!cir.ptr<!s8i>, !cir.ptr<!s8i>, !s32i) -> !s32i
 // CIR:   cir.return
 
-// LLVM: define void @_Z4funcPKci(ptr %[[arg0:.+]], i32 %[[arg1:.+]])
+// LLVM: define{{.*}} void @_Z4funcPKci(ptr %[[arg0:.+]], i32 %[[arg1:.+]])
 // LLVM:   %[[str_ptr:.+]] = alloca ptr
 // LLVM:   %[[i_ptr:.+]] = alloca i32
 // LLVM:   store ptr %[[arg0]], ptr %[[str_ptr]]{{.*}}
@@ -51,7 +51,7 @@ void func(char const * const str, int i) {
 // LLVM:   %[[printf_result3:.+]] = call i32 (ptr, ...) @printf(ptr @.str.1, ptr %[[str_val2]], i32 %[[i_val]])
 // LLVM:   ret void
 
-// OGCG: define dso_local void @_Z4funcPKci(ptr noundef %[[arg0:.+]], i32 noundef %[[arg1:.+]])
+// OGCG: define{{.*}} void @_Z4funcPKci(ptr noundef %[[arg0:.+]], i32 noundef %[[arg1:.+]])
 // OGCG:   %[[str_ptr:.+]] = alloca ptr
 // OGCG:   %[[i_ptr:.+]] = alloca i32
 // OGCG:   store ptr %[[arg0]], ptr %[[str_ptr]]{{.*}}
diff --git a/clang/test/CIR/CodeGen/call.c b/clang/test/CIR/CodeGen/call.c
index f6aa41df7439e..83a66fca638c2 100644
--- a/clang/test/CIR/CodeGen/call.c
+++ b/clang/test/CIR/CodeGen/call.c
@@ -16,15 +16,15 @@ void f2() {
   f1(s);
 }
 
-// CIR-LABEL: cir.func @f2()
+// CIR-LABEL: cir.func{{.*}} @f2()
 // CIR:         %[[S:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!rec_S>, !rec_S
 // CIR-NEXT:    cir.call @f1(%[[S]]) : (!rec_S) -> ()
 
-// LLVM-LABEL: define void @f2()
+// LLVM-LABEL: define{{.*}} void @f2()
 // LLVM:         %[[S:.+]] = load %struct.S, ptr %{{.+}}, align 4
 // LLVM-NEXT:    call void @f1(%struct.S %[[S]])
 
-// OGCG-LABEL: define dso_local void @f2()
+// OGCG-LABEL: define{{.*}} void @f2()
 // OGCG:         %[[S:.+]] = load i64, ptr %{{.+}}, align 4
 // OGCG-NEXT:    call void @f1(i64 %[[S]])
 
@@ -33,15 +33,15 @@ void f4() {
   struct S s = f3();
 }
 
-// CIR-LABEL: cir.func @f4() {
+// CIR-LABEL: cir.func{{.*}} @f4() {
 // CIR:         %[[S:.+]] = cir.call @f3() : () -> !rec_S
 // CIR-NEXT:    cir.store align(4) %[[S]], %{{.+}} : !rec_S, !cir.ptr<!rec_S>
 
-// LLVM-LABEL: define void @f4() {
+// LLVM-LABEL: define{{.*}} void @f4() {
 // LLVM:         %[[S:.+]] = call %struct.S (...) @f3()
 // LLVM-NEXT:    store %struct.S %[[S]], ptr %{{.+}}, align 4
 
-// OGCG-LABEL: define dso_local void @f4() #0 {
+// OGCG-LABEL: define{{.*}} void @f4() #0 {
 // OGCG:         %[[S:.+]] = call i64 (...) @f3()
 // OGCG-NEXT:    store i64 %[[S]], ptr %{{.+}}, align 4
 
@@ -57,15 +57,15 @@ void f7() {
   f5(b);
 }
 
-// CIR-LABEL: cir.func @f7()
+// CIR-LABEL: cir.func{{.*}} @f7()
 // CIR:         %[[B:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!rec_Big>, !rec_Big
 // CIR-NEXT:    cir.call @f5(%[[B]]) : (!rec_Big) -> ()
 
-// LLVM-LABEL: define void @f7() {
+// LLVM-LABEL: define{{.*}} void @f7() {
 // LLVM:         %[[B:.+]] = load %struct.Big, ptr %{{.+}}, align 4
 // LLVM-NEXT:    call void @f5(%struct.Big %[[B]])
 
-// OGCG-LABEL: define dso_local void @f7() #0 {
+// OGCG-LABEL: define{{.*}} void @f7() #0 {
 // OGCG:         %[[B:.+]] = alloca %struct.Big, align 8
 // OGCG-NEXT:    call void @f5(ptr noundef byval(%struct.Big) align 8 %[[B]])
 
@@ -73,15 +73,15 @@ void f8() {
   struct Big b = f6();
 }
 
-// CIR-LABEL: cir.func @f8()
+// CIR-LABEL: cir.func{{.*}} @f8()
 // CIR:         %[[B:.+]] = cir.call @f6() : () -> !rec_Big
 // CIR:         cir.store align(4) %[[B]], %{{.+}} : !rec_Big, !cir.ptr<!rec_Big>
 
-// LLVM-LABEL: define void @f8() {
+// LLVM-LABEL: define{{.*}} void @f8() {
 // LLVM:        %[[B:.+]] = call %struct.Big (...) @f6()
 // LLVM-NEXT:   store %struct.Big %[[B]], ptr %{{.+}}, align 4
 
-// OGCG-LABEL: define dso_local void @f8() #0 {
+// OGCG-LABEL: define{{.*}} void @f8() #0 {
 // OGCG:         %[[B:.+]] = alloca %struct.Big, align 4
 // OGCG-NEXT:    call void (ptr, ...) @f6(ptr dead_on_unwind writable sret(%struct.Big) align 4 %[[B]])
 
@@ -89,21 +89,21 @@ void f9() {
   f1(f3());
 }
 
-// CIR-LABEL: cir.func @f9()
+// CIR-LABEL: cir.func{{.*}} @f9()
 // CIR:         %[[SLOT:.+]] = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["agg.tmp0"] {alignment = 4 : i64}
 // CIR-NEXT:    %[[RET:.+]] = cir.call @f3() : () -> !rec_S
 // CIR-NEXT:    cir.store align(4) %[[RET]], %[[SLOT]] : !rec_S, !cir.ptr<!rec_S>
 // CIR-NEXT:    %[[ARG:.+]] = cir.load align(4) %[[SLOT]] : !cir.ptr<!rec_S>, !rec_S
 // CIR-NEXT:    cir.call @f1(%[[ARG]]) : (!rec_S) -> ()
 
-// LLVM-LABEL: define void @f9() {
+// LLVM-LABEL: define{{.*}} void @f9() {
 // LLVM:         %[[SLOT:.+]] = alloca %struct.S, i64 1, align 4
 // LLVM-NEXT:    %[[RET:.+]] = call %struct.S (...) @f3()
 // LLVM-NEXT:    store %struct.S %[[RET]], ptr %[[SLOT]], align 4
 // LLVM-NEXT:    %[[ARG:.+]] = load %struct.S, ptr %[[SLOT]], align 4
 // LLVM-NEXT:    call void @f1(%struct.S %[[ARG]])
 
-// OGCG-LABEL: define dso_local void @f9() #0 {
+// OGCG-LABEL: define{{.*}} void @f9() #0 {
 // OGCG:         %[[SLOT:.+]] = alloca %struct.S, align 4
 // OGCG-NEXT:    %[[RET:.+]] = call i64 (...) @f3()
 // OGCG-NEXT:    store i64 %[[RET]], ptr %[[SLOT]], align 4
@@ -116,17 +116,17 @@ int f12(void) {
   return f10(1) + f11(2);
 }
 
-// CIR-LABEL: cir.func @f12() -> !s32i
+// CIR-LABEL: cir.func{{.*}} @f12() -> !s32i
 // CIR:         %[[A:.+]] = cir.const #cir.int<1> : !s32i
 // CIR-NEXT:    %{{.+}} = cir.call @f10(%[[A]]) side_effect(pure) : (!s32i) -> !s32i
 // CIR-NEXT:    %[[B:.+]] = cir.const #cir.int<2> : !s32i
 // CIR-NEXT:    %{{.+}} = cir.call @f11(%[[B]]) side_effect(const) : (!s32i) -> !s32i
 
-// LLVM-LABEL: define i32 @f12()
+// LLVM-LABEL: define{{.*}} i32 @f12()
 // LLVM:         %{{.+}} = call i32 @f10(i32 1) #[[ATTR0:.+]]
 // LLVM-NEXT:    %{{.+}} = call i32 @f11(i32 2) #[[ATTR1:.+]]
 
-// OGCG-LABEL: define dso_local i32 @f12()
+// OGCG-LABEL: define{{.*}} i32 @f12()
 // OGCG:         %{{.+}} = call i32 @f10(i32 noundef 1) #[[ATTR0:.+]]
 // OGCG-NEXT:    %{{.+}} = call i32 @f11(i32 noundef 2) #[[ATTR1:.+]]
 
diff --git a/clang/test/CIR/CodeGen/call.cpp b/clang/test/CIR/CodeGen/call.cpp
index cc25afce1e5a4..f7653ed7a572b 100644
--- a/clang/test/CIR/CodeGen/call.cpp
+++ b/clang/test/CIR/CodeGen/call.cpp
@@ -8,11 +8,11 @@ void f2() {
   f1();
 }
 
-// CIR-LABEL: cir.func @_Z2f1v
-// CIR-LABEL: cir.func @_Z2f2v
+// CIR-LABEL: cir.func{{.*}} @_Z2f1v
+// CIR-LABEL: cir.func{{.*}} @_Z2f2v
 // CIR:         cir.call @_Z2f1v() : () -> ()
 
-// LLVM-LABEL: define void @_Z2f2v() {
+// LLVM-LABEL: define{{.*}} void @_Z2f2v() {
 // LLVM:         call void @_Z2f1v()
 
 int f3() { return 2; }
@@ -21,11 +21,11 @@ int f4() {
   return x;
 }
 
-// CIR-LABEL: cir.func @_Z2f3v() -> !s32i
-// CIR-LABEL: cir.func @_Z2f4v() -> !s32i
+// CIR-LABEL: cir.func{{.*}} @_Z2f3v() -> !s32i
+// CIR-LABEL: cir.func{{.*}} @_Z2f4v() -> !s32i
 // CIR:         cir.call @_Z2f3v() : () -> !s32i
 
-// LLVM-LABEL: define i32 @_Z2f4v() {
+// LLVM-LABEL: define{{.*}} i32 @_Z2f4v() {
 // LLVM:         %{{.+}} = call i32 @_Z2f3v()
 
 int f5(int a, int *b, bool c);
@@ -34,26 +34,26 @@ int f6() {
   return f5(2, &b, false);
 }
 
-// CIR-LABEL: cir.func @_Z2f6v() -> !s32i
+// CIR-LABEL: cir.func{{.*}} @_Z2f6v() -> !s32i
 // CIR:         %[[#b:]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init]
 // CIR:         %[[#a:]] = cir.const #cir.int<2> : !s32i
 // CIR-NEXT:    %[[#c:]] = cir.const #false
 // CIR-NEXT:    %{{.+}} = cir.call @_Z2f5iPib(%[[#a]], %[[#b:]], %[[#c]]) : (!s32i, !cir.ptr<!s32i>, !cir.bool) -> !s32i
 
-// LLVM-LABEL: define i32 @_Z2f6v() {
+// LLVM-LABEL: define{{.*}} i32 @_Z2f6v() {
 // LLVM:         %{{.+}} = call i32 @_Z2f5iPib(i32 2, ptr %{{.+}}, i1 false)
 
 int f7(int (*ptr)(int, int)) {
   return ptr(1, 2);
 }
 
-// CIR-LABEL: cir.func @_Z2f7PFiiiE
+// CIR-LABEL: cir.func{{.*}} @_Z2f7PFiiiE
 // CIR:         %[[#ptr:]] = cir.load{{.*}} %{{.+}} : !cir.ptr<!cir.ptr<!cir.func<(!s32i, !s32i) -> !s32i>>>, !cir.ptr<!cir.func<(!s32i, !s32i) -> !s32i>>
 // CIR-NEXT:    %[[#a:]] = cir.const #cir.int<1> : !s32i
 // CIR-NEXT:    %[[#b:]] = cir.const #cir.int<2> : !s32i
 // CIR-NEXT:    %{{.+}} = cir.call %[[#ptr]](%[[#a]], %[[#b]]) : (!cir.ptr<!cir.func<(!s32i, !s32i) -> !s32i>>, !s32i, !s32i) -> !s32i
 
-// LLVM-LABEL: define i32 @_Z2f7PFiiiE
+// LLVM-LABEL: define{{.*}} i32 @_Z2f7PFiiiE
 // LLVM:         %[[#ptr:]] = load ptr, ptr %{{.+}}
 // LLVM-NEXT:    %{{.+}} = call i32 %[[#ptr]](i32 1, i32 2)
 
@@ -63,11 +63,11 @@ void f9() {
   f8(1, 2, 3, 4);
 }
 
-// CIR-LABEL: cir.func @_Z2f9v()
+// CIR-LABEL: cir.func{{.*}} @_Z2f9v()
 // CIR:         cir.call @_Z2f8iz(%{{.+}}) : (!s32i) -> ()
 // CIR:         cir.call @_Z2f8iz(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) : (!s32i, !s32i, !s32i, !s32i) -> ()
 
-// LLVM-LABEL: define void @_Z2f9v()
+// LLVM-LABEL: define{{.*}} void @_Z2f9v()
 // LLVM:         call void (i32, ...) @_Z2f8iz(i32 1)
 // LLVM:         call void (i32, ...) @_Z2f8iz(i32 1, i32 2, i32 3, i32 4)
 
@@ -81,11 +81,11 @@ void f11() {
   S s = f10();
 }
 
-// CIR-LABEL: cir.func @_Z3f11v()
+// CIR-LABEL: cir.func{{.*}} @_Z3f11v()
 // CIR:         %[[#s:]] = cir.call @_Z3f10v() : () -> !rec_S
 // CIR-NEXT:    cir.store align(4) %[[#s]], %{{.+}} : !rec_S, !cir.ptr<!rec_S>
 
-// LLVM-LABEL: define void @_Z3f11v()
+// LLVM-LABEL: define{{.*}} void @_Z3f11v()
 // LLVM:         %[[#s:]] = call %struct.S @_Z3f10v()
 // LLVM-NEXT:    store %struct.S %[[#s]], ptr %{{.+}}, align 4
 
@@ -93,12 +93,12 @@ void f12() {
   f10();
 }
 
-// CIR-LABEL: cir.func @_Z3f12v()
+// CIR-LABEL: cir.func{{.*}} @_Z3f12v()
 // CIR:         %[[#slot:]] = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["agg.tmp0"]
 // CIR-NEXT:    %[[#ret:]] = cir.call @_Z3f10v() : () -> !rec_S
 // CIR-NEXT:    cir.store align(4) %[[#ret]], %[[#slot]] : !rec_S, !cir.ptr<!rec_S>
 
-// LLVM-LABEL: define void @_Z3f12v() {
+// LLVM-LABEL: define{{.*}} void @_Z3f12v() {
 // LLVM:         %[[#slot:]] = alloca %struct.S, i64 1, align 4
 // LLVM-NEXT:    %[[#ret:]] = call %struct.S @_Z3f10v()
 // LLVM-NEXT:    store %struct.S %[[#ret]], ptr %[[#slot]], align 4
diff --git a/clang/test/CIR/CodeGen/cast.cpp b/clang/test/CIR/CodeGen/cast.cpp
index 84f55242a6118..caf6de7c7d485 100644
--- a/clang/test/CIR/CodeGen/cast.cpp
+++ b/clang/test/CIR/CodeGen/cast.cpp
@@ -7,7 +7,7 @@ unsigned char cxxstaticcast_0(unsigned int x) {
   return static_cast<unsigned char>(x);
 }
 
-// CIR: cir.func @_Z15cxxstaticcast_0j
+// CIR: cir.func{{.*}} @_Z15cxxstaticcast_0j
 // CIR:    %[[XPTR:[0-9]+]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["x", init] {alignment = 4 : i64}
 // CIR:    %[[RV:[0-9]+]] = cir.alloca !u8i, !cir.ptr<!u8i>, ["__retval"] {alignment = 1 : i64}
 // CIR:    cir.store %arg0, %[[XPTR]] : !u32i, !cir.ptr<!u32i>
@@ -18,7 +18,7 @@ unsigned char cxxstaticcast_0(unsigned int x) {
 // CIR:    cir.return %[[R]] : !u8i
 // CIR:  }
 
-// LLVM: define i8 @_Z15cxxstaticcast_0j(i32 %{{[0-9]+}})
+// LLVM: define{{.*}} i8 @_Z15cxxstaticcast_0j(i32 %{{[0-9]+}})
 // LLVM: %[[LOAD:[0-9]+]] = load i32, ptr %{{[0-9]+}}, align 4
 // LLVM: %[[TRUNC:[0-9]+]] = trunc i32 %[[LOAD]] to i8
 // LLVM: store i8 %[[TRUNC]], ptr %[[RV:[0-9]+]], align 1
@@ -26,8 +26,8 @@ unsigned char cxxstaticcast_0(unsigned int x) {
 // LLVM: ret i8 %[[R]]
 
 int cStyleCasts_0(unsigned x1, int x2, float x3, short x4, double x5) {
-// CIR: cir.func @_Z13cStyleCasts_0jifsd
-// LLVM: define i32 @_Z13cStyleCasts_0jifsd
+// CIR: cir.func{{.*}} @_Z13cStyleCasts_0jifsd
+// LLVM: define{{.*}} i32 @_Z13cStyleCasts_0jifsd
 
   char a = (char)x1; // truncate
   // CIR: %{{[0-9]+}} = cir.cast(integral, %{{[0-9]+}} : !u32i), !s8i
@@ -89,13 +89,13 @@ bool cptr(void *d) {
   return x;
 }
 
-// CIR: cir.func @_Z4cptrPv(%arg0: !cir.ptr<!void>
+// CIR: cir.func{{.*}} @_Z4cptrPv(%arg0: !cir.ptr<!void>
 // CIR:   %[[DPTR:[0-9]+]] = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["d", init] {alignment = 8 : i64}
 
 // CIR:   %[[DVAL:[0-9]+]] = cir.load{{.*}} %[[DPTR]] : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>
 // CIR:   %{{[0-9]+}} = cir.cast(ptr_to_bool, %[[DVAL]] : !cir.ptr<!void>), !cir.bool
 
-// LLVM-LABEL: define i1 @_Z4cptrPv(ptr %0)
+// LLVM-LABEL: define{{.*}} i1 @_Z4cptrPv(ptr %0)
 // LLVM:         %[[ARG_STORAGE:.*]] = alloca ptr, i64 1
 // LLVM:         %[[RETVAL:.*]] = alloca i8, i64 1
 // LLVM:         %[[X_STORAGE:.*]] = alloca i8, i64 1
@@ -114,7 +114,7 @@ void should_not_cast() {
   (void) ib; // void cast
 }
 
-// CIR:     cir.func @_Z15should_not_castv
+// CIR:     cir.func{{.*}} @_Z15should_not_castv
 // CIR-NOT:   cir.cast
 // CIR:     cir.return
 
diff --git a/clang/test/CIR/CodeGen/class.cpp b/clang/test/CIR/CodeGen/class.cpp
index d7f3772c95826..43dde12df40f0 100644
--- a/clang/test/CIR/CodeGen/class.cpp
+++ b/clang/test/CIR/CodeGen/class.cpp
@@ -51,7 +51,7 @@ class Derived : public Base {
 
 int use(Derived *d) { return d->b; }
 
-// CIR: cir.func @_Z3useP7Derived(%[[ARG0:.*]]: !cir.ptr<!rec_Derived>
+// CIR: cir.func{{.*}} @_Z3useP7Derived(%[[ARG0:.*]]: !cir.ptr<!rec_Derived>
 // CIR:  %[[D_ADDR:.*]] = cir.alloca !cir.ptr<!rec_Derived>, !cir.ptr<!cir.ptr<!rec_Derived>>, ["d", init]
 // CIR:  cir.store %[[ARG0]], %[[D_ADDR]]
 // CIR:  %[[D_PTR:.*]] = cir.load align(8) %0
@@ -69,7 +69,7 @@ int use_base() {
   return d.a;
 }
 
-// CIR: cir.func @_Z8use_basev
+// CIR: cir.func{{.*}} @_Z8use_basev
 // CIR:   %[[D_ADDR:.*]] = cir.alloca !rec_Derived, !cir.ptr<!rec_Derived>, ["d"]
 // CIR:   %[[BASE_ADDR:.*]] cir.base_class_addr %[[D_ADDR]] : !cir.ptr<!rec_Derived> nonnull [0] -> !cir.ptr<!rec_Base>
 // CIR:   %[[D_A_ADDR:.*]] = cir.get_member %2[0] {name = "a"} : !cir.ptr<!rec_Base> -> !cir.ptr<!s32i>
@@ -87,7 +87,7 @@ int use_base_via_pointer(Derived *d) {
   return d->a;
 }
 
-// CIR: cir.func @_Z20use_base_via_pointerP7Derived(%[[ARG0:.*]]: !cir.ptr<!rec_Derived>
+// CIR: cir.func{{.*}} @_Z20use_base_via_pointerP7Derived(%[[ARG0:.*]]: !cir.ptr<!rec_Derived>
 // CIR:   %[[D_ADDR:.*]] = cir.alloca !cir.ptr<!rec_Derived>, !cir.ptr<!cir.ptr<!rec_Derived>>, ["d", init]
 // CIR:   cir.store %[[ARG0]], %[[D_ADDR]]
 // CIR:   %[[D:.*]] = cir.load align(8) %[[D_ADDR]]
diff --git a/clang/test/CIR/CodeGen/cmp.cpp b/clang/test/CIR/CodeGen/cmp.cpp
index 40529d92b2a05..75c8cda0c3603 100644
--- a/clang/test/CIR/CodeGen/cmp.cpp
+++ b/clang/test/CIR/CodeGen/cmp.cpp
@@ -14,7 +14,7 @@ void c0(int a, int b) {
   x = a == b;
 }
 
-// CIR-LABEL: cir.func @_Z2c0ii(
+// CIR-LABEL: cir.func{{.*}} @_Z2c0ii(
 
 // CIR: %[[A_PTR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
 // CIR: %[[B_PTR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init]
@@ -45,7 +45,7 @@ void c0(int a, int b) {
 // CIR: %[[B6:.*]] = cir.load{{.*}} %[[B_PTR]]
 // CIR: %{{.*}} = cir.cmp(eq, %[[A6]], %[[B6]]) : !s32i, !cir.bool
 
-// LLVM-LABEL: define void @_Z2c0ii(i32 %0, i32 %1) {
+// LLVM-LABEL: define{{.*}} void @_Z2c0ii(i32 %0, i32 %1) {
 // LLVM: %[[PTR1:.*]] = alloca i32, i64 1
 // LLVM: %[[PTR2:.*]] = alloca i32, i64 1
 // LLVM: %[[BOOL_PTR:.*]] = alloca i8, i64 1
@@ -88,7 +88,7 @@ void c0(int a, int b) {
 // LLVM: %[[ZEXT6:.*]] = zext i1 %[[CMP6]] to i8
 // LLVM: store i8 %[[ZEXT6]], ptr %[[BOOL_PTR]]
 
-// OGCG-LABEL: define dso_local void @_Z2c0ii(i32 {{.*}} %a, i32 {{.*}} %b) {{.*}} {
+// OGCG-LABEL: define{{.*}} void @_Z2c0ii(i32 {{.*}} %a, i32 {{.*}} %b) {{.*}} {
 // OGCG: %[[PTR1:.*]] = alloca i32
 // OGCG: %[[PTR2:.*]] = alloca i32
 // OGCG: %[[BOOL_PTR:.*]] = alloca i8
@@ -140,7 +140,7 @@ void c0_unsigned(unsigned int a, unsigned int b) {
   x = a == b;
 }
 
-// CIR-LABEL: cir.func @_Z11c0_unsignedjj(
+// CIR-LABEL: cir.func{{.*}} @_Z11c0_unsignedjj(
 
 // CIR: %[[U_A_PTR:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["a", init]
 // CIR: %[[U_B_PTR:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["b", init]
@@ -170,7 +170,7 @@ void c0_unsigned(unsigned int a, unsigned int b) {
 // CIR: %[[UB6:.*]] = cir.load{{.*}} %[[U_B_PTR]]
 // CIR: %{{.*}} = cir.cmp(eq, %[[UA6]], %[[UB6]]) : !u32i, !cir.bool
 
-// LLVM-LABEL: define void @_Z11c0_unsignedjj(i32 %0, i32 %1) {
+// LLVM-LABEL: define{{.*}} void @_Z11c0_unsignedjj(i32 %0, i32 %1) {
 // LLVM: %[[U_PTR1:.*]] = alloca i32, i64 1
 // LLVM: %[[U_PTR2:.*]] = alloca i32, i64 1
 // LLVM: %[[U_BOOL_PTR:.*]] = alloca i8, i64 1
@@ -213,7 +213,7 @@ void c0_unsigned(unsigned int a, unsigned int b) {
 // LLVM: %[[UZEXT6:.*]] = zext i1 %[[UCMP6]] to i8
 // LLVM: store i8 %[[UZEXT6]], ptr %[[U_BOOL_PTR]]
 
-// OGCG-LABEL: define dso_local void @_Z11c0_unsignedjj(i32 {{.*}} %a, i32 {{.*}} %b) {{.*}} {
+// OGCG-LABEL: define{{.*}} void @_Z11c0_unsignedjj(i32 {{.*}} %a, i32 {{.*}} %b) {{.*}} {
 // OGCG: %[[U_PTR1:.*]] = alloca i32
 // OGCG: %[[U_PTR2:.*]] = alloca i32
 // OGCG: %[[U_BOOL_PTR:.*]] = alloca i8
@@ -265,7 +265,7 @@ void c0_float(float a, float b) {
   x = a == b;
 }
 
-// CIR-LABEL: cir.func @_Z8c0_floatff(%arg0: !cir.float{{.*}}, %arg1: !cir.float{{.*}}) {
+// CIR-LABEL: cir.func{{.*}} @_Z8c0_floatff(%arg0: !cir.float{{.*}}, %arg1: !cir.float{{.*}}) {
 // CIR: %[[A_PTR:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["a", init]
 // CIR: %[[B_PTR:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["b", init]
 // CIR: %[[X_PTR:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["x", init]
@@ -303,7 +303,7 @@ void c0_float(float a, float b) {
 // CIR: %[[CMP6:.*]] = cir.cmp(eq, %[[A6]], %[[B6]]) : !cir.float, !cir.bool
 // CIR: cir.store{{.*}} %[[CMP6]], %[[X_PTR]] : !cir.bool, !cir.ptr<!cir.bool>
 
-// LLVM-LABEL: define void @_Z8c0_floatff(float %0, float %1) {
+// LLVM-LABEL: define{{.*}} void @_Z8c0_floatff(float %0, float %1) {
 // LLVM: %[[A_PTR:.*]] = alloca float
 // LLVM: %[[B_PTR:.*]] = alloca float
 // LLVM: store float %0, ptr %[[A_PTR]]
@@ -320,7 +320,7 @@ void c0_float(float a, float b) {
 // LLVM: fcmp une float %{{.*}}, %{{.*}}
 // LLVM: fcmp oeq float %{{.*}}, %{{.*}}
 
-// OGCG-LABEL: define dso_local void @_Z8c0_floatff(float {{.*}} %a, float  {{.*}} %b)  {{.*}} {
+// OGCG-LABEL: define{{.*}} void @_Z8c0_floatff(float {{.*}} %a, float  {{.*}} %b)  {{.*}} {
 // OGCG: %[[A_PTR:.*]] = alloca float
 // OGCG: %[[B_PTR:.*]] = alloca float
 // OGCG: store float %a, ptr %[[A_PTR]]
@@ -346,7 +346,7 @@ void pointer_cmp(int *a, int *b) {
   x = a != b;
 }
 
-// CIR-LABEL: cir.func @_Z11pointer_cmpPiS_(%arg0: !cir.ptr<!s32i>{{.*}}, %arg1: !cir.ptr<!s32i>{{.*}}) {
+// CIR-LABEL: cir.func{{.*}} @_Z11pointer_cmpPiS_(%arg0: !cir.ptr<!s32i>{{.*}}, %arg1: !cir.ptr<!s32i>{{.*}}) {
 // CIR: %[[A_PTR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["a", init]
 // CIR: %[[B_PTR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["b", init]
 
@@ -360,7 +360,7 @@ void pointer_cmp(int *a, int *b) {
 // CIR: cir.cmp(eq, {{.*}}, {{.*}}) : !cir.ptr<!s32i>, !cir.bool
 // CIR: cir.cmp(ne, {{.*}}, {{.*}}) : !cir.ptr<!s32i>, !cir.bool
 
-// LLVM-LABEL: define void @_Z11pointer_cmpPiS_(ptr %0, ptr %1) {
+// LLVM-LABEL: define{{.*}} void @_Z11pointer_cmpPiS_(ptr %0, ptr %1) {
 // LLVM: %[[A_PTR:.*]] = alloca ptr
 // LLVM: %[[B_PTR:.*]] = alloca ptr
 // LLVM: store ptr %0, ptr %[[A_PTR]]
@@ -376,7 +376,7 @@ void pointer_cmp(int *a, int *b) {
 // LLVM: icmp eq ptr %{{.*}}, %{{.*}}
 // LLVM: icmp ne ptr %{{.*}}, %{{.*}}
 
-// OGCG-LABEL: define dso_local void @_Z11pointer_cmpPiS_(ptr {{.*}} %a, ptr {{.*}} %b) {{.*}} {
+// OGCG-LABEL: define{{.*}} void @_Z11pointer_cmpPiS_(ptr {{.*}} %a, ptr {{.*}} %b) {{.*}} {
 // OGCG: %[[A_PTR:.*]] = alloca ptr
 // OGCG: %[[B_PTR:.*]] = alloca ptr
 // OGCG: store ptr %a, ptr %[[A_PTR]]
@@ -401,7 +401,7 @@ void bool_cmp(bool a, bool b) {
   x = a != b;
 }
 
-// CIR-LABEL: cir.func @_Z8bool_cmpbb(%arg0: !cir.bool{{.*}}, %arg1: !cir.bool{{.*}}) {
+// CIR-LABEL: cir.func{{.*}} @_Z8bool_cmpbb(%arg0: !cir.bool{{.*}}, %arg1: !cir.bool{{.*}}) {
 // CIR: %[[A_PTR:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["a", init]
 // CIR: %[[B_PTR:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["b", init]
 // CIR: %[[X_PTR:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["x", init]
@@ -419,7 +419,7 @@ void bool_cmp(bool a, bool b) {
 // CIR: cir.cmp(eq
 // CIR: cir.cmp(ne
 
-// LLVM-LABEL: define void @_Z8bool_cmpbb(i1 %0, i1 %1) {
+// LLVM-LABEL: define{{.*}} void @_Z8bool_cmpbb(i1 %0, i1 %1) {
 // LLVM: %[[A_PTR:.*]] = alloca i8
 // LLVM: %[[B_PTR:.*]] = alloca i8
 // LLVM: %[[X_PTR:.*]] = alloca i8
@@ -444,7 +444,7 @@ void bool_cmp(bool a, bool b) {
 // LLVM: icmp eq
 // LLVM: icmp ne
 
-// OGCG-LABEL: define dso_local void @_Z8bool_cmpbb(i1 {{.*}} %a, i1 {{.*}} %b) {{.*}} {
+// OGCG-LABEL: define{{.*}} void @_Z8bool_cmpbb(i1 {{.*}} %a, i1 {{.*}} %b) {{.*}} {
 // OGCG: %[[A_PTR:.*]] = alloca i8
 // OGCG: %[[B_PTR:.*]] = alloca i8
 // OGCG: %[[X_PTR:.*]] = alloca i8
diff --git a/clang/test/CIR/CodeGen/comma.c b/clang/test/CIR/CodeGen/comma.c
index d811f5a72bddf..a1479b85d3f04 100644
--- a/clang/test/CIR/CodeGen/comma.c
+++ b/clang/test/CIR/CodeGen/comma.c
@@ -16,7 +16,7 @@ void comma(void) {
     i = 100, 200;
 }
 
-// CIR-LABEL: cir.func @comma() {
+// CIR-LABEL: cir.func{{.*}} @comma() {
 // CIR:         %[[B:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["b"]
 // CIR:         %[[C:.*]] = cir.alloca !s8i, !cir.ptr<!s8i>, ["c"]
 // CIR:         %[[F:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["f"]
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index cfeed345b4f11..ad3720097a795 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -216,6 +216,53 @@ void foo9(double a, double b) {
 // OGCG: store double %[[TMP_A]], ptr %[[C_REAL_PTR]], align 8
 // OGCG: store double %[[TMP_B]], ptr %[[C_IMAG_PTR]], align 8
 
+void foo12() {
+  double _Complex c;
+  double imag = __imag__ c;
+}
+
+// CIR: %[[COMPLEX:.*]] = cir.alloca !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>, ["c"]
+// CIR: %[[INIT:.*]] = cir.alloca !cir.double, !cir.ptr<!cir.double>, ["imag", init]
+// CIR: %[[TMP:.*]] = cir.load{{.*}} %[[COMPLEX]] : !cir.ptr<!cir.complex<!cir.double>>, !cir.complex<!cir.double>
+// CIR: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.double> -> !cir.double
+// CIR: cir.store{{.*}} %[[IMAG]], %[[INIT]] : !cir.double, !cir.ptr<!cir.double>
+
+// LLVM: %[[COMPLEX:.*]] = alloca { double, double }, i64 1, align 8
+// LLVM: %[[INIT:.*]] = alloca double, i64 1, align 8
+// LLVM: %[[TMP:.*]] = load { double, double }, ptr %[[COMPLEX]], align 8
+// LLVM: %[[IMAG:.*]] = extractvalue { double, double } %[[TMP]], 1
+// LLVM: store double %[[IMAG]], ptr %[[INIT]], align 8
+
+// OGCG: %[[COMPLEX:.*]] = alloca { double, double }, align 8
+// OGCG: %[[INIT:.*]] = alloca double, align 8
+// OGCG: %[[IMAG:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: %[[TMP:.*]] = load double, ptr %[[IMAG]], align 8
+// OGCG: store double %[[TMP]], ptr %[[INIT]], align 8
+
+void foo13() {
+  double _Complex c;
+  double real = __real__ c;
+}
+
+// CIR: %[[COMPLEX:.*]] = cir.alloca !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>, ["c"]
+// CIR: %[[INIT:.*]] = cir.alloca !cir.double, !cir.ptr<!cir.double>, ["real", init]
+// CIR: %[[TMP:.*]] = cir.load{{.*}} %[[COMPLEX]] : !cir.ptr<!cir.complex<!cir.double>>, !cir.complex<!cir.double>
+// CIR: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.double> -> !cir.double
+// CIR: cir.store{{.*}} %[[REAL]], %[[INIT]] : !cir.double, !cir.ptr<!cir.double>
+
+// LLVM: %[[COMPLEX:.*]] = alloca { double, double }, i64 1, align 8
+// LLVM: %[[INIT:.*]] = alloca double, i64 1, align 8
+// LLVM: %[[TMP:.*]] = load { double, double }, ptr %[[COMPLEX]], align 8
+// LLVM: %[[REAL:.*]] = extractvalue { double, double } %[[TMP]], 0
+// LLVM: store double %[[REAL]], ptr %[[INIT]], align 8
+
+// OGCG: %[[COMPLEX:.*]] = alloca { double, double }, align 8
+// OGCG: %[[INIT:.*]] = alloca double, align 8
+// OGCG: %[[REAL:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[TMP:.*]] = load double, ptr %[[REAL]], align 8
+// OGCG: store double %[[TMP]], ptr %[[INIT]], align 8
+
+
 void foo14() {
   int _Complex c = 2i;
 }
@@ -256,3 +303,69 @@ void foo15() {
 // OGCG: %[[B_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX_B]], i32 0, i32 1
 // OGCG: store i32 %[[A_REAL]], ptr %[[B_REAL_PTR]], align 4
 // OGCG: store i32 %[[A_IMAG]], ptr %[[B_IMAG_PTR]], align 4
+
+int foo16(int _Complex a, int _Complex b) {
+  return __imag__ a + __imag__ b;
+}
+
+// CIR: %[[RET:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
+// CIR: %[[COMPLEX_A:.*]] = cir.load{{.*}} {{.*}} : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CIR: %[[A_IMAG:.*]] = cir.complex.imag %[[COMPLEX_A]] : !cir.complex<!s32i> -> !s32i
+// CIR: %[[COMPLEX_B:.*]] = cir.load{{.*}} {{.*}} : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CIR: %[[B_IMAG:.*]] = cir.complex.imag %[[COMPLEX_B]] : !cir.complex<!s32i> -> !s32i
+// CIR: %[[ADD:.*]] = cir.binop(add, %[[A_IMAG]], %[[B_IMAG]]) nsw : !s32i
+// CIR: cir.store %[[ADD]], %[[RET]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[TMP:.*]] = cir.load %[[RET]] : !cir.ptr<!s32i>, !s32i
+// CIR: cir.return %[[TMP]] : !s32i
+
+// LLVM: %[[RET:.*]] = alloca i32, i64 1, align 4
+// LLVM: %[[COMPLEX_A:.*]] = load { i32, i32 }, ptr {{.*}}, align 4
+// LLVM: %[[A_IMAG:.*]] = extractvalue { i32, i32 } %[[COMPLEX_A]], 1
+// LLVM: %[[COMPLEX_B:.*]] = load { i32, i32 }, ptr {{.*}}, align 4
+// LLVM: %[[B_IMAG:.*]] = extractvalue { i32, i32 } %[[COMPLEX_B]], 1
+// LLVM: %[[ADD:.*]] = add nsw i32 %[[A_IMAG]], %[[B_IMAG]]
+// LLVM: store i32 %[[ADD]], ptr %[[RET]], align 4
+// LLVM: %[[TMP:.*]] = load i32, ptr %[[RET]], align 4
+// LLVM: ret i32 %[[TMP]]
+
+// OGCG: %[[COMPLEX_A:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[COMPLEX_B:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[A_IMAG:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX_A]], i32 0, i32 1
+// OGCG: %[[TMP_A:.*]] = load i32, ptr %[[A_IMAG]], align 4
+// OGCG: %[[B_IMAG:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX_B]], i32 0, i32 1
+// OGCG: %[[TMP_B:.*]] = load i32, ptr %[[B_IMAG]], align 4
+// OGCG: %[[ADD:.*]] = add nsw i32 %[[TMP_A]], %[[TMP_B]]
+// OGCG: ret i32 %[[ADD]]
+
+int foo17(int _Complex a, int _Complex b) {
+  return __real__ a + __real__ b;
+}
+
+// CIR: %[[RET:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
+// CIR: %[[COMPLEX_A:.*]] = cir.load{{.*}} {{.*}} : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CIR: %[[A_REAL:.*]] = cir.complex.real %[[COMPLEX_A]] : !cir.complex<!s32i> -> !s32i
+// CIR: %[[COMPLEX_B:.*]] = cir.load{{.*}} {{.*}} : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CIR: %[[B_REAL:.*]] = cir.complex.real %[[COMPLEX_B]] : !cir.complex<!s32i> -> !s32i
+// CIR: %[[ADD:.*]] = cir.binop(add, %[[A_REAL]], %[[B_REAL]]) nsw : !s32i
+// CIR: cir.store %[[ADD]], %[[RET]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[TMP:.*]] = cir.load %[[RET]] : !cir.ptr<!s32i>, !s32i
+// CIR: cir.return %[[TMP]] : !s32i
+
+// LLVM: %[[RET:.*]] = alloca i32, i64 1, align 4
+// LLVM: %[[COMPLEX_A:.*]] = load { i32, i32 }, ptr {{.*}}, align 4
+// LLVM: %[[A_REAL:.*]] = extractvalue { i32, i32 } %[[COMPLEX_A]], 0
+// LLVM: %[[COMPLEX_B:.*]] = load { i32, i32 }, ptr {{.*}}, align 4
+// LLVM: %[[B_REAL:.*]] = extractvalue { i32, i32 } %[[COMPLEX_B]], 0
+// LLVM: %[[ADD:.*]] = add nsw i32 %[[A_REAL]], %[[B_REAL]]
+// LLVM: store i32 %[[ADD]], ptr %[[RET]], align 4
+// LLVM: %[[TMP:.*]] = load i32, ptr %[[RET]], align 4
+// LLVM: ret i32 %[[TMP]]
+
+// OGCG: %[[COMPLEX_A:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[COMPLEX_B:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[A_REAL:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX_A]], i32 0, i32 0
+// OGCG: %[[TMP_A:.*]] = load i32, ptr %[[A_REAL]], align 4
+// OGCG: %[[B_REAL:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX_B]], i32 0, i32 0
+// OGCG: %[[TMP_B:.*]] = load i32, ptr %[[B_REAL]], align 4
+// OGCG: %[[ADD:.*]] = add nsw i32 %[[TMP_A]], %[[TMP_B]]
+// OGCG: ret i32 %[[ADD]]
\ No newline at end of file
diff --git a/clang/test/CIR/CodeGen/compound_assign.cpp b/clang/test/CIR/CodeGen/compound_assign.cpp
index 60442bcdf912e..04bf406d6dd2a 100644
--- a/clang/test/CIR/CodeGen/compound_assign.cpp
+++ b/clang/test/CIR/CodeGen/compound_assign.cpp
@@ -20,7 +20,7 @@ int compound_assign(int b) {
   return x;
 }
 
-// CIR: cir.func @_Z15compound_assigni
+// CIR: cir.func{{.*}} @_Z15compound_assigni
 // CIR:   %[[MUL:.*]] = cir.binop(mul, %{{.*}}, %{{.*}}) nsw : !s32i
 // CIR:   cir.store{{.*}} %[[MUL]], %{{.*}} : !s32i, !cir.ptr<!s32i>
 // CIR:   %[[DIV:.*]] = cir.binop(div, %{{.*}}, %{{.*}}) : !s32i
diff --git a/clang/test/CIR/CodeGen/ctor.cpp b/clang/test/CIR/CodeGen/ctor.cpp
index 0b009442b2f87..4c2877f8460d0 100644
--- a/clang/test/CIR/CodeGen/ctor.cpp
+++ b/clang/test/CIR/CodeGen/ctor.cpp
@@ -16,20 +16,20 @@ void baz() {
 //       constructors here. The handling of constructor aliases is currently
 //       NYI, but when it is added this test should be updated to add a RUN
 //       line that passes '-mconstructor-aliases' to clang_cc1.
-// CHECK:   cir.func @_ZN5StrukC2Ev(%arg0: !cir.ptr<!rec_Struk>
+// CHECK:   cir.func{{.*}} @_ZN5StrukC2Ev(%arg0: !cir.ptr<!rec_Struk>
 // CHECK-NEXT:     %[[THIS_ADDR:.*]] = cir.alloca !cir.ptr<!rec_Struk>, !cir.ptr<!cir.ptr<!rec_Struk>>, ["this", init] {alignment = 8 : i64}
 // CHECK-NEXT:     cir.store %arg0, %[[THIS_ADDR]] : !cir.ptr<!rec_Struk>, !cir.ptr<!cir.ptr<!rec_Struk>>
 // CHECK-NEXT:     %[[THIS:.*]] = cir.load %[[THIS_ADDR]] : !cir.ptr<!cir.ptr<!rec_Struk>>, !cir.ptr<!rec_Struk>
 // CHECK-NEXT:     cir.return
 
-// CHECK:   cir.func @_ZN5StrukC1Ev(%arg0: !cir.ptr<!rec_Struk>
+// CHECK:   cir.func{{.*}} @_ZN5StrukC1Ev(%arg0: !cir.ptr<!rec_Struk>
 // CHECK-NEXT:     %[[THIS_ADDR:.*]] = cir.alloca !cir.ptr<!rec_Struk>, !cir.ptr<!cir.ptr<!rec_Struk>>, ["this", init] {alignment = 8 : i64}
 // CHECK-NEXT:     cir.store %arg0, %[[THIS_ADDR]] : !cir.ptr<!rec_Struk>, !cir.ptr<!cir.ptr<!rec_Struk>>
 // CHECK-NEXT:     %[[THIS:.*]] = cir.load %[[THIS_ADDR]] : !cir.ptr<!cir.ptr<!rec_Struk>>, !cir.ptr<!rec_Struk>
 // CHECK-NEXT:     cir.call @_ZN5StrukC2Ev(%[[THIS]]) : (!cir.ptr<!rec_Struk>) -> ()
 // CHECK-NEXT:     cir.return
 
-// CHECK:   cir.func @_Z3bazv()
+// CHECK:   cir.func{{.*}} @_Z3bazv()
 // CHECK-NEXT:     %[[S_ADDR:.*]] = cir.alloca !rec_Struk, !cir.ptr<!rec_Struk>, ["s", init] {alignment = 4 : i64}
 // CHECK-NEXT:     cir.call @_ZN5StrukC1Ev(%[[S_ADDR]]) : (!cir.ptr<!rec_Struk>) -> ()
 // CHECK-NEXT:     cir.return
@@ -45,9 +45,9 @@ void bar() {
 
 // When a variadic constructor is present, we call the C2 constructor directly.
 
-// CHECK-NOT: cir.func @_ZN13VariadicStrukC2Eiz
+// CHECK-NOT: cir.func{{.*}} @_ZN13VariadicStrukC2Eiz
 
-// CHECK:      cir.func @_ZN13VariadicStrukC1Eiz(%arg0: !cir.ptr<!rec_VariadicStruk>
+// CHECK:      cir.func{{.*}} @_ZN13VariadicStrukC1Eiz(%arg0: !cir.ptr<!rec_VariadicStruk>
 // CHECK-SAME:                                   %arg1: !s32i
 // CHECK-SAME:                                   ...) {
 // CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
@@ -60,7 +60,7 @@ void bar() {
 // CHECK-NEXT:   cir.store{{.*}} %[[N]], %[[A_ADDR]]
 // CHECK-NEXT:   cir.return
 
-// CHECK:  cir.func @_Z3barv
+// CHECK:  cir.func{{.*}} @_Z3barv
 // CHECK-NEXT:    %[[S_ADDR:.*]] = cir.alloca !rec_VariadicStruk, !cir.ptr<!rec_VariadicStruk>, ["s", init]
 // CHECK-NEXT:    %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT:    %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
@@ -78,7 +78,7 @@ void bam() {
   DelegatingStruk s;
 }
 
-// CHECK:       cir.func @_ZN15DelegatingStrukC2Ei(%arg0: !cir.ptr<!rec_DelegatingStruk>
+// CHECK:       cir.func{{.*}} @_ZN15DelegatingStrukC2Ei(%arg0: !cir.ptr<!rec_DelegatingStruk>
 // CHECK-SAME:                                     %arg1: !s32i
 // CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
 // CHECK-NEXT:   %[[N_ADDR:.*]] = cir.alloca {{.*}} ["n", init]
@@ -90,7 +90,7 @@ void bam() {
 // CHECK-NEXT:   cir.store{{.*}} %[[N]], %[[A_ADDR]]
 // CHECK-NEXT:   cir.return
 
-// CHECK:       cir.func @_ZN15DelegatingStrukC1Ei(%arg0: !cir.ptr<!rec_DelegatingStruk>
+// CHECK:       cir.func{{.*}} @_ZN15DelegatingStrukC1Ei(%arg0: !cir.ptr<!rec_DelegatingStruk>
 // CHECK-SAME:                                     %arg1: !s32i
 // CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
 // CHECK-NEXT:   %[[N_ADDR:.*]] = cir.alloca {{.*}} ["n", init]
@@ -101,7 +101,7 @@ void bam() {
 // CHECK-NEXT:   cir.call @_ZN15DelegatingStrukC2Ei(%[[THIS]], %[[N]])
 // CHECK-NEXT:   cir.return
 
-// CHECK: cir.func @_ZN15DelegatingStrukC1Ev(%arg0: !cir.ptr<!rec_DelegatingStruk>
+// CHECK: cir.func{{.*}} @_ZN15DelegatingStrukC1Ev(%arg0: !cir.ptr<!rec_DelegatingStruk>
 // CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
 // CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
 // CHECK-NEXT:   %[[THIS:.*]] = cir.load{{.*}} %[[THIS_ADDR]]
@@ -109,7 +109,7 @@ void bam() {
 // CHECK-NEXT:   cir.call @_ZN15DelegatingStrukC1Ei(%[[THIS]], %[[ZERO]])
 // CHECK-NEXT:   cir.return
 
-// CHECK: cir.func @_Z3bamv
+// CHECK: cir.func{{.*}} @_Z3bamv
 // CHECK-NEXT:    %[[S_ADDR:.*]] = cir.alloca {{.*}} ["s", init]
 // CHECK-NEXT:    cir.call @_ZN15DelegatingStrukC1Ev(%[[S_ADDR]])
 // CHECK-NEXT:    cir.return
@@ -123,7 +123,7 @@ void init_member() {
   MemberInitStruk s;
 }
 
-// CHECK:      cir.func @_ZN15MemberInitStrukC2Ev(%arg0: !cir.ptr<!rec_MemberInitStruk>
+// CHECK:      cir.func{{.*}} @_ZN15MemberInitStrukC2Ev(%arg0: !cir.ptr<!rec_MemberInitStruk>
 // CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
 // CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
 // CHECK-NEXT:   %[[THIS:.*]] = cir.load %[[THIS_ADDR]]
@@ -132,14 +132,14 @@ void init_member() {
 // CHECK-NEXT:   cir.store align(4) %[[ZERO]], %[[A_ADDR]]
 // CHECK-NEXT:   cir.return
 
-// CHECK:      cir.func @_ZN15MemberInitStrukC1Ev(%arg0: !cir.ptr<!rec_MemberInitStruk>
+// CHECK:      cir.func{{.*}} @_ZN15MemberInitStrukC1Ev(%arg0: !cir.ptr<!rec_MemberInitStruk>
 // CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
 // CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
 // CHECK-NEXT:   %[[THIS:.*]] = cir.load %[[THIS_ADDR]]
 // CHECK-NEXT:   cir.call @_ZN15MemberInitStrukC2Ev(%[[THIS]])
 // CHECK-NEXT:   cir.return
 
-// CHECK: cir.func @_Z11init_memberv
+// CHECK: cir.func{{.*}} @_Z11init_memberv
 // CHECK-NEXT:    %[[S_ADDR:.*]] = cir.alloca {{.*}} ["s", init]
 // CHECK-NEXT:    cir.call @_ZN15MemberInitStrukC1Ev(%[[S_ADDR]])
 // CHECK-NEXT:    cir.return
@@ -153,7 +153,7 @@ void init_param_member() {
   ParamMemberInitStruk s(0);
 }
 
-// CHECK:      cir.func @_ZN20ParamMemberInitStrukC2Ei(%arg0: !cir.ptr<!rec_ParamMemberInitStruk>
+// CHECK:      cir.func{{.*}} @_ZN20ParamMemberInitStrukC2Ei(%arg0: !cir.ptr<!rec_ParamMemberInitStruk>
 // CHECK-SAME:                                         %arg1: !s32i
 // CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
 // CHECK-NEXT:   %[[N_ADDR:.*]] = cir.alloca {{.*}} ["n", init]
@@ -165,7 +165,7 @@ void init_param_member() {
 // CHECK-NEXT:   cir.store{{.*}} %[[N]], %[[A_ADDR]]
 // CHECK-NEXT:   cir.return
 
-// CHECK:      cir.func @_ZN20ParamMemberInitStrukC1Ei(%arg0: !cir.ptr<!rec_ParamMemberInitStruk>
+// CHECK:      cir.func{{.*}} @_ZN20ParamMemberInitStrukC1Ei(%arg0: !cir.ptr<!rec_ParamMemberInitStruk>
 // CHECK-SAME:                                         %arg1: !s32i
 // CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
 // CHECK-NEXT:   %[[N_ADDR:.*]] = cir.alloca {{.*}} ["n", init]
@@ -176,7 +176,7 @@ void init_param_member() {
 // CHECK-NEXT:   cir.call @_ZN20ParamMemberInitStrukC2Ei(%[[THIS]], %[[N]])
 // CHECK-NEXT:   cir.return
 
-// CHECK: cir.func @_Z17init_param_memberv
+// CHECK: cir.func{{.*}} @_Z17init_param_memberv
 // CHECK-NEXT:    %[[S_ADDR:.*]] = cir.alloca {{.*}} ["s", init]
 // CHECK-NEXT:    %[[ZERO:.*]] = cir.const #cir.int<0>
 // CHECK-NEXT:    cir.call @_ZN20ParamMemberInitStrukC1Ei(%[[S_ADDR]], %[[ZERO]])
@@ -197,7 +197,7 @@ void init_union() {
   UnionInitStruk s;
 }
 
-// CHECK:      cir.func @_ZN14UnionInitStrukC2Ev(%arg0: !cir.ptr<!rec_UnionInitStruk>
+// CHECK:      cir.func{{.*}} @_ZN14UnionInitStrukC2Ev(%arg0: !cir.ptr<!rec_UnionInitStruk>
 // CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
 // CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
 // CHECK-NEXT:   %[[THIS:.*]] = cir.load %[[THIS_ADDR]]
@@ -208,14 +208,14 @@ void init_union() {
 // CHECK-NEXT:   cir.store{{.*}} %[[ZERO]], %[[C_ADDR]]
 // CHECK-NEXT:   cir.return
 
-// CHECK:      cir.func @_ZN14UnionInitStrukC1Ev(%arg0: !cir.ptr<!rec_UnionInitStruk>
+// CHECK:      cir.func{{.*}} @_ZN14UnionInitStrukC1Ev(%arg0: !cir.ptr<!rec_UnionInitStruk>
 // CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
 // CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
 // CHECK-NEXT:   %[[THIS:.*]] = cir.load %[[THIS_ADDR]]
 // CHECK-NEXT:   cir.call @_ZN14UnionInitStrukC2Ev
 // CHECK-NEXT:   cir.return
 
-// CHECK: cir.func @_Z10init_unionv
+// CHECK: cir.func{{.*}} @_Z10init_unionv
 // CHECK-NEXT:    %[[S_ADDR:.*]] = cir.alloca {{.*}} ["s", init]
 // CHECK-NEXT:    cir.call @_ZN14UnionInitStrukC1Ev(%[[S_ADDR]])
 // CHECK-NEXT:    cir.return
diff --git a/clang/test/CIR/CodeGen/dso-local.c b/clang/test/CIR/CodeGen/dso-local.c
index 07c833d2fbc94..01c93cbd81ee1 100644
--- a/clang/test/CIR/CodeGen/dso-local.c
+++ b/clang/test/CIR/CodeGen/dso-local.c
@@ -3,7 +3,9 @@
 
 // These are here so we find this test when grepping for missing features.
 // cir::MissingFeatures::opGlobalThreadLocal()
-// cir::MissingFeatures::opFuncDsoLocal()
+
+// Note: Unlike CIR doesn't set dso_local on function declarations. This is
+//       a difference from classic codege in the STATIC checks.
 
 /// Static relocation model defaults to -fdirect-access-external-data and sets
 /// dso_local on most global objects.
@@ -13,6 +15,9 @@
 // STATIC-NEXT: @import_var = external dso_local global i32
 // STATIC-NEXT: @weak_bar = extern_weak dso_local global i32
 // STATIC-NEXT: @bar = external dso_local global i32
+// STATIC-DAG: declare void @foo()
+// STATIC-DAG: define dso_local ptr @zed()
+// STATIC-DAG: declare void @import_func()
 
 /// If -fno-direct-access-external-data is set, drop dso_local from global variable
 /// declarations.
@@ -21,30 +26,45 @@
 // STATIC-INDIRECT-NEXT: @import_var = external global i32
 // STATIC-INDIRECT-NEXT: @weak_bar = extern_weak global i32
 // STATIC-INDIRECT-NEXT: @bar = external global i32
+// STATIC-INDIRECT-DAG:  declare void @import_func()
+// STATIC-INDIRECT-DAG:  define dso_local ptr @zed()
+// STATIC-INDIRECT-DAG:  declare void @foo()
 
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm -pic-level 1 -pic-is-pie %s -o - | FileCheck --check-prefix=PIE %s
 // PIE:      @baz = dso_local global i32 42
 // PIE-NEXT: @import_var = external global i32
 // PIE-NEXT: @weak_bar = extern_weak global i32
 // PIE-NEXT: @bar = external global i32
+// PIE-DAG: declare void @foo()
+// PIE-DAG: define dso_local ptr @zed()
+// PIE-DAG: declare void @import_func()
 
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm -pic-level 1 -pic-is-pie -fdirect-access-external-data %s -o - | FileCheck --check-prefix=PIE-DIRECT %s
 // PIE-DIRECT:      @baz = dso_local global i32 42
 // PIE-DIRECT-NEXT: @import_var = external dso_local global i32
 // PIE-DIRECT-NEXT: @weak_bar = extern_weak global i32
 // PIE-DIRECT-NEXT: @bar = external dso_local global i32
+// PIE-DIRECT-DAG: declare void @foo()
+// PIE-DIRECT-DAG: define dso_local ptr @zed()
+// PIE-DIRECT-DAG: declare void @import_func()
 
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm -mrelocation-model static -fno-plt %s -o - | FileCheck --check-prefix=NOPLT %s
 // NOPLT:      @baz = dso_local global i32 42
 // NOPLT-NEXT: @import_var = external dso_local global i32
 // NOPLT-NEXT: @weak_bar = extern_weak dso_local global i32
 // NOPLT-NEXT: @bar = external dso_local global i32
+// NOPLT-DAG: declare void @foo()
+// NOPLT-DAG: define dso_local ptr @zed()
+// NOPLT-DAG: declare void @import_func()
 
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm -fno-plt -pic-level 1 -pic-is-pie -fdirect-access-external-data %s -o - | FileCheck --check-prefix=PIE-DIRECT-NOPLT %s
 // PIE-DIRECT-NOPLT:      @baz = dso_local global i32 42
 // PIE-DIRECT-NOPLT-NEXT: @import_var = external dso_local global i32
 // PIE-DIRECT-NOPLT-NEXT: @weak_bar = extern_weak global i32
 // PIE-DIRECT-NOPLT-NEXT: @bar = external dso_local global i32
+// PIE-DIRECT-NOPLT-DAG: declare void @foo()
+// PIE-DIRECT-NOPLT-DAG: define dso_local ptr @zed()
+// PIE-DIRECT-NOPLT-DAG: declare void @import_func()
 
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm -pic-level 1 -pic-is-pie -fno-plt %s -o - | FileCheck --check-prefix=PIE-NO-PLT %s
 // RUN: %clang_cc1 -triple powerpc64le -fclangir -emit-llvm -mrelocation-model static %s -o - | FileCheck --check-prefix=PIE-NO-PLT %s
@@ -52,24 +72,34 @@
 // PIE-NO-PLT-NEXT: @import_var = external global i32
 // PIE-NO-PLT-NEXT: @weak_bar = extern_weak global i32
 // PIE-NO-PLT-NEXT: @bar = external global i32
+// PIE-NO-PLT-DAG:  declare void @import_func()
+// PIE-NO-PLT-DAG:  define dso_local ptr @zed()
+// PIE-NO-PLT-DAG:  declare void @foo()
 
 /// -fdirect-access-external-data is currently ignored for -fPIC.
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm -pic-level 2 %s -o - | FileCheck --check-prefix=SHARED %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm -pic-level 2 -fdirect-access-external-data %s -o - | FileCheck --check-prefix=SHARED %s
 // SHARED-DAG: @bar = external global i32
 // SHARED-DAG: @weak_bar = extern_weak global i32
+// SHARED-DAG: declare void @foo()
 // SHARED-DAG: @baz ={{.*}} global i32 42
+// SHARED-DAG: define{{.*}} ptr @zed()
 
 int baz = 42;
 __attribute__((dllimport)) extern int import_var;
 __attribute__((weak)) extern int weak_bar;
 extern int bar;
+__attribute__((dllimport)) void import_func(void);
 
 int *use_import(void) {
+  import_func();
   return &import_var;
 }
 
+void foo(void);
+
 int *zed(void) {
+  foo();
   if (baz)
     return &weak_bar;
   return &bar;
diff --git a/clang/test/CIR/CodeGen/forrange.cpp b/clang/test/CIR/CodeGen/forrange.cpp
index 45e146e9091d0..485e9c331417b 100644
--- a/clang/test/CIR/CodeGen/forrange.cpp
+++ b/clang/test/CIR/CodeGen/forrange.cpp
@@ -13,10 +13,10 @@ void for_range() {
     ;
 }
 
-// CIR: cir.func @_Z5beginR9Container(!cir.ptr<!rec_Container>) -> !cir.ptr<!rec_Element>
-// CIR: cir.func @_Z3endR9Container(!cir.ptr<!rec_Container>) -> !cir.ptr<!rec_Element
+// CIR: cir.func{{.*}} @_Z5beginR9Container(!cir.ptr<!rec_Container>) -> !cir.ptr<!rec_Element>
+// CIR: cir.func{{.*}} @_Z3endR9Container(!cir.ptr<!rec_Container>) -> !cir.ptr<!rec_Element
 
-// CIR: cir.func @_Z9for_rangev()
+// CIR: cir.func{{.*}} @_Z9for_rangev()
 // CIR:    %[[C_ADDR:.*]] = cir.alloca !rec_Container{{.*}} ["c"]
 // CIR:    cir.scope {
 // CIR:      %[[RANGE_ADDR:.*]] = cir.alloca !cir.ptr<!rec_Container>{{.*}} ["__range1", init, const]
@@ -59,7 +59,7 @@ void for_range2() {
     ;
 }
 
-// CIR: cir.func @_Z10for_range2v()
+// CIR: cir.func{{.*}} @_Z10for_range2v()
 // CIR:    %[[C_ADDR:.*]] = cir.alloca !rec_C2{{.*}} ["c"]
 // CIR:    cir.scope {
 // CIR:      %[[RANGE_ADDR:.*]] = cir.alloca !cir.ptr<!rec_C2>{{.*}} ["__range1", init, const]
@@ -111,7 +111,7 @@ void for_range3() {
     ;
 }
 
-// CIR: cir.func @_Z10for_range3v()
+// CIR: cir.func{{.*}} @_Z10for_range3v()
 // CIR:    %[[C_ADDR:.*]] = cir.alloca !rec_C3{{.*}} ["c"]
 // CIR:    cir.scope {
 // CIR:      %[[RANGE_ADDR:.*]] = cir.alloca !cir.ptr<!rec_C3>{{.*}} ["__range1", init, const]
diff --git a/clang/test/CIR/CodeGen/if.cpp b/clang/test/CIR/CodeGen/if.cpp
index c78ca103de63b..daaec8a61484d 100644
--- a/clang/test/CIR/CodeGen/if.cpp
+++ b/clang/test/CIR/CodeGen/if.cpp
@@ -14,7 +14,7 @@ int if0(bool a) {
 
 }
 
-// CIR: cir.func @_Z3if0b(%arg0: !cir.bool loc({{.*}})) -> !s32i
+// CIR: cir.func{{.*}} @_Z3if0b(%arg0: !cir.bool loc({{.*}})) -> !s32i
 // CIR: cir.scope {
 // CIR:   %4 = cir.load{{.*}} %0 : !cir.ptr<!cir.bool>, !cir.bool
 // CIR-NEXT: cir.if %4 {
@@ -26,7 +26,7 @@ int if0(bool a) {
 // CIR-NEXT:  }
 
 
-// LLVM: define i32 @_Z3if0b(i1 %0)
+// LLVM: define{{.*}} i32 @_Z3if0b(i1 %0)
 // LLVM:   br label %[[ENTRY:.*]]
 // LLVM: [[ENTRY]]:
 // LLVM:   %6 = load i8, ptr %2, align 1
@@ -43,7 +43,7 @@ int if0(bool a) {
 // LLVM:   %12 = load i32, ptr %3, align 4
 // LLVM:   ret i32 %12
 
-// OGCG: define dso_local noundef i32 @_Z3if0b(i1 noundef zeroext %a)
+// OGCG: define{{.*}} i32 @_Z3if0b(i1 noundef zeroext %a)
 // OGCG: entry:
 // OGCG:   %[[RETVAL:.*]] = alloca i32, align 4
 // OGCG:   %[[A_ADDR:.*]] = alloca i8, align 1
@@ -71,7 +71,7 @@ void if1(int a) {
   }
 }
 
-// CIR: cir.func @_Z3if1i(%arg0: !s32i loc({{.*}}))
+// CIR: cir.func{{.*}} @_Z3if1i(%arg0: !s32i loc({{.*}}))
 // CIR: cir.scope {
 // CIR:   %3 = cir.load{{.*}} %0 : !cir.ptr<!s32i>, !s32i
 // CIR:   %4 = cir.cast(int_to_bool, %3 : !s32i), !cir.bool
@@ -84,7 +84,7 @@ void if1(int a) {
 // CIR-NEXT:   }
 // CIR: }
 
-// LLVM: define void @_Z3if1i(i32 %0)
+// LLVM: define{{.*}} void @_Z3if1i(i32 %0)
 // LLVM: %[[A:.*]] = alloca i32, i64 1, align 4
 // LLVM: %[[X:.*]] = alloca i32, i64 1, align 4
 // LLVM: store i32 %0, ptr %[[A]], align 4
@@ -105,7 +105,7 @@ void if1(int a) {
 // LLVM: [[EXIT]]:
 // LLVM:   ret void
 
-// OGCG: define dso_local void @_Z3if1i(i32 noundef %[[A:.*]])
+// OGCG: define{{.*}} void @_Z3if1i(i32 noundef %[[A:.*]])
 // OGCG: entry:
 // OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
 // OGCG:   %[[X:.*]] = alloca i32, align 4
@@ -138,7 +138,7 @@ void if2(int a, bool b, bool c) {
   }
 }
 
-// CIR: cir.func @_Z3if2ibb(%arg0: !s32i loc({{.*}}), %arg1: !cir.bool loc({{.*}}), %arg2: !cir.bool loc({{.*}}))
+// CIR: cir.func{{.*}} @_Z3if2ibb(%arg0: !s32i loc({{.*}}), %arg1: !cir.bool loc({{.*}}), %arg2: !cir.bool loc({{.*}}))
 // CIR: cir.scope {
 // CIR:   %5 = cir.load{{.*}} %0 : !cir.ptr<!s32i>, !s32i
 // CIR:   %6 = cir.cast(int_to_bool, %5 : !s32i), !cir.bool
@@ -165,7 +165,7 @@ void if2(int a, bool b, bool c) {
 // CIR:   }
 // CIR: }
 
-// LLVM: define void @_Z3if2ibb(i32 %[[A:.*]], i1 %[[B:.*]], i1 %[[C:.*]])
+// LLVM: define{{.*}} void @_Z3if2ibb(i32 %[[A:.*]], i1 %[[B:.*]], i1 %[[C:.*]])
 // LLVM:   %[[VARA:.*]] = alloca i32, i64 1, align 4
 // LLVM:   %[[VARB:.*]] = alloca i8, i64 1, align 1
 // LLVM:   %[[VARC:.*]] = alloca i8, i64 1, align 1
@@ -214,7 +214,7 @@ void if2(int a, bool b, bool c) {
 // LLVM: [[LABEL28]]:
 // LLVM:   ret void
 
-// OGCG: define dso_local void @_Z3if2ibb(i32 noundef %[[A:.*]], i1 noundef zeroext %[[B:.*]], i1 noundef zeroext %[[C:.*]])
+// OGCG: define{{.*}} void @_Z3if2ibb(i32 noundef %[[A:.*]], i1 noundef zeroext %[[B:.*]], i1 noundef zeroext %[[C:.*]])
 // OGCG: entry:
 // OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
 // OGCG:   %[[B_ADDR:.*]] = alloca i8, align 1
@@ -260,7 +260,7 @@ int if_init() {
   }
 }
 
-// CIR: cir.func @_Z7if_initv() -> !s32i
+// CIR: cir.func{{.*}} @_Z7if_initv() -> !s32i
 // CIR: %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>
 // CIR: cir.scope {
 // CIR:   %[[X:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>,
@@ -285,7 +285,7 @@ int if_init() {
 // CIR:   }
 // CIR: }
 
-// LLVM: define i32 @_Z7if_initv()
+// LLVM: define{{.*}} i32 @_Z7if_initv()
 // LLVM: %[[X:.*]] = alloca i32, i64 1, align 4
 // LLVM: %[[RETVAL:.*]] = alloca i32, i64 1, align 4
 // LLVM: store i32 42, ptr %[[X]], align 4
@@ -305,7 +305,7 @@ int if_init() {
 // LLVM:   %[[RETVAL_LOAD2:.*]] = load i32, ptr %[[RETVAL]], align 4
 // LLVM:   ret i32 %[[RETVAL_LOAD2]]
 
-// OGCG: define dso_local noundef i32 @_Z7if_initv()
+// OGCG: define{{.*}} i32 @_Z7if_initv()
 // OGCG: entry:
 // OGCG:   %[[RETVAL:.*]] = alloca i32, align 4
 // OGCG:   %[[X:.*]] = alloca i32, align 4
diff --git a/clang/test/CIR/CodeGen/inline-cxx-func.cpp b/clang/test/CIR/CodeGen/inline-cxx-func.cpp
index 31d0255f18df9..d121daf816173 100644
--- a/clang/test/CIR/CodeGen/inline-cxx-func.cpp
+++ b/clang/test/CIR/CodeGen/inline-cxx-func.cpp
@@ -17,7 +17,7 @@ struct S {
 // LLVM: %struct.S = type { i32 }
 // OGCG: %struct.S = type { i32 }
 
-// CIR: cir.func @_ZN1S10InlineFuncEv(%arg0: !cir.ptr<!rec_S> {{.*}}) -> !s32i
+// CIR: cir.func{{.*}} @_ZN1S10InlineFuncEv(%arg0: !cir.ptr<!rec_S> {{.*}}) -> !s32i
 // CIR:   %[[THIS_ADDR:.*]] = cir.alloca !cir.ptr<!rec_S>, !cir.ptr<!cir.ptr<!rec_S>>, ["this", init]
 // CIR:   %[[RET_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
 // CIR:   cir.store %arg0, %[[THIS_ADDR]] : !cir.ptr<!rec_S>, !cir.ptr<!cir.ptr<!rec_S>>
@@ -46,7 +46,7 @@ void use() {
   s.InlineFunc();
 }
 
-// CIR: cir.func @_Z3usev()
+// CIR: cir.func{{.*}} @_Z3usev()
 // CIR:   %[[S_ADDR:.*]] = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["s"]
 // CIR:   %[[RET_VAL:.*]] = cir.call @_ZN1S10InlineFuncEv(%[[S_ADDR]]) : (!cir.ptr<!rec_S>) -> !s32i
 // CIR:   cir.return
diff --git a/clang/test/CIR/CodeGen/int-to-bool.cpp b/clang/test/CIR/CodeGen/int-to-bool.cpp
index 1dd15edbbf55a..ad36af4552c2f 100644
--- a/clang/test/CIR/CodeGen/int-to-bool.cpp
+++ b/clang/test/CIR/CodeGen/int-to-bool.cpp
@@ -9,14 +9,14 @@ bool f1(unsigned char c) {
   return c;
 }
 
-// CIR: cir.func @_Z2f1h
+// CIR: cir.func{{.*}} @_Z2f1h
 // CIR:   cir.cast(int_to_bool, %{{.*}} : !u8i), !cir.bool
 
 // Note: The full zext/store/load/trunc sequence is checked here to show what
 // CIR is being lowered to. There's no need to check it for every function since
 // the lowering is the same for all of them.
 
-// LLVM: define i1 @_Z2f1h
+// LLVM: define{{.*}} i1 @_Z2f1h
 // LLVM:   %[[CMP:.*]] = icmp ne i8 %4, 0
 // LLVM:   %[[ZEXT:.*]] = zext i1 %[[CMP]] to i8
 // LLVM:   store i8 %[[ZEXT]], ptr %{{.*}}
@@ -32,10 +32,10 @@ bool f2(short s) {
   return s;
 }
 
-// CIR: cir.func @_Z2f2s
+// CIR: cir.func{{.*}} @_Z2f2s
 // CIR:   cir.cast(int_to_bool, %{{.*}} : !s16i), !cir.bool
 
-// LLVM: define i1 @_Z2f2s
+// LLVM: define{{.*}} i1 @_Z2f2s
 // LLVM:   %[[CMP:.*]] = icmp ne i16 %4, 0
 // LLVM:   %[[ZEXT:.*]] = zext i1 %[[CMP]] to i8
 
@@ -47,10 +47,10 @@ bool f3(unsigned u) {
   return u;
 }
 
-// CIR: cir.func @_Z2f3j
+// CIR: cir.func{{.*}} @_Z2f3j
 // CIR:   cir.cast(int_to_bool, %{{.*}} : !u32i), !cir.bool
 
-// LLVM: define i1 @_Z2f3j
+// LLVM: define{{.*}} i1 @_Z2f3j
 // LLVM:   %[[CMP:.*]] = icmp ne i32 %4, 0
 // LLVM:   %[[ZEXT:.*]] = zext i1 %[[CMP]] to i8
 
@@ -62,10 +62,10 @@ bool f4(long l) {
   return l;
 }
 
-// CIR: cir.func @_Z2f4l
+// CIR: cir.func{{.*}} @_Z2f4l
 // CIR:   cir.cast(int_to_bool, %{{.*}} : !s64i), !cir.bool
 
-// LLVM: define i1 @_Z2f4l
+// LLVM: define{{.*}} i1 @_Z2f4l
 // LLVM:   %[[CMP:.*]] = icmp ne i64 %4, 0
 // LLVM:   %[[ZEXT:.*]] = zext i1 %[[CMP]] to i8
 
diff --git a/clang/test/CIR/CodeGen/linkage-spec.cpp b/clang/test/CIR/CodeGen/linkage-spec.cpp
index 01c4e3fbe181d..eb6c7b0a546a9 100644
--- a/clang/test/CIR/CodeGen/linkage-spec.cpp
+++ b/clang/test/CIR/CodeGen/linkage-spec.cpp
@@ -1,42 +1,42 @@
 // RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o - 2>&1 | FileCheck %s
 
 extern "C" void TopLevelC(){}
-// CHECK: cir.func @TopLevelC() {
+// CHECK: cir.func{{.*}} @TopLevelC() {
 extern "C++" void TopLevelCpp(){}
-// CHECK: cir.func @_Z11TopLevelCppv() {
+// CHECK: cir.func{{.*}} @_Z11TopLevelCppv() {
 
 extern "C++" {
   void ExternCppEmpty(){}
-  // CHECK: cir.func @_Z14ExternCppEmptyv() {
+  // CHECK: cir.func{{.*}} @_Z14ExternCppEmptyv() {
   extern "C" void ExternCpp_C(){}
-  // CHECK: cir.func @ExternCpp_C() {
+  // CHECK: cir.func{{.*}} @ExternCpp_C() {
   extern "C++" void ExternCpp_Cpp(){}
-  // CHECK: cir.func @_Z13ExternCpp_Cppv() {
+  // CHECK: cir.func{{.*}} @_Z13ExternCpp_Cppv() {
 
   extern "C" {
   void ExternCpp_CEmpty(){}
-  // CHECK: cir.func @ExternCpp_CEmpty() {
+  // CHECK: cir.func{{.*}} @ExternCpp_CEmpty() {
   extern "C" void ExternCpp_C_C(){}
-  // CHECK: cir.func @ExternCpp_C_C() {
+  // CHECK: cir.func{{.*}} @ExternCpp_C_C() {
   extern "C++" void ExternCpp_C_Cpp(){}
-  // CHECK: cir.func @_Z15ExternCpp_C_Cppv() {
+  // CHECK: cir.func{{.*}} @_Z15ExternCpp_C_Cppv() {
   }
 }
 
 extern "C" {
   void ExternCEmpty(){}
-  // CHECK: cir.func @ExternCEmpty() {
+  // CHECK: cir.func{{.*}} @ExternCEmpty() {
   extern "C" void ExternC_C(){}
-  // CHECK: cir.func @ExternC_C() {
+  // CHECK: cir.func{{.*}} @ExternC_C() {
   extern "C++" void ExternC_Cpp(){}
-  // CHECK: cir.func @_Z11ExternC_Cppv() {
+  // CHECK: cir.func{{.*}} @_Z11ExternC_Cppv() {
   extern "C++" {
   void ExternC_CppEmpty(){}
-  // CHECK: cir.func @_Z16ExternC_CppEmptyv() {
+  // CHECK: cir.func{{.*}} @_Z16ExternC_CppEmptyv() {
   extern "C" void ExternC_Cpp_C(){}
-  // CHECK: cir.func @ExternC_Cpp_C() {
+  // CHECK: cir.func{{.*}} @ExternC_Cpp_C() {
   extern "C++" void ExternC_Cpp_Cpp(){}
-  // CHECK: cir.func @_Z15ExternC_Cpp_Cppv() {
+  // CHECK: cir.func{{.*}} @_Z15ExternC_Cpp_Cppv() {
   }
 }
 
diff --git a/clang/test/CIR/CodeGen/local-vars.cpp b/clang/test/CIR/CodeGen/local-vars.cpp
index 42d6433645354..9385fdfa65601 100644
--- a/clang/test/CIR/CodeGen/local-vars.cpp
+++ b/clang/test/CIR/CodeGen/local-vars.cpp
@@ -22,7 +22,7 @@ void test() {
 }
 
 // CHECK: module
-// CHECK: cir.func @_Z4testv()
+// CHECK: cir.func{{.*}} @_Z4testv()
 // CHECK:    %[[I_PTR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init] {alignment = 4 : i64}
 // CHECK:    %[[L_PTR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["l", init] {alignment = 8 : i64}
 // CHECK:    %[[F_PTR:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["f", init] {alignment = 4 : i64}
diff --git a/clang/test/CIR/CodeGen/loop.cpp b/clang/test/CIR/CodeGen/loop.cpp
index ba117c54b743d..0eba0bbc97c15 100644
--- a/clang/test/CIR/CodeGen/loop.cpp
+++ b/clang/test/CIR/CodeGen/loop.cpp
@@ -10,7 +10,7 @@ void l0() {
   }
 }
 
-// CIR: cir.func @_Z2l0v
+// CIR: cir.func{{.*}} @_Z2l0v
 // CIR:   cir.scope {
 // CIR:     cir.for : cond {
 // CIR:       %[[TRUE:.*]] = cir.const #true
@@ -24,7 +24,7 @@ void l0() {
 // CIR:   cir.return
 // CIR: }
 
-// LLVM: define void @_Z2l0v()
+// LLVM: define{{.*}} void @_Z2l0v()
 // LLVM:   br label %[[LABEL1:.*]]
 // LLVM: [[LABEL1]]:
 // LLVM:   br label %[[LABEL2:.*]]
@@ -50,7 +50,7 @@ void l1() {
   }
 }
 
-// CIR:      cir.func @_Z2l1v
+// CIR:      cir.func{{.*}} @_Z2l1v
 // CIR-NEXT:   cir.scope {
 // CIR-NEXT:     %[[I:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init] {alignment = 4 : i64}
 // CIR-NEXT:     %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
@@ -67,7 +67,7 @@ void l1() {
 // CIR-NEXT:   cir.return
 // CIR-NEXT: }
 
-// LLVM: define void @_Z2l1v()
+// LLVM: define{{.*}} void @_Z2l1v()
 // LLVM:   %[[I:.*]] = alloca i32, i64 1, align 4
 // LLVM:   br label %[[LABEL1:.*]]
 // LLVM: [[LABEL1]]:
@@ -98,7 +98,7 @@ void l2() {
   }
 }
 
-// CIR:      cir.func @_Z2l2v
+// CIR:      cir.func{{.*}} @_Z2l2v
 // CIR-NEXT:   cir.scope {
 // CIR-NEXT:     cir.for : cond {
 // CIR-NEXT:       %[[TRUE:.*]] = cir.const #true
@@ -117,7 +117,7 @@ void l2() {
 // CIR-NEXT:   cir.return
 // CIR-NEXT: }
 
-// LLVM: define void @_Z2l2v()
+// LLVM: define{{.*}} void @_Z2l2v()
 // LLVM:   %[[I:.*]] = alloca i32, i64 1, align 4
 // LLVM:   br label %[[LABEL1:.*]]
 // LLVM: [[LABEL1]]:
@@ -148,7 +148,7 @@ void l3() {
     int i = 0;
 }
 
-// CIR:      cir.func @_Z2l3v
+// CIR:      cir.func{{.*}} @_Z2l3v
 // CIR-NEXT:   cir.scope {
 // CIR-NEXT:     %[[I:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init] {alignment = 4 : i64}
 // CIR-NEXT:     cir.for : cond {
@@ -165,7 +165,7 @@ void l3() {
 // CIR-NEXT:   cir.return
 // CIR-NEXT: }
 
-// LLVM: define void @_Z2l3v()
+// LLVM: define{{.*}} void @_Z2l3v()
 // LLVM:   %[[I:.*]] = alloca i32, i64 1, align 4
 // LLVM:   br label %[[LABEL1:.*]]
 // LLVM: [[LABEL1]]:
@@ -196,7 +196,7 @@ void l4() {
     ;
 }
 
-// CIR: cir.func @_Z2l4v
+// CIR: cir.func{{.*}} @_Z2l4v
 // CIR:   %[[A_ADDR:.*]] = cir.alloca {{.*}} ["a"]
 // CIR:   cir.scope {
 // CIR:     %[[RANGE_ADDR:.*]] = cir.alloca {{.*}} ["__range1", init, const]
@@ -231,7 +231,7 @@ void l4() {
 // CIR:     }
 // CIR:   }
 
-// LLVM: define void @_Z2l4v() {
+// LLVM: define{{.*}} void @_Z2l4v() {
 // LLVM:   %[[RANGE_ADDR:.*]] = alloca ptr
 // LLVM:   %[[BEGIN_ADDR:.*]] = alloca ptr
 // LLVM:   %[[END_ADDR:.*]] = alloca ptr
@@ -305,7 +305,7 @@ void l5() {
   for (int arr[]{1,2,3,4}; auto x : arr) {} 
 }
 
-// CIR: cir.func @_Z2l5v
+// CIR: cir.func{{.*}} @_Z2l5v
 // CIR:   cir.scope {
 // CIR:     %[[ARR_ADDR:.*]] = cir.alloca {{.*}} ["arr", init]
 // CIR:     %[[RANGE_ADDR:.*]] = cir.alloca {{.*}} ["__range1", init, const]
@@ -355,7 +355,7 @@ void l5() {
 // CIR:     }
 // CIR:   }
 
-// LLVM: define void @_Z2l5v() {
+// LLVM: define{{.*}} void @_Z2l5v() {
 // LLVM:   %[[ARR_ADDR:.*]] = alloca [4 x i32]
 // LLVM:   %[[RANGE_ADDR:.*]] = alloca ptr
 // LLVM:   %[[BEGIN_ADDR:.*]] = alloca ptr
@@ -439,7 +439,7 @@ void test_do_while_false() {
   } while (0);
 }
 
-// CIR: cir.func @_Z19test_do_while_falsev()
+// CIR: cir.func{{.*}} @_Z19test_do_while_falsev()
 // CIR-NEXT:   cir.scope {
 // CIR-NEXT:     cir.do {
 // CIR-NEXT:       cir.yield
@@ -448,7 +448,7 @@ void test_do_while_false() {
 // CIR-NEXT:       %[[FALSE:.*]] = cir.cast(int_to_bool, %[[ZERO]] : !s32i), !cir.bool
 // CIR-NEXT:       cir.condition(%[[FALSE]])
 
-// LLVM: define void @_Z19test_do_while_falsev()
+// LLVM: define{{.*}} void @_Z19test_do_while_falsev()
 // LLVM:   br label %[[LABEL1:.*]]
 // LLVM: [[LABEL1]]:
 // LLVM:   br label %[[LABEL3:.*]]
@@ -475,7 +475,7 @@ void test_empty_while_true() {
   }
 }
 
-// CIR: cir.func @_Z21test_empty_while_truev()
+// CIR: cir.func{{.*}} @_Z21test_empty_while_truev()
 // CIR-NEXT:   cir.scope {
 // CIR-NEXT:     cir.while {
 // CIR-NEXT:       %[[TRUE:.*]] = cir.const #true
@@ -486,7 +486,7 @@ void test_empty_while_true() {
 // CIR-NEXT:       }
 // CIR-NEXT:       cir.yield
 
-// LLVM: define void @_Z21test_empty_while_truev()
+// LLVM: define{{.*}} void @_Z21test_empty_while_truev()
 // LLVM:   br label %[[LABEL1:.*]]
 // LLVM: [[LABEL1]]:
 // LLVM:   br label %[[LABEL2:.*]]
@@ -517,7 +517,7 @@ void unreachable_after_continue() {
   }
 }
 
-// CIR: cir.func @_Z26unreachable_after_continuev()
+// CIR: cir.func{{.*}} @_Z26unreachable_after_continuev()
 // CIR:   cir.scope {
 // CIR:     cir.for : cond {
 // CIR:       %[[TRUE:.*]] = cir.const #true
@@ -539,7 +539,7 @@ void unreachable_after_continue() {
 // CIR:   cir.return
 // CIR: }
 
-// LLVM: define void @_Z26unreachable_after_continuev()
+// LLVM: define{{.*}} void @_Z26unreachable_after_continuev()
 // LLVM:   %[[X:.*]] = alloca i32, i64 1, align 4
 // LLVM:   br label %[[LABEL1:.*]]
 // LLVM: [[LABEL1]]:
@@ -577,7 +577,7 @@ void unreachable_after_break() {
   }
 }
 
-// CIR: cir.func @_Z23unreachable_after_breakv()
+// CIR: cir.func{{.*}} @_Z23unreachable_after_breakv()
 // CIR:   cir.scope {
 // CIR:     cir.for : cond {
 // CIR:       %[[TRUE:.*]] = cir.const #true
@@ -599,7 +599,7 @@ void unreachable_after_break() {
 // CIR:   cir.return
 // CIR: }
 
-// LLVM: define void @_Z23unreachable_after_breakv()
+// LLVM: define{{.*}} void @_Z23unreachable_after_breakv()
 // LLVM:   %[[X:.*]] = alloca i32, i64 1, align 4
 // LLVM:   br label %[[LABEL1:.*]]
 // LLVM: [[LABEL1]]:
diff --git a/clang/test/CIR/CodeGen/member-functions.cpp b/clang/test/CIR/CodeGen/member-functions.cpp
index c1d49ac4d8f3a..8be2c7fc2edbe 100644
--- a/clang/test/CIR/CodeGen/member-functions.cpp
+++ b/clang/test/CIR/CodeGen/member-functions.cpp
@@ -10,7 +10,7 @@ struct C {
 
 void C::f() {}
 
-// CIR: cir.func @_ZN1C1fEv(%[[THIS_ARG:.*]]: !cir.ptr<!rec_C>
+// CIR: cir.func{{.*}} @_ZN1C1fEv(%[[THIS_ARG:.*]]: !cir.ptr<!rec_C>
 // CIR:   %[[THIS_ADDR:.*]] = cir.alloca !cir.ptr<!rec_C>, !cir.ptr<!cir.ptr<!rec_C>>, ["this", init]
 // CIR:   cir.store %[[THIS_ARG]], %[[THIS_ADDR]] : !cir.ptr<!rec_C>, !cir.ptr<!cir.ptr<!rec_C>>
 // CIR:   %[[THIS:.*]] = cir.load %[[THIS_ADDR]] : !cir.ptr<!cir.ptr<!rec_C>>, !cir.ptr<!rec_C>
@@ -19,7 +19,7 @@ void C::f() {}
 
 void C::f2(int a, int b) {}
 
-// CIR:      cir.func @_ZN1C2f2Eii(%[[THIS_ARG:.*]]: !cir.ptr<!rec_C> {{.*}}, %[[A_ARG:.*]]: !s32i {{.*}}, %[[B_ARG:.*]]: !s32i {{.*}}) {
+// CIR:      cir.func{{.*}} @_ZN1C2f2Eii(%[[THIS_ARG:.*]]: !cir.ptr<!rec_C> {{.*}}, %[[A_ARG:.*]]: !s32i {{.*}}, %[[B_ARG:.*]]: !s32i {{.*}}) {
 // CIR-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca !cir.ptr<!rec_C>, !cir.ptr<!cir.ptr<!rec_C>>, ["this", init]
 // CIR-NEXT:   %[[A_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
 // CIR-NEXT:   %[[B_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init]
@@ -36,7 +36,7 @@ void test1() {
   c.f2(1, 2);
 }
 
-// CIR: cir.func @_Z5test1v() {
+// CIR: cir.func{{.*}} @_Z5test1v() {
 // CIR-NEXT:   %[[C_ADDR:.*]] = cir.alloca !rec_C, !cir.ptr<!rec_C>, ["c"]
 // CIR-NEXT:   cir.call @_ZN1C1fEv(%[[C_ADDR]]) : (!cir.ptr<!rec_C>) -> ()
 // CIR-NEXT:   %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
diff --git a/clang/test/CIR/CodeGen/namespace.cpp b/clang/test/CIR/CodeGen/namespace.cpp
index cf02673c07787..efae1f2f2f236 100644
--- a/clang/test/CIR/CodeGen/namespace.cpp
+++ b/clang/test/CIR/CodeGen/namespace.cpp
@@ -23,9 +23,9 @@ namespace test {
 // CHECK-DAG: cir.global "private" internal dso_local @_ZN12_GLOBAL__N_12g1E = #cir.int<1> : !s32i
 // CHECK-DAG: cir.global external @_ZN4test2g2E = #cir.int<2> : !s32i
 // CHECK-DAG: cir.global external @_ZN4test5test22g3E = #cir.int<3> : !s32i
-// CHECK-DAG: cir.func @_ZN12_GLOBAL__N_12f1Ev()
-// CHECK-DAG: cir.func @_ZN4test2f2Ev()
-// CHECK-DAG: cir.func @_ZN4test5test22f3Ev()
+// CHECK-DAG: cir.func{{.*}} @_ZN12_GLOBAL__N_12f1Ev()
+// CHECK-DAG: cir.func{{.*}} @_ZN4test2f2Ev()
+// CHECK-DAG: cir.func{{.*}} @_ZN4test5test22f3Ev()
 
 using namespace test;
 
@@ -38,7 +38,7 @@ int f4(void) {
 }
 
 // The namespace gets added during name mangling, so this is wrong but expected.
-// CHECK: cir.func @_Z2f4v()
+// CHECK: cir.func{{.*}} @_Z2f4v()
 // CHECK:   cir.call @_ZN12_GLOBAL__N_12f1Ev()
 // CHECK:   cir.call @_ZN4test2f2Ev()
 // CHECK:   cir.call @_ZN4test5test22f3Ev()
@@ -59,7 +59,7 @@ int f5() {
   return g3;
 }
 
-// CHECK: cir.func @_Z2f5v()
+// CHECK: cir.func{{.*}} @_Z2f5v()
 // CHECK:   cir.call @_ZN4test5test22f3Ev()
 // CHECK:   %[[G3_ADDR:.*]] = cir.get_global @_ZN4test5test22g3E : !cir.ptr<!s32i>
 // CHECK:   %[[G3_VAL:.*]] = cir.load{{.*}} %[[G3_ADDR]] : !cir.ptr<!s32i>, !s32i
@@ -76,7 +76,7 @@ int f6() {
   return s.a;
 }
 
-// CHECK: cir.func @_Z2f6v()
+// CHECK: cir.func{{.*}} @_Z2f6v()
 // CHECK:   cir.get_global @_ZN5test31sE : !cir.ptr<!rec_test33A3AS>
 // CHECK:   cir.get_member %{{.*}}[0] {name = "a"}
 
@@ -92,4 +92,4 @@ void f7() {
   shadow::shadowedFunc();
 }
 
-// CHECK: cir.func @_Z2f7v()
+// CHECK: cir.func{{.*}} @_Z2f7v()
diff --git a/clang/test/CIR/CodeGen/nullptr-init.cpp b/clang/test/CIR/CodeGen/nullptr-init.cpp
index 76965ce78469e..091269d09c985 100644
--- a/clang/test/CIR/CodeGen/nullptr-init.cpp
+++ b/clang/test/CIR/CodeGen/nullptr-init.cpp
@@ -11,7 +11,7 @@ void t1() {
   int *p3 = (int*)0;
 }
 
-// CIR:      cir.func @_Z2t1v()
+// CIR:      cir.func{{.*}} @_Z2t1v()
 // CIR-NEXT:     %[[P1:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["p1", init] {alignment = 8 : i64}
 // CIR-NEXT:     %[[P2:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["p2", init] {alignment = 8 : i64}
 // CIR-NEXT:     %[[P3:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["p3", init] {alignment = 8 : i64}
diff --git a/clang/test/CIR/CodeGen/string-literals.c b/clang/test/CIR/CodeGen/string-literals.c
index be9622f9abe27..44fd191173c33 100644
--- a/clang/test/CIR/CodeGen/string-literals.c
+++ b/clang/test/CIR/CodeGen/string-literals.c
@@ -33,10 +33,10 @@ char *f1() {
   return "1";
 }
 
-// CIR: cir.func @f1()
+// CIR: cir.func{{.*}} @f1()
 // CIR:   %[[STR:.*]] = cir.get_global @[[STR1_GLOBAL]] : !cir.ptr<!cir.array<!s8i x 2>>
 
-// LLVM: define ptr @f1()
+// LLVM: define{{.*}} ptr @f1()
 // LLVM:   store ptr @[[STR1_GLOBAL]], ptr {{.*}}
 
 // OGCG: define {{.*}}ptr @f1()
@@ -46,24 +46,24 @@ char *f2() {
   return "";
 }
 
-// CIR: cir.func @f2()
+// CIR: cir.func{{.*}} @f2()
 // CIR:   %[[STR2:.*]] = cir.get_global @[[STR2_GLOBAL]] : !cir.ptr<!cir.array<!s8i x 1>>
 
-// LLVM: define ptr @f2()
+// LLVM: define{{.*}} ptr @f2()
 // LLVM:   store ptr @[[STR2_GLOBAL]], ptr {{.*}}
 
-// OGCG: define {{.*}}ptr @f2()
+// OGCG: define{{.*}} ptr @f2()
 // OGCG:   ret ptr @[[STR2_GLOBAL]]
 
 char *f3() {
   return "\00";
 }
 
-// CIR: cir.func @f3()
+// CIR: cir.func{{.*}} @f3()
 // CIR:   %[[STR3:.*]] = cir.get_global @[[STR3_GLOBAL]] : !cir.ptr<!cir.array<!s8i x 2>>
 
-// LLVM: define ptr @f3()
+// LLVM: define{{.*}} ptr @f3()
 // LLVM:   store ptr @[[STR3_GLOBAL]], ptr {{.*}}
 
-// OGCG: define {{.*}}ptr @f3()
+// OGCG: define{{.*}} ptr @f3()
 // OGCG:   ret ptr @[[STR3_GLOBAL]]
diff --git a/clang/test/CIR/CodeGen/struct.c b/clang/test/CIR/CodeGen/struct.c
index b722b64eeb583..aa7c4cf1c295a 100644
--- a/clang/test/CIR/CodeGen/struct.c
+++ b/clang/test/CIR/CodeGen/struct.c
@@ -166,11 +166,11 @@ void f(void) {
   struct IncompleteS *p;
 }
 
-// CIR:      cir.func @f()
+// CIR:      cir.func{{.*}} @f()
 // CIR-NEXT:   cir.alloca !cir.ptr<!rec_IncompleteS>, !cir.ptr<!cir.ptr<!rec_IncompleteS>>, ["p"] {alignment = 8 : i64}
 // CIR-NEXT:   cir.return
 
-// LLVM:      define void @f()
+// LLVM:      define{{.*}} void @f()
 // LLVM-NEXT:   %[[P:.*]] = alloca ptr, i64 1, align 8
 // LLVM-NEXT:   ret void
 
@@ -183,11 +183,11 @@ void f2(void) {
   struct CompleteS s;
 }
 
-// CIR:      cir.func @f2()
+// CIR:      cir.func{{.*}} @f2()
 // CIR-NEXT:   cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["s"] {alignment = 4 : i64}
 // CIR-NEXT:   cir.return
 
-// LLVM:      define void @f2()
+// LLVM:      define{{.*}} void @f2()
 // LLVM-NEXT:   %[[S:.*]] = alloca %struct.CompleteS, i64 1, align 4
 // LLVM-NEXT:   ret void
 
@@ -201,7 +201,7 @@ char f3(int a) {
   return cs.b;
 }
 
-// CIR:      cir.func @f3(%[[ARG_A:.*]]: !s32i
+// CIR:      cir.func{{.*}} @f3(%[[ARG_A:.*]]: !s32i
 // CIR-NEXT:   %[[A_ADDR:.*]] = cir.alloca {{.*}} ["a", init] {alignment = 4 : i64}
 // CIR-NEXT:   %[[RETVAL_ADDR:.*]] = cir.alloca {{.*}} ["__retval"] {alignment = 1 : i64}
 // CIR-NEXT:   cir.store{{.*}} %[[ARG_A]], %[[A_ADDR]]
@@ -216,7 +216,7 @@ char f3(int a) {
 // CIR-NEXT:   %[[RETVAL:.*]] = cir.load{{.*}} %[[RETVAL_ADDR]]
 // CIR-NEXT:   cir.return %[[RETVAL]]
 
-// LLVM:      define i8 @f3(i32 %[[ARG_A:.*]])
+// LLVM:      define{{.*}} i8 @f3(i32 %[[ARG_A:.*]])
 // LLVM-NEXT:   %[[A_ADDR:.*]] = alloca i32, i64 1, align 4
 // LLVM-NEXT:   %[[RETVAL_ADDR:.*]] = alloca i8, i64 1, align 1
 // LLVM-NEXT:   store i32 %[[ARG_A]], ptr %[[A_ADDR]], align 4
@@ -241,7 +241,7 @@ char f4(int a, struct CompleteS *p) {
   return p->b;
 }
 
-// CIR:      cir.func @f4(%[[ARG_A:.*]]: !s32i {{.*}}, %[[ARG_P:.*]]: !cir.ptr<!rec_CompleteS>
+// CIR:      cir.func{{.*}} @f4(%[[ARG_A:.*]]: !s32i {{.*}}, %[[ARG_P:.*]]: !cir.ptr<!rec_CompleteS>
 // CIR-NEXT:   %[[A_ADDR:.*]] = cir.alloca {{.*}} ["a", init] {alignment = 4 : i64}
 // CIR-NEXT:   %[[P_ADDR:.*]] = cir.alloca {{.*}} ["p", init] {alignment = 8 : i64}
 // CIR-NEXT:   %[[RETVAL_ADDR:.*]] = cir.alloca {{.*}} ["__retval"] {alignment = 1 : i64}
@@ -258,7 +258,7 @@ char f4(int a, struct CompleteS *p) {
 // CIR-NEXT:   %[[RETVAL:.*]] = cir.load{{.*}} %[[RETVAL_ADDR]]
 // CIR-NEXT:   cir.return %[[RETVAL]]
 
-// LLVM:      define i8 @f4(i32 %[[ARG_A:.*]], ptr %[[ARG_P:.*]])
+// LLVM:      define{{.*}} i8 @f4(i32 %[[ARG_A:.*]], ptr %[[ARG_P:.*]])
 // LLVM-NEXT:   %[[A_ADDR:.*]] = alloca i32, i64 1, align 4
 // LLVM-NEXT:   %[[P_ADDR:.*]] = alloca ptr, i64 1, align 8
 // LLVM-NEXT:   %[[RETVAL_ADDR:.*]] = alloca i8, i64 1, align 1
@@ -294,7 +294,7 @@ void f5(struct NodeS* a) {
   a->next = 0;
 }
 
-// CIR: cir.func @f5
+// CIR: cir.func{{.*}} @f5
 // CIR:   %[[NEXT:.*]] = cir.get_member {{%.}}[0] {name = "next"} : !cir.ptr<!rec_NodeS> -> !cir.ptr<!cir.ptr<!rec_NodeS>>
 // CIR:   cir.store {{.*}}, %[[NEXT]]
 
@@ -312,7 +312,7 @@ void f6(struct CycleStart *start) {
   struct CycleStart *start2 = end->start;
 }
 
-// CIR: cir.func @f6
+// CIR: cir.func{{.*}} @f6
 // CIR:   %[[MIDDLE:.*]] = cir.get_member {{.*}}[0] {name = "middle"} : !cir.ptr<!rec_CycleStart> -> !cir.ptr<!cir.ptr<!rec_CycleMiddle>>
 // CIR:   %[[END:.*]] = cir.get_member %{{.*}}[0] {name = "end"} : !cir.ptr<!rec_CycleMiddle> -> !cir.ptr<!cir.ptr<!rec_CycleEnd>>
 // CIR:   %[[START2:.*]] = cir.get_member %{{.*}}[0] {name = "start"} : !cir.ptr<!rec_CycleEnd> -> !cir.ptr<!cir.ptr<!rec_CycleStart>>
diff --git a/clang/test/CIR/CodeGen/struct.cpp b/clang/test/CIR/CodeGen/struct.cpp
index c8406f811a462..ee6c4cab7341f 100644
--- a/clang/test/CIR/CodeGen/struct.cpp
+++ b/clang/test/CIR/CodeGen/struct.cpp
@@ -27,11 +27,11 @@ void f(void) {
   IncompleteS *p;
 }
 
-// CIR:      cir.func @_Z1fv()
+// CIR:      cir.func{{.*}} @_Z1fv()
 // CIR-NEXT:   cir.alloca !cir.ptr<!rec_IncompleteS>, !cir.ptr<!cir.ptr<!rec_IncompleteS>>, ["p"]
 // CIR-NEXT:   cir.return
 
-// LLVM:      define void @_Z1fv()
+// LLVM:      define{{.*}} void @_Z1fv()
 // LLVM-NEXT:   %[[P:.*]] = alloca ptr, i64 1, align 8
 // LLVM-NEXT:   ret void
 
@@ -44,14 +44,14 @@ char f2(CompleteS &s) {
   return s.b;
 }
 
-// CIR: cir.func @_Z2f2R9CompleteS(%[[ARG_S:.*]]: !cir.ptr<!rec_CompleteS>{{.*}})
+// CIR: cir.func{{.*}} @_Z2f2R9CompleteS(%[[ARG_S:.*]]: !cir.ptr<!rec_CompleteS>{{.*}})
 // CIR:   %[[S_ADDR:.*]] = cir.alloca !cir.ptr<!rec_CompleteS>, !cir.ptr<!cir.ptr<!rec_CompleteS>>, ["s", init, const]
 // CIR:   cir.store %[[ARG_S]], %[[S_ADDR]]
 // CIR:   %[[S_REF:.*]] = cir.load{{.*}} %[[S_ADDR]]
 // CIR:   %[[S_ADDR2:.*]] = cir.get_member %[[S_REF]][1] {name = "b"}
 // CIR:   %[[S_B:.*]] = cir.load{{.*}} %[[S_ADDR2]]
 
-// LLVM: define i8 @_Z2f2R9CompleteS(ptr %[[ARG_S:.*]])
+// LLVM: define{{.*}} i8 @_Z2f2R9CompleteS(ptr %[[ARG_S:.*]])
 // LLVM:   %[[S_ADDR:.*]] = alloca ptr
 // LLVM:   store ptr %[[ARG_S]], ptr %[[S_ADDR]]
 // LLVM:   %[[S_REF:.*]] = load ptr, ptr %[[S_ADDR]], align 8
@@ -79,7 +79,7 @@ void f3() {
   o.i.n;
 }
 
-// CIR: cir.func @_Z2f3v()
+// CIR: cir.func{{.*}} @_Z2f3v()
 // CIR:   %[[O:.*]] = cir.alloca !rec_Outer, !cir.ptr<!rec_Outer>, ["o"]
 // CIR:   %[[O_I:.*]] = cir.get_member %[[O]][0] {name = "i"}
 // CIR:   %[[O_I_N:.*]] = cir.get_member %[[O_I]][0] {name = "n"}
diff --git a/clang/test/CIR/CodeGen/switch.cpp b/clang/test/CIR/CodeGen/switch.cpp
index 8786c2350c192..e13aa8f4f4953 100644
--- a/clang/test/CIR/CodeGen/switch.cpp
+++ b/clang/test/CIR/CodeGen/switch.cpp
@@ -19,7 +19,7 @@ void sw1(int a) {
   }
 }
 
-// CIR: cir.func @_Z3sw1i
+// CIR: cir.func{{.*}} @_Z3sw1i
 // CIR: cir.switch (%[[COND:.*]] : !s32i) {
 // CIR-NEXT: cir.case(equal, [#cir.int<0> : !s32i]) {
 // CIR: cir.break
@@ -30,7 +30,7 @@ void sw1(int a) {
 // CIR: cir.alloca !s32i, !cir.ptr<!s32i>, ["yolo", init]
 // CIR: cir.break
 
-// LLVM: define void @_Z3sw1i
+// LLVM: define{{.*}} void @_Z3sw1i
 // LLVM:   store i32 1, ptr %[[B_ADDR:.*]], align 4
 // LLVM:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR:.*]], align 4
 // LLVM:   br label %[[BB7:.*]]
@@ -60,7 +60,7 @@ void sw1(int a) {
 // LLVM: [[DEFAULT]]:
 // LLVM:   ret void
 
-// OGCG: define dso_local void @_Z3sw1i
+// OGCG: define{{.*}} void @_Z3sw1i
 // OGCG: entry:
 // OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
 // OGCG:   %[[B:.*]] = alloca i32, align 4
@@ -97,7 +97,7 @@ void sw2(int a) {
   }
 }
 
-// CIR: cir.func @_Z3sw2i
+// CIR: cir.func{{.*}} @_Z3sw2i
 // CIR: cir.scope {
 // CIR-NEXT:   %[[YOLO:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["yolo", init]
 // CIR-NEXT:   %[[FOMO:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["fomo", init]
@@ -106,7 +106,7 @@ void sw2(int a) {
 // CIR-NEXT:     %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
 // CIR-NEXT:     cir.store{{.*}} %[[ZERO]], %[[FOMO]] : !s32i, !cir.ptr<!s32i>
 
-// LLVM: define void @_Z3sw2i
+// LLVM: define{{.*}} void @_Z3sw2i
 // LLVM:   store i32 2, ptr %[[YOLO_ADDR:.*]], align 4
 // LLVM:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR:.*]], align 4
 // LLVM:   br label %[[SWITCH:.*]]
@@ -126,7 +126,7 @@ void sw2(int a) {
 // LLVM: [[RET]]:
 // LLVM:   ret void
 
-// OGCG: define dso_local void @_Z3sw2i
+// OGCG: define{{.*}} void @_Z3sw2i
 // OGCG: entry:
 // OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
 // OGCG:   %[[YOLO:.*]] = alloca i32, align 4
@@ -151,7 +151,7 @@ void sw3(int a) {
   }
 }
 
-// CIR: cir.func @_Z3sw3i
+// CIR: cir.func{{.*}} @_Z3sw3i
 // CIR: cir.scope {
 // CIR-NEXT:   %[[COND:.*]] = cir.load{{.*}} %[[A:.*]] : !cir.ptr<!s32i>, !s32i
 // CIR-NEXT:   cir.switch (%[[COND]] : !s32i) {
@@ -161,7 +161,7 @@ void sw3(int a) {
 // CIR-NEXT:   cir.yield
 // CIR-NEXT:   }
 
-// LLVM-LABEL: define void @_Z3sw3i
+// LLVM-LABEL: define{{.*}} void @_Z3sw3i
 // LLVM:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR:.*]], align 4
 // LLVM:   br label %[[SWITCH:.*]]
 // LLVM: [[SWITCH]]:
@@ -174,7 +174,7 @@ void sw3(int a) {
 // LLVM: [[RET]]:
 // LLVM:   ret void
 
-// OGCG: define dso_local void @_Z3sw3i
+// OGCG: define{{.*}} void @_Z3sw3i
 // OGCG: entry:
 // OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
 // OGCG:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR]], align 4
@@ -195,7 +195,7 @@ int sw4(int a) {
   return 0;
 }
 
-// CIR: cir.func @_Z3sw4i
+// CIR: cir.func{{.*}} @_Z3sw4i
 // CIR:       cir.switch (%[[COND:.*]] : !s32i) {
 // CIR-NEXT:       cir.case(equal, [#cir.int<42> : !s32i]) {
 // CIR-NEXT:         cir.scope {
@@ -215,7 +215,7 @@ int sw4(int a) {
 // CIR-NEXT:       cir.yield
 // CIR-NEXT:  }
 
-// LLVM: define i32 @_Z3sw4i
+// LLVM: define{{.*}} i32 @_Z3sw4i
 // LLVM:   %[[A_ADDR:.*]] = alloca i32, i64 1, align 4
 // LLVM:   %[[RET_ADDR:.*]] = alloca i32, i64 1, align 4
 // LLVM:   br label %[[ENTRY:.*]]
@@ -241,7 +241,7 @@ int sw4(int a) {
 // LLVM:   %[[RET0:.*]] = load i32, ptr %[[RET_ADDR]], align 4
 // LLVM:   ret i32 %[[RET0]]
 
-// OGCG: define dso_local noundef i32 @_Z3sw4i
+// OGCG: define{{.*}} i32 @_Z3sw4i
 // OGCG: entry:
 // OGCG:   %[[RETVAL:.*]] = alloca i32, align 4
 // OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
@@ -263,7 +263,7 @@ void sw5(int a) {
   }
 }
 
-// CIR: cir.func @_Z3sw5i
+// CIR: cir.func{{.*}} @_Z3sw5i
 // CIR: cir.switch (%[[A:.*]] : !s32i) {
 // CIR-NEXT:   cir.case(equal, [#cir.int<1> : !s32i]) {
 // CIR-NEXT:     cir.yield
@@ -271,7 +271,7 @@ void sw5(int a) {
 // CIR-NEXT:   cir.yield
 // CIR-NEXT:   }
 
-// LLVM-LABEL: define void @_Z3sw5i
+// LLVM-LABEL: define{{.*}} void @_Z3sw5i
 // LLVM:   %[[A_ADDR:.*]] = alloca i32, i64 1, align 4
 // LLVM:   br label %[[ENTRY:.*]]
 // LLVM: [[ENTRY]]:
@@ -288,7 +288,7 @@ void sw5(int a) {
 // LLVM: [[RET]]:
 // LLVM:   ret void
 
-// OGCG: define dso_local void @_Z3sw5i
+// OGCG: define{{.*}} void @_Z3sw5i
 // OGCG: entry:
 // OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
 // OGCG:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR]], align 4
@@ -313,7 +313,7 @@ void sw6(int a) {
   }
 }
 
-// CIR: cir.func @_Z3sw6i
+// CIR: cir.func{{.*}} @_Z3sw6i
 // CIR: cir.switch (%[[A:.*]] : !s32i) {
 // CIR-NEXT: cir.case(equal, [#cir.int<0> : !s32i]) {
 // CIR-NEXT:     cir.yield
@@ -334,7 +334,7 @@ void sw6(int a) {
 // CIR-NEXT:     cir.break
 // CIR-NEXT: }
 
-// LLVM: define void @_Z3sw6i
+// LLVM: define{{.*}} void @_Z3sw6i
 // LLVM:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR:.*]], align 4
 // LLVM:   br label %[[SWITCH:.*]]
 // LLVM: [[SWITCH]]:
@@ -371,7 +371,7 @@ void sw6(int a) {
 // LLVM: [[RET]]:
 // LLVM:   ret void
 
-// OGCG: define dso_local void @_Z3sw6i
+// OGCG: define{{.*}} void @_Z3sw6i
 // OGCG: entry:
 // OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
 // OGCG:   store i32 %a, ptr %[[A_ADDR]], align 4
@@ -404,7 +404,7 @@ void sw7(int a) {
   }
 }
 
-// CIR: cir.func @_Z3sw7i
+// CIR: cir.func{{.*}} @_Z3sw7i
 // CIR: %[[X:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["x"]
 // CIR: cir.switch (%[[A:.*]] : !s32i)
 // CIR-NEXT: cir.case(equal, [#cir.int<0> : !s32i]) {
@@ -428,7 +428,7 @@ void sw7(int a) {
 // CIR-NEXT: cir.yield
 // CIR: }
 
-// LLVM: define void @_Z3sw7i
+// LLVM: define{{.*}} void @_Z3sw7i
 // LLVM:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR:.*]], align 4
 // LLVM:   br label %[[SWITCH:.*]]
 // LLVM: [[SWITCH]]:
@@ -467,7 +467,7 @@ void sw7(int a) {
 // LLVM: [[RET]]:
 // LLVM:   ret void
 
-// OGCG: define dso_local void @_Z3sw7i
+// OGCG: define{{.*}} void @_Z3sw7i
 // OGCG: entry:
 // OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
 // OGCG:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR]], align 4
@@ -498,7 +498,7 @@ void sw8(int a) {
   }
 }
 
-// CIR:    cir.func @_Z3sw8i
+// CIR:    cir.func{{.*}} @_Z3sw8i
 // CIR:    cir.switch (%[[A:.*]] : !s32i)
 // CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) {
 // CIR-NEXT:   cir.break
@@ -510,7 +510,7 @@ void sw8(int a) {
 // CIR-NEXT:   cir.break
 // CIR-NEXT: }
 
-// LLVM: define void @_Z3sw8i
+// LLVM: define{{.*}} void @_Z3sw8i
 // LLVM:   switch i32 %[[COND:.*]], label %[[DEFAULT:.*]] [
 // LLVM-DAG:  i32 3, label %[[CASE3:.*]]
 // LLVM-DAG:  i32 4, label %[[CASE4:.*]]
@@ -528,7 +528,7 @@ void sw8(int a) {
 // LLVM: [[RET]]:
 // LLVM:   ret void
 
-// OGCG: define dso_local void @_Z3sw8i
+// OGCG: define{{.*}} void @_Z3sw8i
 // OGCG: entry:
 // OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
 // OGCG:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR]], align 4
@@ -556,7 +556,7 @@ void sw9(int a) {
   }
 }
 
-// CIR:    cir.func @_Z3sw9i
+// CIR:    cir.func{{.*}} @_Z3sw9i
 // CIR:    cir.switch (%[[A:.*]] : !s32i)
 // CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) {
 // CIR-NEXT:   cir.break
@@ -568,7 +568,7 @@ void sw9(int a) {
 // CIR-NEXT:   cir.break
 // CIR-NEXT: }
 
-// LLVM: define void @_Z3sw9i
+// LLVM: define{{.*}} void @_Z3sw9i
 // LLVM:   switch i32 %[[COND:.*]], label %[[DEFAULT:.*]] [
 // LLVM-DAG:     i32 3, label %[[CASE3:.*]]
 // LLVM-DAG:     i32 4, label %[[CASE4:.*]]
@@ -586,7 +586,7 @@ void sw9(int a) {
 // LLVM: [[RET]]:
 // LLVM:   ret void
 
-// OGCG: define dso_local void @_Z3sw9i
+// OGCG: define{{.*}} void @_Z3sw9i
 // OGCG: entry:
 // OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
 // OGCG:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR]], align 4
@@ -615,7 +615,7 @@ void sw10(int a) {
   }
 }
 
-// CIR:    cir.func @_Z4sw10i
+// CIR:    cir.func{{.*}} @_Z4sw10i
 // CIR:    cir.switch (%[[A:.*]] : !s32i)
 // CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) {
 // CIR-NEXT:   cir.break
@@ -630,7 +630,7 @@ void sw10(int a) {
 // CIR-NEXT:   cir.break
 // CIR-NEXT: }
 
-// LLVM: define void @_Z4sw10i
+// LLVM: define{{.*}} void @_Z4sw10i
 // LLVM:   switch i32 %[[COND:.*]], label %[[DEFAULT:.*]] [
 // LLVM-DAG:     i32 3, label %[[CASE_3:.*]]
 // LLVM-DAG:     i32 4, label %[[CASE_4:.*]]
@@ -653,7 +653,7 @@ void sw10(int a) {
 // LLVM: [[RET]]:
 // LLVM:   ret void
 
-// OGCG: define dso_local void @_Z4sw10i
+// OGCG: define{{.*}} void @_Z4sw10i
 // OGCG: entry:
 // OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
 // OGCG:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR]], align 4
@@ -687,7 +687,7 @@ void sw11(int a) {
   }
 }
 
-// CIR:    cir.func @_Z4sw11i
+// CIR:    cir.func{{.*}} @_Z4sw11i
 // CIR:    cir.switch (%[[A:.*]] : !s32i)
 // CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) {
 // CIR-NEXT:   cir.break
@@ -708,7 +708,7 @@ void sw11(int a) {
 // CIR-NEXT:   cir.break
 // CIR-NEXT: }
 
-// LLVM: define void @_Z4sw11i
+// LLVM: define{{.*}} void @_Z4sw11i
 // LLVM:   switch i32 %[[COND:.*]], label %[[DEFAULT:.*]] [
 // LLVM-DAG:     i32 3, label %[[CASE_3:.*]]
 // LLVM-DAG:     i32 4, label %[[CASE_4:.*]]
@@ -741,7 +741,7 @@ void sw11(int a) {
 // LLVM: [[RET]]:
 // LLVM:   ret void
 
-// OGCG: define dso_local void @_Z4sw11i
+// OGCG: define{{.*}} void @_Z4sw11i
 // OGCG: entry:
 // OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
 // OGCG:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR]], align 4
@@ -772,7 +772,7 @@ void sw12(int a) {
   }
 }
 
-//      CIR: cir.func @_Z4sw12i
+//      CIR: cir.func{{.*}} @_Z4sw12i
 //      CIR:   cir.scope {
 //      CIR:     cir.switch
 // CIR-NEXT:     cir.case(equal, [#cir.int<3> : !s32i]) {
@@ -781,7 +781,7 @@ void sw12(int a) {
 // CIR-NEXT:       cir.break
 // CIR-NEXT:     }
 
-// LLVM: define void @_Z4sw12i
+// LLVM: define{{.*}} void @_Z4sw12i
 // LLVM:   switch i32 %[[COND:.*]], label %[[EXIT:.*]] [
 // LLVM-DAG:     i32 3, label %[[CASE_3:.*]]
 // LLVM:   ]
@@ -794,7 +794,7 @@ void sw12(int a) {
 // LLVM: [[RET]]:
 // LLVM:   ret void
 
-// OGCG: define dso_local void @_Z4sw12i
+// OGCG: define{{.*}} void @_Z4sw12i
 // OGCG: entry:
 // OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
 // OGCG:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR]], align 4
@@ -816,7 +816,7 @@ void sw13(int a, int b) {
   }
 }
 
-//      CIR:  cir.func @_Z4sw13ii
+//      CIR:  cir.func{{.*}} @_Z4sw13ii
 //      CIR:    cir.scope {
 //      CIR:      cir.switch
 // CIR-NEXT:      cir.case(equal, [#cir.int<1> : !s32i]) {
@@ -832,7 +832,7 @@ void sw13(int a, int b) {
 //      CIR:    }
 //      CIR:    cir.return
 
-// LLVM: define void @_Z4sw13ii
+// LLVM: define{{.*}} void @_Z4sw13ii
 // LLVM:   switch i32 %[[COND:.*]], label %[[OUTER_EXIT:.*]] [
 // LLVM-DAG:     i32 1, label %[[CASE_A_1:.*]]
 // LLVM:   ]
@@ -858,7 +858,7 @@ void sw13(int a, int b) {
 // LLVM: [[EXIT]]:
 // LLVM:   ret void
 
-// OGCG: define dso_local void @_Z4sw13ii
+// OGCG: define{{.*}} void @_Z4sw13ii
 // OGCG: entry:
 // OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
 // OGCG:   %[[B_ADDR:.*]] = alloca i32, align 4
@@ -890,7 +890,7 @@ void sw14(int x) {
   }
 }
 
-// CIR:      cir.func @_Z4sw14i
+// CIR:      cir.func{{.*}} @_Z4sw14i
 // CIR:      cir.switch
 // CIR-NEXT: cir.case(equal, [#cir.int<1> : !s32i]) {
 // CIR-NEXT:   cir.yield
@@ -908,7 +908,7 @@ void sw14(int x) {
 // CIR-NEXT:   cir.break
 // CIR-NEXT: }
 
-// LLVM: define void @_Z4sw14i
+// LLVM: define{{.*}} void @_Z4sw14i
 // LLVM:   switch i32 %[[COND:.*]], label %[[DEFAULT:.*]] [
 // LLVM-DAG:     i32 1, label %[[CASE1:.*]]
 // LLVM-DAG:     i32 2, label %[[CASE2:.*]]
@@ -939,7 +939,7 @@ void sw14(int x) {
 // LLVM: [[RET]]:
 // LLVM:   ret void
 
-// OGCG: define dso_local void @_Z4sw14i
+// OGCG: define{{.*}} void @_Z4sw14i
 // OGCG: entry:
 // OGCG:   %[[X_ADDR:.*]] = alloca i32, align 4
 // OGCG:   store i32 %x, ptr %[[X_ADDR]], align 4
@@ -977,7 +977,7 @@ void sw15(int x) {
   }
 }
 
-// CIR:      cir.func @_Z4sw15i
+// CIR:      cir.func{{.*}} @_Z4sw15i
 // CIR:      %[[Y:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["y"]
 // CIR:      cir.switch
 // CIR-NEXT: cir.case(equal, [#cir.int<1> : !s32i]) {
@@ -995,7 +995,7 @@ void sw15(int x) {
 // CIR-NEXT:   cir.break
 // CIR-NEXT: }
 
-// LLVM: define void @_Z4sw15i
+// LLVM: define{{.*}} void @_Z4sw15i
 // LLVM:   switch i32 %[[COND:.*]], label %[[DEFAULT:.*]] [
 // LLVM-DAG:     i32 1, label %[[CASE1:.*]]
 // LLVM-DAG:     i32 2, label %[[CASE2:.*]]
@@ -1019,7 +1019,7 @@ void sw15(int x) {
 // LLVM: [[RET]]:
 // LLVM:   ret void
 
-// OGCG: define dso_local void @_Z4sw15i
+// OGCG: define{{.*}} void @_Z4sw15i
 // OGCG: entry:
 // OGCG:   %[[X_ADDR:.*]] = alloca i32, align 4
 // OGCG:   %[[Y:.*]] = alloca i32, align 4
@@ -1081,7 +1081,7 @@ int nested_switch(int a) {
 // CIR:           cir.case(equal, [#cir.int<7> : !s32i]) {
 // CIR:           cir.return
 
-// LLVM: define i32 @_Z13nested_switchi
+// LLVM: define{{.*}} i32 @_Z13nested_switchi
 // LLVM:   %[[B_ADDR:.*]] = alloca i32, i64 1, align 4
 // LLVM:   %[[A_ADDR:.*]] = alloca i32, i64 1, align 4
 // LLVM:   %[[RES_ADDR:.*]] = alloca i32, i64 1, align 4
@@ -1136,7 +1136,7 @@ int nested_switch(int a) {
 // LLVM:   %[[RET0:.*]] = load i32, ptr %[[RES_ADDR]], align 4
 // LLVM:   ret i32 %[[RET0]]
 
-// OGCG: define dso_local noundef i32 @_Z13nested_switchi
+// OGCG: define{{.*}} i32 @_Z13nested_switchi
 // OGCG: entry:
 // OGCG:   %[[RETVAL:.*]] = alloca i32, align 4
 // OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
diff --git a/clang/test/CIR/CodeGen/switch_flat_op.cpp b/clang/test/CIR/CodeGen/switch_flat_op.cpp
index f917bd59f7ce3..a3ea7e7a15547 100644
--- a/clang/test/CIR/CodeGen/switch_flat_op.cpp
+++ b/clang/test/CIR/CodeGen/switch_flat_op.cpp
@@ -18,7 +18,7 @@ void swf(int a) {
 
 }
 
-// BEFORE:  cir.func @_Z3swfi
+// BEFORE:  cir.func{{.*}} @_Z3swfi
 // BEFORE:   %[[VAR_B:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init] {alignment = 4 : i64}
 // BEFORE:   %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
 // BEFORE:   cir.switch (%[[COND:.*]] : !s32i) {
@@ -44,7 +44,7 @@ void swf(int a) {
 // BEFORE: }
 // BEFORE: cir.return
 
-// AFTER: cir.func @_Z3swfi
+// AFTER: cir.func{{.*}} @_Z3swfi
 // AFTER:  %[[VAR_A:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init] {alignment = 4 : i64}
 // AFTER:  cir.store{{.*}} %arg0, %[[VAR_A]] : !s32i, !cir.ptr<!s32i>
 // AFTER:  %[[VAR_B:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init] {alignment = 4 : i64}
diff --git a/clang/test/CIR/CodeGen/ternary.cpp b/clang/test/CIR/CodeGen/ternary.cpp
index 3b66f7ccdf54f..781286a94cc2e 100644
--- a/clang/test/CIR/CodeGen/ternary.cpp
+++ b/clang/test/CIR/CodeGen/ternary.cpp
@@ -9,7 +9,7 @@ int x(int y) {
   return y > 0 ? 3 : 5;
 }
 
-// CIR-LABEL: cir.func @_Z1xi(
+// CIR-LABEL: cir.func{{.*}} @_Z1xi(
 // CIR-SAME: %[[ARG0:.*]]: !s32i {{.*}}) -> !s32i {
 // CIR: [[Y:%.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["y", init] {alignment = 4 : i64}
 // CIR: [[RETVAL:%.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"] {alignment = 4 : i64}
@@ -24,7 +24,7 @@ int x(int y) {
 // CIR: [[RETVAL_VAL:%.+]] = cir.load [[RETVAL]] : !cir.ptr<!s32i>, !s32i
 // CIR: cir.return [[RETVAL_VAL]] : !s32i
 
-// LLVM-LABEL: define i32 @_Z1xi(
+// LLVM-LABEL: define{{.*}} i32 @_Z1xi(
 // LLVM-SAME: i32 %[[ARG0:.+]])
 // LLVM: %[[Y:.*]] = alloca i32
 // LLVM: %[[RETVAL:.*]] = alloca i32
@@ -36,7 +36,7 @@ int x(int y) {
 // LLVM: %[[RESULT:.*]] = load i32, ptr %[[RETVAL]]
 // LLVM: ret i32 %[[RESULT]]
 
-// OGCG-LABEL: define dso_local noundef i32 @_Z1xi(
+// OGCG-LABEL: define{{.*}} i32 @_Z1xi(
 // OGCG-SAME: i32 {{.*}} %[[ARG0:.+]])
 // OGCG: %[[Y:.*]] = alloca i32
 // OGCG: store i32 %[[ARG0]], ptr %[[Y]]
@@ -51,7 +51,7 @@ int foo(int a, int b) {
   return 0;
 }
 
-// CIR-LABEL: cir.func @_Z3fooii(
+// CIR-LABEL: cir.func{{.*}} @_Z3fooii(
 // CIR-SAME: %[[ARG0:.*]]: !s32i {{.*}}, %[[ARG1:.*]]: !s32i {{.*}}) -> !s32i {
 // CIR: [[A:%.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init] {alignment = 4 : i64}
 // CIR: [[B:%.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init] {alignment = 4 : i64}
@@ -83,7 +83,7 @@ int foo(int a, int b) {
 // CIR: [[RETVAL_VAL2:%.+]] = cir.load [[RETVAL]] : !cir.ptr<!s32i>, !s32i
 // CIR: cir.return [[RETVAL_VAL2]] : !s32i
 
-// LLVM-LABEL: define i32 @_Z3fooii(
+// LLVM-LABEL: define{{.*}} i32 @_Z3fooii(
 // LLVM-SAME: i32 %[[ARG0:.*]], i32 %[[ARG1:.*]])
 // LLVM: %[[A:.*]] = alloca i32
 // LLVM: %[[B:.*]] = alloca i32
@@ -116,7 +116,7 @@ int foo(int a, int b) {
 // LLVM: %[[RET2:.*]] = load i32, ptr %[[RETVAL]]
 // LLVM: ret i32 %[[RET2]]
 
-// OGCG-LABEL: define dso_local noundef i32 @_Z3fooii(
+// OGCG-LABEL: define{{.*}} i32 @_Z3fooii(
 // OGCG-SAME: i32 {{.*}} %[[ARG0:.*]], i32 {{.*}} %[[ARG1:.*]])
 // OGCG: %[[RETVAL:.*]] = alloca i32
 // OGCG: %[[A:.*]] = alloca i32
diff --git a/clang/test/CIR/CodeGen/typedef.c b/clang/test/CIR/CodeGen/typedef.c
index a87e6ffb1843a..201df2e08ee2e 100644
--- a/clang/test/CIR/CodeGen/typedef.c
+++ b/clang/test/CIR/CodeGen/typedef.c
@@ -10,12 +10,12 @@ void local_typedef(void) {
   Struct s;
 }
 
-// CIR:      cir.func @local_typedef()
+// CIR:      cir.func{{.*}} @local_typedef()
 // CIR:        cir.alloca !rec_Struct, !cir.ptr<!rec_Struct>, ["s"] {alignment = 4 : i64}
 // CIR:        cir.return
 
 // LLVM: %struct.Struct = type { i32 }
-// LLVM: define void @local_typedef()
+// LLVM: define{{.*}} void @local_typedef()
 // LLVM:   alloca %struct.Struct, i64 1, align 4
 // LLVM:   ret void
 
diff --git a/clang/test/CIR/CodeGen/unary.cpp b/clang/test/CIR/CodeGen/unary.cpp
index 0633cc3fd8e15..a7c946eaffd03 100644
--- a/clang/test/CIR/CodeGen/unary.cpp
+++ b/clang/test/CIR/CodeGen/unary.cpp
@@ -10,12 +10,12 @@ unsigned up0() {
   return +a;
 }
 
-// CHECK: cir.func @_Z3up0v() -> !u32i
+// CHECK: cir.func{{.*}} @_Z3up0v() -> !u32i
 // CHECK:   %[[A:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["a", init]
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
 // CHECK:   %[[OUTPUT:.*]] = cir.unary(plus, %[[INPUT]])
 
-// LLVM: define i32 @_Z3up0v()
+// LLVM: define{{.*}} i32 @_Z3up0v()
 // LLVM:   %[[RV:.*]] = alloca i32, i64 1, align 4
 // LLVM:   %[[A:.*]] = alloca i32, i64 1, align 4
 // LLVM:   store i32 1, ptr %[[A]], align 4
@@ -31,12 +31,12 @@ unsigned um0() {
   return -a;
 }
 
-// CHECK: cir.func @_Z3um0v() -> !u32i
+// CHECK: cir.func{{.*}} @_Z3um0v() -> !u32i
 // CHECK:   %[[A:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["a", init]
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
 // CHECK:   %[[OUTPUT:.*]] = cir.unary(minus, %[[INPUT]])
 
-// LLVM: define i32 @_Z3um0v()
+// LLVM: define{{.*}} i32 @_Z3um0v()
 // LLVM:   %[[RV:.*]] = alloca i32, i64 1, align 4
 // LLVM:   %[[A:.*]] = alloca i32, i64 1, align 4
 // LLVM:   store i32 1, ptr %[[A]], align 4
@@ -54,12 +54,12 @@ unsigned un0() {
   return ~a; // a ^ -1 , not
 }
 
-// CHECK: cir.func @_Z3un0v() -> !u32i
+// CHECK: cir.func{{.*}} @_Z3un0v() -> !u32i
 // CHECK:   %[[A:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["a", init]
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
 // CHECK:   %[[OUTPUT:.*]] = cir.unary(not, %[[INPUT]])
 
-// LLVM: define i32 @_Z3un0v()
+// LLVM: define{{.*}} i32 @_Z3un0v()
 // LLVM:   %[[RV:.*]] = alloca i32, i64 1, align 4
 // LLVM:   %[[A:.*]] = alloca i32, i64 1, align 4
 // LLVM:   store i32 1, ptr %[[A]], align 4
@@ -78,7 +78,7 @@ int inc0() {
   return a;
 }
 
-// CHECK: cir.func @_Z4inc0v() -> !s32i
+// CHECK: cir.func{{.*}} @_Z4inc0v() -> !s32i
 // CHECK:   %[[A:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK:   cir.store{{.*}} %[[ATMP]], %[[A]] : !s32i
@@ -87,7 +87,7 @@ int inc0() {
 // CHECK:   cir.store{{.*}} %[[INCREMENTED]], %[[A]]
 // CHECK:   %[[A_TO_OUTPUT:.*]] = cir.load{{.*}} %[[A]]
 
-// LLVM: define i32 @_Z4inc0v()
+// LLVM: define{{.*}} i32 @_Z4inc0v()
 // LLVM:   %[[RV:.*]] = alloca i32, i64 1, align 4
 // LLVM:   %[[A:.*]] = alloca i32, i64 1, align 4
 // LLVM:   store i32 1, ptr %[[A]], align 4
@@ -106,7 +106,7 @@ int dec0() {
   return a;
 }
 
-// CHECK: cir.func @_Z4dec0v() -> !s32i
+// CHECK: cir.func{{.*}} @_Z4dec0v() -> !s32i
 // CHECK:   %[[A:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK:   cir.store{{.*}} %[[ATMP]], %[[A]] : !s32i
@@ -115,7 +115,7 @@ int dec0() {
 // CHECK:   cir.store{{.*}} %[[DECREMENTED]], %[[A]]
 // CHECK:   %[[A_TO_OUTPUT:.*]] = cir.load{{.*}} %[[A]]
 
-// LLVM: define i32 @_Z4dec0v()
+// LLVM: define{{.*}} i32 @_Z4dec0v()
 // LLVM:   %[[RV:.*]] = alloca i32, i64 1, align 4
 // LLVM:   %[[A:.*]] = alloca i32, i64 1, align 4
 // LLVM:   store i32 1, ptr %[[A]], align 4
@@ -134,7 +134,7 @@ int inc1() {
   return a;
 }
 
-// CHECK: cir.func @_Z4inc1v() -> !s32i
+// CHECK: cir.func{{.*}} @_Z4inc1v() -> !s32i
 // CHECK:   %[[A:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK:   cir.store{{.*}} %[[ATMP]], %[[A]] : !s32i
@@ -143,7 +143,7 @@ int inc1() {
 // CHECK:   cir.store{{.*}} %[[INCREMENTED]], %[[A]]
 // CHECK:   %[[A_TO_OUTPUT:.*]] = cir.load{{.*}} %[[A]]
 
-// LLVM: define i32 @_Z4inc1v()
+// LLVM: define{{.*}} i32 @_Z4inc1v()
 // LLVM:   %[[RV:.*]] = alloca i32, i64 1, align 4
 // LLVM:   %[[A:.*]] = alloca i32, i64 1, align 4
 // LLVM:   store i32 1, ptr %[[A]], align 4
@@ -162,7 +162,7 @@ int dec1() {
   return a;
 }
 
-// CHECK: cir.func @_Z4dec1v() -> !s32i
+// CHECK: cir.func{{.*}} @_Z4dec1v() -> !s32i
 // CHECK:   %[[A:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK:   cir.store{{.*}} %[[ATMP]], %[[A]] : !s32i
@@ -171,7 +171,7 @@ int dec1() {
 // CHECK:   cir.store{{.*}} %[[DECREMENTED]], %[[A]]
 // CHECK:   %[[A_TO_OUTPUT:.*]] = cir.load{{.*}} %[[A]]
 
-// LLVM: define i32 @_Z4dec1v()
+// LLVM: define{{.*}} i32 @_Z4dec1v()
 // LLVM:   %[[RV:.*]] = alloca i32, i64 1, align 4
 // LLVM:   %[[A:.*]] = alloca i32, i64 1, align 4
 // LLVM:   store i32 1, ptr %[[A]], align 4
@@ -191,7 +191,7 @@ int inc2() {
   return b;
 }
 
-// CHECK: cir.func @_Z4inc2v() -> !s32i
+// CHECK: cir.func{{.*}} @_Z4inc2v() -> !s32i
 // CHECK:   %[[A:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
 // CHECK:   %[[B:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init]
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.int<1> : !s32i
@@ -202,7 +202,7 @@ int inc2() {
 // CHECK:   cir.store{{.*}} %[[ATOB]], %[[B]]
 // CHECK:   %[[B_TO_OUTPUT:.*]] = cir.load{{.*}} %[[B]]
 
-// LLVM: define i32 @_Z4inc2v()
+// LLVM: define{{.*}} i32 @_Z4inc2v()
 // LLVM:   %[[RV:.*]] = alloca i32, i64 1, align 4
 // LLVM:   %[[A:.*]] = alloca i32, i64 1, align 4
 // LLVM:   %[[B:.*]] = alloca i32, i64 1, align 4
@@ -228,12 +228,12 @@ float fpPlus() {
   return +a;
 }
 
-// CHECK: cir.func @_Z6fpPlusv() -> !cir.float
+// CHECK: cir.func{{.*}} @_Z6fpPlusv() -> !cir.float
 // CHECK:   %[[A:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["a", init]
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
 // CHECK:   %[[OUTPUT:.*]] = cir.unary(plus, %[[INPUT]])
 
-// LLVM: define float @_Z6fpPlusv()
+// LLVM: define{{.*}} float @_Z6fpPlusv()
 // LLVM:   %[[RV:.*]] = alloca float, i64 1, align 4
 // LLVM:   %[[A:.*]] = alloca float, i64 1, align 4
 // LLVM:   store float 1.000000e+00, ptr %[[A]], align 4
@@ -249,12 +249,12 @@ float fpMinus() {
   return -a;
 }
 
-// CHECK: cir.func @_Z7fpMinusv() -> !cir.float
+// CHECK: cir.func{{.*}} @_Z7fpMinusv() -> !cir.float
 // CHECK:   %[[A:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["a", init]
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
 // CHECK:   %[[OUTPUT:.*]] = cir.unary(minus, %[[INPUT]])
 
-// LLVM: define float @_Z7fpMinusv()
+// LLVM: define{{.*}} float @_Z7fpMinusv()
 // LLVM:   %[[RV:.*]] = alloca float, i64 1, align 4
 // LLVM:   %[[A:.*]] = alloca float, i64 1, align 4
 // LLVM:   store float 1.000000e+00, ptr %[[A]], align 4
@@ -272,14 +272,14 @@ float fpPreInc() {
   return ++a;
 }
 
-// CHECK: cir.func @_Z8fpPreIncv() -> !cir.float
+// CHECK: cir.func{{.*}} @_Z8fpPreIncv() -> !cir.float
 // CHECK:   %[[A:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["a", init]
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.fp<1.000000e+00> : !cir.float
 // CHECK:   cir.store{{.*}} %[[ATMP]], %[[A]] : !cir.float
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
 // CHECK:   %[[INCREMENTED:.*]] = cir.unary(inc, %[[INPUT]])
 
-// LLVM: define float @_Z8fpPreIncv()
+// LLVM: define{{.*}} float @_Z8fpPreIncv()
 // LLVM:   %[[RV:.*]] = alloca float, i64 1, align 4
 // LLVM:   %[[A:.*]] = alloca float, i64 1, align 4
 // LLVM:   store float 1.000000e+00, ptr %[[A]], align 4
@@ -297,14 +297,14 @@ float fpPreDec() {
   return --a;
 }
 
-// CHECK: cir.func @_Z8fpPreDecv() -> !cir.float
+// CHECK: cir.func{{.*}} @_Z8fpPreDecv() -> !cir.float
 // CHECK:   %[[A:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["a", init]
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.fp<1.000000e+00> : !cir.float
 // CHECK:   cir.store{{.*}} %[[ATMP]], %[[A]] : !cir.float
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
 // CHECK:   %[[DECREMENTED:.*]] = cir.unary(dec, %[[INPUT]])
 
-// LLVM: define float @_Z8fpPreDecv()
+// LLVM: define{{.*}} float @_Z8fpPreDecv()
 // LLVM:   %[[RV:.*]] = alloca float, i64 1, align 4
 // LLVM:   %[[A:.*]] = alloca float, i64 1, align 4
 // LLVM:   store float 1.000000e+00, ptr %[[A]], align 4
@@ -322,14 +322,14 @@ float fpPostInc() {
   return a++;
 }
 
-// CHECK: cir.func @_Z9fpPostIncv() -> !cir.float
+// CHECK: cir.func{{.*}} @_Z9fpPostIncv() -> !cir.float
 // CHECK:   %[[A:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["a", init]
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.fp<1.000000e+00> : !cir.float
 // CHECK:   cir.store{{.*}} %[[ATMP]], %[[A]] : !cir.float
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
 // CHECK:   %[[INCREMENTED:.*]] = cir.unary(inc, %[[INPUT]])
 
-// LLVM: define float @_Z9fpPostIncv()
+// LLVM: define{{.*}} float @_Z9fpPostIncv()
 // LLVM:   %[[RV:.*]] = alloca float, i64 1, align 4
 // LLVM:   %[[A:.*]] = alloca float, i64 1, align 4
 // LLVM:   store float 1.000000e+00, ptr %[[A]], align 4
@@ -347,14 +347,14 @@ float fpPostDec() {
   return a--;
 }
 
-// CHECK: cir.func @_Z9fpPostDecv() -> !cir.float
+// CHECK: cir.func{{.*}} @_Z9fpPostDecv() -> !cir.float
 // CHECK:   %[[A:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["a", init]
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.fp<1.000000e+00> : !cir.float
 // CHECK:   cir.store{{.*}} %[[ATMP]], %[[A]] : !cir.float
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
 // CHECK:   %[[DECREMENTED:.*]] = cir.unary(dec, %[[INPUT]])
 
-// LLVM: define float @_Z9fpPostDecv()
+// LLVM: define{{.*}} float @_Z9fpPostDecv()
 // LLVM:   %[[RV:.*]] = alloca float, i64 1, align 4
 // LLVM:   %[[A:.*]] = alloca float, i64 1, align 4
 // LLVM:   store float 1.000000e+00, ptr %[[A]], align 4
@@ -374,7 +374,7 @@ float fpPostInc2() {
   return b;
 }
 
-// CHECK: cir.func @_Z10fpPostInc2v() -> !cir.float
+// CHECK: cir.func{{.*}} @_Z10fpPostInc2v() -> !cir.float
 // CHECK:   %[[A:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["a", init]
 // CHECK:   %[[B:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["b", init]
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.fp<1.000000e+00> : !cir.float
@@ -385,7 +385,7 @@ float fpPostInc2() {
 // CHECK:   cir.store{{.*}} %[[ATOB]], %[[B]]
 // CHECK:   %[[B_TO_OUTPUT:.*]] = cir.load{{.*}} %[[B]]
 
-// LLVM: define float @_Z10fpPostInc2v()
+// LLVM: define{{.*}} float @_Z10fpPostInc2v()
 // LLVM:   %[[RV:.*]] = alloca float, i64 1, align 4
 // LLVM:   %[[A:.*]] = alloca float, i64 1, align 4
 // LLVM:   %[[B:.*]] = alloca float, i64 1, align 4
@@ -407,7 +407,7 @@ float fpPostInc2() {
 // OGCG:   %[[B_TO_OUTPUT:.*]] = load float, ptr %[[B]], align 4
 
 void chars(char c) {
-// CHECK: cir.func @_Z5charsc
+// CHECK: cir.func{{.*}} @_Z5charsc
 
   int c1 = +c;
   // CHECK: %[[PROMO:.*]] = cir.cast(integral, %{{.+}} : !s8i), !s32i
@@ -429,13 +429,13 @@ _Float16 fp16UPlus(_Float16 f) {
   return +f;
 }
 
-// CHECK: cir.func @_Z9fp16UPlusDF16_({{.*}}) -> !cir.f16
+// CHECK: cir.func{{.*}} @_Z9fp16UPlusDF16_({{.*}}) -> !cir.f16
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[F:.*]]
 // CHECK:   %[[PROMOTED:.*]] = cir.cast(floating, %[[INPUT]] : !cir.f16), !cir.float
 // CHECK:   %[[RESULT:.*]] = cir.unary(plus, %[[PROMOTED]])
 // CHECK:   %[[UNPROMOTED:.*]] = cir.cast(floating, %[[RESULT]] : !cir.float), !cir.f16
 
-// LLVM: define half @_Z9fp16UPlusDF16_({{.*}})
+// LLVM: define{{.*}} half @_Z9fp16UPlusDF16_({{.*}})
 // LLVM:   %[[F_LOAD:.*]] = load half, ptr %{{.*}}, align 2
 // LLVM:   %[[PROMOTED:.*]] = fpext half %[[F_LOAD]] to float
 // LLVM:   %[[UNPROMOTED:.*]] = fptrunc float %[[PROMOTED]] to half
@@ -449,13 +449,13 @@ _Float16 fp16UMinus(_Float16 f) {
   return -f;
 }
 
-// CHECK: cir.func @_Z10fp16UMinusDF16_({{.*}}) -> !cir.f16
+// CHECK: cir.func{{.*}} @_Z10fp16UMinusDF16_({{.*}}) -> !cir.f16
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[F:.*]]
 // CHECK:   %[[PROMOTED:.*]] = cir.cast(floating, %[[INPUT]] : !cir.f16), !cir.float
 // CHECK:   %[[RESULT:.*]] = cir.unary(minus, %[[PROMOTED]])
 // CHECK:   %[[UNPROMOTED:.*]] = cir.cast(floating, %[[RESULT]] : !cir.float), !cir.f16
 
-// LLVM: define half @_Z10fp16UMinusDF16_({{.*}})
+// LLVM: define{{.*}} half @_Z10fp16UMinusDF16_({{.*}})
 // LLVM:   %[[F_LOAD:.*]] = load half, ptr %{{.*}}, align 2
 // LLVM:   %[[PROMOTED:.*]] = fpext half %[[F_LOAD]] to float
 // LLVM:   %[[RESULT:.*]] = fneg float %[[PROMOTED]]
@@ -480,7 +480,7 @@ void test_logical_not() {
   b = !d;
 }
 
-// CHECK: cir.func @_Z16test_logical_notv()
+// CHECK: cir.func{{.*}} @_Z16test_logical_notv()
 // CHECK:   %[[A:.*]] = cir.load{{.*}} %[[A_ADDR:.*]] : !cir.ptr<!s32i>, !s32i
 // CHECK:   %[[A_BOOL:.*]] = cir.cast(int_to_bool, %[[A]] : !s32i), !cir.bool
 // CHECK:   %[[A_NOT:.*]] = cir.unary(not, %[[A_BOOL]]) : !cir.bool, !cir.bool
@@ -503,7 +503,7 @@ void test_logical_not() {
 // CHECK:   %[[D_NOT:.*]] = cir.unary(not, %[[D_BOOL]]) : !cir.bool, !cir.bool
 // CHECK:   cir.store{{.*}} %[[D_NOT]], %[[B_ADDR]] : !cir.bool, !cir.ptr<!cir.bool>
 
-// LLVM: define void @_Z16test_logical_notv()
+// LLVM: define{{.*}} void @_Z16test_logical_notv()
 // LLVM:   %[[A:.*]] = load i32, ptr %[[A_ADDR:.*]], align 4
 // LLVM:   %[[A_BOOL:.*]] = icmp ne i32 %[[A]], 0
 // LLVM:   %[[A_NOT:.*]] = xor i1 %[[A_BOOL]], true
diff --git a/clang/test/CIR/CodeGen/union.c b/clang/test/CIR/CodeGen/union.c
index d998b5add41a3..23e862b24517d 100644
--- a/clang/test/CIR/CodeGen/union.c
+++ b/clang/test/CIR/CodeGen/union.c
@@ -54,11 +54,11 @@ void f1(void) {
   union IncompleteU *p;
 }
 
-// CIR:      cir.func @f1()
+// CIR:      cir.func{{.*}} @f1()
 // CIR-NEXT:   cir.alloca !cir.ptr<!rec_IncompleteU>, !cir.ptr<!cir.ptr<!rec_IncompleteU>>, ["p"]
 // CIR-NEXT:   cir.return
 
-// LLVM:      define void @f1()
+// LLVM:      define{{.*}} void @f1()
 // LLVM-NEXT:   %[[P:.*]] = alloca ptr, i64 1, align 8
 // LLVM-NEXT:   ret void
 
@@ -73,7 +73,7 @@ int f2(void) {
   return u.n;
 }
 
-// CIR:      cir.func @f2() -> !s32i
+// CIR:      cir.func{{.*}} @f2() -> !s32i
 // CIR-NEXT:   %[[RETVAL_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"] {alignment = 4 : i64}
 // CIR-NEXT:   %[[U:.*]] = cir.alloca !rec_U1, !cir.ptr<!rec_U1>, ["u"] {alignment = 4 : i64}
 // CIR-NEXT:   %[[I:.*]] = cir.const #cir.int<42> : !s32i
@@ -85,7 +85,7 @@ int f2(void) {
 // CIR-NEXT:   %[[RET:.*]] = cir.load{{.*}} %[[RETVAL_ADDR]] : !cir.ptr<!s32i>, !s32i
 // CIR-NEXT:   cir.return %[[RET]] : !s32i
 
-// LLVM:      define i32 @f2()
+// LLVM:      define{{.*}} i32 @f2()
 // LLVM-NEXT:   %[[RETVAL:.*]] = alloca i32, i64 1, align 4
 // LLVM-NEXT:   %[[U:.*]] = alloca %union.U1, i64 1, align 4
 // LLVM-NEXT:   store i32 42, ptr %[[U]], align 4
@@ -94,7 +94,7 @@ int f2(void) {
 // LLVM-NEXT:   %[[RET:.*]] = load i32, ptr %[[RETVAL]], align 4
 // LLVM-NEXT:   ret i32 %[[RET]]
 
-//      OGCG: define dso_local i32 @f2()
+//      OGCG: define{{.*}} i32 @f2()
 // OGCG-NEXT: entry:
 // OGCG-NEXT: %[[U:.*]] = alloca %union.U1, align 4
 // OGCG-NEXT: store i32 42, ptr %[[U]], align 4
@@ -112,7 +112,7 @@ void shouldGenerateUnionAccess(union U2 u) {
   u.d;
 }
 
-// CIR:      cir.func @shouldGenerateUnionAccess(%[[ARG:.*]]: !rec_U2
+// CIR:      cir.func{{.*}} @shouldGenerateUnionAccess(%[[ARG:.*]]: !rec_U2
 // CIR-NEXT:   %[[U:.*]] = cir.alloca !rec_U2, !cir.ptr<!rec_U2>, ["u", init] {alignment = 8 : i64}
 // CIR-NEXT:   cir.store{{.*}} %[[ARG]], %[[U]] : !rec_U2, !cir.ptr<!rec_U2>
 // CIR-NEXT:   %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
@@ -138,7 +138,7 @@ void shouldGenerateUnionAccess(union U2 u) {
 // CIR-NEXT:   %[[D_VAL:.*]] = cir.load{{.*}} %[[D_PTR2]] : !cir.ptr<!cir.double>, !cir.double
 // CIR-NEXT:   cir.return
 
-// LLVM:      define void @shouldGenerateUnionAccess(%union.U2 %[[ARG:.*]])
+// LLVM:      define{{.*}} void @shouldGenerateUnionAccess(%union.U2 %[[ARG:.*]])
 // LLVM-NEXT:   %[[U:.*]] = alloca %union.U2, i64 1, align 8
 // LLVM-NEXT:   store %union.U2 %[[ARG]], ptr %[[U]], align 8
 // LLVM-NEXT:   store i8 0, ptr %[[U]], align 8
@@ -151,7 +151,7 @@ void shouldGenerateUnionAccess(union U2 u) {
 // LLVM-NEXT:   %[[D_VAL:.*]] = load double, ptr %[[U]], align 8
 // LLVM-NEXT:   ret void
 
-// OGCG:      define dso_local void @shouldGenerateUnionAccess(i64 %[[ARG:.*]])
+// OGCG:      define{{.*}} void @shouldGenerateUnionAccess(i64 %[[ARG:.*]])
 // OGCG-NEXT: entry:
 // OGCG-NEXT:   %[[U:.*]] = alloca %union.U2, align 8
 // OGCG-NEXT:   %[[COERCE_DIVE:.*]] = getelementptr inbounds nuw %union.U2, ptr %[[U]], i32 0, i32 0
@@ -170,7 +170,7 @@ void f3(union U3 u) {
   u.c[2] = 0;
 }
 
-// CIR:      cir.func @f3(%[[ARG:.*]]: !rec_U3
+// CIR:      cir.func{{.*}} @f3(%[[ARG:.*]]: !rec_U3
 // CIR-NEXT:   %[[U:.*]] = cir.alloca !rec_U3, !cir.ptr<!rec_U3>, ["u", init] {alignment = 1 : i64}
 // CIR-NEXT:   cir.store{{.*}} %[[ARG]], %[[U]] : !rec_U3, !cir.ptr<!rec_U3>
 // CIR-NEXT:   %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
@@ -182,7 +182,7 @@ void f3(union U3 u) {
 // CIR-NEXT:   cir.store{{.*}} %[[ZERO_CHAR]], %[[ELEM_PTR]] : !s8i, !cir.ptr<!s8i>
 // CIR-NEXT:   cir.return
 
-// LLVM:      define void @f3(%union.U3 %[[ARG:.*]])
+// LLVM:      define{{.*}} void @f3(%union.U3 %[[ARG:.*]])
 // LLVM-NEXT:   %[[U:.*]] = alloca %union.U3, i64 1, align 1
 // LLVM-NEXT:   store %union.U3 %[[ARG]], ptr %[[U]], align 1
 // LLVM-NEXT:   %[[C_PTR:.*]] = getelementptr i8, ptr %[[U]], i32 0
@@ -190,7 +190,7 @@ void f3(union U3 u) {
 // LLVM-NEXT:   store i8 0, ptr %[[ELEM_PTR]], align 1
 // LLVM-NEXT:   ret void
 
-// OGCG:      define dso_local void @f3(i40 %[[ARG:.*]])
+// OGCG:      define{{.*}} void @f3(i40 %[[ARG:.*]])
 // OGCG-NEXT: entry:
 // OGCG-NEXT:   %[[U:.*]] = alloca %union.U3, align 1
 // OGCG-NEXT:   store i40 %[[ARG]], ptr %[[U]], align 1
@@ -202,7 +202,7 @@ void f5(union U4 u) {
   u.c[4] = 65;
 }
 
-// CIR:      cir.func @f5(%[[ARG:.*]]: !rec_U4
+// CIR:      cir.func{{.*}} @f5(%[[ARG:.*]]: !rec_U4
 // CIR-NEXT:   %[[U:.*]] = cir.alloca !rec_U4, !cir.ptr<!rec_U4>, ["u", init] {alignment = 4 : i64}
 // CIR-NEXT:   cir.store{{.*}} %[[ARG]], %[[U]] : !rec_U4, !cir.ptr<!rec_U4>
 // CIR-NEXT:   %[[CHAR_VAL:.*]] = cir.const #cir.int<65> : !s32i
@@ -214,7 +214,7 @@ void f5(union U4 u) {
 // CIR-NEXT:   cir.store{{.*}} %[[CHAR_CAST]], %[[ELEM_PTR]] : !s8i, !cir.ptr<!s8i>
 // CIR-NEXT:   cir.return
 
-// LLVM:      define void @f5(%union.U4 %[[ARG:.*]])
+// LLVM:      define{{.*}} void @f5(%union.U4 %[[ARG:.*]])
 // LLVM-NEXT:   %[[U:.*]] = alloca %union.U4, i64 1, align 4
 // LLVM-NEXT:   store %union.U4 %[[ARG]], ptr %[[U]], align 4
 // LLVM-NEXT:   %[[C_PTR:.*]] = getelementptr i8, ptr %[[U]], i32 0
@@ -222,7 +222,7 @@ void f5(union U4 u) {
 // LLVM-NEXT:   store i8 65, ptr %[[ELEM_PTR]], align 4
 // LLVM-NEXT:   ret void
 
-// OGCG:      define dso_local void @f5(i64 %[[ARG:.*]])
+// OGCG:      define{{.*}} void @f5(i64 %[[ARG:.*]])
 // OGCG-NEXT: entry:
 // OGCG-NEXT:   %[[U:.*]] = alloca %union.U4, align 4
 // OGCG-NEXT:   store i64 %[[ARG]], ptr %[[U]], align 4
diff --git a/clang/test/CIR/CodeGen/vector-ext.cpp b/clang/test/CIR/CodeGen/vector-ext.cpp
index fe4919ec0478d..2ee42187a6e94 100644
--- a/clang/test/CIR/CodeGen/vector-ext.cpp
+++ b/clang/test/CIR/CodeGen/vector-ext.cpp
@@ -1161,3 +1161,18 @@ void foo20() {
 // OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16
 // OGCG: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16
 // OGCG: %[[SHUF:.*]] = shufflevector <4 x i32> %[[TMP_A]], <4 x i32> %[[TMP_B]], <4 x i32> <i32 poison, i32 1, i32 poison, i32 1>
+
+void foo21() {
+  vi4 a;
+  unsigned long size = __builtin_vectorelements(a);
+}
+
+// CIR: %[[INIT:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["size", init]
+// CIR: %[[SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CIR: cir.store align(8) %[[SIZE]], %[[INIT]] : !u64i, !cir.ptr<!u64i>
+
+// LLVM: %[[SIZE:.*]] = alloca i64, i64 1, align 8
+// LLVM: store i64 4, ptr %[[SIZE]], align 8
+
+// OGCG: %[[SIZE:.*]] = alloca i64, align 8
+// OGCG: store i64 4, ptr %[[SIZE]], align 8
diff --git a/clang/test/CIR/CodeGen/vector.cpp b/clang/test/CIR/CodeGen/vector.cpp
index d0c5b83cd5b04..18fa90bd2cb3f 100644
--- a/clang/test/CIR/CodeGen/vector.cpp
+++ b/clang/test/CIR/CodeGen/vector.cpp
@@ -1203,3 +1203,18 @@ void foo23() {
 // OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16
 // OGCG: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16
 // OGCG: %[[SHUF:.*]] = shufflevector <4 x i32> %[[TMP_A]], <4 x i32> %[[TMP_B]], <4 x i32> <i32 poison, i32 1, i32 poison, i32 1>
+
+void foo24() {
+  vi4 a;
+  unsigned long size = __builtin_vectorelements(a);
+}
+
+// CIR: %[[INIT:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["size", init]
+// CIR: %[[SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CIR: cir.store align(8) %[[SIZE]], %[[INIT]] : !u64i, !cir.ptr<!u64i>
+
+// LLVM: %[[SIZE:.*]] = alloca i64, i64 1, align 8
+// LLVM: store i64 4, ptr %[[SIZE]], align 8
+
+// OGCG: %[[SIZE:.*]] = alloca i64, align 8
+// OGCG: store i64 4, ptr %[[SIZE]], align 8
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-copy.c b/clang/test/CIR/CodeGenOpenACC/combined-copy.c
index 72471d4ec7874..1c94fa8238ce8 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-copy.c
+++ b/clang/test/CIR/CodeGenOpenACC/combined-copy.c
@@ -2,7 +2,7 @@
 
 int global;
 void acc_compute(int parmVar) {
-  // CHECK: cir.func @acc_compute(%[[ARG:.*]]: !s32i{{.*}}) {
+  // CHECK: cir.func{{.*}} @acc_compute(%[[ARG:.*]]: !s32i{{.*}}) {
   // CHECK-NEXT: %[[PARM:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["parmVar", init]
   int localVar1;
   short localVar2;
@@ -773,7 +773,7 @@ typedef struct StructTy {
 } Struct ;
 
 void acc_compute_members() {
-  // CHECK: cir.func @acc_compute_members()
+  // CHECK: cir.func{{.*}} @acc_compute_members()
   Struct localStruct;
   // CHECK-NEXT: %[[LOCALSTRUCT:.*]] = cir.alloca !rec_StructTy, !cir.ptr<!rec_StructTy>, ["localStruct"]
 
@@ -1082,7 +1082,7 @@ typedef struct OuterTy {
 } Outer;
 
 void copy_member_of_array_element_member() {
-  // CHECK: cir.func @copy_member_of_array_element_member() {
+  // CHECK: cir.func{{.*}} @copy_member_of_array_element_member() {
   Outer outer;
   // CHECK-NEXT: %[[OUTER:.*]] = cir.alloca !rec_OuterTy, !cir.ptr<!rec_OuterTy>, ["outer"]
 
diff --git a/clang/test/CIR/CodeGenOpenACC/combined.cpp b/clang/test/CIR/CodeGenOpenACC/combined.cpp
index 5b83a9cb91898..b8140335f7c29 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s
 
 extern "C" void acc_combined(int N, int cond) {
-  // CHECK: cir.func @acc_combined(%[[ARG_N:.*]]: !s32i loc{{.*}}, %[[ARG_COND:.*]]: !s32i loc{{.*}}) {
+  // CHECK: cir.func{{.*}} @acc_combined(%[[ARG_N:.*]]: !s32i loc{{.*}}, %[[ARG_COND:.*]]: !s32i loc{{.*}}) {
   // CHECK-NEXT: %[[ALLOCA_N:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["N", init]
   // CHECK-NEXT: %[[COND:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["cond", init]
   // CHECK-NEXT: cir.store %[[ARG_N]], %[[ALLOCA_N]] : !s32i, !cir.ptr<!s32i>
@@ -1012,7 +1012,7 @@ extern "C" void acc_combined(int N, int cond) {
   // CHECK-NEXT: } loc
 }
 extern "C" void acc_combined_data_clauses(int *arg1, int *arg2) {
-  // CHECK: cir.func @acc_combined_data_clauses(%[[ARG1_PARAM:.*]]: !cir.ptr<!s32i>{{.*}}, %[[ARG2_PARAM:.*]]: !cir.ptr<!s32i>{{.*}}) {
+  // CHECK: cir.func{{.*}} @acc_combined_data_clauses(%[[ARG1_PARAM:.*]]: !cir.ptr<!s32i>{{.*}}, %[[ARG2_PARAM:.*]]: !cir.ptr<!s32i>{{.*}}) {
   // CHECK-NEXT: %[[ARG1:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arg1", init]
   // CHECK-NEXT: %[[ARG2:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arg2", init]
   // CHECK-NEXT: cir.store %[[ARG1_PARAM]], %[[ARG1]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-copy.c b/clang/test/CIR/CodeGenOpenACC/compute-copy.c
index 888bad29caa7c..0fb150475bc72 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-copy.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-copy.c
@@ -2,7 +2,7 @@
 
 int global;
 void acc_compute(int parmVar) {
-  // CHECK: cir.func @acc_compute(%[[ARG:.*]]: !s32i{{.*}}) {
+  // CHECK: cir.func{{.*}} @acc_compute(%[[ARG:.*]]: !s32i{{.*}}) {
   // CHECK-NEXT: %[[PARM:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["parmVar", init]
   int localVar1;
   short localVar2;
@@ -646,7 +646,7 @@ typedef struct StructTy {
 } Struct ;
 
 void acc_compute_members() {
-  // CHECK: cir.func @acc_compute_members()
+  // CHECK: cir.func{{.*}} @acc_compute_members()
   Struct localStruct;
   // CHECK-NEXT: %[[LOCALSTRUCT:.*]] = cir.alloca !rec_StructTy, !cir.ptr<!rec_StructTy>, ["localStruct"]
 
diff --git a/clang/test/CIR/CodeGenOpenACC/data.c b/clang/test/CIR/CodeGenOpenACC/data.c
index 948119f66e93d..77a373f9c049f 100644
--- a/clang/test/CIR/CodeGenOpenACC/data.c
+++ b/clang/test/CIR/CodeGenOpenACC/data.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fopenacc -emit-cir -fclangir %s -o - | FileCheck %s
 
 void acc_data(int cond) {
-  // CHECK: cir.func @acc_data(%[[ARG:.*]]: !s32i{{.*}}) {
+  // CHECK: cir.func{{.*}} @acc_data(%[[ARG:.*]]: !s32i{{.*}}) {
   // CHECK-NEXT: %[[COND:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["cond", init]
   // CHECK-NEXT: cir.store %[[ARG]], %[[COND]] : !s32i, !cir.ptr<!s32i>
 
diff --git a/clang/test/CIR/CodeGenOpenACC/host_data.c b/clang/test/CIR/CodeGenOpenACC/host_data.c
index 4c3f7dd092a2f..aeaf3d2f047b5 100644
--- a/clang/test/CIR/CodeGenOpenACC/host_data.c
+++ b/clang/test/CIR/CodeGenOpenACC/host_data.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s
 
 void acc_host_data(int cond, int var1, int var2) {
-  // CHECK: cir.func @acc_host_data(%[[ARG_COND:.*]]: !s32i {{.*}}, %[[ARG_V1:.*]]: !s32i {{.*}}, %[[ARG_V2:.*]]: !s32i {{.*}}) {
+  // CHECK: cir.func{{.*}} @acc_host_data(%[[ARG_COND:.*]]: !s32i {{.*}}, %[[ARG_V1:.*]]: !s32i {{.*}}, %[[ARG_V2:.*]]: !s32i {{.*}}) {
   // CHECK-NEXT: %[[COND:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["cond", init]
   // CHECK-NEXT: %[[V1:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["var1", init]
   // CHECK-NEXT: %[[V2:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["var2", init]
diff --git a/clang/test/CIR/CodeGenOpenACC/init.c b/clang/test/CIR/CodeGenOpenACC/init.c
index 54f686dbe8ebc..177e5a6ea2117 100644
--- a/clang/test/CIR/CodeGenOpenACC/init.c
+++ b/clang/test/CIR/CodeGenOpenACC/init.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fopenacc -emit-cir -fclangir %s -o - | FileCheck %s
 
 void acc_init(int cond) {
-  // CHECK: cir.func @acc_init(%[[ARG:.*]]: !s32i{{.*}}) {
+  // CHECK: cir.func{{.*}} @acc_init(%[[ARG:.*]]: !s32i{{.*}}) {
   // CHECK-NEXT: %[[COND:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["cond", init]
   // CHECK-NEXT: cir.store %[[ARG]], %[[COND]] : !s32i, !cir.ptr<!s32i>
 #pragma acc init
diff --git a/clang/test/CIR/CodeGenOpenACC/kernels.c b/clang/test/CIR/CodeGenOpenACC/kernels.c
index d0c6f1134c8d2..9b10b7489e814 100644
--- a/clang/test/CIR/CodeGenOpenACC/kernels.c
+++ b/clang/test/CIR/CodeGenOpenACC/kernels.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s
 
 void acc_kernels(int cond) {
-  // CHECK: cir.func @acc_kernels(%[[ARG:.*]]: !s32i{{.*}}) {
+  // CHECK: cir.func{{.*}} @acc_kernels(%[[ARG:.*]]: !s32i{{.*}}) {
   // CHECK-NEXT: %[[COND:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["cond", init]
   // CHECK-NEXT: cir.store %[[ARG]], %[[COND]] : !s32i, !cir.ptr<!s32i>
 #pragma acc kernels
@@ -418,7 +418,7 @@ void acc_kernels(int cond) {
 }
 
 void acc_kernels_data_clauses(int *arg1, int *arg2) {
-  // CHECK: cir.func @acc_kernels_data_clauses(%[[ARG1_PARAM:.*]]: !cir.ptr<!s32i>{{.*}}, %[[ARG2_PARAM:.*]]: !cir.ptr<!s32i>{{.*}}) {
+  // CHECK: cir.func{{.*}} @acc_kernels_data_clauses(%[[ARG1_PARAM:.*]]: !cir.ptr<!s32i>{{.*}}, %[[ARG2_PARAM:.*]]: !cir.ptr<!s32i>{{.*}}) {
   // CHECK-NEXT: %[[ARG1:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arg1", init]
   // CHECK-NEXT: %[[ARG2:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arg2", init]
   // CHECK-NEXT: cir.store %[[ARG1_PARAM]], %[[ARG1]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
diff --git a/clang/test/CIR/CodeGenOpenACC/loop.cpp b/clang/test/CIR/CodeGenOpenACC/loop.cpp
index c0bf11e353951..d8707ba78fb5b 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s
 
 extern "C" void acc_loop(int *A, int *B, int *C, int N) {
-  // CHECK: cir.func @acc_loop(%[[ARG_A:.*]]: !cir.ptr<!s32i> loc{{.*}}, %[[ARG_B:.*]]: !cir.ptr<!s32i> loc{{.*}}, %[[ARG_C:.*]]: !cir.ptr<!s32i> loc{{.*}}, %[[ARG_N:.*]]: !s32i loc{{.*}}) {
+  // CHECK: cir.func{{.*}} @acc_loop(%[[ARG_A:.*]]: !cir.ptr<!s32i> loc{{.*}}, %[[ARG_B:.*]]: !cir.ptr<!s32i> loc{{.*}}, %[[ARG_C:.*]]: !cir.ptr<!s32i> loc{{.*}}, %[[ARG_N:.*]]: !s32i loc{{.*}}) {
   // CHECK-NEXT: %[[ALLOCA_A:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["A", init]
   // CHECK-NEXT: %[[ALLOCA_B:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["B", init]
   // CHECK-NEXT: %[[ALLOCA_C:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["C", init]
diff --git a/clang/test/CIR/CodeGenOpenACC/parallel.c b/clang/test/CIR/CodeGenOpenACC/parallel.c
index 0127613233eca..5db174fb6549b 100644
--- a/clang/test/CIR/CodeGenOpenACC/parallel.c
+++ b/clang/test/CIR/CodeGenOpenACC/parallel.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s
 
 void acc_parallel(int cond) {
-  // CHECK: cir.func @acc_parallel(%[[ARG:.*]]: !s32i{{.*}}) {
+  // CHECK: cir.func{{.*}} @acc_parallel(%[[ARG:.*]]: !s32i{{.*}}) {
   // CHECK-NEXT: %[[COND:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["cond", init]
   // CHECK-NEXT: cir.store %[[ARG]], %[[COND]] : !s32i, !cir.ptr<!s32i>
 #pragma acc parallel
@@ -445,7 +445,7 @@ void acc_parallel(int cond) {
 }
 
 void acc_parallel_data_clauses(int *arg1, int *arg2) {
-  // CHECK: cir.func @acc_parallel_data_clauses(%[[ARG1_PARAM:.*]]: !cir.ptr<!s32i>{{.*}}, %[[ARG2_PARAM:.*]]: !cir.ptr<!s32i>{{.*}}) {
+  // CHECK: cir.func{{.*}} @acc_parallel_data_clauses(%[[ARG1_PARAM:.*]]: !cir.ptr<!s32i>{{.*}}, %[[ARG2_PARAM:.*]]: !cir.ptr<!s32i>{{.*}}) {
   // CHECK-NEXT: %[[ARG1:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arg1", init]
   // CHECK-NEXT: %[[ARG2:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arg2", init]
   // CHECK-NEXT: cir.store %[[ARG1_PARAM]], %[[ARG1]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
diff --git a/clang/test/CIR/CodeGenOpenACC/serial.c b/clang/test/CIR/CodeGenOpenACC/serial.c
index 1c9695b34833f..9e3359141838f 100644
--- a/clang/test/CIR/CodeGenOpenACC/serial.c
+++ b/clang/test/CIR/CodeGenOpenACC/serial.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s
 
 void acc_serial(int cond) {
-  // CHECK: cir.func @acc_serial(%[[ARG:.*]]: !s32i{{.*}}) {
+  // CHECK: cir.func{{.*}} @acc_serial(%[[ARG:.*]]: !s32i{{.*}}) {
   // CHECK-NEXT: %[[COND:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["cond", init]
   // CHECK-NEXT: cir.store %[[ARG]], %[[COND]] : !s32i, !cir.ptr<!s32i>
 #pragma acc serial
@@ -268,7 +268,7 @@ void acc_serial(int cond) {
 }
 
 void acc_serial_data_clauses(int *arg1, int *arg2) {
-  // CHECK: cir.func @acc_serial_data_clauses(%[[ARG1_PARAM:.*]]: !cir.ptr<!s32i>{{.*}}, %[[ARG2_PARAM:.*]]: !cir.ptr<!s32i>{{.*}}) {
+  // CHECK: cir.func{{.*}} @acc_serial_data_clauses(%[[ARG1_PARAM:.*]]: !cir.ptr<!s32i>{{.*}}, %[[ARG2_PARAM:.*]]: !cir.ptr<!s32i>{{.*}}) {
   // CHECK-NEXT: %[[ARG1:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arg1", init]
   // CHECK-NEXT: %[[ARG2:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arg2", init]
   // CHECK-NEXT: cir.store %[[ARG1_PARAM]], %[[ARG1]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
diff --git a/clang/test/CIR/CodeGenOpenACC/set.c b/clang/test/CIR/CodeGenOpenACC/set.c
index ced581680c037..0b87f42603776 100644
--- a/clang/test/CIR/CodeGenOpenACC/set.c
+++ b/clang/test/CIR/CodeGenOpenACC/set.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fopenacc -emit-cir -fclangir %s -o - | FileCheck %s
 
 void acc_set(int cond) {
-  // CHECK: cir.func @acc_set(%[[ARG:.*]]: !s32i{{.*}}) {
+  // CHECK: cir.func{{.*}} @acc_set(%[[ARG:.*]]: !s32i{{.*}}) {
   // CHECK-NEXT: %[[COND:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["cond", init]
   // CHECK-NEXT: cir.store %[[ARG]], %[[COND]] : !s32i, !cir.ptr<!s32i>
 
diff --git a/clang/test/CIR/CodeGenOpenACC/shutdown.c b/clang/test/CIR/CodeGenOpenACC/shutdown.c
index e8ab6bd75d0e9..52db382df217e 100644
--- a/clang/test/CIR/CodeGenOpenACC/shutdown.c
+++ b/clang/test/CIR/CodeGenOpenACC/shutdown.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fopenacc -emit-cir -fclangir %s -o - | FileCheck %s
 
 void acc_shutdown(int cond) {
-  // CHECK: cir.func @acc_shutdown(%[[ARG:.*]]: !s32i{{.*}}) {
+  // CHECK: cir.func{{.*}} @acc_shutdown(%[[ARG:.*]]: !s32i{{.*}}) {
   // CHECK-NEXT: %[[COND:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["cond", init]
   // CHECK-NEXT: cir.store %[[ARG]], %[[COND]] : !s32i, !cir.ptr<!s32i>
 #pragma acc shutdown
diff --git a/clang/test/CIR/CodeGenOpenACC/wait.c b/clang/test/CIR/CodeGenOpenACC/wait.c
index ec2ab6e9446cc..aeda8b955a6d0 100644
--- a/clang/test/CIR/CodeGenOpenACC/wait.c
+++ b/clang/test/CIR/CodeGenOpenACC/wait.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fopenacc -emit-cir -fclangir %s -o - | FileCheck %s
 
 void acc_wait(int cond) {
-  // CHECK: cir.func @acc_wait(%[[ARG:.*]]: !s32i{{.*}}) {
+  // CHECK: cir.func{{.*}} @acc_wait(%[[ARG:.*]]: !s32i{{.*}}) {
   // CHECK-NEXT: %[[COND:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["cond", init]
   // CHECK-NEXT: cir.store %[[ARG]], %[[COND]] : !s32i, !cir.ptr<!s32i>
 
diff --git a/clang/test/CIR/IR/array.cir b/clang/test/CIR/IR/array.cir
index 4930fc493c5a7..bba536062d740 100644
--- a/clang/test/CIR/IR/array.cir
+++ b/clang/test/CIR/IR/array.cir
@@ -33,7 +33,7 @@ cir.func @func() {
   cir.return
 }
 
-// CHECK: cir.func @func() {
+// CHECK: cir.func{{.*}} @func() {
 // CHECK:   %0 = cir.alloca !cir.array<!s32i x 10>, !cir.ptr<!cir.array<!s32i x 10>>, ["l"] {alignment = 4 : i64}
 // CHECK:   cir.return
 // CHECK: }
@@ -44,7 +44,7 @@ cir.func @func2(%arg0: !cir.ptr<!s32i>) {
   cir.return
 }
 
-// CHECK: cir.func @func2(%arg0: !cir.ptr<!s32i>) {
+// CHECK: cir.func{{.*}} @func2(%arg0: !cir.ptr<!s32i>) {
 // CHECK:   %0 = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["p", init] {alignment = 8 : i64}
 // CHECK:   cir.store %arg0, %0 : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 // CHECK:   cir.return
@@ -56,7 +56,7 @@ cir.func @func3(%arg0: !cir.ptr<!cir.array<!s32i x 10>>) {
   cir.return
 }
 
-// CHECK: cir.func @func3(%arg0: !cir.ptr<!cir.array<!s32i x 10>>) {
+// CHECK: cir.func{{.*}} @func3(%arg0: !cir.ptr<!cir.array<!s32i x 10>>) {
 // CHECK:   %0 = cir.alloca !cir.ptr<!cir.array<!s32i x 10>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 10>>>, ["pp", init] {alignment = 8 : i64}
 // CHECK:   cir.store %arg0, %0 : !cir.ptr<!cir.array<!s32i x 10>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 10>>>
 // CHECK:   cir.return
diff --git a/clang/test/CIR/IR/binassign.cir b/clang/test/CIR/IR/binassign.cir
index 24ed95d3c29c7..a25729635094e 100644
--- a/clang/test/CIR/IR/binassign.cir
+++ b/clang/test/CIR/IR/binassign.cir
@@ -26,7 +26,7 @@ module {
 // CHECK: !s8i = !cir.int<s, 8>
 // CHECK: #true = #cir.bool<true> : !cir.bool
 // CHECK: module {
-// CHECK:   cir.func @binary_assign() {
+// CHECK:   cir.func{{.*}} @binary_assign() {
 // CHECK:     %0 = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["b"] {alignment = 1 : i64}
 // CHECK:     %1 = cir.alloca !s8i, !cir.ptr<!s8i>, ["c"] {alignment = 1 : i64}
 // CHECK:     %2 = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["f"] {alignment = 4 : i64}
diff --git a/clang/test/CIR/IR/call.cir b/clang/test/CIR/IR/call.cir
index 5f0916775479e..9607df7202e0f 100644
--- a/clang/test/CIR/IR/call.cir
+++ b/clang/test/CIR/IR/call.cir
@@ -4,7 +4,7 @@
 
 module {
 
-cir.func @f1()
+cir.func private @f1()
 
 cir.func @f2() {
   cir.call @f1() : () -> ()
@@ -13,26 +13,26 @@ cir.func @f2() {
   cir.return
 }
 
-// CHECK:      cir.func @f2() {
+// CHECK:      cir.func{{.*}} @f2() {
 // CHECK-NEXT:   cir.call @f1() : () -> ()
 // CHECK-NEXT:   cir.call @f1() side_effect(pure) : () -> ()
 // CHECK-NEXT:   cir.call @f1() side_effect(const) : () -> ()
 // CHECK-NEXT:   cir.return
 // CHECK-NEXT: }
 
-cir.func @f3() -> !s32i
+cir.func private @f3() -> !s32i
 
 cir.func @f4() -> !s32i {
   %0 = cir.call @f3() : () -> !s32i
   cir.return %0 : !s32i
 }
 
-// CHECK:      cir.func @f4() -> !s32i {
+// CHECK:      cir.func{{.*}} @f4() -> !s32i {
 // CHECK-NEXT:   %[[#x:]] = cir.call @f3() : () -> !s32i
 // CHECK-NEXT:   cir.return %[[#x]] : !s32i
 // CHECK-NEXT: }
 
-cir.func @f5(!s32i, !s32i) -> !s32i
+cir.func private @f5(!s32i, !s32i) -> !s32i
 cir.func @f6() -> !s32i {
   %0 = cir.const #cir.int<1> : !s32i
   %1 = cir.const #cir.int<2> : !s32i
@@ -40,7 +40,7 @@ cir.func @f6() -> !s32i {
   cir.return %2 : !s32i
 }
 
-// CHECK:      cir.func @f6() -> !s32i {
+// CHECK:      cir.func{{.*}} @f6() -> !s32i {
 // CHECK-NEXT:   %[[#a:]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT:   %[[#b:]] = cir.const #cir.int<2> : !s32i
 // CHECK-NEXT:   %[[#c:]] = cir.call @f5(%[[#a]], %[[#b]]) : (!s32i, !s32i) -> !s32i
@@ -54,7 +54,7 @@ cir.func @f7(%arg0: !cir.ptr<!cir.func<(!s32i, !s32i) -> !s32i>>) -> !s32i {
   cir.return %2 : !s32i
 }
 
-// CHECK:      cir.func @f7(%[[ptr:.+]]: !cir.ptr<!cir.func<(!s32i, !s32i) -> !s32i>>) -> !s32i {
+// CHECK:      cir.func{{.*}} @f7(%[[ptr:.+]]: !cir.ptr<!cir.func<(!s32i, !s32i) -> !s32i>>) -> !s32i {
 // CHECK-NEXT:   %[[#a:]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT:   %[[#b:]] = cir.const #cir.int<2> : !s32i
 // CHECK-NEXT:   %[[#ret:]] = cir.call %[[ptr]](%[[#a]], %[[#b]]) : (!cir.ptr<!cir.func<(!s32i, !s32i) -> !s32i>>, !s32i, !s32i) -> !s32i
diff --git a/clang/test/CIR/IR/cast.cir b/clang/test/CIR/IR/cast.cir
index 4881db7fc271f..a335887de7ec7 100644
--- a/clang/test/CIR/IR/cast.cir
+++ b/clang/test/CIR/IR/cast.cir
@@ -15,9 +15,9 @@ module  {
   }
 }
 
-// CHECK: cir.func @yolo(%arg0: !s32i)
+// CHECK: cir.func{{.*}} @yolo(%arg0: !s32i)
 // CHECK: %0 = cir.cast(int_to_bool, %arg0 : !s32i), !cir.bool
 // CHECK: %1 = cir.const #cir.int<0> : !s32i
 
-// CHECK: cir.func @bitcast
+// CHECK: cir.func{{.*}} @bitcast
 // CHECK: %0 = cir.cast(bitcast, %arg0 : !cir.ptr<!s32i>), !cir.ptr<f32>
diff --git a/clang/test/CIR/IR/cmp.cir b/clang/test/CIR/IR/cmp.cir
index a049dc51f1401..818527189af01 100644
--- a/clang/test/CIR/IR/cmp.cir
+++ b/clang/test/CIR/IR/cmp.cir
@@ -36,7 +36,7 @@ module {
     cir.return
   }
 
-  // CHECK: cir.func @c0(%arg0: !s32i, %arg1: !s32i) {
+  // CHECK: cir.func{{.*}} @c0(%arg0: !s32i, %arg1: !s32i) {
   // CHECK-NEXT:   %0 = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init] {alignment = 4 : i64}
   // CHECK-NEXT:   %1 = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init] {alignment = 4 : i64}
   // CHECK-NEXT:   %2 = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["x", init] {alignment = 1 : i64}
@@ -102,7 +102,7 @@ module {
     cir.return
   }
 
-  // CHECK: cir.func @c0_unsigned(%arg0: !u32i, %arg1: !u32i) {
+  // CHECK: cir.func{{.*}} @c0_unsigned(%arg0: !u32i, %arg1: !u32i) {
   // CHECK-NEXT:   %0 = cir.alloca !u32i, !cir.ptr<!u32i>, ["a", init] {alignment = 4 : i64}
   // CHECK-NEXT:   %1 = cir.alloca !u32i, !cir.ptr<!u32i>, ["b", init] {alignment = 4 : i64}
   // CHECK-NEXT:   %2 = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["x", init] {alignment = 1 : i64}
@@ -168,7 +168,7 @@ module {
     cir.return
   }
 
-  // CHECK: cir.func @c0_float(%arg0: !cir.float, %arg1: !cir.float) {
+  // CHECK: cir.func{{.*}} @c0_float(%arg0: !cir.float, %arg1: !cir.float) {
   // CHECK-NEXT:   %0 = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["a", init] {alignment = 4 : i64}
   // CHECK-NEXT:   %1 = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["b", init] {alignment = 4 : i64}
   // CHECK-NEXT:   %2 = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["x", init] {alignment = 1 : i64}
@@ -234,7 +234,7 @@ module {
     cir.return
   }
 
-  // CHECK: cir.func @pointer_cmp(%arg0: !cir.ptr<!s32i>, %arg1: !cir.ptr<!s32i>) {
+  // CHECK: cir.func{{.*}} @pointer_cmp(%arg0: !cir.ptr<!s32i>, %arg1: !cir.ptr<!s32i>) {
   // CHECK-NEXT:   %0 = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["a", init] {alignment = 8 : i64}
   // CHECK-NEXT:   %1 = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["b", init] {alignment = 8 : i64}
   // CHECK-NEXT:   %2 = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["x", init] {alignment = 1 : i64}
@@ -312,7 +312,7 @@ module {
     cir.return
   }
 
-  // CHECK: cir.func @bool_cmp(%arg0: !cir.bool, %arg1: !cir.bool) {
+  // CHECK: cir.func{{.*}} @bool_cmp(%arg0: !cir.bool, %arg1: !cir.bool) {
   // CHECK-NEXT:   %0 = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["a", init] {alignment = 1 : i64}
   // CHECK-NEXT:   %1 = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["b", init] {alignment = 1 : i64}
   // CHECK-NEXT:   %2 = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["x", init] {alignment = 1 : i64}
diff --git a/clang/test/CIR/IR/func.cir b/clang/test/CIR/IR/func.cir
index 9ed44611e195d..865ab7ecc39a2 100644
--- a/clang/test/CIR/IR/func.cir
+++ b/clang/test/CIR/IR/func.cir
@@ -10,7 +10,7 @@ module {
 cir.func @empty() {
   cir.return
 }
-// CHECK: cir.func @empty() {
+// CHECK: cir.func{{.*}} @empty() {
 // CHECK:   cir.return
 // CHECK: }
 
@@ -18,7 +18,7 @@ cir.func @empty() {
 cir.func @voidret() {
   cir.return
 }
-// CHECK: cir.func @voidret() {
+// CHECK: cir.func{{.*}} @voidret() {
 // CHECK:   cir.return
 // CHECK: }
 
@@ -27,7 +27,7 @@ cir.func @intfunc() -> !s32i {
   %0 = cir.const #cir.int<42> : !s32i
   cir.return %0 : !s32i
 }
-// CHECK: cir.func @intfunc() -> !s32i {
+// CHECK: cir.func{{.*}} @intfunc() -> !s32i {
 // CHECK:   %[[VAL:.*]] = cir.const #cir.int<42> : !s32i
 // CHECK:   cir.return %[[VAL]] : !s32i
 // CHECK: }
@@ -48,7 +48,7 @@ cir.func @scopes() -> !s32i {
   }
   cir.trap
 }
-// CHECK: cir.func @scopes() -> !s32i {
+// CHECK: cir.func{{.*}} @scopes() -> !s32i {
 // CHECK:   cir.scope {
 // CHECK:     cir.scope {
 // CHECK:       %[[VAL:.*]] = cir.const #cir.int<99> : !s32i
@@ -63,7 +63,7 @@ cir.func @longfunc() -> !s64i {
   %0 = cir.const #cir.int<42> : !s64i
   cir.return %0 : !s64i
 }
-// CHECK: cir.func @longfunc() -> !s64i
+// CHECK: cir.func{{.*}} @longfunc() -> !s64i
 // CHECK:   %0 = cir.const #cir.int<42> : !s64i
 // CHECK:   cir.return %0 : !s64i
 // CHECK: }
@@ -73,7 +73,7 @@ cir.func @unsignedfunc() -> !u32i {
   %0 = cir.const #cir.int<42> : !u32i
   cir.return %0 : !u32i
 }
-// CHECK: cir.func @unsignedfunc() -> !u32i
+// CHECK: cir.func{{.*}} @unsignedfunc() -> !u32i
 // CHECK:   %[[VAL:.*]] = cir.const #cir.int<42> : !u32i
 // CHECK:   cir.return %[[VAL]] : !u32i
 // CHECK: }
@@ -83,7 +83,7 @@ cir.func @ullfunc() -> !u64i {
   %0 = cir.const #cir.int<42> : !u64i
   cir.return %0 : !u64i
 }
-// CHECK: cir.func @ullfunc() -> !u64i
+// CHECK: cir.func{{.*}} @ullfunc() -> !u64i
 // CHECK:   %[[VAL:.*]] = cir.const #cir.int<42> : !u64i
 // CHECK:   cir.return %[[VAL:.*]] : !u64i
 // CHECK: }
diff --git a/clang/test/CIR/IR/invalid-call.cir b/clang/test/CIR/IR/invalid-call.cir
index 3ebb771ed72e7..a9c7e38f73af6 100644
--- a/clang/test/CIR/IR/invalid-call.cir
+++ b/clang/test/CIR/IR/invalid-call.cir
@@ -12,7 +12,7 @@ cir.func @f1() {
 
 !u32i = !cir.int<u, 32>
 
-cir.func @f2()
+cir.func private @f2()
 cir.func @f3() {
   // expected-error @below {{callee returns void but call has results}}
   %0 = cir.call @f2() : () -> !u32i
@@ -23,7 +23,7 @@ cir.func @f3() {
 
 !u32i = !cir.int<u, 32>
 
-cir.func @f4() -> !u32i
+cir.func private @f4() -> !u32i
 cir.func @f5() {
   // expected-error @below {{incorrect number of results for callee}}
   cir.call @f4() : () -> ()
@@ -35,7 +35,7 @@ cir.func @f5() {
 !s32i = !cir.int<s, 32>
 !u32i = !cir.int<u, 32>
 
-cir.func @f6() -> !u32i
+cir.func private @f6() -> !u32i
 cir.func @f7() {
   // expected-error @below {{result type mismatch}}
   %0 = cir.call @f6() : () -> !s32i
@@ -47,7 +47,7 @@ cir.func @f7() {
 !s32i = !cir.int<s, 32>
 !u32i = !cir.int<u, 32>
 
-cir.func @f8(!s32i, !s32i)
+cir.func private @f8(!s32i, !s32i)
 cir.func @f9() {
   %0 = cir.const #cir.int<1> : !s32i
   // expected-error @below {{incorrect number of operands for callee}}
@@ -60,7 +60,7 @@ cir.func @f9() {
 !s32i = !cir.int<s, 32>
 !u32i = !cir.int<u, 32>
 
-cir.func @f10(!s32i, !s32i)
+cir.func private @f10(!s32i, !s32i)
 cir.func @f11() {
   %0 = cir.const #cir.int<1> : !s32i
   %1 = cir.const #cir.int<2> : !u32i
@@ -73,7 +73,7 @@ cir.func @f11() {
 
 !s32i = !cir.int<s, 32>
 
-cir.func @f12(!s32i, !s32i, ...)
+cir.func private @f12(!s32i, !s32i, ...)
 cir.func @f13() {
   %0 = cir.const #cir.int<1> : !s32i
   // expected-error @below {{too few operands for callee}}
diff --git a/clang/test/CIR/IR/invalid-complex.cir b/clang/test/CIR/IR/invalid-complex.cir
index 8c6d890579321..2414809f7dbca 100644
--- a/clang/test/CIR/IR/invalid-complex.cir
+++ b/clang/test/CIR/IR/invalid-complex.cir
@@ -21,3 +21,27 @@ module {
 cir.global external @ci2 = #cir.const_complex<#cir.int<1> : !s32i, #cir.int<2> : !s64i> : !cir.complex<!s32i>
 
 }
+
+// -----
+
+module {
+  cir.func @complex_real_invalid_result_type() -> !cir.double {
+    %0 = cir.alloca !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>, ["c"]
+    %2 = cir.load align(8) %0 : !cir.ptr<!cir.complex<!cir.double>>, !cir.complex<!cir.double>
+    // expected-error @below {{result type does not match operand type}}
+    %3 = cir.complex.real %2 : !cir.complex<!cir.double> -> !cir.float
+    cir.return
+  }
+}
+
+// -----
+
+module {
+  cir.func @complex_imag_invalid_result_type() -> !cir.double {
+    %0 = cir.alloca !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>, ["c"]
+    %2 = cir.load align(8) %0 : !cir.ptr<!cir.complex<!cir.double>>, !cir.complex<!cir.double>
+    // expected-error @below {{result type does not match operand type}}
+    %3 = cir.complex.imag %2 : !cir.complex<!cir.double> -> !cir.float
+    cir.return
+  }
+}
diff --git a/clang/test/CIR/IR/ternary.cir b/clang/test/CIR/IR/ternary.cir
index 3827dc77726df..e419c7f5af40c 100644
--- a/clang/test/CIR/IR/ternary.cir
+++ b/clang/test/CIR/IR/ternary.cir
@@ -16,7 +16,7 @@ module  {
 
 // CHECK: module  {
 
-// CHECK: cir.func @blue(%arg0: !cir.bool) -> !u32i {
+// CHECK: cir.func{{.*}} @blue(%arg0: !cir.bool) -> !u32i {
 // CHECK:   %0 = cir.ternary(%arg0, true {
 // CHECK:     %1 = cir.const #cir.int<0> : !u32i
 // CHECK:     cir.yield %1 : !u32i
diff --git a/clang/test/CIR/IR/unary.cir b/clang/test/CIR/IR/unary.cir
index f01121adc106e..ba3bc20d574f5 100644
--- a/clang/test/CIR/IR/unary.cir
+++ b/clang/test/CIR/IR/unary.cir
@@ -16,7 +16,7 @@ module {
     %6 = cir.unary(dec, %1) : !u32i, !u32i
     cir.return
   }
-// CHECK: cir.func @test_unary_unsigned() {
+// CHECK: cir.func{{.*}} @test_unary_unsigned() {
 // CHECK:   %0 = cir.alloca !u32i, !cir.ptr<!u32i>, ["a"] {alignment = 4 : i64}
 // CHECK:   %1 = cir.load %0 : !cir.ptr<!u32i>, !u32i
 // CHECK:   %2 = cir.unary(plus, %1) : !u32i, !u32i
@@ -37,7 +37,7 @@ module {
     %6 = cir.unary(dec, %1) nsw : !s32i, !s32i
     cir.return
   }
-// CHECK: cir.func @test_unary_signed() {
+// CHECK: cir.func{{.*}} @test_unary_signed() {
 // CHECK:   %0 = cir.alloca !s32i, !cir.ptr<!s32i>, ["a"] {alignment = 4 : i64}
 // CHECK:   %1 = cir.load %0 : !cir.ptr<!s32i>, !s32i
 // CHECK:   %2 = cir.unary(plus, %1) : !s32i, !s32i
diff --git a/clang/test/CIR/IR/vector.cir b/clang/test/CIR/IR/vector.cir
index f23f5de9692de..6d8e5beffd63f 100644
--- a/clang/test/CIR/IR/vector.cir
+++ b/clang/test/CIR/IR/vector.cir
@@ -26,7 +26,7 @@ cir.func @vec_int_test() {
   cir.return
 }
 
-// CHECK: cir.func @vec_int_test() {
+// CHECK: cir.func{{.*}} @vec_int_test() {
 // CHECK:  %0 = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a"]
 // CHECK:  %1 = cir.alloca !cir.vector<3 x !s32i>, !cir.ptr<!cir.vector<3 x !s32i>>, ["b"]
 // CHECK:  %2 = cir.alloca !cir.vector<2 x !s32i>, !cir.ptr<!cir.vector<2 x !s32i>>, ["c"]
@@ -38,7 +38,7 @@ cir.func @vec_double_test() {
   cir.return
 }
 
-// CHECK: cir.func @vec_double_test() {
+// CHECK: cir.func{{.*}} @vec_double_test() {
 // CHECK:  %0 = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr<!cir.vector<2 x !cir.double>>, ["a"]
 // CHECK:  cir.return
 // CHECK: }
@@ -54,7 +54,7 @@ cir.func @local_vector_create_test() {
     cir.return
 }
 
-// CHECK: cir.func @local_vector_create_test() {
+// CHECK: cir.func{{.*}} @local_vector_create_test() {
 // CHECK:   %0 = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CHECK:   %1 = cir.const #cir.int<1> : !s32i
 // CHECK:   %2 = cir.const #cir.int<2> : !s32i
@@ -81,7 +81,7 @@ cir.func @vector_extract_element_test() {
     cir.return
 }
 
-// CHECK: cir.func @vector_extract_element_test() {
+// CHECK: cir.func{{.*}} @vector_extract_element_test() {
 // CHECK:    %0 = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["d", init]
 // CHECK:    %1 = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
 // CHECK:    %2 = cir.const #cir.int<1> : !s32i
@@ -116,7 +116,7 @@ cir.func @vector_insert_element_test() {
     cir.return
 }
 
-// CHECK: cir.func @vector_insert_element_test() {
+// CHECK: cir.func{{.*}} @vector_insert_element_test() {
 // CHECK:    %0 = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CHECK:    %1 = cir.const #cir.int<1> : !s32i
 // CHECK:    %2 = cir.const #cir.int<2> : !s32i
@@ -150,7 +150,7 @@ cir.func @vector_compare_test() {
     cir.return
 }
 
-// CHECK: cir.func @vector_compare_test() {
+// CHECK: cir.func{{.*}} @vector_compare_test() {
 // CHECK:    %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a"]
 // CHECK:    %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b"]
 // CHECK:    %[[INIT:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["o", init]
@@ -176,7 +176,7 @@ cir.func @vector_shuffle_dynamic_test() {
     cir.return
 }
 
-// CHECK: cir.func @vector_shuffle_dynamic_test() {
+// CHECK: cir.func{{.*}} @vector_shuffle_dynamic_test() {
 // CHECK:    %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a"]
 // CHECK:    %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b"]
 // CHECK:    %[[RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["r", init]
@@ -204,7 +204,7 @@ cir.func @vector_splat_test() {
     cir.return
 }
 
-// CHECK: cir.func @vector_splat_test() {
+// CHECK: cir.func{{.*}} @vector_splat_test() {
 // CHECK-NEXT: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CHECK-NEXT: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
 // CHECK-NEXT: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
diff --git a/clang/test/CIR/Lowering/array.cpp b/clang/test/CIR/Lowering/array.cpp
index 9bd3464dc3145..438d41e2c2c2f 100644
--- a/clang/test/CIR/Lowering/array.cpp
+++ b/clang/test/CIR/Lowering/array.cpp
@@ -41,7 +41,7 @@ void func() {
   int e = arr[0];
   int e2 = arr[1];
 }
-// CHECK: define void @_Z4funcv()
+// CHECK: define{{.*}} void @_Z4funcv()
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = alloca [10 x i32], i64 1, align 16
 // CHECK-NEXT: %[[INIT:.*]] = alloca i32, i64 1, align 4
 // CHECK-NEXT: %[[INIT_2:.*]] = alloca i32, i64 1, align 4
@@ -57,7 +57,7 @@ void func() {
 void func2() {
   int arr[2] = {5};
 }
-// CHECK: define void @_Z5func2v()
+// CHECK: define{{.*}} void @_Z5func2v()
 // CHECK:  %[[ARR_ALLOCA:.*]] = alloca [2 x i32], i64 1, align 4
 // CHECK:  %[[TMP:.*]] = alloca ptr, i64 1, align 8
 // CHECK:  %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR_ALLOCA]], i32 0
@@ -72,7 +72,7 @@ void func2() {
 void func3() {
   int arr3[2] = {5, 6};
 }
-// CHECK: define void @_Z5func3v()
+// CHECK: define{{.*}} void @_Z5func3v()
 // CHECK:  %[[ARR_ALLOCA:.*]] = alloca [2 x i32], i64 1, align 4
 // CHECK:  %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR_ALLOCA]], i32 0
 // CHECK:  store i32 5, ptr %[[ARR_PTR]], align 4
@@ -83,7 +83,7 @@ void func4() {
   int arr[2][1] = {{5}, {6}};
   int e = arr[1][0];
 }
-// CHECK: define void @_Z5func4v()
+// CHECK: define{{.*}} void @_Z5func4v()
 // CHECK:  %[[ARR_ALLOCA:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
 // CHECK:  %[[INIT:.*]] = alloca i32, i64 1, align 4
 // CHECK:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_ALLOCA]], i32 0
@@ -102,7 +102,7 @@ void func4() {
 void func5() {
   int arr[2][1] = {{5}};
 }
-// CHECK: define void @_Z5func5v()
+// CHECK: define{{.*}} void @_Z5func5v()
 // CHECK:  %[[ARR_ALLOCA:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
 // CHECK:  %[[TMP:.*]] = alloca ptr, i64 1, align 8
 // CHECK:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_ALLOCA]], i32 0
@@ -119,7 +119,7 @@ void func6() {
   int x = 4;
   int arr[2] = { x, 5 };
 }
-// CHECK: define void @_Z5func6v()
+// CHECK: define{{.*}} void @_Z5func6v()
 // CHECK:  %[[VAR:.*]] = alloca i32, i64 1, align 4
 // CHECK:  %[[ARR:.*]] = alloca [2 x i32], i64 1, align 4
 // CHECK:  store i32 4, ptr %[[VAR]], align 4
@@ -132,7 +132,7 @@ void func6() {
 void func7() {
   int* arr[1] = {};
 }
-// CHECK: define void @_Z5func7v()
+// CHECK: define{{.*}} void @_Z5func7v()
 // CHECK:  %[[ARR:.*]] = alloca [1 x ptr], i64 1, align 8
 // CHECK:  %[[ALLOCA:.*]] = alloca ptr, i64 1, align 8
 // CHECK:  %[[ELE_PTR:.*]] = getelementptr ptr, ptr %[[ARR]], i32 0
@@ -143,9 +143,9 @@ void func7() {
 // CHECK:  store ptr %[[ELE]], ptr %[[ALLOCA]], align 8
 
 void func8(int p[10]) {}
-// CHECK: define void @_Z5func8Pi(ptr {{%.*}})
+// CHECK: define{{.*}} void @_Z5func8Pi(ptr {{%.*}})
 // CHECK-NEXT: alloca ptr, i64 1, align 8
 
 void func9(int pp[10][5]) {}
-// CHECK: define void @_Z5func9PA5_i(ptr {{%.*}})
+// CHECK: define{{.*}} void @_Z5func9PA5_i(ptr {{%.*}})
 // CHECK-NEXT: alloca ptr, i64 1, align 8
diff --git a/clang/test/CIR/Transforms/canonicalize.cir b/clang/test/CIR/Transforms/canonicalize.cir
index 164d231db7bb4..7ba163eb30bb1 100644
--- a/clang/test/CIR/Transforms/canonicalize.cir
+++ b/clang/test/CIR/Transforms/canonicalize.cir
@@ -15,7 +15,7 @@ module {
   ^bb2:  // pred: ^bb1
     cir.return
   }
-  // CHECK:      cir.func @redundant_br() {
+  // CHECK:      cir.func{{.*}} @redundant_br() {
   // CHECK-NEXT:   %[[A:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["a", init] {alignment = 4 : i64}
   // CHECK-NEXT:   %[[FOUR:.*]] = cir.const #cir.int<4> : !u32i
   // CHECK-NEXT:   cir.store %[[FOUR]], %[[A]] : !u32i, !cir.ptr<!u32i>
@@ -27,7 +27,7 @@ module {
     }
     cir.return
   }
-  // CHECK:      cir.func @empty_scope() {
+  // CHECK:      cir.func{{.*}} @empty_scope() {
   // CHECK-NEXT:   cir.return
   // CHECK-NEXT: }
 
@@ -36,7 +36,7 @@ module {
     %1 = cir.unary(not, %0) : !cir.bool, !cir.bool
     cir.return %1 : !cir.bool
   }
-  // CHECK:      cir.func @unary_not(%arg0: !cir.bool) -> !cir.bool
+  // CHECK:      cir.func{{.*}} @unary_not(%arg0: !cir.bool) -> !cir.bool
   // CHECK-NEXT:   cir.return %arg0 : !cir.bool
 
   cir.func @cast1(%arg0: !cir.bool) -> !cir.bool {
@@ -44,7 +44,7 @@ module {
     %1 = cir.cast(int_to_bool, %0 : !s32i), !cir.bool
     cir.return %1 : !cir.bool
   }
-  // CHECK:      cir.func @cast1(%[[ARG0:.*]]: !cir.bool) -> !cir.bool
+  // CHECK:      cir.func{{.*}} @cast1(%[[ARG0:.*]]: !cir.bool) -> !cir.bool
   // CHECK-NEXT:   cir.return %[[ARG0]] : !cir.bool
 
   cir.func @cast2(%arg0: !s32i) -> !cir.bool {
@@ -54,7 +54,7 @@ module {
     %3 = cir.cast(int_to_bool, %2 : !s64i), !cir.bool
     cir.return %3 : !cir.bool
   }
-  // CHECK:      cir.func @cast2(%[[ARG0:.*]]: !s32i) -> !cir.bool
+  // CHECK:      cir.func{{.*}} @cast2(%[[ARG0:.*]]: !s32i) -> !cir.bool
   // CHECK-NEXT:   %[[CAST:.*]] = cir.cast(int_to_bool, %[[ARG0]] : !s32i), !cir.bool
   // CHECK-NEXT:   cir.return %[[CAST]] : !cir.bool
 
@@ -64,7 +64,7 @@ module {
     %2 = cir.cast(integral, %1 : !s32i), !s64i
     cir.return %2 : !s64i
   }
-  // CHECK:      cir.func @no_fold_cast(%[[ARG0:.*]]: !s32i) -> !s64i
+  // CHECK:      cir.func{{.*}} @no_fold_cast(%[[ARG0:.*]]: !s32i) -> !s64i
   // CHECK-NEXT:   %[[CAST:.*]] = cir.cast(int_to_bool, %[[ARG0]] : !s32i), !cir.bool
   // CHECK-NEXT:   %[[CAST2:.*]] = cir.cast(bool_to_int, %[[CAST]] : !cir.bool), !s32i
   // CHECK-NEXT:   %[[CAST3:.*]] = cir.cast(integral, %[[CAST2]] : !s32i), !s64i
diff --git a/clang/test/CIR/Transforms/complex-create-fold.cir b/clang/test/CIR/Transforms/complex-create-fold.cir
index 5d9d22112c8b7..370acaecc2222 100644
--- a/clang/test/CIR/Transforms/complex-create-fold.cir
+++ b/clang/test/CIR/Transforms/complex-create-fold.cir
@@ -16,7 +16,7 @@ module {
     cir.return %6 : !cir.complex<!s32i>
   }
 
-// CHECK: cir.func @fold_complex_create_test() -> !cir.complex<!s32i> {
+// CHECK: cir.func{{.*}} @fold_complex_create_test() -> !cir.complex<!s32i> {
 // CHECK:   %[[RET:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["__retval"]
 // CHECK:   %[[INIT:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["c", init]
 // CHECK:   %[[COMPLEX:.*]] = cir.const #cir.const_complex<#cir.int<1> : !s32i, #cir.int<2> : !s32i> : !cir.complex<!s32i>
diff --git a/clang/test/CIR/Transforms/complex-imag-fold.cir b/clang/test/CIR/Transforms/complex-imag-fold.cir
new file mode 100644
index 0000000000000..0d9a4e43142a3
--- /dev/null
+++ b/clang/test/CIR/Transforms/complex-imag-fold.cir
@@ -0,0 +1,23 @@
+// RUN: cir-opt %s -cir-canonicalize -o - | FileCheck %s
+
+!s32i = !cir.int<s, 32>
+
+module {
+  cir.func @fold_complex_imag_test() -> !s32i {
+    %0 = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
+    %2 = cir.const #cir.const_complex<#cir.int<1> : !s32i, #cir.int<2> : !s32i> : !cir.complex<!s32i>
+    %4 = cir.complex.imag %2 : !cir.complex<!s32i> -> !s32i
+    cir.store %4, %0 : !s32i, !cir.ptr<!s32i>
+    %5 = cir.load %0 : !cir.ptr<!s32i>, !s32i
+    cir.return %5 : !s32i
+  }
+
+  // CHECK: cir.func @fold_complex_imag_test() -> !s32i {
+  // CHECK:   %[[RET:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
+  // CHECK:   %[[IMAG:.*]] = cir.const #cir.int<2> : !s32i
+  // CHECK:   cir.store %[[IMAG]], %[[RET]] : !s32i, !cir.ptr<!s32i>
+  // CHECK:   %[[TMP:.]] = cir.load %[[RET]] : !cir.ptr<!s32i>, !s32i
+  // CHECK:   cir.return %[[TMP]] : !s32i
+  // CHECK: }
+
+}
diff --git a/clang/test/CIR/Transforms/complex-real-fold.cir b/clang/test/CIR/Transforms/complex-real-fold.cir
new file mode 100644
index 0000000000000..1cab9be616af0
--- /dev/null
+++ b/clang/test/CIR/Transforms/complex-real-fold.cir
@@ -0,0 +1,23 @@
+// RUN: cir-opt %s -cir-canonicalize -o - | FileCheck %s
+
+!s32i = !cir.int<s, 32>
+
+module {
+  cir.func @fold_complex_real_test() -> !s32i {
+    %0 = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
+    %2 = cir.const #cir.const_complex<#cir.int<1> : !s32i, #cir.int<2> : !s32i> : !cir.complex<!s32i>
+    %4 = cir.complex.real %2 : !cir.complex<!s32i> -> !s32i
+    cir.store %4, %0 : !s32i, !cir.ptr<!s32i>
+    %5 = cir.load %0 : !cir.ptr<!s32i>, !s32i
+    cir.return %5 : !s32i
+  }
+
+  // CHECK: cir.func @fold_complex_real_test() -> !s32i {
+  // CHECK:   %[[RET:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
+  // CHECK:   %[[REAL:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK:   cir.store %[[REAL]], %[[RET]] : !s32i, !cir.ptr<!s32i>
+  // CHECK:   %[[TMP:.]] = cir.load %[[RET]] : !cir.ptr<!s32i>, !s32i
+  // CHECK:   cir.return %[[TMP]] : !s32i
+  // CHECK: }
+
+}
diff --git a/clang/test/CIR/Transforms/hoist-allocas.cir b/clang/test/CIR/Transforms/hoist-allocas.cir
index df7b9f48be9dc..04724f3073e57 100644
--- a/clang/test/CIR/Transforms/hoist-allocas.cir
+++ b/clang/test/CIR/Transforms/hoist-allocas.cir
@@ -20,7 +20,7 @@ module {
     }
     cir.return
   }
-  // CHECK:      cir.func @l1
+  // CHECK:      cir.func{{.*}} @l1
   // CHECK-NEXT:   %[[I:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init] {alignment = 4 : i64}
   // CHECK-NEXT:   cir.scope {
   // CHECK-NEXT:     %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
@@ -55,7 +55,7 @@ module {
     }
     cir.return
   }
-  // CHECK:      cir.func @l2
+  // CHECK:      cir.func{{.*}} @l2
   // CHECK-NEXT:   %[[I:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init] {alignment = 4 : i64}
   // CHECK-NEXT:   cir.scope {
   // CHECK-NEXT:     cir.for : cond {
@@ -92,7 +92,7 @@ module {
     }
     cir.return
   }
-  // CHECK:      cir.func @l3
+  // CHECK:      cir.func{{.*}} @l3
   // CHECK-NEXT:   %[[I:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init] {alignment = 4 : i64}
   // CHECK-NEXT:   cir.scope {
   // CHECK-NEXT:     cir.for : cond {
diff --git a/clang/test/CIR/Transforms/if.cir b/clang/test/CIR/Transforms/if.cir
index 03848bf8d0633..3f817c793643f 100644
--- a/clang/test/CIR/Transforms/if.cir
+++ b/clang/test/CIR/Transforms/if.cir
@@ -14,7 +14,7 @@ module {
     }
     cir.return %arg0 : !s32i
   }
-//      CHECK: cir.func @foo(%arg0: !s32i) -> !s32i {
+//      CHECK: cir.func{{.*}} @foo(%arg0: !s32i) -> !s32i {
 // CHECK-NEXT:   %0 = cir.cast(int_to_bool, %arg0 : !s32i), !cir.bool
 // CHECK-NEXT:   cir.brcond %0 ^bb1, ^bb2
 // CHECK-NEXT: ^bb1:  // pred: ^bb0
@@ -35,7 +35,7 @@ module {
     }
     cir.return %arg0 : !s32i
   }
-//      CHECK: cir.func @onlyIf(%arg0: !s32i) -> !s32i {
+//      CHECK: cir.func{{.*}} @onlyIf(%arg0: !s32i) -> !s32i {
 // CHECK-NEXT:   %0 = cir.cast(int_to_bool, %arg0 : !s32i), !cir.bool
 // CHECK-NEXT:   cir.brcond %0 ^bb1, ^bb2
 // CHECK-NEXT: ^bb1:  // pred: ^bb0
diff --git a/clang/test/CIR/Transforms/loop.cir b/clang/test/CIR/Transforms/loop.cir
index d02412d049158..9c76092bb8e76 100644
--- a/clang/test/CIR/Transforms/loop.cir
+++ b/clang/test/CIR/Transforms/loop.cir
@@ -16,7 +16,7 @@ module {
   }
 }
 
-// CHECK:  cir.func @testFor(%arg0: !cir.bool) {
+// CHECK:  cir.func{{.*}} @testFor(%arg0: !cir.bool) {
 // CHECK:    cir.br ^bb[[#COND:]]
 // CHECK:  ^bb[[#COND]]:
 // CHECK:    cir.brcond %arg0 ^bb[[#BODY:]], ^bb[[#EXIT:]]
@@ -38,7 +38,7 @@ module {
     cir.return
   }
 
-// CHECK:  cir.func @testWhile(%arg0: !cir.bool) {
+// CHECK:  cir.func{{.*}} @testWhile(%arg0: !cir.bool) {
 // CHECK:    cir.br ^bb[[#COND:]]
 // CHECK:  ^bb[[#COND]]:
 // CHECK:    cir.brcond %arg0 ^bb[[#BODY:]], ^bb[[#EXIT:]]
@@ -59,7 +59,7 @@ module {
     cir.return
   }
 
-// CHECK:  cir.func @testDoWhile(%arg0: !cir.bool) {
+// CHECK:  cir.func{{.*}} @testDoWhile(%arg0: !cir.bool) {
 // CHECK:    cir.br ^bb[[#BODY:]]
 // CHECK:  ^bb[[#COND]]:
 // CHECK:    cir.brcond %arg0 ^bb[[#BODY:]], ^bb[[#EXIT:]]
diff --git a/clang/test/CIR/Transforms/scope.cir b/clang/test/CIR/Transforms/scope.cir
index 2d14784c33f87..757428cd019ac 100644
--- a/clang/test/CIR/Transforms/scope.cir
+++ b/clang/test/CIR/Transforms/scope.cir
@@ -11,7 +11,7 @@ module {
     }
     cir.return
   }
-// CHECK:  cir.func @foo() {
+// CHECK:  cir.func{{.*}} @foo() {
 // CHECK:    cir.br ^bb1
 // CHECK:  ^bb1:  // pred: ^bb0
 // CHECK:    %0 = cir.alloca !u32i, !cir.ptr<!u32i>, ["a", init] {alignment = 4 : i64}
@@ -28,7 +28,7 @@ module {
     }
     cir.return
   }
-// CHECK:  cir.func @empty_scope() {
+// CHECK:  cir.func{{.*}} @empty_scope() {
 // CHECK:    cir.return
 // CHECK:  }
 
@@ -44,7 +44,7 @@ module {
     cir.return %1 : !u32i
   }
 
-// CHECK:  cir.func @scope_with_return() -> !u32i {
+// CHECK:  cir.func{{.*}} @scope_with_return() -> !u32i {
 // CHECK:    %0 = cir.alloca !u32i, !cir.ptr<!u32i>, ["__retval"] {alignment = 4 : i64}
 // CHECK:    cir.br ^bb1
 // CHECK:  ^bb1:  // pred: ^bb0
diff --git a/clang/test/CIR/Transforms/select.cir b/clang/test/CIR/Transforms/select.cir
index 29a5d1ed1ddeb..0ad5c43178831 100644
--- a/clang/test/CIR/Transforms/select.cir
+++ b/clang/test/CIR/Transforms/select.cir
@@ -10,7 +10,7 @@ module {
     cir.return %1 : !s32i
   }
 
-  //      CHECK: cir.func @fold_true(%[[ARG0:.+]]: !s32i, %[[ARG1:.+]]: !s32i) -> !s32i {
+  //      CHECK: cir.func{{.*}} @fold_true(%[[ARG0:.+]]: !s32i, %[[ARG1:.+]]: !s32i) -> !s32i {
   // CHECK-NEXT:   cir.return %[[ARG0]] : !s32i
   // CHECK-NEXT: }
 
@@ -20,7 +20,7 @@ module {
     cir.return %1 : !s32i
   }
 
-  //      CHECK: cir.func @fold_false(%[[ARG0:.+]]: !s32i, %[[ARG1:.+]]: !s32i) -> !s32i {
+  //      CHECK: cir.func{{.*}} @fold_false(%[[ARG0:.+]]: !s32i, %[[ARG1:.+]]: !s32i) -> !s32i {
   // CHECK-NEXT:   cir.return %[[ARG1]] : !s32i
   // CHECK-NEXT: }
 
@@ -30,7 +30,7 @@ module {
     cir.return %1 : !s32i
   }
 
-  //      CHECK: cir.func @fold_to_const(%{{.+}}: !cir.bool) -> !s32i {
+  //      CHECK: cir.func{{.*}} @fold_to_const(%{{.+}}: !cir.bool) -> !s32i {
   // CHECK-NEXT:   %[[#A:]] = cir.const #cir.int<42> : !s32i
   // CHECK-NEXT:   cir.return %[[#A]] : !s32i
   // CHECK-NEXT: }
@@ -42,7 +42,7 @@ module {
     cir.return %2 : !cir.bool
   }
 
-  //      CHECK: cir.func @simplify_1(%[[ARG0:.+]]: !cir.bool) -> !cir.bool {
+  //      CHECK: cir.func{{.*}} @simplify_1(%[[ARG0:.+]]: !cir.bool) -> !cir.bool {
   // CHECK-NEXT:   cir.return %[[ARG0]] : !cir.bool
   // CHECK-NEXT: }
 
@@ -53,7 +53,7 @@ module {
     cir.return %2 : !cir.bool
   }
 
-  //      CHECK: cir.func @simplify_2(%[[ARG0:.+]]: !cir.bool) -> !cir.bool {
+  //      CHECK: cir.func{{.*}} @simplify_2(%[[ARG0:.+]]: !cir.bool) -> !cir.bool {
   // CHECK-NEXT:   %[[#A:]] = cir.unary(not, %[[ARG0]]) : !cir.bool, !cir.bool
   // CHECK-NEXT:   cir.return %[[#A]] : !cir.bool
   // CHECK-NEXT: }
diff --git a/clang/test/CIR/Transforms/switch.cir b/clang/test/CIR/Transforms/switch.cir
index 00b462a6075c9..a000d6b70fbcc 100644
--- a/clang/test/CIR/Transforms/switch.cir
+++ b/clang/test/CIR/Transforms/switch.cir
@@ -17,7 +17,7 @@ module {
     }
     cir.return
   }
-// CHECK:  cir.func @shouldFlatSwitchWithDefault(%arg0: !s8i) {
+// CHECK:  cir.func{{.*}} @shouldFlatSwitchWithDefault(%arg0: !s8i) {
 // CHECK:    cir.switch.flat %arg0 : !s8i, ^bb[[#DEFAULT:]] [
 // CHECK:      1: ^bb[[#CASE1:]]
 // CHECK:    ]
@@ -38,7 +38,7 @@ module {
     }
     cir.return
   }
-// CHECK:  cir.func @shouldFlatSwitchWithoutDefault(%arg0: !s32i) {
+// CHECK:  cir.func{{.*}} @shouldFlatSwitchWithoutDefault(%arg0: !s32i) {
 // CHECK:    cir.switch.flat %arg0 : !s32i, ^bb[[#EXIT:]] [
 // CHECK:      1: ^bb[[#CASE1:]]
 // CHECK:    ]
@@ -58,7 +58,7 @@ module {
     }
     cir.return
   }
-// CHECK:  cir.func @shouldFlatSwitchWithImplicitFallthrough(%arg0: !s64i) {
+// CHECK:  cir.func{{.*}} @shouldFlatSwitchWithImplicitFallthrough(%arg0: !s64i) {
 // CHECK:    cir.switch.flat %arg0 : !s64i, ^bb[[#EXIT:]] [
 // CHECK:      1: ^bb[[#CASE1N2:]],
 // CHECK:      2: ^bb[[#CASE1N2]]
@@ -83,7 +83,7 @@ module {
       }
     cir.return
   }
-// CHECK:  cir.func @shouldFlatSwitchWithExplicitFallthrough(%arg0: !s64i) {
+// CHECK:  cir.func{{.*}} @shouldFlatSwitchWithExplicitFallthrough(%arg0: !s64i) {
 // CHECK:    cir.switch.flat %arg0 : !s64i, ^bb[[#EXIT:]] [
 // CHECK:      1: ^bb[[#CASE1:]],
 // CHECK:      2: ^bb[[#CASE2:]]
@@ -105,7 +105,7 @@ module {
       }
     cir.return
   }
-// CHECK:  cir.func @shouldFlatSwitchWithFallthroughToExit(%arg0: !s64i) {
+// CHECK:  cir.func{{.*}} @shouldFlatSwitchWithFallthroughToExit(%arg0: !s64i) {
 // CHECK:    cir.switch.flat %arg0 : !s64i, ^bb[[#EXIT:]] [
 // CHECK:      1: ^bb[[#CASE1:]]
 // CHECK:    ]
@@ -122,7 +122,7 @@ module {
     // CHECK-NOT: llvm.switch
     cir.return
   }
-// CHECK:      cir.func @shouldDropEmptySwitch(%arg0: !s64i)
+// CHECK:      cir.func{{.*}} @shouldDropEmptySwitch(%arg0: !s64i)
 // CHECK-NOT:    cir.switch.flat
 
 
@@ -143,7 +143,7 @@ module {
     cir.return
   }
 
-// CHECK:  cir.func @shouldFlatMultiBlockCase(%arg0: !s32i) {
+// CHECK:  cir.func{{.*}} @shouldFlatMultiBlockCase(%arg0: !s32i) {
 // CHECK:     %0 = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init] {alignment = 4 : i64}
 // CHECK:     cir.store %arg0, %0 : !s32i, !cir.ptr<!s32i>
 // CHECK:     cir.br ^bb1
@@ -189,7 +189,7 @@ module {
     %4 = cir.load %2 : !cir.ptr<!s32i>, !s32i
     cir.return %4 : !s32i
   }
-// CHECK:  cir.func @shouldFlatNestedBreak(%arg0: !s32i, %arg1: !s32i) -> !s32i {
+// CHECK:  cir.func{{.*}} @shouldFlatNestedBreak(%arg0: !s32i, %arg1: !s32i) -> !s32i {
 // CHECK:    cir.switch.flat %[[COND:.*]] : !s32i, ^bb[[#DEFAULT_BB:]] [
 // CHECK:      0: ^bb[[#BB1:]]
 // CHECK:    ]
@@ -243,7 +243,7 @@ module {
     %5 = cir.load %1 : !cir.ptr<!s32i>, !s32i
     cir.return %5 : !s32i
   }
-//      CHECK:  cir.func @flatCaseRange(%arg0: !s32i) -> !s32i {
+//      CHECK:  cir.func{{.*}} @flatCaseRange(%arg0: !s32i) -> !s32i {
 //      CHECK:    cir.switch.flat %[[X:[0-9]+]] : !s32i, ^[[JUDGE_RANGE:bb[0-9]+]] [
 // CHECK-NEXT:      -100: ^[[CASE_EQUAL:bb[0-9]+]]
 // CHECK-NEXT:    ]
@@ -293,7 +293,7 @@ module {
     cir.return
   }
 
-// CHECK:  cir.func @_Z8bigRangei(%arg0: !s32i) {
+// CHECK:  cir.func{{.*}} @_Z8bigRangei(%arg0: !s32i) {
 // CHECK:    cir.switch.flat %[[COND:.*]] : !s32i, ^bb[[#RANGE_BR:]] [
 // CHECK:    ]
 // CHECK:  ^bb[[#NO_PRED_BB:]]:  // no predecessors
diff --git a/clang/test/CIR/Transforms/ternary-fold.cir b/clang/test/CIR/Transforms/ternary-fold.cir
index 1192a0ce29424..718906f5c6ee5 100644
--- a/clang/test/CIR/Transforms/ternary-fold.cir
+++ b/clang/test/CIR/Transforms/ternary-fold.cir
@@ -14,7 +14,7 @@ module {
     cir.return %1 : !s32i
   }
 
-  //      CHECK: cir.func @fold_ternary(%{{.+}}: !s32i, %[[ARG:.+]]: !s32i) -> !s32i {
+  //      CHECK: cir.func{{.*}} @fold_ternary(%{{.+}}: !s32i, %[[ARG:.+]]: !s32i) -> !s32i {
   // CHECK-NEXT:   cir.return %[[ARG]] : !s32i
   // CHECK-NEXT: }
 
@@ -28,7 +28,7 @@ module {
     cir.return %0 : !s32i
   }
 
-  //      CHECK: cir.func @simplify_ternary(%[[ARG0:.+]]: !cir.bool, %[[ARG1:.+]]: !s32i) -> !s32i {
+  //      CHECK: cir.func{{.*}} @simplify_ternary(%[[ARG0:.+]]: !cir.bool, %[[ARG1:.+]]: !s32i) -> !s32i {
   // CHECK-NEXT:   %[[#A:]] = cir.const #cir.int<42> : !s32i
   // CHECK-NEXT:   %[[#B:]] = cir.select if %[[ARG0]] then %[[#A]] else %[[ARG1]] : (!cir.bool, !s32i, !s32i) -> !s32i
   // CHECK-NEXT:   cir.return %[[#B]] : !s32i
@@ -44,7 +44,7 @@ module {
     cir.return %0 : !s32i
   }
 
-  //      CHECK: cir.func @simplify_ternary_false_const(%[[ARG0:.+]]: !cir.bool, %[[ARG1:.+]]: !s32i) -> !s32i {
+  //      CHECK: cir.func{{.*}} @simplify_ternary_false_const(%[[ARG0:.+]]: !cir.bool, %[[ARG1:.+]]: !s32i) -> !s32i {
   // CHECK-NEXT:   %[[#A:]] = cir.const #cir.int<24> : !s32i
   // CHECK-NEXT:   %[[#B:]] = cir.select if %[[ARG0]] then %[[ARG1]] else %[[#A]] : (!cir.bool, !s32i, !s32i) -> !s32i
   // CHECK-NEXT:   cir.return %[[#B]] : !s32i
@@ -62,7 +62,7 @@ module {
     cir.return %1 : !s32i
   }
 
-  //      CHECK: cir.func @non_simplifiable_ternary(%[[ARG0:.+]]: !cir.bool) -> !s32i {
+  //      CHECK: cir.func{{.*}} @non_simplifiable_ternary(%[[ARG0:.+]]: !cir.bool) -> !s32i {
   // CHECK-NEXT:   %[[#A:]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
   // CHECK-NEXT:   %[[#B:]] = cir.ternary(%[[ARG0]], true {
   // CHECK-NEXT:     %[[#C:]] = cir.const #cir.int<42> : !s32i
diff --git a/clang/test/CIR/Transforms/ternary.cir b/clang/test/CIR/Transforms/ternary.cir
index 67ef7f95a6b52..fffafa9ff8e4c 100644
--- a/clang/test/CIR/Transforms/ternary.cir
+++ b/clang/test/CIR/Transforms/ternary.cir
@@ -22,7 +22,7 @@ module {
     cir.return %6 : !s32i
   }
 
-// CHECK: cir.func @foo(%arg0: !s32i) -> !s32i {
+// CHECK: cir.func{{.*}} @foo(%arg0: !s32i) -> !s32i {
 // CHECK:   %0 = cir.alloca !s32i, !cir.ptr<!s32i>, ["y", init] {alignment = 4 : i64}
 // CHECK:   %1 = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"] {alignment = 4 : i64}
 // CHECK:   cir.store %arg0, %0 : !s32i, !cir.ptr<!s32i>
@@ -53,7 +53,7 @@ module {
     cir.return
   }
 
-// CHECK: cir.func @foo2(%arg0: !cir.bool) {
+// CHECK: cir.func{{.*}} @foo2(%arg0: !cir.bool) {
 // CHECK:   cir.brcond %arg0 ^bb1, ^bb2
 // CHECK: ^bb1:  // pred: ^bb0
 // CHECK:   cir.br ^bb3
diff --git a/clang/test/CIR/Transforms/vector-cmp-fold.cir b/clang/test/CIR/Transforms/vector-cmp-fold.cir
index b207fc08748e2..f3486bd26fe1b 100644
--- a/clang/test/CIR/Transforms/vector-cmp-fold.cir
+++ b/clang/test/CIR/Transforms/vector-cmp-fold.cir
@@ -10,7 +10,7 @@ module  {
     cir.return %new_vec : !cir.vector<4 x !s32i>
   }
 
-  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK:  cir.func{{.*}} @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
   // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
   // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
   // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
@@ -28,7 +28,7 @@ module  {
     cir.return %new_vec : !cir.vector<4 x !s32i>
   }
 
-  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK:  cir.func{{.*}} @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
   // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
   // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
   // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
@@ -46,7 +46,7 @@ module  {
     cir.return %new_vec : !cir.vector<4 x !s32i>
   }
 
-  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK:  cir.func{{.*}} @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
   // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
   // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
   // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
@@ -64,7 +64,7 @@ module  {
     cir.return %new_vec : !cir.vector<4 x !s32i>
   }
 
-  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK:  cir.func{{.*}} @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
   // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
   // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
   // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
@@ -82,7 +82,7 @@ module  {
     cir.return %new_vec : !cir.vector<4 x !s32i>
   }
 
-  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK:  cir.func{{.*}} @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
   // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
   // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
   // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
@@ -100,7 +100,7 @@ module  {
     cir.return %new_vec : !cir.vector<4 x !s32i>
   }
 
-  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK:  cir.func{{.*}} @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
   // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
   // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
   // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
@@ -120,7 +120,7 @@ module  {
     cir.return %new_vec : !cir.vector<4 x !s32i>
   }
 
-  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK:  cir.func{{.*}} @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
   // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
   // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
   // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
@@ -140,7 +140,7 @@ module  {
     cir.return %new_vec : !cir.vector<4 x !s32i>
   }
 
-  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK:  cir.func{{.*}} @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
   // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
   // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
   // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
@@ -160,7 +160,7 @@ module  {
     cir.return %new_vec : !cir.vector<4 x !s32i>
   }
 
-  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK:  cir.func{{.*}} @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
   // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
   // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
   // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
@@ -180,7 +180,7 @@ module  {
     cir.return %new_vec : !cir.vector<4 x !s32i>
   }
 
-  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK:  cir.func{{.*}} @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
   // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
   // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
   // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
@@ -200,7 +200,7 @@ module  {
     cir.return %new_vec : !cir.vector<4 x !s32i>
   }
 
-  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK:  cir.func{{.*}} @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
   // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
   // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
   // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
@@ -220,7 +220,7 @@ module  {
     cir.return %new_vec : !cir.vector<4 x !s32i>
   }
 
-  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK:  cir.func{{.*}} @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
   // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
   // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
   // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
diff --git a/clang/test/CIR/Transforms/vector-create-fold.cir b/clang/test/CIR/Transforms/vector-create-fold.cir
index fb8f66dc4debc..fb8c39e4dda40 100644
--- a/clang/test/CIR/Transforms/vector-create-fold.cir
+++ b/clang/test/CIR/Transforms/vector-create-fold.cir
@@ -12,7 +12,7 @@ module  {
     cir.return %vec : !cir.vector<4 x !s32i>
   }
 
-  // CHECK:  cir.func @fold_create_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK:  cir.func{{.*}} @fold_create_vector_op_test() -> !cir.vector<4 x !s32i> {
   // CHECK-NEXT: %[[VEC:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
   // CHECK-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
   // CHECK-NEXT: cir.return %[[VEC]] : !cir.vector<4 x !s32i>
diff --git a/clang/test/CIR/Transforms/vector-shuffle-dynamic-fold.cir b/clang/test/CIR/Transforms/vector-shuffle-dynamic-fold.cir
index 46ab04502afec..6584df3d7050b 100644
--- a/clang/test/CIR/Transforms/vector-shuffle-dynamic-fold.cir
+++ b/clang/test/CIR/Transforms/vector-shuffle-dynamic-fold.cir
@@ -11,7 +11,7 @@ module {
   }
 
   // Masking indices [8, 7, 6, 5] AND 3 = [0, 3, 2, 1]
-  // CHECK: cir.func @fold_shuffle_dynamic_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK: cir.func{{.*}} @fold_shuffle_dynamic_vector_op_test() -> !cir.vector<4 x !s32i> {
   // CHECK-NEXT: %[[NEW_VEC:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<4> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i]> : !cir.vector<4 x !s32i>
   // CHECK-NEXT: cir.return %[[NEW_VEC:.*]] : !cir.vector<4 x !s32i>
 
@@ -23,7 +23,7 @@ module {
   }
 
   // Masking indices [3, 2, 1, 0] AND 3 = [3, 2, 1, 0]
-  // CHECK: cir.func @fold_shuffle_dynamic_vector_op_test_2() -> !cir.vector<4 x !s32i> {
+  // CHECK: cir.func{{.*}} @fold_shuffle_dynamic_vector_op_test_2() -> !cir.vector<4 x !s32i> {
   // CHECK-NEXT: %[[NEW_VEC:.*]] = cir.const #cir.const_vector<[#cir.int<4> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
   // CHECK-NEXT: cir.return %[[NEW_VEC:.*]] : !cir.vector<4 x !s32i>
 }
diff --git a/clang/test/CIR/Transforms/vector-shuffle-fold.cir b/clang/test/CIR/Transforms/vector-shuffle-fold.cir
index 87d409728989b..7aaddc051a75b 100644
--- a/clang/test/CIR/Transforms/vector-shuffle-fold.cir
+++ b/clang/test/CIR/Transforms/vector-shuffle-fold.cir
@@ -12,7 +12,7 @@ module  {
     cir.return %new_vec : !cir.vector<4 x !s32i>
   }
 
-  // CHECK: cir.func @fold_shuffle_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK: cir.func{{.*}} @fold_shuffle_vector_op_test() -> !cir.vector<4 x !s32i> {
   // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i,
   // CHECK-SAME: #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
   // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
@@ -32,7 +32,7 @@ module  {
     cir.return %new_vec : !cir.vector<6 x !s32i>
   }
 
-  // CHECK: cir.func @fold_shuffle_vector_op_test() -> !cir.vector<6 x !s32i> {
+  // CHECK: cir.func{{.*}} @fold_shuffle_vector_op_test() -> !cir.vector<6 x !s32i> {
   // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i,
   // CHECK-SAME: #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i]> : !cir.vector<6 x !s32i>
   // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<6 x !s32i>
@@ -52,7 +52,7 @@ module  {
     cir.return %new_vec : !cir.vector<4 x !s32i>
   }
 
-  // CHECK: cir.func @fold_shuffle_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK: cir.func{{.*}} @fold_shuffle_vector_op_test() -> !cir.vector<4 x !s32i> {
   // CHECK: cir.const #cir.const_vector<[#cir.undef : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i,
   // CHECK-SAME: #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
   // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
diff --git a/clang/test/CIR/Transforms/vector-ternary-fold.cir b/clang/test/CIR/Transforms/vector-ternary-fold.cir
index f2e18576da74b..3de614a39264f 100644
--- a/clang/test/CIR/Transforms/vector-ternary-fold.cir
+++ b/clang/test/CIR/Transforms/vector-ternary-fold.cir
@@ -12,7 +12,7 @@ module {
   }
 
   // [1, 0, 1, 0] ? [1, 2, 3, 4] : [5, 6, 7, 8] Will be fold to [1, 6, 3, 8]
-  // CHECK: cir.func @vector_ternary_fold_test() -> !cir.vector<4 x !s32i> {
+  // CHECK: cir.func{{.*}} @vector_ternary_fold_test() -> !cir.vector<4 x !s32i> {
   // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<6> : !s32i, #cir.int<3> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
   // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
 }
diff --git a/clang/test/CIR/func-linkage.cpp b/clang/test/CIR/func-linkage.cpp
new file mode 100644
index 0000000000000..d43f7ed273063
--- /dev/null
+++ b/clang/test/CIR/func-linkage.cpp
@@ -0,0 +1,51 @@
+// Linkage types of global variables
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck %s -check-prefix=CIR --input-file %t.cir
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck %s -check-prefix=LLVM --input-file %t-cir.ll
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck %s -check-prefix=OGCG --input-file %t.ll
+
+void a() {}
+
+// CIR: cir.func dso_local @_Z1av()
+// LLVM: define dso_local void @_Z1av()
+// OGCG: define dso_local void @_Z1av()
+
+extern void b();
+// CIR: cir.func private @_Z1bv()
+// LLVM: declare void @_Z1bv()
+// OGCG: declare void @_Z1bv()
+
+static void c() {}
+// CIR: cir.func internal private dso_local @_ZL1cv()
+// LLVM: define internal void @_ZL1cv()
+// OGCG: define internal void @_ZL1cv()
+
+inline void d() {}
+// CIR: cir.func comdat linkonce_odr @_Z1dv()
+// LLVM: define linkonce_odr void @_Z1dv()
+// OGCG: define linkonce_odr void @_Z1dv(){{.*}} comdat
+
+namespace {
+  void e() {}
+}
+
+// CIR: cir.func internal private dso_local @_ZN12_GLOBAL__N_11eEv()
+// LLVM: define internal void @_ZN12_GLOBAL__N_11eEv()
+// OGCG: define internal void @_ZN12_GLOBAL__N_11eEv()
+
+void f();
+// CIR: cir.func private @_Z1fv()
+// LLVM: declare void @_Z1fv()
+// OGCG: declare void @_Z1fv()
+
+// Force the functions to be emitted
+void reference_funcs() {
+    a();
+    b();
+    c();
+    d();
+    e();
+    f();
+}
diff --git a/clang/test/CIR/func-simple.cpp b/clang/test/CIR/func-simple.cpp
index 45cf1746de713..c9cb5c5595352 100644
--- a/clang/test/CIR/func-simple.cpp
+++ b/clang/test/CIR/func-simple.cpp
@@ -2,17 +2,17 @@
 // RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o -  | FileCheck %s
 
 void empty() { }
-// CHECK: cir.func @_Z5emptyv() {
+// CHECK: cir.func{{.*}} @_Z5emptyv() {
 // CHECK:   cir.return
 // CHECK: }
 
 void voidret() { return; }
-// CHECK: cir.func @_Z7voidretv() {
+// CHECK: cir.func{{.*}} @_Z7voidretv() {
 // CHECK:   cir.return
 // CHECK: }
 
 int intfunc() { return 42; }
-// CHECK: cir.func @_Z7intfuncv() -> !s32i {
+// CHECK: cir.func{{.*}} @_Z7intfuncv() -> !s32i {
 // CHECK:   %0 = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"] {alignment = 4 : i64}
 // CHECK:   %1 = cir.const #cir.int<42> : !s32i
 // CHECK:   cir.store %1, %0 : !s32i, !cir.ptr<!s32i>
@@ -27,7 +27,7 @@ int scopes() {
     }
   }
 }
-// CHECK: cir.func @_Z6scopesv() -> !s32i {
+// CHECK: cir.func{{.*}} @_Z6scopesv() -> !s32i {
 // CHECK:   %0 = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"] {alignment = 4 : i64}
 // CHECK:   cir.scope {
 // CHECK:     cir.scope {
@@ -41,7 +41,7 @@ int scopes() {
 // CHECK: }
 
 long longfunc() { return 42l; }
-// CHECK: cir.func @_Z8longfuncv() -> !s64i
+// CHECK: cir.func{{.*}} @_Z8longfuncv() -> !s64i
 // CHECK:   %0 = cir.alloca !s64i, !cir.ptr<!s64i>, ["__retval"] {alignment = 8 : i64}
 // CHECK:   %1 = cir.const #cir.int<42> : !s64i
 // CHECK:   cir.store %1, %0 : !s64i, !cir.ptr<!s64i>
@@ -50,7 +50,7 @@ long longfunc() { return 42l; }
 // CHECK: }
 
 unsigned unsignedfunc() { return 42u; }
-// CHECK: cir.func @_Z12unsignedfuncv() -> !u32i
+// CHECK: cir.func{{.*}} @_Z12unsignedfuncv() -> !u32i
 // CHECK:   %0 = cir.alloca !u32i, !cir.ptr<!u32i>, ["__retval"] {alignment = 4 : i64}
 // CHECK:   %1 = cir.const #cir.int<42> : !u32i
 // CHECK:   cir.store %1, %0 : !u32i, !cir.ptr<!u32i>
@@ -59,7 +59,7 @@ unsigned unsignedfunc() { return 42u; }
 // CHECK: }
 
 unsigned long long ullfunc() { return 42ull; }
-// CHECK: cir.func @_Z7ullfuncv() -> !u64i
+// CHECK: cir.func{{.*}} @_Z7ullfuncv() -> !u64i
 // CHECK:   %0 = cir.alloca !u64i, !cir.ptr<!u64i>, ["__retval"] {alignment = 8 : i64}
 // CHECK:   %1 = cir.const #cir.int<42> : !u64i
 // CHECK:   cir.store %1, %0 : !u64i, !cir.ptr<!u64i>
@@ -68,7 +68,7 @@ unsigned long long ullfunc() { return 42ull; }
 // CHECK: }
 
 bool boolfunc() { return true; }
-// CHECK: cir.func @_Z8boolfuncv() -> !cir.bool {
+// CHECK: cir.func{{.*}} @_Z8boolfuncv() -> !cir.bool {
 // CHECK:   %0 = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["__retval"] {alignment = 1 : i64}
 // CHECK:   %1 = cir.const #true
 // CHECK:   cir.store %1, %0 : !cir.bool, !cir.ptr<!cir.bool>
@@ -77,7 +77,7 @@ bool boolfunc() { return true; }
 // CHECK: }
 
 float floatfunc() { return 42.42f; }
-// CHECK: cir.func @_Z9floatfuncv() -> !cir.float {
+// CHECK: cir.func{{.*}} @_Z9floatfuncv() -> !cir.float {
 // CHECK:   %0 = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["__retval"] {alignment = 4 : i64}
 // CHECK:   %1 = cir.const #cir.fp<4.242
 // CHECK:   cir.store %1, %0 : !cir.float, !cir.ptr<!cir.float>
@@ -86,7 +86,7 @@ float floatfunc() { return 42.42f; }
 // CHECK: }
 
 double doublefunc() { return 42.42; }
-// CHECK: cir.func @_Z10doublefuncv() -> !cir.double {
+// CHECK: cir.func{{.*}} @_Z10doublefuncv() -> !cir.double {
 // CHECK:   %0 = cir.alloca !cir.double, !cir.ptr<!cir.double>, ["__retval"] {alignment = 8 : i64}
 // CHECK:   %1 = cir.const #cir.fp<4.242
 // CHECK:   cir.store %1, %0 : !cir.double, !cir.ptr<!cir.double>
diff --git a/clang/test/CIR/mlprint.c b/clang/test/CIR/mlprint.c
index 755a6cb47855e..1630bc1e3ce9b 100644
--- a/clang/test/CIR/mlprint.c
+++ b/clang/test/CIR/mlprint.c
@@ -7,7 +7,7 @@ int foo(void) {
 }
 
 // CIR:  IR Dump After CIRCanonicalize (cir-canonicalize)
-// CIR:  cir.func @foo() -> !s32i
+// CIR:  cir.func{{.*}} @foo() -> !s32i
 // LLVM: IR Dump After cir::direct::ConvertCIRToLLVMPass (cir-flat-to-llvm)
 // LLVM: llvm.func @foo() -> i32
 // LLVM: IR Dump After
diff --git a/clang/test/CodeGen/PowerPC/builtins-bcd-transform.c b/clang/test/CodeGen/PowerPC/builtins-bcd-transform.c
new file mode 100644
index 0000000000000..74a8500da6dab
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/builtins-bcd-transform.c
@@ -0,0 +1,79 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// Testfile that verifies positive cases (0 or 1 only) for BCD builtins national2packed, packed2zoned and zoned2packed.
+// REQUIRES: powerpc-registered-target
+// RUN: %clang_cc1 -triple powerpc64le-unknown-unknown -O2 -target-cpu pwr9 \
+// RUN:   -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc64-unknown-unknown -O2 -target-cpu pwr9 \
+// RUN:   -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc-unknown-unknown -O2 -target-cpu pwr9 \
+// RUN:   -emit-llvm %s -o - | FileCheck %s
+
+// CHECK-LABEL: define dso_local <16 x i8> @tBcd_National2packed_imm1(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.ppc.national2packed(<16 x i8> [[A]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+vector unsigned char tBcd_National2packed_imm1(vector unsigned char a) {
+    return __builtin_ppc_national2packed (a,'\1');
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @tBcd_National2packed_imm0(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.ppc.national2packed(<16 x i8> [[A]], i32 0)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+vector unsigned char tBcd_National2packed_imm0(vector unsigned char a) {
+    return __builtin_ppc_national2packed (a,'\0');
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @tBcd_Packed2national(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.ppc.packed2national(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+vector unsigned char tBcd_Packed2national(vector unsigned char a){
+    return __builtin_ppc_packed2national(a);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @tBcd_Packed2zoned_imm0(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.ppc.packed2zoned(<16 x i8> [[A]], i32 0)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+vector unsigned char tBcd_Packed2zoned_imm0(vector unsigned char a){
+    return __builtin_ppc_packed2zoned(a,'\0');
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @tBcd_Packed2zoned_imm1(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.ppc.packed2zoned(<16 x i8> [[A]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+vector unsigned char tBcd_Packed2zoned_imm1(vector unsigned char a){
+    return __builtin_ppc_packed2zoned(a,'\1');
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @tBcd_Zoned2packed_imm0(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.ppc.zoned2packed(<16 x i8> [[A]], i32 0)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+vector unsigned char tBcd_Zoned2packed_imm0(vector unsigned char a){
+    return __builtin_ppc_zoned2packed(a,'\0');
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @tBcd_Zoned2packed_imm1(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.ppc.zoned2packed(<16 x i8> [[A]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+vector unsigned char tBcd_Zoned2packed_imm1(vector unsigned char a){
+    return __builtin_ppc_zoned2packed(a,'\1');
+}
diff --git a/clang/test/CodeGen/X86/ms-secure-hotpatch-bad-file.c b/clang/test/CodeGen/X86/ms-secure-hotpatch-bad-file.c
index 839dd44f7ff61..7c8c7d590060d 100644
--- a/clang/test/CodeGen/X86/ms-secure-hotpatch-bad-file.c
+++ b/clang/test/CodeGen/X86/ms-secure-hotpatch-bad-file.c
@@ -3,7 +3,7 @@
 // This verifies that we correctly handle a -fms-secure-hotpatch-functions-file argument that points
 // to a missing file.
 //
-// RUN: not %clang_cl -c --target=x86_64-windows-msvc -O2 /Z7 -fms-secure-hotpatch-functions-file=%S/this-file-is-intentionally-missing-do-not-create-it.txt /Fo%t.obj %s 2>&1 | FileCheck %s
+// RUN: not %clang_cl -c --target=x86_64-windows-msvc -O2 /Z7 -fms-secure-hotpatch-functions-file=%S/this-file-is-intentionally-missing-do-not-create-it.txt /Fo%t.obj -- %s 2>&1 | FileCheck %s
 // CHECK: failed to open hotpatch functions file
 
 void this_might_have_side_effects();
diff --git a/clang/test/CodeGen/X86/ms-secure-hotpatch-cpp.cpp b/clang/test/CodeGen/X86/ms-secure-hotpatch-cpp.cpp
index 3dc75c95d76f7..24e1c2937baac 100644
--- a/clang/test/CodeGen/X86/ms-secure-hotpatch-cpp.cpp
+++ b/clang/test/CodeGen/X86/ms-secure-hotpatch-cpp.cpp
@@ -3,7 +3,7 @@
 // This verifies that hotpatch function attributes are correctly propagated when compiling directly to OBJ,
 // and that name mangling works as expected.
 //
-// RUN: %clang_cl -c --target=x86_64-windows-msvc -O2 /Z7 -fms-secure-hotpatch-functions-list=?this_gets_hotpatched@@YAHXZ /Fo%t.obj %s
+// RUN: %clang_cl -c --target=x86_64-windows-msvc -O2 /Z7 -fms-secure-hotpatch-functions-list=?this_gets_hotpatched@@YAHXZ /Fo%t.obj -- %s
 // RUN: llvm-readobj --codeview %t.obj | FileCheck %s
 
 void this_might_have_side_effects();
diff --git a/clang/test/CodeGen/X86/ms-secure-hotpatch-eh.cpp b/clang/test/CodeGen/X86/ms-secure-hotpatch-eh.cpp
index 69704626c8cb6..66fbc3a950bbf 100644
--- a/clang/test/CodeGen/X86/ms-secure-hotpatch-eh.cpp
+++ b/clang/test/CodeGen/X86/ms-secure-hotpatch-eh.cpp
@@ -2,7 +2,7 @@
 
 // Global constant data such as exception handler tables should not be redirected by Windows Secure Hot-Patching
 //
-// RUN: %clang_cl -c --target=x86_64-windows-msvc /EHsc -O2 -fms-secure-hotpatch-functions-list=this_gets_hotpatched /Fo%t.obj /clang:-S /clang:-o- %s 2>& 1 | FileCheck %s
+// RUN: %clang_cl -c --target=x86_64-windows-msvc /EHsc -O2 -fms-secure-hotpatch-functions-list=this_gets_hotpatched /Fo%t.obj /clang:-S /clang:-o- -- %s 2>& 1 | FileCheck %s
 
 class Foo {
 public:
diff --git a/clang/test/CodeGen/X86/ms-secure-hotpatch-globals.c b/clang/test/CodeGen/X86/ms-secure-hotpatch-globals.c
index d76d2aa6d8acc..ff3a1a47288a6 100644
--- a/clang/test/CodeGen/X86/ms-secure-hotpatch-globals.c
+++ b/clang/test/CodeGen/X86/ms-secure-hotpatch-globals.c
@@ -4,7 +4,7 @@
 //
 // RUN: %clang_cl -c --target=x86_64-windows-msvc -O2 /Z7 \
 // RUN:   -fms-secure-hotpatch-functions-list=hp1,hp2,hp3,hp4,hp5_phi_ptr_mixed,hp_phi_ptr_both,hp_const_ptr_sub \
-// RUN:   /clang:-S /clang:-o- %s | FileCheck %s
+// RUN:   /clang:-S /clang:-o- -- %s | FileCheck %s
 
 #ifdef __clang__
 #define NO_TAIL __attribute__((disable_tail_calls))
diff --git a/clang/test/CodeGen/X86/ms-secure-hotpatch-lto.c b/clang/test/CodeGen/X86/ms-secure-hotpatch-lto.c
index 6adb0b1818e31..cbf19adb4739f 100644
--- a/clang/test/CodeGen/X86/ms-secure-hotpatch-lto.c
+++ b/clang/test/CodeGen/X86/ms-secure-hotpatch-lto.c
@@ -2,7 +2,7 @@
 
 // This verifies that hotpatch function attributes are correctly propagated through LLVM IR when compiling with LTO.
 //
-// RUN: %clang_cl -c --target=x86_64-windows-msvc -O2 /Z7 -fms-secure-hotpatch-functions-list=this_gets_hotpatched -flto /Fo%t.bc %s
+// RUN: %clang_cl -c --target=x86_64-windows-msvc -O2 /Z7 -fms-secure-hotpatch-functions-list=this_gets_hotpatched -flto /Fo%t.bc -- %s
 // RUN: llvm-dis %t.bc -o - | FileCheck %s
 //
 // CHECK-LABEL: define dso_local noundef i32 @this_gets_hotpatched()
diff --git a/clang/test/CodeGen/X86/ms-secure-hotpatch.c b/clang/test/CodeGen/X86/ms-secure-hotpatch.c
index b829e5acc5c83..9bc8c2cf364bf 100644
--- a/clang/test/CodeGen/X86/ms-secure-hotpatch.c
+++ b/clang/test/CodeGen/X86/ms-secure-hotpatch.c
@@ -3,7 +3,7 @@
 // This verifies that hotpatch function attributes are correctly propagated when compiling directly to OBJ.
 //
 // RUN: echo this_gets_hotpatched > %t.patch-functions.txt
-// RUN: %clang_cl -c --target=x86_64-windows-msvc -O2 /Z7 -fms-secure-hotpatch-functions-file=%t.patch-functions.txt /Fo%t.obj %s
+// RUN: %clang_cl -c --target=x86_64-windows-msvc -O2 /Z7 -fms-secure-hotpatch-functions-file=%t.patch-functions.txt /Fo%t.obj -- %s
 // RUN: llvm-readobj --codeview %t.obj | FileCheck %s
 
 void this_might_have_side_effects();
diff --git a/clang/test/CodeGen/arm64-microsoft-intrinsics.c b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
index e18977a4559b1..51e0038b64cde 100644
--- a/clang/test/CodeGen/arm64-microsoft-intrinsics.c
+++ b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
@@ -21,6 +21,36 @@ long test_InterlockedAdd_constant(long volatile *Addend) {
 // CHECK-MSVC: ret i32 %[[NEWVAL:[0-9]+]]
 // CHECK-LINUX: error: call to undeclared function '_InterlockedAdd'
 
+long test_InterlockedAdd_acq(long volatile *Addend, long Value) {
+  return _InterlockedAdd_acq(Addend, Value);
+}
+
+// CHECK-LABEL: define {{.*}} i32 @test_InterlockedAdd_acq(ptr %Addend, i32 %Value) {{.*}} {
+// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %1, i32 %2 acquire, align 4
+// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i32 %[[OLDVAL:[0-9]+]], %2
+// CHECK-MSVC: ret i32 %[[NEWVAL:[0-9]+]]
+// CHECK-LINUX: error: call to undeclared function '_InterlockedAdd_acq'
+
+long test_InterlockedAdd_nf(long volatile *Addend, long Value) {
+  return _InterlockedAdd_nf(Addend, Value);
+}
+
+// CHECK-LABEL: define {{.*}} i32 @test_InterlockedAdd_nf(ptr %Addend, i32 %Value) {{.*}} {
+// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %1, i32 %2 monotonic, align 4
+// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i32 %[[OLDVAL:[0-9]+]], %2
+// CHECK-MSVC: ret i32 %[[NEWVAL:[0-9]+]]
+// CHECK-LINUX: error: call to undeclared function '_InterlockedAdd_nf'
+
+long test_InterlockedAdd_rel(long volatile *Addend, long Value) {
+  return _InterlockedAdd_rel(Addend, Value);
+}
+
+// CHECK-LABEL: define {{.*}} i32 @test_InterlockedAdd_rel(ptr %Addend, i32 %Value) {{.*}} {
+// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %1, i32 %2 release, align 4
+// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i32 %[[OLDVAL:[0-9]+]], %2
+// CHECK-MSVC: ret i32 %[[NEWVAL:[0-9]+]]
+// CHECK-LINUX: error: call to undeclared function '_InterlockedAdd_rel'
+
 __int64 test_InterlockedAdd64(__int64 volatile *Addend, __int64 Value) {
   return _InterlockedAdd64(Addend, Value);
 }
@@ -35,6 +65,36 @@ __int64 test_InterlockedAdd64_constant(__int64 volatile *Addend) {
 // CHECK-MSVC: ret i64 %[[NEWVAL:[0-9]+]]
 // CHECK-LINUX: error: call to undeclared function '_InterlockedAdd64'
 
+__int64 test_InterlockedAdd64_acq(__int64 volatile *Addend, __int64 Value) {
+  return _InterlockedAdd64_acq(Addend, Value);
+}
+
+// CHECK-LABEL: define {{.*}} i64 @test_InterlockedAdd64_acq(ptr %Addend, i64 %Value) {{.*}} {
+// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %1, i64 %2 acquire, align 8
+// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i64 %[[OLDVAL:[0-9]+]], %2
+// CHECK-MSVC: ret i64 %[[NEWVAL:[0-9]+]]
+// CHECK-LINUX: error: call to undeclared function '_InterlockedAdd64_acq'
+
+__int64 test_InterlockedAdd64_nf(__int64 volatile *Addend, __int64 Value) {
+  return _InterlockedAdd64_nf(Addend, Value);
+}
+
+// CHECK-LABEL: define {{.*}} i64 @test_InterlockedAdd64_nf(ptr %Addend, i64 %Value) {{.*}} {
+// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %1, i64 %2 monotonic, align 8
+// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i64 %[[OLDVAL:[0-9]+]], %2
+// CHECK-MSVC: ret i64 %[[NEWVAL:[0-9]+]]
+// CHECK-LINUX: error: call to undeclared function '_InterlockedAdd64_nf'
+
+__int64 test_InterlockedAdd64_rel(__int64 volatile *Addend, __int64 Value) {
+  return _InterlockedAdd64_rel(Addend, Value);
+}
+
+// CHECK-LABEL: define {{.*}} i64 @test_InterlockedAdd64_rel(ptr %Addend, i64 %Value) {{.*}} {
+// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %1, i64 %2 release, align 8
+// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i64 %[[OLDVAL:[0-9]+]], %2
+// CHECK-MSVC: ret i64 %[[NEWVAL:[0-9]+]]
+// CHECK-LINUX: error: call to undeclared function '_InterlockedAdd64_rel'
+
 void check_ReadWriteBarrier(void) {
   _ReadWriteBarrier();
 }
diff --git a/clang/test/CodeGen/builtins-overflow.c b/clang/test/CodeGen/builtins-overflow.c
index 7c524723f76e8..0e04191b9e2ac 100644
--- a/clang/test/CodeGen/builtins-overflow.c
+++ b/clang/test/CodeGen/builtins-overflow.c
@@ -604,3 +604,15 @@ long long test_mixed_sign_mul_overflow_extend_unsigned(int x, unsigned y) {
     return LongLongErrorCode;
   return result;
 }
+
+_BitInt(65) test_mixed_sign_mul_overflow_bitint(unsigned _BitInt(65) y, _BitInt(119) a) {
+// CHECK: call { i119, i1 } @llvm.umul.with.overflow.i119
+// CHECK: select i1 %{{.*}}, i119 %{{.*}}, i119 %{{.*}}
+// CHECK: trunc i119
+// CHECK: zext i65
+// CHECK: store
+  unsigned _BitInt(65) result;
+  if (__builtin_mul_overflow(a, y, &result))
+    return LongLongErrorCode;
+  return result;
+}
diff --git a/clang/test/CodeGen/logb_scalbn.c b/clang/test/CodeGen/logb_scalbn.c
index be5e68b5fd4b0..52c52bcb292be 100644
--- a/clang/test/CodeGen/logb_scalbn.c
+++ b/clang/test/CodeGen/logb_scalbn.c
@@ -4,6 +4,11 @@
 // RUN: %clang -cc1 -triple amdgcn-amd-amdhsa -o - -ffp-exception-behavior=strict -emit-llvm %s | FileCheck %s -check-prefixes=STRICT
 // RUN: %clang -cc1 -triple amdgcn-amd-amdhsa -o - -ffp-exception-behavior=maytrap -emit-llvm %s | FileCheck %s -check-prefixes=MAYTRAP
 // RUN: %clang -cc1 -triple amdgcn-amd-amdhsa -o - -fmath-errno -emit-llvm %s | FileCheck %s -check-prefixes=ERRNO
+// RUN: %clang -cc1 -triple spirv64-amd-amdhsa -o - -emit-llvm %s | FileCheck %s -check-prefixes=AMDGCNSPIRV-DEFAULT
+// RUN: %clang -cc1 -triple spirv64-amd-amdhsa -o - -ffp-exception-behavior=ignore -emit-llvm %s | FileCheck %s -check-prefixes=AMDGCNSPIRV-IGNORE
+// RUN: %clang -cc1 -triple spirv64-amd-amdhsa -o - -ffp-exception-behavior=strict -emit-llvm %s | FileCheck %s -check-prefixes=AMDGCNSPIRV-STRICT
+// RUN: %clang -cc1 -triple spirv64-amd-amdhsa -o - -ffp-exception-behavior=maytrap -emit-llvm %s | FileCheck %s -check-prefixes=AMDGCNSPIRV-MAYTRAP
+// RUN: %clang -cc1 -triple spirv64-amd-amdhsa -o - -fmath-errno -emit-llvm %s | FileCheck %s -check-prefixes=AMDGCNSPIRV-ERRNO
 
 // DEFAULT-LABEL: define dso_local void @test_logbf(
 // DEFAULT-SAME: ) #[[ATTR0:[0-9]+]] {
@@ -78,6 +83,79 @@
 // ERRNO-NEXT:    store float [[CALL]], ptr [[D1_ASCAST]], align 4
 // ERRNO-NEXT:    ret void
 //
+// AMDGCNSPIRV-DEFAULT-LABEL: define spir_func void @test_logbf(
+// AMDGCNSPIRV-DEFAULT-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
+// AMDGCNSPIRV-DEFAULT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP0:%.*]] = call addrspace(4) { float, i32 } @llvm.frexp.f32.i32(float 0x40301999A0000000)
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP2:%.*]] = add nsw i32 [[TMP1]], -1
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP3:%.*]] = sitofp i32 [[TMP2]] to float
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP4:%.*]] = call addrspace(4) float @llvm.fabs.f32(float 0x40301999A0000000)
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP5:%.*]] = fcmp one float [[TMP4]], 0x7FF0000000000000
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP4]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP7:%.*]] = select i1 false, float 0xFFF0000000000000, float [[TMP6]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    store float [[TMP7]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    ret void
+//
+// AMDGCNSPIRV-IGNORE-LABEL: define spir_func void @test_logbf(
+// AMDGCNSPIRV-IGNORE-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
+// AMDGCNSPIRV-IGNORE-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP0:%.*]] = call addrspace(4) { float, i32 } @llvm.frexp.f32.i32(float 0x40301999A0000000)
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP2:%.*]] = add nsw i32 [[TMP1]], -1
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP3:%.*]] = sitofp i32 [[TMP2]] to float
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP4:%.*]] = call addrspace(4) float @llvm.fabs.f32(float 0x40301999A0000000)
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP5:%.*]] = fcmp one float [[TMP4]], 0x7FF0000000000000
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP4]]
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP7:%.*]] = select i1 false, float 0xFFF0000000000000, float [[TMP6]]
+// AMDGCNSPIRV-IGNORE-NEXT:    store float [[TMP7]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    ret void
+//
+// AMDGCNSPIRV-STRICT-LABEL: define spir_func void @test_logbf(
+// AMDGCNSPIRV-STRICT-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
+// AMDGCNSPIRV-STRICT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP0:%.*]] = call addrspace(4) { float, i32 } @llvm.frexp.f32.i32(float 0x40301999A0000000)
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP2:%.*]] = add nsw i32 [[TMP1]], -1
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP3:%.*]] = sitofp i32 [[TMP2]] to float
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP4:%.*]] = call addrspace(4) float @llvm.fabs.f32(float 0x40301999A0000000)
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP5:%.*]] = fcmp one float [[TMP4]], 0x7FF0000000000000
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP4]]
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP7:%.*]] = select i1 false, float 0xFFF0000000000000, float [[TMP6]]
+// AMDGCNSPIRV-STRICT-NEXT:    store float [[TMP7]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    ret void
+//
+// AMDGCNSPIRV-MAYTRAP-LABEL: define spir_func void @test_logbf(
+// AMDGCNSPIRV-MAYTRAP-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
+// AMDGCNSPIRV-MAYTRAP-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP0:%.*]] = call addrspace(4) { float, i32 } @llvm.frexp.f32.i32(float 0x40301999A0000000)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP2:%.*]] = add nsw i32 [[TMP1]], -1
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP3:%.*]] = sitofp i32 [[TMP2]] to float
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP4:%.*]] = call addrspace(4) float @llvm.fabs.f32(float 0x40301999A0000000)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP5:%.*]] = fcmp one float [[TMP4]], 0x7FF0000000000000
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP4]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP7:%.*]] = select i1 false, float 0xFFF0000000000000, float [[TMP6]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store float [[TMP7]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    ret void
+//
+// AMDGCNSPIRV-ERRNO-LABEL: define spir_func void @test_logbf(
+// AMDGCNSPIRV-ERRNO-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
+// AMDGCNSPIRV-ERRNO-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    [[CALL:%.*]] = call spir_func addrspace(4) float @logbf(float noundef 0x40301999A0000000) #[[ATTR2:[0-9]+]]
+// AMDGCNSPIRV-ERRNO-NEXT:    store float [[CALL]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    ret void
+//
 void test_logbf() {
   float D1 = __builtin_logbf(16.1f);
 }
@@ -182,6 +260,107 @@ void test_logbf() {
 // ERRNO-NEXT:    store float [[CALL]], ptr [[D1_ASCAST]], align 4
 // ERRNO-NEXT:    ret void
 //
+// AMDGCNSPIRV-DEFAULT-LABEL: define spir_func void @test_logbf_var(
+// AMDGCNSPIRV-DEFAULT-SAME: float noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-DEFAULT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    store float [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP1:%.*]] = call addrspace(4) { float, i32 } @llvm.frexp.f32.i32(float [[TMP0]])
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 1
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP3:%.*]] = add nsw i32 [[TMP2]], -1
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP4:%.*]] = sitofp i32 [[TMP3]] to float
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP6:%.*]] = call addrspace(4) float @llvm.fabs.f32(float [[TMP5]])
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP7:%.*]] = fcmp one float [[TMP6]], 0x7FF0000000000000
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP4]], float [[TMP6]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP9:%.*]] = fcmp oeq float [[TMP0]], 0.000000e+00
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[TMP8]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    store float [[TMP10]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    ret void
+//
+// AMDGCNSPIRV-IGNORE-LABEL: define spir_func void @test_logbf_var(
+// AMDGCNSPIRV-IGNORE-SAME: float noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-IGNORE-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-IGNORE-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    store float [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP1:%.*]] = call addrspace(4) { float, i32 } @llvm.frexp.f32.i32(float [[TMP0]])
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 1
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP3:%.*]] = add nsw i32 [[TMP2]], -1
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP4:%.*]] = sitofp i32 [[TMP3]] to float
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP6:%.*]] = call addrspace(4) float @llvm.fabs.f32(float [[TMP5]])
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP7:%.*]] = fcmp one float [[TMP6]], 0x7FF0000000000000
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP4]], float [[TMP6]]
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP9:%.*]] = fcmp oeq float [[TMP0]], 0.000000e+00
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[TMP8]]
+// AMDGCNSPIRV-IGNORE-NEXT:    store float [[TMP10]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    ret void
+//
+// AMDGCNSPIRV-STRICT-LABEL: define spir_func void @test_logbf_var(
+// AMDGCNSPIRV-STRICT-SAME: float noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-STRICT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-STRICT-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    store float [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP1:%.*]] = call addrspace(4) { float, i32 } @llvm.frexp.f32.i32(float [[TMP0]])
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 1
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP3:%.*]] = add nsw i32 [[TMP2]], -1
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP4:%.*]] = sitofp i32 [[TMP3]] to float
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP6:%.*]] = call addrspace(4) float @llvm.fabs.f32(float [[TMP5]])
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP7:%.*]] = fcmp one float [[TMP6]], 0x7FF0000000000000
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP4]], float [[TMP6]]
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP9:%.*]] = fcmp oeq float [[TMP0]], 0.000000e+00
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[TMP8]]
+// AMDGCNSPIRV-STRICT-NEXT:    store float [[TMP10]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    ret void
+//
+// AMDGCNSPIRV-MAYTRAP-LABEL: define spir_func void @test_logbf_var(
+// AMDGCNSPIRV-MAYTRAP-SAME: float noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-MAYTRAP-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store float [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP1:%.*]] = call addrspace(4) { float, i32 } @llvm.frexp.f32.i32(float [[TMP0]])
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 1
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP3:%.*]] = add nsw i32 [[TMP2]], -1
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP4:%.*]] = sitofp i32 [[TMP3]] to float
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP6:%.*]] = call addrspace(4) float @llvm.fabs.f32(float [[TMP5]])
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP7:%.*]] = fcmp one float [[TMP6]], 0x7FF0000000000000
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP4]], float [[TMP6]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP9:%.*]] = fcmp oeq float [[TMP0]], 0.000000e+00
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[TMP8]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store float [[TMP10]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    ret void
+//
+// AMDGCNSPIRV-ERRNO-LABEL: define spir_func void @test_logbf_var(
+// AMDGCNSPIRV-ERRNO-SAME: float noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-ERRNO-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-ERRNO-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    store float [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[CALL:%.*]] = call spir_func addrspace(4) float @logbf(float noundef [[TMP0]]) #[[ATTR2]]
+// AMDGCNSPIRV-ERRNO-NEXT:    store float [[CALL]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    ret void
+//
 void test_logbf_var(float a) {
   float D1 = __builtin_logbf(a);
 }
@@ -273,6 +452,79 @@ void test_logbf_var(float a) {
 // ERRNO-NEXT:    store double [[CALL]], ptr [[D1_ASCAST]], align 8
 // ERRNO-NEXT:    ret void
 //
+// AMDGCNSPIRV-DEFAULT-LABEL: define spir_func void @test_logb(
+// AMDGCNSPIRV-DEFAULT-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-DEFAULT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP0:%.*]] = call addrspace(4) { double, i32 } @llvm.frexp.f64.i32(double 1.510000e+01)
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP2:%.*]] = add nsw i32 [[TMP1]], -1
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP3:%.*]] = sitofp i32 [[TMP2]] to double
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP4:%.*]] = call addrspace(4) double @llvm.fabs.f64(double 1.510000e+01)
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP5:%.*]] = fcmp one double [[TMP4]], 0x7FF0000000000000
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP4]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP7:%.*]] = select i1 false, double 0xFFF0000000000000, double [[TMP6]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    store double [[TMP7]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    ret void
+//
+// AMDGCNSPIRV-IGNORE-LABEL: define spir_func void @test_logb(
+// AMDGCNSPIRV-IGNORE-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-IGNORE-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP0:%.*]] = call addrspace(4) { double, i32 } @llvm.frexp.f64.i32(double 1.510000e+01)
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP2:%.*]] = add nsw i32 [[TMP1]], -1
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP3:%.*]] = sitofp i32 [[TMP2]] to double
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP4:%.*]] = call addrspace(4) double @llvm.fabs.f64(double 1.510000e+01)
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP5:%.*]] = fcmp one double [[TMP4]], 0x7FF0000000000000
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP4]]
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP7:%.*]] = select i1 false, double 0xFFF0000000000000, double [[TMP6]]
+// AMDGCNSPIRV-IGNORE-NEXT:    store double [[TMP7]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    ret void
+//
+// AMDGCNSPIRV-STRICT-LABEL: define spir_func void @test_logb(
+// AMDGCNSPIRV-STRICT-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-STRICT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP0:%.*]] = call addrspace(4) { double, i32 } @llvm.frexp.f64.i32(double 1.510000e+01)
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP2:%.*]] = add nsw i32 [[TMP1]], -1
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP3:%.*]] = sitofp i32 [[TMP2]] to double
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP4:%.*]] = call addrspace(4) double @llvm.fabs.f64(double 1.510000e+01)
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP5:%.*]] = fcmp one double [[TMP4]], 0x7FF0000000000000
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP4]]
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP7:%.*]] = select i1 false, double 0xFFF0000000000000, double [[TMP6]]
+// AMDGCNSPIRV-STRICT-NEXT:    store double [[TMP7]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-STRICT-NEXT:    ret void
+//
+// AMDGCNSPIRV-MAYTRAP-LABEL: define spir_func void @test_logb(
+// AMDGCNSPIRV-MAYTRAP-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-MAYTRAP-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP0:%.*]] = call addrspace(4) { double, i32 } @llvm.frexp.f64.i32(double 1.510000e+01)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP2:%.*]] = add nsw i32 [[TMP1]], -1
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP3:%.*]] = sitofp i32 [[TMP2]] to double
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP4:%.*]] = call addrspace(4) double @llvm.fabs.f64(double 1.510000e+01)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP5:%.*]] = fcmp one double [[TMP4]], 0x7FF0000000000000
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP4]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP7:%.*]] = select i1 false, double 0xFFF0000000000000, double [[TMP6]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store double [[TMP7]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    ret void
+//
+// AMDGCNSPIRV-ERRNO-LABEL: define spir_func void @test_logb(
+// AMDGCNSPIRV-ERRNO-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-ERRNO-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    [[CALL:%.*]] = call spir_func addrspace(4) double @logb(double noundef 1.510000e+01) #[[ATTR2]]
+// AMDGCNSPIRV-ERRNO-NEXT:    store double [[CALL]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    ret void
+//
 void test_logb() {
   double D1 = __builtin_logb(15.1);
 }
@@ -398,6 +650,107 @@ void test_logb() {
 // ERRNO-NEXT:    store double [[CALL]], ptr [[D1_ASCAST]], align 8
 // ERRNO-NEXT:    ret void
 //
+// AMDGCNSPIRV-DEFAULT-LABEL: define spir_func void @test_logb_var(
+// AMDGCNSPIRV-DEFAULT-SAME: double noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-DEFAULT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    store double [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP1:%.*]] = call addrspace(4) { double, i32 } @llvm.frexp.f64.i32(double [[TMP0]])
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP1]], 1
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP3:%.*]] = add nsw i32 [[TMP2]], -1
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP4:%.*]] = sitofp i32 [[TMP3]] to double
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP5:%.*]] = load double, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP6:%.*]] = call addrspace(4) double @llvm.fabs.f64(double [[TMP5]])
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP7:%.*]] = fcmp one double [[TMP6]], 0x7FF0000000000000
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], double [[TMP4]], double [[TMP6]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP9:%.*]] = fcmp oeq double [[TMP0]], 0.000000e+00
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[TMP8]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    store double [[TMP10]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    ret void
+//
+// AMDGCNSPIRV-IGNORE-LABEL: define spir_func void @test_logb_var(
+// AMDGCNSPIRV-IGNORE-SAME: double noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-IGNORE-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-IGNORE-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    store double [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP1:%.*]] = call addrspace(4) { double, i32 } @llvm.frexp.f64.i32(double [[TMP0]])
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP1]], 1
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP3:%.*]] = add nsw i32 [[TMP2]], -1
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP4:%.*]] = sitofp i32 [[TMP3]] to double
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP5:%.*]] = load double, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP6:%.*]] = call addrspace(4) double @llvm.fabs.f64(double [[TMP5]])
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP7:%.*]] = fcmp one double [[TMP6]], 0x7FF0000000000000
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], double [[TMP4]], double [[TMP6]]
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP9:%.*]] = fcmp oeq double [[TMP0]], 0.000000e+00
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[TMP8]]
+// AMDGCNSPIRV-IGNORE-NEXT:    store double [[TMP10]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    ret void
+//
+// AMDGCNSPIRV-STRICT-LABEL: define spir_func void @test_logb_var(
+// AMDGCNSPIRV-STRICT-SAME: double noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-STRICT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-STRICT-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-STRICT-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    store double [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP1:%.*]] = call addrspace(4) { double, i32 } @llvm.frexp.f64.i32(double [[TMP0]])
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP1]], 1
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP3:%.*]] = add nsw i32 [[TMP2]], -1
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP4:%.*]] = sitofp i32 [[TMP3]] to double
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP5:%.*]] = load double, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP6:%.*]] = call addrspace(4) double @llvm.fabs.f64(double [[TMP5]])
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP7:%.*]] = fcmp one double [[TMP6]], 0x7FF0000000000000
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], double [[TMP4]], double [[TMP6]]
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP9:%.*]] = fcmp oeq double [[TMP0]], 0.000000e+00
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[TMP8]]
+// AMDGCNSPIRV-STRICT-NEXT:    store double [[TMP10]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-STRICT-NEXT:    ret void
+//
+// AMDGCNSPIRV-MAYTRAP-LABEL: define spir_func void @test_logb_var(
+// AMDGCNSPIRV-MAYTRAP-SAME: double noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-MAYTRAP-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store double [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP1:%.*]] = call addrspace(4) { double, i32 } @llvm.frexp.f64.i32(double [[TMP0]])
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP1]], 1
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP3:%.*]] = add nsw i32 [[TMP2]], -1
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP4:%.*]] = sitofp i32 [[TMP3]] to double
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP5:%.*]] = load double, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP6:%.*]] = call addrspace(4) double @llvm.fabs.f64(double [[TMP5]])
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP7:%.*]] = fcmp one double [[TMP6]], 0x7FF0000000000000
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], double [[TMP4]], double [[TMP6]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP9:%.*]] = fcmp oeq double [[TMP0]], 0.000000e+00
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[TMP8]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store double [[TMP10]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    ret void
+//
+// AMDGCNSPIRV-ERRNO-LABEL: define spir_func void @test_logb_var(
+// AMDGCNSPIRV-ERRNO-SAME: double noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-ERRNO-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-ERRNO-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    store double [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    [[CALL:%.*]] = call spir_func addrspace(4) double @logb(double noundef [[TMP0]]) #[[ATTR2]]
+// AMDGCNSPIRV-ERRNO-NEXT:    store double [[CALL]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    ret void
+//
 void test_logb_var(double a) {
   double D1 = __builtin_logb(a);
 }
@@ -455,6 +808,51 @@ void test_logb_var(double a) {
 // ERRNO-NEXT:    store float [[CALL]], ptr [[D1_ASCAST]], align 4
 // ERRNO-NEXT:    ret void
 //
+// AMDGCNSPIRV-DEFAULT-LABEL: define spir_func void @test_scalbnf(
+// AMDGCNSPIRV-DEFAULT-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-DEFAULT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP0:%.*]] = call addrspace(4) float @llvm.ldexp.f32.i32(float 0x4030B33340000000, i32 10)
+// AMDGCNSPIRV-DEFAULT-NEXT:    store float [[TMP0]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    ret void
+//
+// AMDGCNSPIRV-IGNORE-LABEL: define spir_func void @test_scalbnf(
+// AMDGCNSPIRV-IGNORE-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-IGNORE-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP0:%.*]] = call addrspace(4) float @llvm.ldexp.f32.i32(float 0x4030B33340000000, i32 10)
+// AMDGCNSPIRV-IGNORE-NEXT:    store float [[TMP0]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    ret void
+//
+// AMDGCNSPIRV-STRICT-LABEL: define spir_func void @test_scalbnf(
+// AMDGCNSPIRV-STRICT-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-STRICT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP0:%.*]] = call addrspace(4) float @llvm.ldexp.f32.i32(float 0x4030B33340000000, i32 10)
+// AMDGCNSPIRV-STRICT-NEXT:    store float [[TMP0]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    ret void
+//
+// AMDGCNSPIRV-MAYTRAP-LABEL: define spir_func void @test_scalbnf(
+// AMDGCNSPIRV-MAYTRAP-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-MAYTRAP-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP0:%.*]] = call addrspace(4) float @llvm.ldexp.f32.i32(float 0x4030B33340000000, i32 10)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store float [[TMP0]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    ret void
+//
+// AMDGCNSPIRV-ERRNO-LABEL: define spir_func void @test_scalbnf(
+// AMDGCNSPIRV-ERRNO-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-ERRNO-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    [[CALL:%.*]] = call spir_func addrspace(4) float @scalbnf(float noundef 0x4030B33340000000, i32 noundef 10) #[[ATTR2]]
+// AMDGCNSPIRV-ERRNO-NEXT:    store float [[CALL]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    ret void
+//
 void test_scalbnf() {
   float D1 = __builtin_scalbnf(16.7f, 10);
 }
@@ -535,6 +933,71 @@ void test_scalbnf() {
 // ERRNO-NEXT:    store float [[CALL]], ptr [[D1_ASCAST]], align 4
 // ERRNO-NEXT:    ret void
 //
+// AMDGCNSPIRV-DEFAULT-LABEL: define spir_func void @test_scalbnf_var1(
+// AMDGCNSPIRV-DEFAULT-SAME: float noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-DEFAULT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    store float [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP1:%.*]] = call addrspace(4) float @llvm.ldexp.f32.i32(float [[TMP0]], i32 9)
+// AMDGCNSPIRV-DEFAULT-NEXT:    store float [[TMP1]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    ret void
+//
+// AMDGCNSPIRV-IGNORE-LABEL: define spir_func void @test_scalbnf_var1(
+// AMDGCNSPIRV-IGNORE-SAME: float noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-IGNORE-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-IGNORE-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    store float [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP1:%.*]] = call addrspace(4) float @llvm.ldexp.f32.i32(float [[TMP0]], i32 9)
+// AMDGCNSPIRV-IGNORE-NEXT:    store float [[TMP1]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    ret void
+//
+// AMDGCNSPIRV-STRICT-LABEL: define spir_func void @test_scalbnf_var1(
+// AMDGCNSPIRV-STRICT-SAME: float noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-STRICT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-STRICT-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    store float [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP1:%.*]] = call addrspace(4) float @llvm.ldexp.f32.i32(float [[TMP0]], i32 9)
+// AMDGCNSPIRV-STRICT-NEXT:    store float [[TMP1]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    ret void
+//
+// AMDGCNSPIRV-MAYTRAP-LABEL: define spir_func void @test_scalbnf_var1(
+// AMDGCNSPIRV-MAYTRAP-SAME: float noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-MAYTRAP-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store float [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP1:%.*]] = call addrspace(4) float @llvm.ldexp.f32.i32(float [[TMP0]], i32 9)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store float [[TMP1]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    ret void
+//
+// AMDGCNSPIRV-ERRNO-LABEL: define spir_func void @test_scalbnf_var1(
+// AMDGCNSPIRV-ERRNO-SAME: float noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-ERRNO-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-ERRNO-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    store float [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[CALL:%.*]] = call spir_func addrspace(4) float @scalbnf(float noundef [[TMP0]], i32 noundef 9) #[[ATTR2]]
+// AMDGCNSPIRV-ERRNO-NEXT:    store float [[CALL]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    ret void
+//
 void test_scalbnf_var1(float a) {
   float D1 = __builtin_scalbnf(a, 9);
 }
@@ -615,6 +1078,71 @@ void test_scalbnf_var1(float a) {
 // ERRNO-NEXT:    store float [[CALL]], ptr [[D1_ASCAST]], align 4
 // ERRNO-NEXT:    ret void
 //
+// AMDGCNSPIRV-DEFAULT-LABEL: define spir_func void @test_scalbnf_var2(
+// AMDGCNSPIRV-DEFAULT-SAME: i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-DEFAULT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP1:%.*]] = call addrspace(4) float @llvm.ldexp.f32.i32(float 0x402E666660000000, i32 [[TMP0]])
+// AMDGCNSPIRV-DEFAULT-NEXT:    store float [[TMP1]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    ret void
+//
+// AMDGCNSPIRV-IGNORE-LABEL: define spir_func void @test_scalbnf_var2(
+// AMDGCNSPIRV-IGNORE-SAME: i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-IGNORE-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-IGNORE-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP1:%.*]] = call addrspace(4) float @llvm.ldexp.f32.i32(float 0x402E666660000000, i32 [[TMP0]])
+// AMDGCNSPIRV-IGNORE-NEXT:    store float [[TMP1]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    ret void
+//
+// AMDGCNSPIRV-STRICT-LABEL: define spir_func void @test_scalbnf_var2(
+// AMDGCNSPIRV-STRICT-SAME: i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-STRICT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-STRICT-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP1:%.*]] = call addrspace(4) float @llvm.ldexp.f32.i32(float 0x402E666660000000, i32 [[TMP0]])
+// AMDGCNSPIRV-STRICT-NEXT:    store float [[TMP1]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    ret void
+//
+// AMDGCNSPIRV-MAYTRAP-LABEL: define spir_func void @test_scalbnf_var2(
+// AMDGCNSPIRV-MAYTRAP-SAME: i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-MAYTRAP-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP1:%.*]] = call addrspace(4) float @llvm.ldexp.f32.i32(float 0x402E666660000000, i32 [[TMP0]])
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store float [[TMP1]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    ret void
+//
+// AMDGCNSPIRV-ERRNO-LABEL: define spir_func void @test_scalbnf_var2(
+// AMDGCNSPIRV-ERRNO-SAME: i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-ERRNO-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-ERRNO-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[CALL:%.*]] = call spir_func addrspace(4) float @scalbnf(float noundef 0x402E666660000000, i32 noundef [[TMP0]]) #[[ATTR2]]
+// AMDGCNSPIRV-ERRNO-NEXT:    store float [[CALL]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    ret void
+//
 void test_scalbnf_var2(int b) {
   float D1 = __builtin_scalbnf(15.2f, b);
 }
@@ -719,6 +1247,91 @@ void test_scalbnf_var2(int b) {
 // ERRNO-NEXT:    store float [[CALL]], ptr [[D1_ASCAST]], align 4
 // ERRNO-NEXT:    ret void
 //
+// AMDGCNSPIRV-DEFAULT-LABEL: define spir_func void @test_scalbnf_var3(
+// AMDGCNSPIRV-DEFAULT-SAME: float noundef [[A:%.*]], i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-DEFAULT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    store float [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP2:%.*]] = call addrspace(4) float @llvm.ldexp.f32.i32(float [[TMP0]], i32 [[TMP1]])
+// AMDGCNSPIRV-DEFAULT-NEXT:    store float [[TMP2]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    ret void
+//
+// AMDGCNSPIRV-IGNORE-LABEL: define spir_func void @test_scalbnf_var3(
+// AMDGCNSPIRV-IGNORE-SAME: float noundef [[A:%.*]], i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-IGNORE-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-IGNORE-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    store float [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP2:%.*]] = call addrspace(4) float @llvm.ldexp.f32.i32(float [[TMP0]], i32 [[TMP1]])
+// AMDGCNSPIRV-IGNORE-NEXT:    store float [[TMP2]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    ret void
+//
+// AMDGCNSPIRV-STRICT-LABEL: define spir_func void @test_scalbnf_var3(
+// AMDGCNSPIRV-STRICT-SAME: float noundef [[A:%.*]], i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-STRICT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-STRICT-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    store float [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP2:%.*]] = call addrspace(4) float @llvm.ldexp.f32.i32(float [[TMP0]], i32 [[TMP1]])
+// AMDGCNSPIRV-STRICT-NEXT:    store float [[TMP2]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    ret void
+//
+// AMDGCNSPIRV-MAYTRAP-LABEL: define spir_func void @test_scalbnf_var3(
+// AMDGCNSPIRV-MAYTRAP-SAME: float noundef [[A:%.*]], i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-MAYTRAP-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store float [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP2:%.*]] = call addrspace(4) float @llvm.ldexp.f32.i32(float [[TMP0]], i32 [[TMP1]])
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store float [[TMP2]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    ret void
+//
+// AMDGCNSPIRV-ERRNO-LABEL: define spir_func void @test_scalbnf_var3(
+// AMDGCNSPIRV-ERRNO-SAME: float noundef [[A:%.*]], i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-ERRNO-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-ERRNO-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    store float [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[A_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[CALL:%.*]] = call spir_func addrspace(4) float @scalbnf(float noundef [[TMP0]], i32 noundef [[TMP1]]) #[[ATTR2]]
+// AMDGCNSPIRV-ERRNO-NEXT:    store float [[CALL]], ptr addrspace(4) [[D1_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    ret void
+//
 void test_scalbnf_var3(float a, int b) {
   float D1 = __builtin_scalbnf(a, b);
 }
@@ -776,6 +1389,51 @@ void test_scalbnf_var3(float a, int b) {
 // ERRNO-NEXT:    store double [[CALL]], ptr [[D1_ASCAST]], align 8
 // ERRNO-NEXT:    ret void
 //
+// AMDGCNSPIRV-DEFAULT-LABEL: define spir_func void @test_scalbn(
+// AMDGCNSPIRV-DEFAULT-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-DEFAULT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP0:%.*]] = call addrspace(4) double @llvm.ldexp.f64.i32(double 1.720000e+01, i32 10)
+// AMDGCNSPIRV-DEFAULT-NEXT:    store double [[TMP0]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    ret void
+//
+// AMDGCNSPIRV-IGNORE-LABEL: define spir_func void @test_scalbn(
+// AMDGCNSPIRV-IGNORE-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-IGNORE-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP0:%.*]] = call addrspace(4) double @llvm.ldexp.f64.i32(double 1.720000e+01, i32 10)
+// AMDGCNSPIRV-IGNORE-NEXT:    store double [[TMP0]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    ret void
+//
+// AMDGCNSPIRV-STRICT-LABEL: define spir_func void @test_scalbn(
+// AMDGCNSPIRV-STRICT-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-STRICT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP0:%.*]] = call addrspace(4) double @llvm.ldexp.f64.i32(double 1.720000e+01, i32 10)
+// AMDGCNSPIRV-STRICT-NEXT:    store double [[TMP0]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-STRICT-NEXT:    ret void
+//
+// AMDGCNSPIRV-MAYTRAP-LABEL: define spir_func void @test_scalbn(
+// AMDGCNSPIRV-MAYTRAP-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-MAYTRAP-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP0:%.*]] = call addrspace(4) double @llvm.ldexp.f64.i32(double 1.720000e+01, i32 10)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store double [[TMP0]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    ret void
+//
+// AMDGCNSPIRV-ERRNO-LABEL: define spir_func void @test_scalbn(
+// AMDGCNSPIRV-ERRNO-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-ERRNO-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    [[CALL:%.*]] = call spir_func addrspace(4) double @scalbn(double noundef 1.720000e+01, i32 noundef 10) #[[ATTR2]]
+// AMDGCNSPIRV-ERRNO-NEXT:    store double [[CALL]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    ret void
+//
 void test_scalbn() {
   double D1 = __builtin_scalbn(17.2, 10);
 }
@@ -856,6 +1514,71 @@ void test_scalbn() {
 // ERRNO-NEXT:    store double [[CALL]], ptr [[D1_ASCAST]], align 8
 // ERRNO-NEXT:    ret void
 //
+// AMDGCNSPIRV-DEFAULT-LABEL: define spir_func void @test_scalbn_var1(
+// AMDGCNSPIRV-DEFAULT-SAME: double noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-DEFAULT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    store double [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP1:%.*]] = call addrspace(4) double @llvm.ldexp.f64.i32(double [[TMP0]], i32 9)
+// AMDGCNSPIRV-DEFAULT-NEXT:    store double [[TMP1]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    ret void
+//
+// AMDGCNSPIRV-IGNORE-LABEL: define spir_func void @test_scalbn_var1(
+// AMDGCNSPIRV-IGNORE-SAME: double noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-IGNORE-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-IGNORE-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    store double [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP1:%.*]] = call addrspace(4) double @llvm.ldexp.f64.i32(double [[TMP0]], i32 9)
+// AMDGCNSPIRV-IGNORE-NEXT:    store double [[TMP1]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    ret void
+//
+// AMDGCNSPIRV-STRICT-LABEL: define spir_func void @test_scalbn_var1(
+// AMDGCNSPIRV-STRICT-SAME: double noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-STRICT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-STRICT-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-STRICT-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    store double [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP1:%.*]] = call addrspace(4) double @llvm.ldexp.f64.i32(double [[TMP0]], i32 9)
+// AMDGCNSPIRV-STRICT-NEXT:    store double [[TMP1]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-STRICT-NEXT:    ret void
+//
+// AMDGCNSPIRV-MAYTRAP-LABEL: define spir_func void @test_scalbn_var1(
+// AMDGCNSPIRV-MAYTRAP-SAME: double noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-MAYTRAP-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store double [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP1:%.*]] = call addrspace(4) double @llvm.ldexp.f64.i32(double [[TMP0]], i32 9)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store double [[TMP1]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    ret void
+//
+// AMDGCNSPIRV-ERRNO-LABEL: define spir_func void @test_scalbn_var1(
+// AMDGCNSPIRV-ERRNO-SAME: double noundef [[A:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-ERRNO-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-ERRNO-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    store double [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    [[CALL:%.*]] = call spir_func addrspace(4) double @scalbn(double noundef [[TMP0]], i32 noundef 9) #[[ATTR2]]
+// AMDGCNSPIRV-ERRNO-NEXT:    store double [[CALL]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    ret void
+//
 void test_scalbn_var1(double a) {
   double D1 = __builtin_scalbn(a, 9);
 }
@@ -936,6 +1659,71 @@ void test_scalbn_var1(double a) {
 // ERRNO-NEXT:    store double [[CALL]], ptr [[D1_ASCAST]], align 8
 // ERRNO-NEXT:    ret void
 //
+// AMDGCNSPIRV-DEFAULT-LABEL: define spir_func void @test_scalbn_var2(
+// AMDGCNSPIRV-DEFAULT-SAME: i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-DEFAULT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP1:%.*]] = call addrspace(4) double @llvm.ldexp.f64.i32(double 1.540000e+01, i32 [[TMP0]])
+// AMDGCNSPIRV-DEFAULT-NEXT:    store double [[TMP1]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    ret void
+//
+// AMDGCNSPIRV-IGNORE-LABEL: define spir_func void @test_scalbn_var2(
+// AMDGCNSPIRV-IGNORE-SAME: i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-IGNORE-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-IGNORE-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP1:%.*]] = call addrspace(4) double @llvm.ldexp.f64.i32(double 1.540000e+01, i32 [[TMP0]])
+// AMDGCNSPIRV-IGNORE-NEXT:    store double [[TMP1]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    ret void
+//
+// AMDGCNSPIRV-STRICT-LABEL: define spir_func void @test_scalbn_var2(
+// AMDGCNSPIRV-STRICT-SAME: i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-STRICT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-STRICT-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-STRICT-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP1:%.*]] = call addrspace(4) double @llvm.ldexp.f64.i32(double 1.540000e+01, i32 [[TMP0]])
+// AMDGCNSPIRV-STRICT-NEXT:    store double [[TMP1]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-STRICT-NEXT:    ret void
+//
+// AMDGCNSPIRV-MAYTRAP-LABEL: define spir_func void @test_scalbn_var2(
+// AMDGCNSPIRV-MAYTRAP-SAME: i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-MAYTRAP-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP1:%.*]] = call addrspace(4) double @llvm.ldexp.f64.i32(double 1.540000e+01, i32 [[TMP0]])
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store double [[TMP1]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    ret void
+//
+// AMDGCNSPIRV-ERRNO-LABEL: define spir_func void @test_scalbn_var2(
+// AMDGCNSPIRV-ERRNO-SAME: i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-ERRNO-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-ERRNO-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[CALL:%.*]] = call spir_func addrspace(4) double @scalbn(double noundef 1.540000e+01, i32 noundef [[TMP0]]) #[[ATTR2]]
+// AMDGCNSPIRV-ERRNO-NEXT:    store double [[CALL]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    ret void
+//
 void test_scalbn_var2(int b) {
   double D1 = __builtin_scalbn(15.4, b);
 }
@@ -1040,6 +1828,91 @@ void test_scalbn_var2(int b) {
 // ERRNO-NEXT:    store double [[CALL]], ptr [[D1_ASCAST]], align 8
 // ERRNO-NEXT:    ret void
 //
+// AMDGCNSPIRV-DEFAULT-LABEL: define spir_func void @test_scalbn_var3(
+// AMDGCNSPIRV-DEFAULT-SAME: double noundef [[A:%.*]], i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-DEFAULT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-DEFAULT-NEXT:    store double [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-DEFAULT-NEXT:    [[TMP2:%.*]] = call addrspace(4) double @llvm.ldexp.f64.i32(double [[TMP0]], i32 [[TMP1]])
+// AMDGCNSPIRV-DEFAULT-NEXT:    store double [[TMP2]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-DEFAULT-NEXT:    ret void
+//
+// AMDGCNSPIRV-IGNORE-LABEL: define spir_func void @test_scalbn_var3(
+// AMDGCNSPIRV-IGNORE-SAME: double noundef [[A:%.*]], i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-IGNORE-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-IGNORE-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-IGNORE-NEXT:    store double [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-IGNORE-NEXT:    [[TMP2:%.*]] = call addrspace(4) double @llvm.ldexp.f64.i32(double [[TMP0]], i32 [[TMP1]])
+// AMDGCNSPIRV-IGNORE-NEXT:    store double [[TMP2]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-IGNORE-NEXT:    ret void
+//
+// AMDGCNSPIRV-STRICT-LABEL: define spir_func void @test_scalbn_var3(
+// AMDGCNSPIRV-STRICT-SAME: double noundef [[A:%.*]], i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-STRICT-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-STRICT-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-STRICT-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-STRICT-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-STRICT-NEXT:    store double [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-STRICT-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-STRICT-NEXT:    [[TMP2:%.*]] = call addrspace(4) double @llvm.ldexp.f64.i32(double [[TMP0]], i32 [[TMP1]])
+// AMDGCNSPIRV-STRICT-NEXT:    store double [[TMP2]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-STRICT-NEXT:    ret void
+//
+// AMDGCNSPIRV-MAYTRAP-LABEL: define spir_func void @test_scalbn_var3(
+// AMDGCNSPIRV-MAYTRAP-SAME: double noundef [[A:%.*]], i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-MAYTRAP-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store double [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-MAYTRAP-NEXT:    [[TMP2:%.*]] = call addrspace(4) double @llvm.ldexp.f64.i32(double [[TMP0]], i32 [[TMP1]])
+// AMDGCNSPIRV-MAYTRAP-NEXT:    store double [[TMP2]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-MAYTRAP-NEXT:    ret void
+//
+// AMDGCNSPIRV-ERRNO-LABEL: define spir_func void @test_scalbn_var3(
+// AMDGCNSPIRV-ERRNO-SAME: double noundef [[A:%.*]], i32 noundef [[B:%.*]]) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-ERRNO-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-ERRNO-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1:%.*]] = alloca double, align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    [[D1_ASCAST:%.*]] = addrspacecast ptr [[D1]] to ptr addrspace(4)
+// AMDGCNSPIRV-ERRNO-NEXT:    store double [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    store i32 [[B]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[B_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-ERRNO-NEXT:    [[CALL:%.*]] = call spir_func addrspace(4) double @scalbn(double noundef [[TMP0]], i32 noundef [[TMP1]]) #[[ATTR2]]
+// AMDGCNSPIRV-ERRNO-NEXT:    store double [[CALL]], ptr addrspace(4) [[D1_ASCAST]], align 8
+// AMDGCNSPIRV-ERRNO-NEXT:    ret void
+//
 void test_scalbn_var3(double a, int b) {
   double D1 = __builtin_scalbn(a, b);
 }
diff --git a/clang/test/CodeGen/pragma-comment.c b/clang/test/CodeGen/pragma-comment.c
index 861fba9aece3b..aa3aba18b9b2c 100644
--- a/clang/test/CodeGen/pragma-comment.c
+++ b/clang/test/CodeGen/pragma-comment.c
@@ -34,4 +34,4 @@
 // ELF-NOT: foo
 // This following match prevents the clang version metadata from matching the forbidden 'foo' and 'bar' tokens.
 // This can happen if the clang version string contains a Git repo URL that includes one of those substrings.
-// ELF-LABEL: !"clang version
+// ELF-LABEL: {{\!\".*clang version}}
diff --git a/clang/test/CodeGenCUDA/bf16.cu b/clang/test/CodeGenCUDA/bf16.cu
index df56ec60c63ae..12474381e718b 100644
--- a/clang/test/CodeGenCUDA/bf16.cu
+++ b/clang/test/CodeGenCUDA/bf16.cu
@@ -37,11 +37,7 @@ __device__ __bf16 test_call( __bf16 in) {
 // CHECK:        ld.param.b16    %[[R:rs[0-9]+]], [_Z9test_callDF16b_param_0];
 // CHECK:        st.param.b16    [param0], %[[R]];
 // CHECK:        .param .align 2 .b8 retval0[2];
-// CHECK:        call.uni (retval0),
-// CHECK-NEXT:   _Z13external_funcDF16b,
-// CHECK-NEXT:   (
-// CHECK-NEXT:   param0
-// CHECK-NEXT    );
+// CHECK:        call.uni (retval0), _Z13external_funcDF16b, (param0);
 // CHECK:        ld.param.b16    %[[RET:rs[0-9]+]], [retval0];
   return external_func(in);
 // CHECK:        st.param.b16    [func_retval0], %[[RET]]
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 3709b1ff52f35..864e301859682 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -2,6 +2,10 @@
 // RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s
 // REQUIRES: amdgpu-registered-target
 
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+typedef half __attribute__((ext_vector_type(2))) half2;
+
 // CHECK-LABEL: @test_setprio_inc_wg(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @llvm.amdgcn.s.setprio.inc.wg(i16 10)
@@ -10,3 +14,43 @@
 void test_setprio_inc_wg() {
   __builtin_amdgcn_s_setprio_inc_wg(10);
 }
+
+// CHECK-LABEL: @test_cvt_pk_f16_fp8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i16 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(1) [[TMP2]], i64 0
+// CHECK-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ARRAYIDX]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_cvt_pk_f16_fp8(global half2* out, short a)
+{
+  out[0] = __builtin_amdgcn_cvt_pk_f16_fp8(a);
+}
+
+// CHECK-LABEL: @test_cvt_pk_f16_bf8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i16 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pk.f16.bf8(i16 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(1) [[TMP2]], i64 0
+// CHECK-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ARRAYIDX]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_cvt_pk_f16_bf8(global half2* out, short a)
+{
+  out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a);
+}
diff --git a/clang/test/Index/inline-assembly.c b/clang/test/Index/inline-assembly.c
new file mode 100644
index 0000000000000..64a7ce03852c9
--- /dev/null
+++ b/clang/test/Index/inline-assembly.c
@@ -0,0 +1,46 @@
+static void inline_assembly_template_regardless_of_target_machine() {
+    int tmp;
+    asm volatile (
+        "nop\n"
+        "a_value %w[v]\n"
+        "o_value %w[o]"
+        : [v] "=&r" (tmp)
+        : [o] "r" (tmp)
+        : "cc", "memory"
+    );
+}
+
+// RUN: c-index-test -test-inline-assembly %s 2>&1 | FileCheck %s
+// CHECK: ===ASM TEMPLATE===
+// CHECK: nop
+// CHECK: a_value ${0:w}
+// CHECK: o_value ${1:w}
+// CHECK: ===ASM TEMPLATE END===
+// CHECK: volatile: true
+// CHECK: Output #0 Constraint (=&r): DeclRefExpr=tmp:2:9
+// CHECK: Input #0 Constraint (r): UnexposedExpr=tmp:2:9
+// CHECK: Clobber #0: cc
+// CHECK: Clobber #1: memory
+// CHECK: ===ASM END===
+
+static void inline_assembly_valid_x86_example() {
+    int tmp;
+    asm (
+        "nop\n"
+        "mov %w[o], %w[v]"
+        : [v] "=&r" (tmp)
+        : [o] "r" (tmp)
+        : "cc", "memory"
+    );
+}
+
+// CHECK: ===ASM TEMPLATE===
+// CHECK: nop
+// CHECK: mov ${1:w}, ${0:w}
+// CHECK: ===ASM TEMPLATE END===
+// CHECK: volatile: false
+// CHECK: Output #0 Constraint (=&r): DeclRefExpr=tmp:27:9
+// CHECK: Input #0 Constraint (r): UnexposedExpr=tmp:27:9
+// CHECK: Clobber #0: cc
+// CHECK: Clobber #1: memory
+// CHECK: ===ASM END===
diff --git a/clang/test/Modules/pr118137.cppm b/clang/test/Modules/pr118137.cppm
new file mode 100644
index 0000000000000..38e35399b05c0
--- /dev/null
+++ b/clang/test/Modules/pr118137.cppm
@@ -0,0 +1,24 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++23 %t/a.cppm -emit-module-interface -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++23 %t/a.cppm -emit-reduced-module-interface -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++23 %t/a.cppm -emit-llvm -o -
+
+//--- a.h
+typedef int nghttp2_session_callbacks;
+
+//--- a.cppm
+module;
+#include "a.h"
+export module g;
+template <typename, typename T>
+concept Deleter = requires(T ptr) { ptr; };
+template <typename T, Deleter<T>> struct Handle {
+  void GetRaw(this auto);
+};
+struct SessionCallbacksDeleter
+    : Handle<nghttp2_session_callbacks, SessionCallbacksDeleter> {
+} Server_callbacks;
+void Server() { Server_callbacks.GetRaw(); }
diff --git a/clang/test/Modules/template-declare.cppm b/clang/test/Modules/template-declare.cppm
new file mode 100644
index 0000000000000..01a7cca10e4ee
--- /dev/null
+++ b/clang/test/Modules/template-declare.cppm
@@ -0,0 +1,39 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/a.cppm -dwarf-version=4 -debug-info-kind=constructor \
+// RUN:     -emit-module-interface -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++20 %t/b.cppm -dwarf-version=4 -debug-info-kind=constructor \
+// RUN:     -emit-module-interface -o %t/b.pcm -fmodule-file=a=%t/a.pcm
+// RUN: %clang_cc1 -std=c++20 %t/b.cpp -dwarf-version=4 -debug-info-kind=constructor \
+// RUN:     -emit-llvm -o - -fmodule-file=a=%t/a.pcm -fmodule-file=b=%t/b.pcm | FileCheck %t/b.cpp 
+
+//--- a.cppm
+export module a;
+export template <class T>
+class a {
+private:
+    T *data;
+
+public:
+    virtual T* getData();
+};
+
+extern template class a<char>;
+
+//--- b.cppm
+export module b;
+import a;
+export struct b {
+    a<char> v;
+};
+
+//--- b.cpp
+module b;
+extern "C" void func() {
+    b();
+}
+
+// It is fine enough to check that we won't crash.
+// CHECK: define {{.*}}void @func()
diff --git a/clang/test/OpenMP/declare_mapper_messages.c b/clang/test/OpenMP/declare_mapper_messages.c
index 2238689227311..4631016698c7d 100644
--- a/clang/test/OpenMP/declare_mapper_messages.c
+++ b/clang/test/OpenMP/declare_mapper_messages.c
@@ -1,12 +1,12 @@
-// RUN: %clang_cc1 -verify=omp50,expected -fopenmp -fopenmp-version=50 -ferror-limit 100 -DOMP50 %s
-// RUN: %clang_cc1 -verify=omp51,expected -fopenmp -ferror-limit 100 %s
-// RUN: %clang_cc1 -verify=expected,omp52 -fopenmp -fopenmp-version=52 -ferror-limit 100 -DOMP52 %s
-// RUN: %clang_cc1 -verify=expected,omp60 -fopenmp -fopenmp-version=60 -ferror-limit 100 -DOMP60 %s
+// RUN: %clang_cc1 -verify=omp50,omp5x,expected -fopenmp -fopenmp-version=50 -ferror-limit 100 -DOMP50 %s
+// RUN: %clang_cc1 -verify=omp51,omp5x,expected -fopenmp -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify=expected,omp52,omp5x -fopenmp -fopenmp-version=52 -ferror-limit 100 -DOMP52 %s
+// RUN: %clang_cc1 -verify=expected,omp60,omp60-maybe-simd -fopenmp -fopenmp-version=60 -ferror-limit 100 -DOMP60 %s
 
-// RUN: %clang_cc1 -verify=omp50,expected -fopenmp-simd -fopenmp-version=50 -ferror-limit 100 -DOMP50 %s
-// RUN: %clang_cc1 -verify=omp51-simd,expected -fopenmp-simd -ferror-limit 100 %s
-// RUN: %clang_cc1 -verify=expected,omp52 -fopenmp-simd -fopenmp-version=52 -ferror-limit 100 -DOMP52 %s
-// RUN: %clang_cc1 -verify=expected,omp60-simd -fopenmp-simd -fopenmp-version=60 -ferror-limit 100 -DOMP60 %s
+// RUN: %clang_cc1 -verify=omp50,omp5x,expected -fopenmp-simd -fopenmp-version=50 -ferror-limit 100 -DOMP50 %s
+// RUN: %clang_cc1 -verify=omp51-simd,omp5x,expected -fopenmp-simd -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify=expected,omp52,omp5x -fopenmp-simd -fopenmp-version=52 -ferror-limit 100 -DOMP52 %s
+// RUN: %clang_cc1 -verify=expected,omp60-simd,omp60-maybe-simd -fopenmp-simd -fopenmp-version=60 -ferror-limit 100 -DOMP60 %s
 
 int temp; // expected-note {{'temp' declared here}}
 
@@ -16,17 +16,38 @@ struct vec {                                                            // expec
   double *data;
 };
 
-#pragma omp declare mapper                                              // expected-error {{expected '(' after 'declare mapper'}}
-#pragma omp declare mapper {                                            // expected-error {{expected '(' after 'declare mapper'}}
-#pragma omp declare mapper(                                             // expected-error {{expected a type}} expected-error {{expected declarator on 'omp declare mapper' directive}}
-#pragma omp declare mapper(#                                            // expected-error {{expected a type}} expected-error {{expected declarator on 'omp declare mapper' directive}}
-#pragma omp declare mapper(struct v                                     // expected-error {{expected declarator on 'omp declare mapper' directive}}
-#pragma omp declare mapper(struct vec                                   // expected-error {{expected declarator on 'omp declare mapper' directive}}
-#pragma omp declare mapper(S v                                          // expected-error {{unknown type name 'S'}}
-#pragma omp declare mapper(struct vec v                                 // expected-error {{expected ')'}} expected-note {{to match this '('}}
-#pragma omp declare mapper(aa:struct vec v)                             // expected-error {{expected at least one clause on '#pragma omp declare mapper' directive}}
-#pragma omp declare mapper(bb:struct vec v) private(v)                  // expected-error {{expected at least one clause on '#pragma omp declare mapper' directive}} // expected-error {{unexpected OpenMP clause 'private' in directive '#pragma omp declare mapper'}}
-#pragma omp declare mapper(cc:struct vec v) map(v) (                    // expected-warning {{extra tokens at the end of '#pragma omp declare mapper' are ignored}}
+// omp60-maybe-simd-error@+2 {{expected '(' after 'declare_mapper'}}
+// omp5x-error@+1 {{expected '(' after 'declare mapper'}}
+#pragma omp declare mapper
+// omp60-maybe-simd-error@+2 {{expected '(' after 'declare_mapper'}}
+// omp5x-error@+1 {{expected '(' after 'declare mapper'}}
+#pragma omp declare mapper {
+// expected-error@+2 {{expected a type}}
+// expected-error@+1 {{expected declarator on 'omp declare mapper' directive}}
+#pragma omp declare mapper(
+// expected-error@+2 {{expected a type}}
+// expected-error@+1 {{expected declarator on 'omp declare mapper' directive}}
+#pragma omp declare mapper(#
+// expected-error@+1 {{expected declarator on 'omp declare mapper' directive}}
+#pragma omp declare mapper(struct v
+// expected-error@+1 {{expected declarator on 'omp declare mapper' directive}}
+#pragma omp declare mapper(struct vec
+// expected-error@+1 {{unknown type name 'S'}}
+#pragma omp declare mapper(S v
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp declare mapper(struct vec v
+// omp60-maybe-simd-error@+2 {{expected at least one clause on '#pragma omp declare_mapper' directive}}
+// omp5x-error@+1 {{expected at least one clause on '#pragma omp declare mapper' directive}}
+#pragma omp declare mapper(aa:struct vec v)
+// omp60-maybe-simd-error@+4 {{expected at least one clause on '#pragma omp declare_mapper' directive}}
+// omp60-maybe-simd-error@+3 {{unexpected OpenMP clause 'private' in directive '#pragma omp declare_mapper'}}
+// omp5x-error@+2 {{expected at least one clause on '#pragma omp declare mapper' directive}}
+// omp5x-error@+1 {{unexpected OpenMP clause 'private' in directive '#pragma omp declare mapper'}}
+#pragma omp declare mapper(bb:struct vec v) private(v)
+// omp60-maybe-simd-warning@+2 {{extra tokens at the end of '#pragma omp declare_mapper' are ignored}}
+// omp5x-warning@+1 {{extra tokens at the end of '#pragma omp declare mapper' are ignored}}
+#pragma omp declare mapper(cc:struct vec v) map(v) (
 
 #pragma omp declare mapper(++: struct vec v) map(v.len)                 // expected-error {{illegal OpenMP user-defined mapper identifier}}
 #pragma omp declare mapper(id1: struct vec v) map(v.len, temp)          // expected-error {{only variable 'v' is allowed in map clauses of this 'omp declare mapper' directive}}
@@ -58,7 +79,9 @@ int fun(int arg) {
 #pragma omp declare mapper(id: struct vec v) map(v.len)                 // expected-note {{previous definition is here}}
 #pragma omp declare mapper(id: struct vec v) map(v.len)                 // expected-error {{redefinition of user-defined mapper for type 'struct vec' with name 'id'}}
     {
-#pragma omp declare mapper(id: struct vec v) map(v.len) allocate(v)   // expected-error {{unexpected OpenMP clause 'allocate' in directive '#pragma omp declare mapper'}}
+// omp60-maybe-simd-error@+2 {{unexpected OpenMP clause 'allocate' in directive '#pragma omp declare_mapper'}}
+// omp5x-error@+1 {{unexpected OpenMP clause 'allocate' in directive '#pragma omp declare mapper'}}
+#pragma omp declare mapper(id: struct vec v) map(v.len) allocate(v)
       struct vec vv, v1;
       struct vec arr[10];
       double d;
diff --git a/clang/test/OpenMP/declare_target_messages.cpp b/clang/test/OpenMP/declare_target_messages.cpp
index 3c0e766cf72ca..4aa4d686eaaf3 100644
--- a/clang/test/OpenMP/declare_target_messages.cpp
+++ b/clang/test/OpenMP/declare_target_messages.cpp
@@ -11,14 +11,14 @@
 
 // RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp45,omp45-to-51,omp45-to-51-var,omp45-to-51-clause,omp45-to-51-clause  %{openmp45} %{limit} -o - %s
 // RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp5,ompvar,omp45-to-51,omp5-and-51,omp5-or-later,omp5-or-later-var,omp45-to-51-var,omp45-to-51-clause,host5,host-5-and-51,no-host5-and-51  %{openmp50} %{target_mac} %{limit} -o - %s
-// RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp52,ompvar,omp5-or-later,omp5-or-later-var  %{openmp60} %{target_mac} %{limit} -o - %s
+// RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp60,omp52-or-later,ompvar,omp5-or-later,omp5-or-later-var  %{openmp60} %{target_mac} %{limit} -o - %s
 // RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp5,ompvar,omp45-to-51,omp5-and-51,omp5-or-later,omp5-or-later-var,omp45-to-51-var,omp45-to-51-clause,host-5-and-51,no-host5-and-51,dev5  %{openmp50} -fopenmp-is-target-device %{target_mac} %{aux_triple} %{limit} -o - %s
-// RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp52,ompvar,omp5-or-later,omp5-or-later-var %{openmp60} -fopenmp-is-target-device %{target_mac} %{aux_triple} %{limit} -o - %s
+// RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp60,omp52-or-later,ompvar,omp5-or-later,omp5-or-later-var %{openmp60} -fopenmp-is-target-device %{target_mac} %{aux_triple} %{limit} -o - %s
 
 // RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp5,ompvar,omp45-to-51,omp5-and-51,omp5-or-later,omp5-or-later-var,omp45-to-51-var,omp45-to-51-clause,host5,host-5-and-51,no-host5-and-51 %{openmp50_simd} %{target_mac} %{limit} -o - %s
-// RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp52,ompvar,omp5-or-later,omp5-or-later-var %{openmp60_simd} %{target_mac} %{limit} -o - %s
+// RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp60,omp52-or-later,ompvar,omp5-or-later,omp5-or-later-var %{openmp60_simd} %{target_mac} %{limit} -o - %s
 // RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp5,ompvar,omp45-to-51,omp5-and-51,omp5-or-later,omp5-or-later-var,omp45-to-51-var,omp45-to-51-clause,host5,host-5-and-51,no-host5-and-51 %{openmp50_simd} -fopenmp-is-target-device %{target_mac} %{limit} -o - %s
-// RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp52,ompvar,omp5-or-later,omp5-or-later-var %{openmp60_simd} -fopenmp-is-target-device %{target_mac} %{limit} -o - %s
+// RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp60,omp52-or-later,ompvar,omp5-or-later,omp5-or-later-var %{openmp60_simd} -fopenmp-is-target-device %{target_mac} %{limit} -o - %s
 
 // RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp45,omp45-to-51,omp45-to-51-var,omp45-to-51-clause -fopenmp-version=45 -fopenmp-simd %{limit} -o - %s
 // RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp51,ompvar,omp45-to-51,omp5-and-51,omp5-or-later,omp5-or-later-var,omp45-to-51-var,omp45-to-51-clause,host-5-and-51,no-host5-and-51 -fopenmp %{limit} -o - %s
@@ -26,12 +26,12 @@
 // RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp51,ompvar,omp45-to-51,omp5-and-51,omp5-or-later,omp5-or-later-var,omp45-to-51-var,omp45-to-51-clause,host-5-and-51,no-host5-and-51 -fopenmp %{limit} -I%S/Inputs -DTESTENDINC=1 -o - %s
 // RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp51,ompvar,omp45-to-51,omp5-and-51,omp5-or-later,omp5-or-later-var,omp45-to-51-var,omp45-to-51-clause,host-5-and-51,no-host5-and-51 -fopenmp-simd %{limit} -o - %s
 
-// RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp52,ompvar,omp5-or-later,omp5-or-later-var %{openmp52} -DVERBOSE_MODE=1 %{limit} -o - %s
-// RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp52,ompvar,omp5-or-later,omp5-or-later-var %{openmp60} -DVERBOSE_MODE=1 %{limit} -o - %s
+// RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp52,omp52-or-later,ompvar,omp5-or-later,omp5-or-later-var %{openmp52} -DVERBOSE_MODE=1 %{limit} -o - %s
+// RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp60,omp52-or-later,ompvar,omp5-or-later,omp5-or-later-var %{openmp60} -DVERBOSE_MODE=1 %{limit} -o - %s
 
 // RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp5,ompvar,omp45-to-51,omp5-and-51,omp5-or-later,omp5-or-later-var,omp45-to-51-var,omp45-to-51-clause,host-5-and-51,no-host5-and-51 %{openmp50} %{limit} -o - %s
 // RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp51,ompvar,omp45-to-51,omp5-and-51,omp5-or-later,omp5-or-later-var,omp45-to-51-var,omp45-to-51-clause,host-5-and-51,no-host5-and-51 -fopenmp %{limit} -o - %s
-// RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp52,ompvar,omp5-or-later,omp5-or-later-var %{openmp60} %{limit} -o - %s
+// RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp60,omp52-or-later,ompvar,omp5-or-later,omp5-or-later-var %{openmp60} %{limit} -o - %s
 
 #pragma omp begin declare target
 static int gg;
@@ -39,7 +39,9 @@ static int gg;
 int recursive = recursive ^ 3 + gg;
 #pragma omp end declare target
 
-// expected-error@+1 {{unexpected OpenMP directive '#pragma omp end declare target'}}
+// omp60-error@+3 {{unexpected OpenMP directive '#pragma omp end declare_target'}}
+// omp52-error@+2 {{unexpected OpenMP directive '#pragma omp end declare target'}}
+// omp45-to-51-error@+1 {{unexpected OpenMP directive '#pragma omp end declare target'}}
 #pragma omp end declare target 
 
 // ompvar-error@+1 {{variable captured in declare target region must appear in a to clause}}
@@ -47,14 +49,20 @@ int a, b, z;
 // expected-note@+1 {{defined as threadprivate or thread local}}
 __thread int t;
 
-// expected-error@+1 {{expected '(' after 'declare target'}}
+// omp60-error@+3 {{expected '(' after 'declare_target'}}
+// omp52-error@+2 {{expected '(' after 'declare target'}}
+// omp45-to-51-error@+1 {{expected '(' after 'declare target'}}
 #pragma omp declare target . 
 
 #pragma omp declare target
 void f();
-// expected-warning@+1 {{extra tokens at the end of '#pragma omp end declare target' are ignored}}
+// omp60-warning@+3 {{extra tokens at the end of '#pragma omp end declare_target' are ignored}}
+// omp52-warning@+2 {{extra tokens at the end of '#pragma omp end declare target' are ignored}}
+// omp45-to-51-warning@+1 {{extra tokens at the end of '#pragma omp end declare target' are ignored}}
 #pragma omp end declare target shared(a) 
 
+// omp60-error@+10 {{unexpected 'map' clause, only 'enter', 'link', 'device_type' or 'indirect' clauses expected}}
+// omp60-error@+9 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // omp52-error@+8 {{unexpected 'map' clause, only 'enter', 'link', 'device_type' or 'indirect' clauses expected}}
 // omp52-error@+7 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // omp51-error@+6 {{unexpected 'map' clause, only 'to', 'link', 'device_type' or 'indirect' clauses expected}} 
@@ -65,6 +73,8 @@ void f();
 // omp45-error@+1 {{expected at least one 'to' or 'link' clause}} 
 #pragma omp declare target map(a)
 
+// omp60-error@+5 {{unexpected 'to' clause, use 'enter' instead}}
+// omp60-error@+4 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // omp52-error@+3 {{unexpected 'to' clause, use 'enter' instead}}
 // omp52-error@+2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // omp45-to-51-error@+1 {{use of undeclared identifier 'foo1'}}
@@ -73,6 +83,8 @@ void f();
 // expected-error@+1 {{use of undeclared identifier 'foo2'}}
 #pragma omp declare target link(foo2) 
 
+// omp60-error@+6 {{unexpected 'to' clause, use 'enter' instead}}
+// omp60-error@+5 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // omp52-error@+4 {{unexpected 'to' clause, use 'enter' instead}}
 // omp52-error@+3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // dev5-note@+2 {{marked as 'device_type(host)' here}}
@@ -80,8 +92,8 @@ void f();
 #pragma omp declare target to(f) device_type(host)
 
 void q();
-// omp52-error@+4 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-error@+3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-or-later-error@+4 {{unexpected 'to' clause, use 'enter' instead}}
+// omp52-or-later-error@+3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // omp5-and-51-warning@+2 {{more than one 'device_type' clause is specified}}
 // omp45-error@+1 {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}}
 #pragma omp declare target to(q) device_type(any) device_type(any) device_type(host) 
@@ -121,7 +133,7 @@ void c();
 // expected-note@+1 {{'func' defined here}}
 void func() {} 
 
-// omp52-error@+5 {{unexpected 'allocate' clause, only 'enter', 'link', 'device_type' or 'indirect' clauses expected}}
+// omp52-or-later-error@+5 {{unexpected 'allocate' clause, only 'enter', 'link', 'device_type' or 'indirect' clauses expected}}
 // omp51-error@+4 {{unexpected 'allocate' clause, only 'to', 'link', 'device_type' or 'indirect' clauses expected}}
 // omp5-error@+3 {{unexpected 'allocate' clause, only 'to', 'link' or 'device_type' clauses expected}}
 // expected-error@+2 {{function name is not allowed in 'link' clause}}
@@ -171,7 +183,9 @@ void t2() {
   void abc();
 #pragma omp end declare target
 void cba();
-// expected-error@+1 {{unexpected OpenMP directive '#pragma omp end declare target'}}
+// omp60-error@+3 {{unexpected OpenMP directive '#pragma omp end declare_target'}}
+// omp52-error@+2 {{unexpected OpenMP directive '#pragma omp end declare target'}}
+// omp45-to-51-error@+1 {{unexpected OpenMP directive '#pragma omp end declare target'}}
 #pragma omp end declare target 
 
 #pragma omp declare target
@@ -234,7 +248,9 @@ void foo1() {
 
 #pragma omp end declare target
 #pragma omp end declare target
-// expected-error@+1 {{unexpected OpenMP directive '#pragma omp end declare target'}}
+// omp60-error@+3 {{unexpected OpenMP directive '#pragma omp end declare_target'}}
+// omp52-error@+2 {{unexpected OpenMP directive '#pragma omp end declare target'}}
+// omp45-to-51-error@+1 {{unexpected OpenMP directive '#pragma omp end declare target'}}
 #pragma omp end declare target 
 
 int C::method() {
@@ -255,18 +271,22 @@ int *y;
 int **w = &y;
 int main (int argc, char **argv) {
   int a = 2;
-// expected-error@+1 {{unexpected OpenMP directive '#pragma omp declare target'}}
+// omp60-error@+3 {{unexpected OpenMP directive '#pragma omp declare_target'}}
+// omp52-error@+2 {{unexpected OpenMP directive '#pragma omp declare target'}}
+// omp45-to-51-error@+1 {{unexpected OpenMP directive '#pragma omp declare target'}}
 #pragma omp declare target 
   int v;
-// expected-error@+1 {{unexpected OpenMP directive '#pragma omp end declare target'}}
+// omp60-error@+3 {{unexpected OpenMP directive '#pragma omp end declare_target'}}
+// omp52-error@+2 {{unexpected OpenMP directive '#pragma omp end declare target'}}
+// omp45-to-51-error@+1 {{unexpected OpenMP directive '#pragma omp end declare target'}}
 #pragma omp end declare target 
   foo(v);
 
-  // omp52-error@+2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
-  // omp52-error@+1 {{unexpected 'to' clause, use 'enter' instead}}
+  // omp52-or-later-error@+2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+  // omp52-or-later-error@+1 {{unexpected 'to' clause, use 'enter' instead}}
 #pragma omp declare target to(foo3) link(w)
-  // omp52-error@+3 {{unexpected 'to' clause, use 'enter' instead}}
-  // omp52-error@+2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+  // omp52-or-later-error@+3 {{unexpected 'to' clause, use 'enter' instead}}
+  // omp52-or-later-error@+2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
   // omp45-to-51-var-error@+1 {{local variable 'a' should not be used in 'declare target' directive}}
 #pragma omp declare target to(a) 
   return (0);
@@ -283,48 +303,48 @@ namespace {
 
 // expected-error@+1 {{'x' appears multiple times in clauses on the same declare target directive}}
 #pragma omp declare target (x, x) 
-// omp52-error@+3 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-error@+2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-or-later-error@+3 {{unexpected 'to' clause, use 'enter' instead}}
+// omp52-or-later-error@+2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // omp45-to-51-clause-error@+1 {{'x' appears multiple times in clauses on the same declare target directive}}
 #pragma omp declare target to(x) to(x)
 // expected-error@+1 {{'x' must not appear in both clauses 'to' and 'link'}}
 #pragma omp declare target link(x) 
 
 void bazz() {}
-// omp52-error@+4 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-error@+3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-or-later-error@+4 {{unexpected 'to' clause, use 'enter' instead}}
+// omp52-or-later-error@+3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // host5-note@+2 3 {{marked as 'device_type(nohost)' here}}
 // omp45-error@+1 {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}} 
 #pragma omp declare target to(bazz) device_type(nohost)
 void bazzz() {bazz();}
-// omp52-error@+3 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-error@+2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-or-later-error@+3 {{unexpected 'to' clause, use 'enter' instead}}
+// omp52-or-later-error@+2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // omp45-error@+1 {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}}
 #pragma omp declare target to(bazzz) device_type(nohost) 
 // host5-error@+1 {{function with 'device_type(nohost)' is not available on host}}
 void any() {bazz();} 
 // host5-error@+1 {{function with 'device_type(nohost)' is not available on host}}
 void host1() {bazz();}
-// omp52-error@+4 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-error@+3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-or-later-error@+4 {{unexpected 'to' clause, use 'enter' instead}}
+// omp52-or-later-error@+3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // dev5-note@+2 3 {{marked as 'device_type(host)' here}}
 // omp45-error@+1 {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}}
 #pragma omp declare target to(host1) device_type(host)
 //host5-error@+1 {{function with 'device_type(nohost)' is not available on host}}
 void host2() {bazz();}
-// omp52-error@+2 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-error@+1 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-or-later-error@+2 {{unexpected 'to' clause, use 'enter' instead}}
+// omp52-or-later-error@+1 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 #pragma omp declare target to(host2) 
 // dev5-error@+1 {{function with 'device_type(host)' is not available on device}}
 void device() {host1();}
-// omp52-error@+4 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-error@+3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-or-later-error@+4 {{unexpected 'to' clause, use 'enter' instead}}
+// omp52-or-later-error@+3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // host5-note@+2 2 {{marked as 'device_type(nohost)' here}} 
 // omp45-error@+1 {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}}
 #pragma omp declare target to(device) device_type(nohost)
 void host3() {host1();} // dev5-error {{function with 'device_type(host)' is not available on device}}
-// omp52-error@+2 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-error@+1 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-or-later-error@+2 {{unexpected 'to' clause, use 'enter' instead}}
+// omp52-or-later-error@+1 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 #pragma omp declare target to(host3)
 
 #pragma omp declare target
@@ -343,17 +363,17 @@ void any7() {device();}
 void any8() {any2();}
 
 int MultiDevTy;
-// omp52-error@+3 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-error@+2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-or-later-error@+3 {{unexpected 'to' clause, use 'enter' instead}}
+// omp52-or-later-error@+2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // omp45-error@+1 {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}}
 #pragma omp declare target to(MultiDevTy) device_type(any)
-// omp52-error@+4 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-error@+3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-or-later-error@+4 {{unexpected 'to' clause, use 'enter' instead}}
+// omp52-or-later-error@+3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // host-5-and-51-error@+2 {{'device_type(host)' does not match previously specified 'device_type(any)' for the same declaration}}
 // omp45-error@+1 {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}}
 #pragma omp declare target to(MultiDevTy) device_type(host)
-// omp52-error@+4 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-error@+3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-or-later-error@+4 {{unexpected 'to' clause, use 'enter' instead}}
+// omp52-or-later-error@+3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // no-host5-and-51-error@+2 {{'device_type(nohost)' does not match previously specified 'device_type(any)' for the same declaration}}
 // omp45-error@+1 {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}}
 #pragma omp declare target to(MultiDevTy) device_type(nohost)
@@ -414,6 +434,8 @@ target *S1 = &S;
 // expected-warning@+1 {{expected '#pragma omp end declare target' at end of file to match '#pragma omp declare target'}}
 #pragma omp declare target
 #else
-// expected-warning@+1 {{expected '#pragma omp end declare target' at end of file to match '#pragma omp begin declare target'}}
+// omp60-warning@+3 {{expected '#pragma omp end declare target' at end of file to match '#pragma omp begin declare_target'}}
+// omp52-warning@+2 {{expected '#pragma omp end declare target' at end of file to match '#pragma omp begin declare target'}}
+// omp45-to-51-warning@+1 {{expected '#pragma omp end declare target' at end of file to match '#pragma omp begin declare target'}}
 #pragma omp begin declare target
 #endif
diff --git a/clang/test/OpenMP/declare_variant_clauses_messages.cpp b/clang/test/OpenMP/declare_variant_clauses_messages.cpp
index aadded7699ea1..bca91481220ff 100644
--- a/clang/test/OpenMP/declare_variant_clauses_messages.cpp
+++ b/clang/test/OpenMP/declare_variant_clauses_messages.cpp
@@ -152,7 +152,7 @@ void vararg_bar2(const char *fmt) { return; }
 #pragma omp declare variant(foo_v1) match(construct={dispatch}) \
                                     append_args(foobar(target))
 
-// expected-error@+2 {{directive '#pragma omp declare variant' cannot contain more than one 'append_args' clause}}
+// expected-error@+2 {{directive '#pragma omp declare_variant' cannot contain more than one 'append_args' clause}}
 #pragma omp declare variant(foo_v1) match(construct={dispatch}) \
                                     append_args(interop(target)) \
                                     append_args(interop(targetsync))
diff --git a/clang/test/OpenMP/target_data_ast_print.cpp b/clang/test/OpenMP/target_data_ast_print.cpp
index a41c7f1a0da53..3f939549dcb54 100644
--- a/clang/test/OpenMP/target_data_ast_print.cpp
+++ b/clang/test/OpenMP/target_data_ast_print.cpp
@@ -102,7 +102,7 @@ T tmain(T argc, T *argv) {
 // CHECK-NEXT: T i, j, b, c, d, e, x[20];
 // CHECK-NEXT: #pragma omp target data map(to: c){{$}}
 // CHECK-NEXT: i = argc;
-// CHECK-NEXT: #pragma omp target data map(to: c) if(target data: j > 0)
+// CHECK-NEXT: #pragma omp target data map(to: c) if(target{{[ _]}}data: j > 0)
 // CHECK-NEXT: foo();
 // CHECK-NEXT: #pragma omp target data map(to: c) if(b)
 // CHECK-NEXT: foo();
@@ -140,7 +140,7 @@ T tmain(T argc, T *argv) {
 // CHECK-NEXT: int i, j, b, c, d, e, x[20];
 // CHECK-NEXT: #pragma omp target data map(to: c)
 // CHECK-NEXT: i = argc;
-// CHECK-NEXT: #pragma omp target data map(to: c) if(target data: j > 0)
+// CHECK-NEXT: #pragma omp target data map(to: c) if(target{{[ _]}}data: j > 0)
 // CHECK-NEXT: foo();
 // CHECK-NEXT: #pragma omp target data map(to: c) if(b)
 // CHECK-NEXT: foo();
@@ -178,7 +178,7 @@ T tmain(T argc, T *argv) {
 // CHECK-NEXT: char i, j, b, c, d, e, x[20];
 // CHECK-NEXT: #pragma omp target data map(to: c)
 // CHECK-NEXT: i = argc;
-// CHECK-NEXT: #pragma omp target data map(to: c) if(target data: j > 0)
+// CHECK-NEXT: #pragma omp target data map(to: c) if(target{{[ _]}}data: j > 0)
 // CHECK-NEXT: foo();
 // CHECK-NEXT: #pragma omp target data map(to: c) if(b)
 // CHECK-NEXT: foo();
@@ -225,7 +225,7 @@ int main (int argc, char **argv) {
   a=2;
 // CHECK-NEXT: a = 2;
 #pragma omp target data map(to: c) if (target data: b)
-// CHECK: #pragma omp target data map(to: c) if(target data: b)
+// CHECK: #pragma omp target data map(to: c) if(target{{[ _]}}data: b)
   foo();
 // CHECK-NEXT: foo();
 
diff --git a/clang/test/OpenMP/target_map_messages.cpp b/clang/test/OpenMP/target_map_messages.cpp
index 911031d5412a9..4a026584a47cb 100644
--- a/clang/test/OpenMP/target_map_messages.cpp
+++ b/clang/test/OpenMP/target_map_messages.cpp
@@ -1,35 +1,35 @@
 // -fopenmp, -fno-openmp-extensions
-// RUN: %clang_cc1 -verify=expected,ge50,lt51,lt60,omp,lt51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,omp5x,ge50,lt51,lt60,omp,lt51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
 // RUN: %clang_cc1 -verify=expected,lt50,lt51,lt60,omp,lt51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=40 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
 // RUN: %clang_cc1 -verify=expected,lt50,lt51,lt60,omp,lt51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=45 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,ge50,lt51,lt60,omp,lt51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,ge50,ge51,lt60,omp,ge51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=51 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,ge50,ge51,ge52,lt60,omp,ge52-omp,omp52 -fopenmp -fno-openmp-extensions -fopenmp-version=52 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,ge50,ge52,ge60,omp,ge60-omp,omp60 -fopenmp -fno-openmp-extensions -fopenmp-version=60 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,omp5x,ge50,lt51,lt60,omp,lt51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,omp5x,ge50,ge51,lt60,omp,ge51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=51 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,omp5x,ge50,ge51,ge52,lt60,omp,ge52-omp,omp52 -fopenmp -fno-openmp-extensions -fopenmp-version=52 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,ge60,omp,ge60-omp,omp60 -fopenmp -fno-openmp-extensions -fopenmp-version=60 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
 // RUN: %clang_cc1 -DCCODE -verify -fopenmp -fno-openmp-extensions -ferror-limit 300 -x c %s -Wno-openmp -Wuninitialized -Wno-vla
 
 // -fopenmp-simd, -fno-openmp-extensions
-// RUN: %clang_cc1 -verify=expected,ge50,lt51,lt60,omp,lt51-omp -fopenmp-simd -fno-openmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,omp5x,ge50,lt51,lt60,omp,lt51-omp -fopenmp-simd -fno-openmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
 // RUN: %clang_cc1 -verify=expected,lt50,lt51,lt60,omp,lt51-omp -fopenmp-simd -fno-openmp-extensions -fopenmp-version=40 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
 // RUN: %clang_cc1 -verify=expected,lt50,lt51,lt60,omp,lt51-omp -fopenmp-simd -fno-openmp-extensions -fopenmp-version=45 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,ge50,lt51,lt60,omp,lt51-omp -fopenmp-simd -fno-openmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,ge50,ge51,lt60,omp,ge51-omp -fopenmp-simd -fno-openmp-extensions -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,omp5x,ge50,lt51,lt60,omp,lt51-omp -fopenmp-simd -fno-openmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,omp5x,ge50,ge51,lt60,omp,ge51-omp -fopenmp-simd -fno-openmp-extensions -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
 // RUN: %clang_cc1 -DCCODE -verify -fopenmp-simd -fno-openmp-extensions -ferror-limit 300 -x c %s -Wno-openmp-mapping -Wuninitialized -Wno-vla
 
 // -fopenmp -fopenmp-extensions
-// RUN: %clang_cc1 -verify=expected,ge50,lt51,lt60,ompx,lt51-ompx -fopenmp -fopenmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,omp5x,ge50,lt51,lt60,ompx,lt51-ompx -fopenmp -fopenmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
 // RUN: %clang_cc1 -verify=expected,lt50,lt51,lt60,ompx,lt51-ompx -fopenmp -fopenmp-extensions -fopenmp-version=40 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
 // RUN: %clang_cc1 -verify=expected,lt50,lt51,lt60,ompx,lt51-ompx -fopenmp -fopenmp-extensions -fopenmp-version=45 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,ge50,lt51,lt60,ompx,lt51-ompx -fopenmp -fopenmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,ge50,ge51,lt60,ompx,ge51-ompx -fopenmp -fopenmp-extensions -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,omp5x,ge50,lt51,lt60,ompx,lt51-ompx -fopenmp -fopenmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,omp5x,ge50,ge51,lt60,ompx,ge51-ompx -fopenmp -fopenmp-extensions -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
 // RUN: %clang_cc1 -DCCODE -verify -fopenmp -fopenmp-extensions -ferror-limit 300 -x c %s -Wno-openmp -Wuninitialized -Wno-vla
 
 // -fopenmp-simd -fopenmp-extensions
-// RUN: %clang_cc1 -verify=expected,ge50,lt51,lt60,ompx,lt51-ompx -fopenmp-simd -fopenmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,omp5x,ge50,lt51,lt60,ompx,lt51-ompx -fopenmp-simd -fopenmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
 // RUN: %clang_cc1 -verify=expected,lt50,lt51,lt60,ompx,lt51-ompx -fopenmp-simd -fopenmp-extensions -fopenmp-version=40 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
 // RUN: %clang_cc1 -verify=expected,lt50,lt51,lt60,ompx,lt51-ompx -fopenmp-simd -fopenmp-extensions -fopenmp-version=45 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,ge50,lt51,lt60,ompx,lt51-ompx -fopenmp-simd -fopenmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,ge50,ge51,lt60,ompx,ge51-ompx -fopenmp-simd -fopenmp-extensions -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,omp5x,ge50,lt51,lt60,ompx,lt51-ompx -fopenmp-simd -fopenmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,omp5x,ge50,ge51,lt60,ompx,ge51-ompx -fopenmp-simd -fopenmp-extensions -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
 // RUN: %clang_cc1 -DCCODE -verify -fopenmp-simd -fopenmp-extensions -ferror-limit 300 -x c %s -Wno-openmp-mapping -Wuninitialized -Wno-vla
 
 // Check
@@ -87,7 +87,7 @@ struct SA {
     {}
     #pragma omp target map(arg[2:2],a,d) // expected-error {{subscripted value is not an array or pointer}}
     {}
-    #pragma omp target map(arg,a*2) // lt50-error {{expected expression containing only member accesses and/or array sections based on named variables}} ge50-error {{expected addressable lvalue in 'map' clause}}
+    #pragma omp target map(arg,a*2) // lt50-error {{expected expression containing only member accesses and/or array sections based on named variables}} ge50-error {{expected addressable lvalue in 'map' clause}} ge60-error {{expected addressable lvalue in 'map' clause}}
     {}
     #pragma omp target map(arg,(c+1)[2]) // lt50-error {{expected expression containing only member accesses and/or array sections based on named variables}}
     {}
@@ -170,27 +170,32 @@ struct SA {
     // expected-error@+1 {{use of undeclared identifier 'present'}}
     #pragma omp target map(present)
     {}
+    // ge60-error@+4 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator', 'self}}
     // ge52-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
     // ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
     // lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
     #pragma omp target map(ompx_hold, tofrom: c,f)
     {}
+    // ge60-error@+4 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator', 'self}}
     // ge52-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
     // ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
     // lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
     #pragma omp target map(ompx_hold, tofrom: c[1:2],f)
     {}
+    // ge60-error@+4 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator', 'self}}
     // ge52-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
     // ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
     // lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
     #pragma omp target map(ompx_hold, tofrom: c,f[1:2])
     {}
+    // ge60-error@+5 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator', 'self}}
     // ge52-error@+4 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
     // expected-error@+3 {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
     // ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
     // lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
     #pragma omp target map(ompx_hold, tofrom: c[:],f)
     {}
+    // ge60-error@+5 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator', 'self}}
     // ge52-error@+4 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
     // expected-error@+3 {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
     // ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
@@ -211,12 +216,14 @@ struct SA {
     // lt51-error@+1 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
     #pragma omp target map(present, present, tofrom: a)
     {}
+    // ge60-error@+5 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator', 'self}}
     // ge52-error@+4 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
     // ompx-error@+3 {{same map type modifier has been specified more than once}}
     // ge51-omp-error@+2 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
     // lt51-omp-error@+1 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
     #pragma omp target map(ompx_hold, ompx_hold, tofrom: a)
     {}
+    // ge60-error@+10 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator', 'self}}
     // ge60-error@+9 {{same map type modifier has been specified more than once}}
     // ge52-error@+8 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
     // expected-error@+7 2 {{same map type modifier has been specified more than once}}
@@ -247,10 +254,11 @@ struct SA {
     // lt60-error@+1 {{missing map type}}
     #pragma omp target map( , f, : a)
     {}
-    #pragma omp target map(always close: a)   // lt60-error {{missing map type}} ge52-error{{missing ',' after map type modifier}}
+    #pragma omp target map(always close: a)   // lt60-error {{missing map type}} ge52-error{{missing ',' after map type modifier}} ge60-error {{missing ',' after map type modifier}}
     {}
-    #pragma omp target map(always close bf: a)   // ge52-error 2 {{missing ',' after map type modifier}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+    #pragma omp target map(always close bf: a)   // ge52-error 2 {{missing ',' after map type modifier}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}} ge60-error 2 {{missing ',' after map type modifier}}
     {}
+    // ge60-error@+5 {{missing ',' after map type modifier}}
     // ge52-error@+4 {{missing ',' after map type modifier}}
     // ge51-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
     // lt51-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
@@ -263,6 +271,7 @@ struct SA {
     // lt51-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
     #pragma omp target map(tofrom from: a)
     {}
+    // ge60-error@+6 {{missing ',' after map type modifier}}
     // ge60-note@+5 {{map type 'to' is previous specified here}}
     // ge60-error@+4 {{map type is already specified}}
     // ge52-error@+3 {{missing ',' after map type modifier}}
@@ -270,7 +279,7 @@ struct SA {
     // lt51-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
     #pragma omp target map(to always from: a)
     {}
-    #pragma omp target map(close bf: a)   // ge52-error {{missing ',' after map type modifier}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+    #pragma omp target map(close bf: a)   // ge52-error {{missing ',' after map type modifier}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}} ge60-error {{missing ',' after map type modifier}}
     {}
     #pragma omp target map(([b[I]][bf])f)  // lt50-error {{expected ',' or ']' in lambda capture list}} lt50-error {{expected ')'}} lt50-note {{to match this '('}}
     {}
@@ -476,7 +485,7 @@ void SAclient(int arg) {
   {}
 #pragma omp target map(r.S.Arr[:12])
   {}
-#pragma omp target map(r.S.foo() [:12]) // lt50-error {{expected expression containing only member accesses and/or array sections based on named variables}} ge50-error {{expected addressable lvalue in 'map' clause}}
+#pragma omp target map(r.S.foo() [:12]) // lt50-error {{expected expression containing only member accesses and/or array sections based on named variables}} ge50-error {{expected addressable lvalue in 'map' clause}} ge60-error {{expected addressable lvalue in 'map' clause}}
   {}
 #pragma omp target map(r.C, r.D)
   {}
@@ -673,7 +682,7 @@ T tmain(T argc) {
   foo();
 #pragma omp target map(T) // expected-error {{'T' does not refer to a value}}
   foo();
-#pragma omp target map(I) // lt50-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}} ge50-error 2 {{expected addressable lvalue in 'map' clause}}
+#pragma omp target map(I) // lt50-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}} ge50-error 2 {{expected addressable lvalue in 'map' clause}} ge60-error 2 {{expected addressable lvalue in 'map' clause}}
   foo();
 #pragma omp target map(S2::S2s)
   foo();
@@ -690,7 +699,7 @@ T tmain(T argc) {
 #pragma omp target map(to, x)
   foo();
 #pragma omp target data map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
-#pragma omp target data map(tofrom: argc > 0 ? x : y) // lt50-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}} ge50-error 2 {{expected addressable lvalue in 'map' clause}}
+#pragma omp target data map(tofrom: argc > 0 ? x : y) // lt50-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}} ge50-error 2 {{expected addressable lvalue in 'map' clause}} ge60-error 2 {{expected addressable lvalue in 'map' clause}}
 #pragma omp target data map(argc)
 #pragma omp target data map(S1) // expected-error {{'S1' does not refer to a value}}
 #pragma omp target data map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} warn-warning 2 {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning 2 {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
@@ -809,7 +818,11 @@ int main(int argc, char **argv) {
   SC1 s;
   SC1 *p;
   int Arr[10];
-#pragma omp target data map // expected-error {{expected '(' after 'map'}} lt50-error {{expected at least one 'map' or 'use_device_ptr' clause for '#pragma omp target data'}} ge50-error {{expected at least one 'map', 'use_device_ptr', or 'use_device_addr' clause for '#pragma omp target data'}}
+// expected-error@+4 {{expected '(' after 'map'}}
+// lt50-error@+3 {{expected at least one 'map' or 'use_device_ptr' clause for '#pragma omp target data'}}
+// omp5x-error@+2 {{expected at least one 'map', 'use_device_ptr', or 'use_device_addr' clause for '#pragma omp target data'}}
+// ge60-error@+1 {{expected at least one 'map', 'use_device_ptr', or 'use_device_addr' clause for '#pragma omp target_data'}}
+#pragma omp target data map
 #pragma omp target data map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
 #pragma omp target data map() // expected-error {{expected expression}}
 #pragma omp target data map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
@@ -828,7 +841,7 @@ int main(int argc, char **argv) {
 #pragma omp target map(to, x)
   foo();
 #pragma omp target data map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
-#pragma omp target data map(tofrom: argc > 0 ? argv[1] : argv[2]) // lt50-error {{expected expression containing only member accesses and/or array sections based on named variables}} ge50-error {{expected addressable lvalue in 'map' clause}}
+#pragma omp target data map(tofrom: argc > 0 ? argv[1] : argv[2]) // lt50-error {{expected expression containing only member accesses and/or array sections based on named variables}} ge50-error {{expected addressable lvalue in 'map' clause}} ge60-error {{expected addressable lvalue in 'map' clause}}
 #pragma omp target data map(argc)
 #pragma omp target data map(S1) // expected-error {{'S1' does not refer to a value}}
 #pragma omp target data map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} warn-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
diff --git a/clang/test/Sema/builtins-bcd-transform.c b/clang/test/Sema/builtins-bcd-transform.c
new file mode 100644
index 0000000000000..103a6be6452b5
--- /dev/null
+++ b/clang/test/Sema/builtins-bcd-transform.c
@@ -0,0 +1,30 @@
+// Testfile to verify the semantics and the error handling for BCD builtins national2packed, packed2zoned and zoned2packed.
+// REQUIRES: powerpc-registered-target
+// RUN: %clang_cc1 -target-feature +altivec -triple powerpc64-unknown-unknown -fsyntax-only -verify %s
+// RUN: %clang_cc1 -target-feature +altivec -triple powerpc64le-unknown-unknown -fsyntax-only -verify %s
+// RUN: %clang_cc1 -target-feature +altivec -triple powerpc-unknown-unknown -fsyntax-only -verify %s
+
+#include <altivec.h>
+vector unsigned char test_national2packed(void)
+{
+  vector unsigned char a = {1,2,3,4};
+  vector unsigned char res_a = __builtin_ppc_national2packed(a, 2);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vector unsigned char res_b = __builtin_ppc_national2packed(a, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  return __builtin_ppc_national2packed(a, 0);
+}
+
+vector unsigned char test_packed2zoned(void)
+{
+  vector unsigned char a = {1,2,3,4};
+  vector unsigned char res_a = __builtin_ppc_packed2zoned(a,2);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vector unsigned char res_b = __builtin_ppc_packed2zoned(a, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  return __builtin_ppc_packed2zoned(a,1);
+}
+
+vector unsigned char test_zoned2packed(void)
+{
+  vector unsigned char a = {1,2,3,4};
+  vector unsigned char res_a = __builtin_ppc_zoned2packed(a,2);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vector unsigned char res_b = __builtin_ppc_zoned2packed(a, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  return __builtin_ppc_zoned2packed(a,0);
+}
\ No newline at end of file
diff --git a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
index 7152a5937d9b7..6f4003f525930 100644
--- a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
+++ b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
@@ -410,3 +410,39 @@ C& C::operator=(const C&) = default;
 static_assert (!__builtin_is_cpp_trivially_relocatable(C));
 static_assert (!__builtin_is_replaceable(C));
 }
+
+namespace GH144232 {
+
+struct E trivially_relocatable_if_eligible replaceable_if_eligible {
+  E (E &&);
+  E &operator= (E &&) = default;
+};
+
+struct F trivially_relocatable_if_eligible replaceable_if_eligible {
+  F (F &&) = default;
+  F &operator= (F &&);
+};
+
+struct G trivially_relocatable_if_eligible replaceable_if_eligible { G (G const &) = default; };
+
+struct I trivially_relocatable_if_eligible replaceable_if_eligible { I &operator= (const I &) = default; };
+
+struct J trivially_relocatable_if_eligible replaceable_if_eligible { J (J const &); };
+struct K trivially_relocatable_if_eligible replaceable_if_eligible { K (K const &); };
+
+
+
+static_assert (__builtin_is_replaceable (E));
+static_assert (__builtin_is_cpp_trivially_relocatable(E));
+static_assert (__builtin_is_replaceable (F));
+static_assert (__builtin_is_cpp_trivially_relocatable(F));
+static_assert (__builtin_is_replaceable (G));
+static_assert (__builtin_is_cpp_trivially_relocatable(G));
+static_assert (__builtin_is_replaceable (I));
+static_assert (__builtin_is_cpp_trivially_relocatable(I));
+static_assert (__builtin_is_replaceable (J));
+static_assert (__builtin_is_cpp_trivially_relocatable(J));
+static_assert (__builtin_is_replaceable (K));
+static_assert (__builtin_is_cpp_trivially_relocatable(K));
+
+}
diff --git a/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp b/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp
index 329b611110c1d..cf33ac283ab42 100644
--- a/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp
+++ b/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp
@@ -20,6 +20,28 @@ struct is_trivially_copyable {
 
 template <typename T>
 constexpr bool is_trivially_copyable_v = __is_trivially_copyable(T);
+
+template <typename T, typename U>
+struct is_assignable {
+    static constexpr bool value = __is_assignable(T, U);
+};
+
+template <typename T, typename U>
+constexpr bool is_assignable_v = __is_assignable(T, U);
+
+template <typename T>
+struct is_empty {
+    static constexpr bool value = __is_empty(T);
+};
+template <typename T>
+constexpr bool is_empty_v = __is_empty(T);
+
+template <typename T>
+struct is_standard_layout {
+static constexpr bool value = __is_standard_layout(T);
+};
+template <typename T>
+constexpr bool is_standard_layout_v = __is_standard_layout(T);
 #endif
 
 #ifdef STD2
@@ -44,6 +66,37 @@ using is_trivially_copyable  = __details_is_trivially_copyable<T>;
 
 template <typename T>
 constexpr bool is_trivially_copyable_v = __is_trivially_copyable(T);
+
+template <typename T, typename U>
+struct __details_is_assignable {
+    static constexpr bool value = __is_assignable(T, U);
+};
+
+template <typename T, typename U>
+using is_assignable = __details_is_assignable<T, U>;
+
+template <typename T, typename U>
+constexpr bool is_assignable_v = __is_assignable(T, U);
+
+template <typename T>
+struct __details_is_empty {
+    static constexpr bool value = __is_empty(T);
+};
+template <typename T>
+using is_empty  = __details_is_empty<T>;
+template <typename T>
+constexpr bool is_empty_v = __is_empty(T);
+
+template <typename T>
+struct __details_is_standard_layout {
+static constexpr bool value = __is_standard_layout(T);
+
+
+};
+template <typename T>
+using is_standard_layout = __details_is_standard_layout<T>;
+template <typename T>
+constexpr bool is_standard_layout_v = __is_standard_layout(T);
 #endif
 
 
@@ -73,6 +126,29 @@ using is_trivially_copyable  = __details_is_trivially_copyable<T>;
 
 template <typename T>
 constexpr bool is_trivially_copyable_v = is_trivially_copyable<T>::value;
+
+template <typename T, typename U>
+struct __details_is_assignable : bool_constant<__is_assignable(T, U)> {};
+
+template <typename T, typename U>
+using is_assignable  = __details_is_assignable<T, U>;
+
+template <typename T, typename U>
+constexpr bool is_assignable_v = is_assignable<T, U>::value;
+
+template <typename T>
+struct __details_is_empty : bool_constant<__is_empty(T)> {};
+template <typename T>
+using is_empty  = __details_is_empty<T>;
+template <typename T>
+constexpr bool is_empty_v = is_empty<T>::value;
+
+template <typename T>
+struct __details_is_standard_layout : bool_constant<__is_standard_layout(T)> {};
+template <typename T>
+using is_standard_layout = __details_is_standard_layout<T>;
+template <typename T>
+constexpr bool is_standard_layout_v = is_standard_layout<T>::value;
 #endif
 
 }
@@ -100,6 +176,41 @@ static_assert(std::is_trivially_copyable_v<int&>);
 // expected-note@-1 {{because it is a reference type}}
 
 
+ // Direct tests
+ static_assert(std::is_standard_layout<int>::value);
+ static_assert(std::is_standard_layout_v<int>);
+
+ static_assert(std::is_standard_layout<int&>::value);
+ // expected-error-re@-1 {{static assertion failed due to requirement 'std::{{.*}}is_standard_layout<int &>::value'}} \
+ // expected-note@-1 {{'int &' is not standard-layout}} \
+ // expected-note@-1 {{because it is a reference type}}
+
+ static_assert(std::is_standard_layout_v<int&>);
+ // expected-error@-1 {{static assertion failed due to requirement 'std::is_standard_layout_v<int &>'}} \
+ // expected-note@-1 {{'int &' is not standard-layout}} \
+ // expected-note@-1 {{because it is a reference type}}
+
+static_assert(!std::is_empty<int>::value);
+
+static_assert(std::is_empty<int&>::value);
+// expected-error-re@-1 {{static assertion failed due to requirement 'std::{{.*}}is_empty<int &>::value'}} \
+// expected-note@-1 {{'int &' is not empty}} \
+// expected-note@-1 {{because it is a reference type}}
+static_assert(std::is_empty_v<int&>);
+// expected-error@-1 {{static assertion failed due to requirement 'std::is_empty_v<int &>'}} \
+// expected-note@-1 {{'int &' is not empty}} \
+// expected-note@-1 {{because it is a reference type}}
+
+
+static_assert(std::is_assignable<int&, int>::value);
+
+static_assert(std::is_assignable<int&, void>::value);
+// expected-error-re@-1 {{static assertion failed due to requirement 'std::{{.*}}is_assignable<int &, void>::value'}} \
+// expected-error@-1 {{assigning to 'int' from incompatible type 'void'}}
+static_assert(std::is_assignable_v<int&, void>);
+// expected-error@-1 {{static assertion failed due to requirement 'std::is_assignable_v<int &, void>'}} \
+// expected-error@-1 {{assigning to 'int' from incompatible type 'void'}}
+
 namespace test_namespace {
     using namespace std;
     static_assert(is_trivially_relocatable<int&>::value);
@@ -119,6 +230,32 @@ namespace test_namespace {
     // expected-error@-1 {{static assertion failed due to requirement 'is_trivially_copyable_v<int &>'}} \
     // expected-note@-1 {{'int &' is not trivially copyable}} \
     // expected-note@-1 {{because it is a reference type}}
+
+    static_assert(is_standard_layout<int&>::value);
+     // expected-error-re@-1 {{static assertion failed due to requirement '{{.*}}is_standard_layout<int &>::value'}} \
+     // expected-note@-1 {{'int &' is not standard-layout}} \
+     // expected-note@-1 {{because it is a reference type}}
+
+     static_assert(is_standard_layout_v<int&>);
+     // expected-error@-1 {{static assertion failed due to requirement 'is_standard_layout_v<int &>'}} \
+     // expected-note@-1 {{'int &' is not standard-layout}} \
+     // expected-note@-1 {{because it is a reference type}}
+
+    static_assert(is_assignable<int&, void>::value);
+    // expected-error-re@-1 {{static assertion failed due to requirement '{{.*}}is_assignable<int &, void>::value'}} \
+    // expected-error@-1 {{assigning to 'int' from incompatible type 'void'}}
+    static_assert(is_assignable_v<int&, void>);
+    // expected-error@-1 {{static assertion failed due to requirement 'is_assignable_v<int &, void>'}} \
+    // expected-error@-1 {{assigning to 'int' from incompatible type 'void'}}
+
+    static_assert(is_empty<int&>::value);
+    // expected-error-re@-1 {{static assertion failed due to requirement '{{.*}}is_empty<int &>::value'}} \
+    // expected-note@-1 {{'int &' is not empty}} \
+    // expected-note@-1 {{because it is a reference type}} 
+    static_assert(is_empty_v<int&>);
+    // expected-error@-1 {{static assertion failed due to requirement 'is_empty_v<int &>'}} \
+    // expected-note@-1 {{'int &' is not empty}} \
+    // expected-note@-1 {{because it is a reference type}}
 }
 
 
@@ -139,6 +276,14 @@ concept C2 = std::is_trivially_copyable_v<T>; // #concept4
 
 template <C2 T> void g2();  // #cand4
 
+template <typename T, typename U>
+requires std::is_assignable<T, U>::value void f4();  // #cand7
+
+template <typename T, typename U>
+concept C4 = std::is_assignable_v<T, U>; // #concept8
+
+template <C4<void> T> void g4();  // #cand8
+
 void test() {
     f<int&>();
     // expected-error@-1 {{no matching function for call to 'f'}} \
@@ -169,6 +314,19 @@ void test() {
     // expected-note@#concept4 {{because 'std::is_trivially_copyable_v<int &>' evaluated to false}} \
     // expected-note@#concept4 {{'int &' is not trivially copyable}} \
     // expected-note@#concept4 {{because it is a reference type}}
+
+    f4<int&, void>();
+    // expected-error@-1 {{no matching function for call to 'f4'}} \
+    // expected-note@#cand7 {{candidate template ignored: constraints not satisfied [with T = int &, U = void]}} \
+    // expected-note-re@#cand7 {{because '{{.*}}is_assignable<int &, void>::value' evaluated to false}} \
+    // expected-error@#cand7 {{assigning to 'int' from incompatible type 'void'}}
+
+    g4<int&>();
+    // expected-error@-1 {{no matching function for call to 'g4'}} \
+    // expected-note@#cand8 {{candidate template ignored: constraints not satisfied [with T = int &]}} \
+    // expected-note@#cand8 {{because 'C4<int &, void>' evaluated to false}} \
+    // expected-note@#concept8 {{because 'std::is_assignable_v<int &, void>' evaluated to false}} \
+    // expected-error@#concept8 {{assigning to 'int' from incompatible type 'void'}}
 }
 }
 
diff --git a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
index 5210354a66d43..cc923d206ab35 100644
--- a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
+++ b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
@@ -488,3 +488,282 @@ static_assert(__is_trivially_copyable(S12));
 // expected-note@-1 {{'S12' is not trivially copyable}} \
 // expected-note@#tc-S12 {{'S12' defined here}}
 }
+
+namespace assignable {
+struct S1;
+static_assert(__is_assignable(S1&, const S1&));
+// expected-error@-1 {{static assertion failed due to requirement '__is_assignable(assignable::S1 &, const assignable::S1 &)'}} \
+// expected-error@-1 {{no viable overloaded '='}} \
+// expected-note@-1 {{type 'S1' is incomplete}}
+
+static_assert(__is_assignable(void, int));
+// expected-error@-1 {{static assertion failed due to requirement '__is_assignable(void, int)'}} \
+// expected-error@-1 {{expression is not assignable}}
+
+static_assert(__is_assignable(int, int));
+// expected-error@-1 {{static assertion failed due to requirement '__is_assignable(int, int)'}} \
+// expected-error@-1 {{expression is not assignable}}
+
+static_assert(__is_assignable(int*, int));
+// expected-error@-1 {{static assertion failed due to requirement '__is_assignable(int *, int)'}} \
+// expected-error@-1 {{expression is not assignable}}
+
+static_assert(__is_assignable(int[], int));
+// expected-error@-1 {{static assertion failed due to requirement '__is_assignable(int[], int)'}} \
+// expected-error@-1 {{expression is not assignable}}
+
+static_assert(__is_assignable(int&, void));
+// expected-error@-1 {{static assertion failed due to requirement '__is_assignable(int &, void)'}} \
+// expected-error@-1 {{assigning to 'int' from incompatible type 'void'}}
+
+static_assert(__is_assignable(int*&, float*));
+// expected-error@-1 {{static assertion failed due to requirement '__is_assignable(int *&, float *)'}} \
+// expected-error@-1 {{incompatible pointer types assigning to 'int *' from 'float *'}}
+
+static_assert(__is_assignable(const int&, int));
+// expected-error@-1 {{static assertion failed due to requirement '__is_assignable(const int &, int)'}} \
+// expected-error@-1 {{read-only variable is not assignable}}
+
+struct S2 {}; // #a-S2
+static_assert(__is_assignable(const S2, S2));
+// expected-error@-1 {{static assertion failed due to requirement '__is_assignable(const assignable::S2, assignable::S2)'}} \
+// expected-error@-1 {{no viable overloaded '='}} \
+// expected-note@#a-S2 {{candidate function (the implicit copy assignment operator) not viable: 'this' argument has type 'const S2', but method is not marked const}} \
+// expected-note@#a-S2 {{candidate function (the implicit move assignment operator) not viable: 'this' argument has type 'const S2', but method is not marked const}} \
+// expected-note@#a-S2 {{'S2' defined here}}
+
+struct S3 { // #a-S3
+    S3& operator=(const S3&) = delete; // #aca-S3
+    S3& operator=(S3&&) = delete;  // #ama-S3
+};
+static_assert(__is_assignable(S3, const S3&));
+// expected-error@-1 {{static assertion failed due to requirement '__is_assignable(assignable::S3, const assignable::S3 &)'}} \
+// expected-error@-1 {{overload resolution selected deleted operator '='}} \
+// expected-note@#aca-S3 {{candidate function has been explicitly deleted}} \
+// expected-note@#ama-S3 {{candidate function not viable: 1st argument ('const S3') would lose const qualifier}} \
+// expected-note@#a-S3 {{'S3' defined here}}
+static_assert(__is_assignable(S3, S3&&));
+// expected-error@-1 {{static assertion failed due to requirement '__is_assignable(assignable::S3, assignable::S3 &&)'}} \
+// expected-error@-1 {{overload resolution selected deleted operator '='}} \
+// expected-note@#aca-S3 {{candidate function has been explicitly deleted}} \
+// expected-note@#ama-S3 {{candidate function has been explicitly deleted}} \
+// expected-note@#a-S3 {{'S3' defined here}}
+
+class C1 { // #a-C1
+    C1& operator=(const C1&) = default;
+    C1& operator=(C1&&) = default; // #ama-C1
+};
+static_assert(__is_assignable(C1, C1));
+// expected-error@-1 {{static assertion failed due to requirement '__is_assignable(assignable::C1, assignable::C1)'}} \
+// expected-error@-1 {{'operator=' is a private member of 'assignable::C1'}} \
+// expected-note@#ama-C1 {{implicitly declared private here}} \
+// expected-note@#a-C1 {{'C1' defined here}}
+}
+
+namespace is_empty_tests {
+    // Non-static data member.
+    struct A { int x; }; // #e-A
+    static_assert(__is_empty(A));
+    // expected-error@-1 {{static assertion failed due to requirement '__is_empty(is_empty_tests::A)'}} \
+    // expected-note@-1 {{'A' is not empty}} \
+    // expected-note@-1 {{because it has a non-static data member 'x' of type 'int'}} \
+    // expected-note@#e-A {{'A' defined here}}
+
+    // Reference member.
+    struct R {int &r; }; // #e-R
+    static_assert(__is_empty(R));
+    // expected-error@-1 {{static assertion failed due to requirement '__is_empty(is_empty_tests::R)'}} \
+    // expected-note@-1 {{'R' is not empty}} \
+    // expected-note@-1 {{because it has a non-static data member 'r' of type 'int &'}} \
+    // expected-note@#e-R {{'R' defined here}}
+
+    // Virtual function.
+    struct VirtualFunc {virtual void f(); }; // #e-VirtualFunc
+    static_assert(__is_empty(VirtualFunc));
+    // expected-error@-1 {{static assertion failed due to requirement '__is_empty(is_empty_tests::VirtualFunc)'}} \
+    // expected-note@-1 {{'VirtualFunc' is not empty}} \
+    // expected-note@-1 {{because it has a virtual function 'f'}} \
+    // expected-note@#e-VirtualFunc {{'VirtualFunc' defined here}}
+
+    // Virtual base class.
+    struct EB {};
+    struct VB: virtual EB {}; // #e-VB
+    static_assert(__is_empty(VB));
+    // expected-error@-1 {{static assertion failed due to requirement '__is_empty(is_empty_tests::VB)'}} \
+    // expected-note@-1 {{'VB' is not empty}} \
+    // expected-note@-1 {{because it has a virtual base 'EB'}} \
+    // expected-note@#e-VB {{'VB' defined here}}
+
+    // Non-empty base class.
+    struct Base { int b; }; // #e-Base
+    struct Derived : Base {}; // #e-Derived
+    static_assert(__is_empty(Derived));
+    // expected-error@-1 {{static assertion failed due to requirement '__is_empty(is_empty_tests::Derived)'}} \
+    // expected-note@-1 {{'Derived' is not empty}} \
+    // expected-note@-1 {{because it has a base class 'Base' that is not empty}} \
+    // expected-note@#e-Derived {{'Derived' defined here}} 
+
+    // Combination of the above.
+    struct Multi : Base, virtual EB { // #e-Multi
+        int z;
+        virtual void g();
+    };
+    static_assert(__is_empty(Multi));
+    // expected-error@-1 {{static assertion failed due to requirement '__is_empty(is_empty_tests::Multi)'}} \
+    // expected-note@-1 {{'Multi' is not empty}} \
+    // expected-note@-1 {{because it has a non-static data member 'z' of type 'int'}} \
+    // expected-note@-1 {{because it has a virtual function 'g'}} \
+    // expected-note@-1 {{because it has a base class 'Base' that is not empty}} \
+    // expected-note@-1 {{because it has a virtual base 'EB'}} \
+    // expected-note@#e-Multi {{'Multi' defined here}}
+
+    // Zero-width bit-field.
+    struct BitField { int : 0; }; // #e-BitField
+    static_assert(__is_empty(BitField)); // no diagnostics  
+
+    // Dependent bit-field width. 
+    template <int N>
+    struct DependentBitField { int : N; }; // #e-DependentBitField
+
+    static_assert(__is_empty(DependentBitField<0>)); // no diagnostics
+
+    static_assert(__is_empty(DependentBitField<2>)); 
+    // expected-error@-1 {{static assertion failed due to requirement '__is_empty(is_empty_tests::DependentBitField<2>)'}} \
+    // expected-note@-1 {{'DependentBitField<2>' is not empty}} \
+    // expected-note@-1 {{because it field '' is a non-zero-length bit-field}} \
+    // expected-note@#e-DependentBitField {{'DependentBitField<2>' defined here}}
+
+}
+
+namespace standard_layout_tests {
+struct WithVirtual { // #sl-Virtual
+    virtual void foo(); // #sl-Virtual-Foo
+};
+static_assert(__is_standard_layout(WithVirtual));
+// expected-error@-1 {{static assertion failed due to requirement '__is_standard_layout(standard_layout_tests::WithVirtual)'}} \
+// expected-note@-1 {{'WithVirtual' is not standard-layout}} \
+// expected-note@-1 {{because it has a virtual function 'foo'}} \
+// expected-note@#sl-Virtual-Foo {{'foo' defined here}} \
+// expected-note@#sl-Virtual {{'WithVirtual' defined here}}
+
+struct MixedAccess { // #sl-Mixed
+public:
+    int a; // #sl-MixedF1
+private:
+    int b; // #sl-MixedF2
+};
+static_assert(__is_standard_layout(MixedAccess));
+// expected-error@-1 {{static assertion failed due to requirement '__is_standard_layout(standard_layout_tests::MixedAccess)'}} \
+// expected-note@-1 {{'MixedAccess' is not standard-layout}} \
+// expected-note@-1 {{because it has mixed access specifiers}} \
+// expected-note@#sl-MixedF1 {{'a' defined here}}
+// expected-note@#sl-MixedF2 {{field 'b' has a different access specifier than field 'a'}}
+// expected-note@#sl-Mixed {{'MixedAccess' defined here}}
+
+struct VirtualBase { virtual ~VirtualBase(); };               // #sl-VirtualBase
+struct VB : virtual VirtualBase {};                            // #sl-VB
+static_assert(__is_standard_layout(VB));
+// expected-error@-1 {{static assertion failed due to requirement '__is_standard_layout(standard_layout_tests::VB)'}} \
+// expected-note@-1 {{'VB' is not standard-layout}} \
+// expected-note@-1 {{because it has a virtual base 'VirtualBase'}} \
+// expected-note@-1 {{because it has a non-standard-layout base 'VirtualBase'}} \
+// expected-note@-1 {{because it has a virtual function '~VB'}} \
+// expected-note@#sl-VB {{'VB' defined here}}
+// expected-note@#sl-VB {{'~VB' defined here}}
+
+union U {      // #sl-U
+public:
+    int x; // #sl-UF1
+private:
+    int y; // #sl-UF2
+};                                                       
+static_assert(__is_standard_layout(U));
+// expected-error@-1 {{static assertion failed due to requirement '__is_standard_layout(standard_layout_tests::U)'}} \
+// expected-note@-1 {{'U' is not standard-layout}} \
+// expected-note@-1 {{because it has mixed access specifiers}}
+// expected-note@#sl-UF1 {{'x' defined here}}
+// expected-note@#sl-UF2 {{field 'y' has a different access specifier than field 'x'}}
+// expected-note@#sl-U {{'U' defined here}}
+
+// Single base class is OK
+struct BaseClass{ int a; };                                   // #sl-BaseClass
+struct DerivedOK : BaseClass {};                                // #sl-DerivedOK
+static_assert(__is_standard_layout(DerivedOK));    
+
+// Primitive types should be standard layout
+static_assert(__is_standard_layout(int));                     // #sl-Int
+static_assert(__is_standard_layout(float));                   // #sl-Float
+
+// Multi-level inheritance: Non-standard layout
+struct Base1 { int a; };                                      // #sl-Base1
+struct Base2 { int b; };                                      // #sl-Base2
+struct DerivedClass : Base1, Base2 {};                        // #sl-DerivedClass
+static_assert(__is_standard_layout(DerivedClass));               
+// expected-error@-1 {{static assertion failed due to requirement '__is_standard_layout(standard_layout_tests::DerivedClass)'}} \
+// expected-note@-1 {{'DerivedClass' is not standard-layout}} \
+// expected-note@-1 {{because it has multiple base classes with data members}} \
+// expected-note@#sl-DerivedClass {{'DerivedClass' defined here}} 
+
+// Inheritance hierarchy with multiple classes having data members
+struct BaseA { int a; };                                      // #sl-BaseA
+struct BaseB : BaseA {};                                      // inherits BaseA, has no new members
+struct BaseC: BaseB { int c; };                               // #sl-BaseC
+static_assert(__is_standard_layout(BaseC));
+// expected-error@-1 {{static assertion failed due to requirement '__is_standard_layout(standard_layout_tests::BaseC)'}} \
+// expected-note@-1 {{'BaseC' is not standard-layout}} \
+// expected-note@-1 {{because it has an indirect base 'BaseA' with data members}} \
+// expected-note@#sl-BaseC {{'BaseC' defined here}} \
+// Multiple direct base classes with no data members --> standard layout
+struct BaseX {};                                              // #sl-BaseX
+struct BaseY {};                                              // #sl-BaseY
+struct MultiBase : BaseX, BaseY {};                          // #sl-MultiBase
+static_assert(__is_standard_layout(MultiBase));
+
+struct A {
+  int x;
+};
+
+struct B : A {
+};
+// Indirect base with data members
+struct C : B { int y; }; // #sl-C
+static_assert(__is_standard_layout(C));
+// expected-error@-1 {{static assertion failed due to requirement '__is_standard_layout(standard_layout_tests::C)'}} \
+// expected-note@-1 {{'C' is not standard-layout}} \
+// expected-note@-1 {{because it has an indirect base 'A' with data members}} \
+// expected-note@#sl-C {{'C' defined here}}
+
+struct D {
+    union { int a; float b; };
+  }; // #sl-D
+static_assert(__is_standard_layout(D)); // no diagnostics
+
+// E inherits D but adds a new member
+struct E : D { int x; }; // #sl-E
+static_assert(__is_standard_layout(E));
+// expected-error@-1 {{static assertion failed due to requirement '__is_standard_layout(standard_layout_tests::E)'}} \
+// expected-note@-1 {{'E' is not standard-layout}} \
+// expected-note@-1 {{because it has an indirect base 'D' with data members}} \
+// expected-note@#sl-E {{'E' defined here}}
+
+// F inherits D but only an unnamed bitfield
+// This should still fail because F ends up with a 
+// base class with a data member and its own unnamed bitfield
+// which is not allowed in standard layout
+struct F : D { int : 0; }; // #sl-F
+static_assert(__is_standard_layout(F));
+// expected-error@-1 {{static assertion failed due to requirement '__is_standard_layout(standard_layout_tests::F)'}} \
+// expected-note@-1 {{'F' is not standard-layout}} \
+// expected-note@#sl-F {{'F' defined here}}
+
+struct Empty {};
+struct G { Empty a, b; }; // #sl-G
+static_assert(__is_standard_layout(G)); // no diagnostics
+
+struct H { Empty a; int x; }; // #sl-H
+static_assert(__is_standard_layout(H)); // no diagnostics
+
+ struct I { Empty a; int : 0; int x; }; // #sl-I
+static_assert(__is_standard_layout(I)); // no diagnostics
+}
+
diff --git a/clang/tools/c-index-test/c-index-test.c b/clang/tools/c-index-test/c-index-test.c
index 4a887cd0c1e2e..cb3245756a394 100644
--- a/clang/tools/c-index-test/c-index-test.c
+++ b/clang/tools/c-index-test/c-index-test.c
@@ -1988,6 +1988,51 @@ static enum CXChildVisitResult PrintDeclAttributes(CXCursor cursor, CXCursor p,
   return CXChildVisit_Continue;
 }
 
+/******************************************************************************/
+/* Inline assembly cursor testing                                             */
+/******************************************************************************/
+
+static enum CXChildVisitResult
+PrintGCCInlineAssembly(CXCursor cursor, CXCursor p, CXClientData d) {
+  CXString Constraint, Template, Clobber;
+  CXCursor Expr;
+  unsigned hasGoto, i, e;
+  if (clang_getCursorKind(cursor) != CXCursor_AsmStmt)
+    return CXChildVisit_Recurse;
+
+  hasGoto = clang_Cursor_isGCCAssemblyHasGoto(cursor);
+  printf("===ASM TEMPLATE%s===\n", hasGoto ? " (WITH GOTO)" : "");
+  Template = clang_Cursor_getGCCAssemblyTemplate(cursor);
+  printf("%s", clang_getCString(Template));
+  clang_disposeString(Template);
+  printf("\n===ASM TEMPLATE END===\n");
+
+  printf("volatile: %s\n",
+         clang_Cursor_isGCCAssemblyVolatile(cursor) ? "true" : "false");
+
+  for (i = 0, e = clang_Cursor_getGCCAssemblyNumOutputs(cursor); i < e; ++i) {
+    clang_Cursor_getGCCAssemblyOutput(cursor, i, &Constraint, &Expr);
+    printf("Output #%d Constraint (%s): ", i, clang_getCString(Constraint));
+    PrintCursor(Expr, NULL);
+    printf("\n");
+    clang_disposeString(Constraint);
+  }
+  for (i = 0, e = clang_Cursor_getGCCAssemblyNumInputs(cursor); i < e; ++i) {
+    clang_Cursor_getGCCAssemblyInput(cursor, i, &Constraint, &Expr);
+    printf("Input #%d Constraint (%s): ", i, clang_getCString(Constraint));
+    PrintCursor(Expr, NULL);
+    printf("\n");
+    clang_disposeString(Constraint);
+  }
+  for (i = 0, e = clang_Cursor_getGCCAssemblyNumClobbers(cursor); i < e; ++i) {
+    Clobber = clang_Cursor_getGCCAssemblyClobber(cursor, i);
+    printf("Clobber #%d: %s\n", i, clang_getCString(Clobber));
+    clang_disposeString(Clobber);
+  }
+  printf("===ASM END===\n");
+  return CXChildVisit_Recurse;
+}
+
 /******************************************************************************/
 /* Target information testing.                                                */
 /******************************************************************************/
@@ -5010,6 +5055,7 @@ static void print_usage(void) {
     "       c-index-test -test-annotate-tokens=<range> {<args>}*\n"
     "       c-index-test -test-inclusion-stack-source {<args>}*\n"
     "       c-index-test -test-inclusion-stack-tu <AST file>\n");
+  fprintf(stderr, "       c-index-test -test-inline-assembly <AST file>\n");
   fprintf(stderr,
     "       c-index-test -test-print-linkage-source {<args>}*\n"
     "       c-index-test -test-print-visibility {<args>}*\n"
@@ -5167,6 +5213,10 @@ int cindextest_main(int argc, const char **argv) {
   else if (argc > 2 && strstr(argv[1], "-single-symbol-sgf-for=") == argv[1])
     return perform_test_single_symbol_sgf(argv[1], argc - 2, argv + 2);
 
+  if (argc > 2 && strstr(argv[1], "-test-inline-assembly") == argv[1])
+    return perform_test_load_source(argc - 2, argv + 2, "all",
+                                    PrintGCCInlineAssembly, NULL);
+
   print_usage();
   return 1;
 }
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 3068621d9c004..e239ffae547aa 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -8648,6 +8648,100 @@ void clang_annotateTokens(CXTranslationUnit TU, CXToken *Tokens,
   }
 }
 
+//===----------------------------------------------------------------------===//
+// Operations for querying information of a GCC inline assembly block under a
+// cursor.
+//===----------------------------------------------------------------------===//
+CXString clang_Cursor_getGCCAssemblyTemplate(CXCursor Cursor) {
+  if (!clang_isStatement(Cursor.kind))
+    return cxstring::createEmpty();
+  if (auto const *S = dyn_cast_or_null<GCCAsmStmt>(getCursorStmt(Cursor))) {
+    ASTContext const &C = getCursorContext(Cursor);
+    std::string AsmTemplate = S->generateAsmString(C);
+    return cxstring::createDup(AsmTemplate);
+  }
+  return cxstring::createEmpty();
+}
+
+unsigned clang_Cursor_isGCCAssemblyHasGoto(CXCursor Cursor) {
+  if (!clang_isStatement(Cursor.kind))
+    return 0;
+  if (auto const *S = dyn_cast_or_null<GCCAsmStmt>(getCursorStmt(Cursor)))
+    return S->isAsmGoto();
+  return 0;
+}
+
+unsigned clang_Cursor_getGCCAssemblyNumOutputs(CXCursor Cursor) {
+  if (!clang_isStatement(Cursor.kind))
+    return 0;
+  if (auto const *S = dyn_cast_or_null<GCCAsmStmt>(getCursorStmt(Cursor)))
+    return S->getNumOutputs();
+  return 0;
+}
+
+unsigned clang_Cursor_getGCCAssemblyNumInputs(CXCursor Cursor) {
+  if (!clang_isStatement(Cursor.kind))
+    return 0;
+  if (auto const *S = dyn_cast_or_null<GCCAsmStmt>(getCursorStmt(Cursor)))
+    return S->getNumInputs();
+  return 0;
+}
+
+unsigned clang_Cursor_getGCCAssemblyInput(CXCursor Cursor, unsigned Index,
+                                          CXString *Constraint,
+                                          CXCursor *ExprCursor) {
+  if (!clang_isStatement(Cursor.kind) || !Constraint || !ExprCursor)
+    return 0;
+  if (auto const *S = dyn_cast_or_null<GCCAsmStmt>(getCursorStmt(Cursor));
+      S && Index < S->getNumInputs()) {
+    *Constraint = cxstring::createDup(S->getInputConstraint(Index));
+    *ExprCursor = MakeCXCursor(S->getInputExpr(Index), getCursorDecl(Cursor),
+                               cxcursor::getCursorTU(Cursor));
+    return 1;
+  }
+  return 0;
+}
+
+unsigned clang_Cursor_getGCCAssemblyOutput(CXCursor Cursor, unsigned Index,
+                                           CXString *Constraint,
+                                           CXCursor *ExprCursor) {
+  if (!clang_isStatement(Cursor.kind) || !Constraint || !ExprCursor)
+    return 0;
+  if (auto const *S = dyn_cast_or_null<GCCAsmStmt>(getCursorStmt(Cursor));
+      S && Index < S->getNumOutputs()) {
+    *Constraint = cxstring::createDup(S->getOutputConstraint(Index));
+    *ExprCursor = MakeCXCursor(S->getOutputExpr(Index), getCursorDecl(Cursor),
+                               cxcursor::getCursorTU(Cursor));
+    return 1;
+  }
+  return 0;
+}
+
+unsigned clang_Cursor_getGCCAssemblyNumClobbers(CXCursor Cursor) {
+  if (!clang_isStatement(Cursor.kind))
+    return 0;
+  if (auto const *S = dyn_cast_or_null<GCCAsmStmt>(getCursorStmt(Cursor)))
+    return S->getNumClobbers();
+  return 0;
+}
+
+CXString clang_Cursor_getGCCAssemblyClobber(CXCursor Cursor, unsigned Index) {
+  if (!clang_isStatement(Cursor.kind))
+    return cxstring::createEmpty();
+  if (auto const *S = dyn_cast_or_null<GCCAsmStmt>(getCursorStmt(Cursor));
+      S && Index < S->getNumClobbers())
+    return cxstring::createDup(S->getClobber(Index));
+  return cxstring::createEmpty();
+}
+
+unsigned clang_Cursor_isGCCAssemblyVolatile(CXCursor Cursor) {
+  if (!clang_isStatement(Cursor.kind))
+    return 0;
+  if (auto const *S = dyn_cast_or_null<GCCAsmStmt>(getCursorStmt(Cursor)))
+    return S->isVolatile();
+  return 0;
+}
+
 //===----------------------------------------------------------------------===//
 // Operations for querying linkage of a cursor.
 //===----------------------------------------------------------------------===//
diff --git a/clang/tools/libclang/libclang.map b/clang/tools/libclang/libclang.map
index f08d13c3da9e1..d140a71e771a0 100644
--- a/clang/tools/libclang/libclang.map
+++ b/clang/tools/libclang/libclang.map
@@ -441,6 +441,15 @@ LLVM_20 {
 LLVM_21 {
   global:
     clang_getFullyQualifiedName;
+    clang_Cursor_getGCCAssemblyTemplate;
+    clang_Cursor_isGCCAssemblyHasGoto;
+    clang_Cursor_getGCCAssemblyNumOutputs;
+    clang_Cursor_getGCCAssemblyNumInputs;
+    clang_Cursor_getGCCAssemblyInput;
+    clang_Cursor_getGCCAssemblyOutput;
+    clang_Cursor_getGCCAssemblyNumClobbers;
+    clang_Cursor_getGCCAssemblyClobber;
+    clang_Cursor_isGCCAssemblyVolatile;
 };
 
 # Example of how to add a new symbol version entry.  If you do add a new symbol
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index c0633ba3c29b3..a05bf8305716b 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -25768,6 +25768,21 @@ TEST_F(FormatTest, OperatorPassedAsAFunctionPtr) {
   verifyFormat("foo(operator, , -42);", Style);
 }
 
+TEST_F(FormatTest, LineSpliceWithTrailingWhitespace) {
+  auto Style = getLLVMStyle();
+  Style.AlignEscapedNewlines = FormatStyle::ENAS_DontAlign;
+  Style.UseTab = FormatStyle::UT_Never;
+
+  verifyFormat("int i;", "  \\  \n"
+                         "  int i;");
+  verifyFormat("#define FOO(args) \\\n"
+               "  struct a {};",
+               "#define FOO( args )   \\   \n"
+               "struct a{\\\t\t\t\n"
+               "  };",
+               Style);
+}
+
 TEST_F(FormatTest, WhitespaceSensitiveMacros) {
   FormatStyle Style = getLLVMStyle();
   Style.WhitespaceSensitiveMacros.push_back("FOO");
diff --git a/clang/unittests/Format/QualifierFixerTest.cpp b/clang/unittests/Format/QualifierFixerTest.cpp
index 3eae39f267c3e..f42f2e307f713 100644
--- a/clang/unittests/Format/QualifierFixerTest.cpp
+++ b/clang/unittests/Format/QualifierFixerTest.cpp
@@ -1122,14 +1122,17 @@ TEST_F(QualifierFixerTest, IsQualifierType) {
 }
 
 TEST_F(QualifierFixerTest, IsMacro) {
-
   auto Tokens = annotate("INT INTPR Foo int");
   ASSERT_EQ(Tokens.size(), 5u) << Tokens;
-
   EXPECT_TRUE(isPossibleMacro(Tokens[0]));
   EXPECT_TRUE(isPossibleMacro(Tokens[1]));
   EXPECT_FALSE(isPossibleMacro(Tokens[2]));
   EXPECT_FALSE(isPossibleMacro(Tokens[3]));
+
+  Tokens = annotate("FOO::BAR");
+  ASSERT_EQ(Tokens.size(), 4u) << Tokens;
+  EXPECT_FALSE(isPossibleMacro(Tokens[0]));
+  EXPECT_FALSE(isPossibleMacro(Tokens[2]));
 }
 
 TEST_F(QualifierFixerTest, OverlappingQualifier) {
diff --git a/clang/unittests/Serialization/SourceLocationEncodingTest.cpp b/clang/unittests/Serialization/SourceLocationEncodingTest.cpp
index c80a8fd0e52b1..18fedd4de3973 100644
--- a/clang/unittests/Serialization/SourceLocationEncodingTest.cpp
+++ b/clang/unittests/Serialization/SourceLocationEncodingTest.cpp
@@ -16,7 +16,6 @@ using namespace llvm;
 using namespace clang;
 
 namespace {
-using LocSeq = SourceLocationSequence;
 
 // Convert a single source location into encoded form and back.
 // If ExpectedEncoded is provided, verify the encoded value too.
@@ -34,37 +33,9 @@ void roundTrip(SourceLocation::UIntTy Loc,
   ASSERT_EQ(DecodedEncoded, Loc) << "Decoding " << ActualEncoded;
 }
 
-// As above, but use sequence encoding for a series of locations.
-void roundTrip(std::vector<SourceLocation::UIntTy> Locs,
-               std::vector<uint64_t> ExpectedEncoded = {}) {
-  std::vector<uint64_t> ActualEncoded;
-  {
-    LocSeq::State Seq;
-    for (auto L : Locs)
-      ActualEncoded.push_back(SourceLocationEncoding::encode(
-          SourceLocation::getFromRawEncoding(L), /*BaseOffset=*/0,
-          /*BaseModuleFileIndex=*/0, Seq));
-    if (!ExpectedEncoded.empty()) {
-      ASSERT_EQ(ActualEncoded, ExpectedEncoded)
-          << "Encoding " << testing::PrintToString(Locs);
-    }
-  }
-  std::vector<SourceLocation::UIntTy> DecodedEncoded;
-  {
-    LocSeq::State Seq;
-    for (auto L : ActualEncoded) {
-      SourceLocation Loc = SourceLocationEncoding::decode(L, Seq).first;
-      DecodedEncoded.push_back(Loc.getRawEncoding());
-    }
-    ASSERT_EQ(DecodedEncoded, Locs)
-        << "Decoding " << testing::PrintToString(ActualEncoded);
-  }
-}
-
 constexpr SourceLocation::UIntTy MacroBit =
     1 << (sizeof(SourceLocation::UIntTy) * CHAR_BIT - 1);
 constexpr SourceLocation::UIntTy Big = MacroBit >> 1;
-constexpr SourceLocation::UIntTy Biggest = -1;
 
 TEST(SourceLocationEncoding, Individual) {
   roundTrip(1, 2);
@@ -77,33 +48,4 @@ TEST(SourceLocationEncoding, Individual) {
   roundTrip(MacroBit | (Big + 1));
 }
 
-TEST(SourceLocationEncoding, Sequence) {
-  roundTrip({1, 2, 3, 3, 2, 1},
-            {2, // 1
-             5, // +2 (+1 of non-raw)
-             5, // +2
-             1, // +0
-             4, // -2
-             4} // -2
-  );
-  roundTrip({100, 0, 100},
-            {200, // 100
-             0,   // 0
-             1}   // +0
-  );
-
-  roundTrip({1, Big}, {2, ((Big - 1) << 2) + 1});
-  roundTrip({2, MacroBit | Big}, {4, ((Big - 1) << 2) - 1});
-
-  roundTrip({3, MacroBit | 5, MacroBit | 4, 3},
-            {6,  // 3
-             11, // +5 (+2 of non-raw + set macro bit)
-             4,  // -2
-             6}  // -3 (-2 of non-raw, clear macro bit)
-  );
-
-  roundTrip(
-      {123 | MacroBit, 1, 9, Biggest, Big, Big + 1, 0, MacroBit | Big, 0});
-}
-
 } // namespace
diff --git a/compiler-rt/cmake/Modules/AddCompilerRT.cmake b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
index 86e19e08270d7..45a2f5c0a61fc 100644
--- a/compiler-rt/cmake/Modules/AddCompilerRT.cmake
+++ b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
@@ -582,6 +582,24 @@ macro(add_compiler_rt_script name)
     DESTINATION ${COMPILER_RT_INSTALL_BINARY_DIR})
 endmacro(add_compiler_rt_script src name)
 
+
+macro(add_compiler_rt_cfg target_name file_name component arch)
+  set(src_file "${CMAKE_CURRENT_SOURCE_DIR}/${file_name}")
+  get_compiler_rt_output_dir(${arch} output_dir)
+  set(dst_file "${output_dir}/${file_name}")
+  add_custom_command(OUTPUT ${dst_file}
+    DEPENDS ${src_file}
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src_file} ${dst_file}
+    COMMENT "Copying ${file_name}...")
+  add_custom_target(${target_name} DEPENDS ${dst_file})
+  install(FILES ${file_name}
+    DESTINATION ${COMPILER_RT_INSTALL_LIBRARY_DIR}
+    COMPONENT ${component})
+  add_dependencies(${component} ${target_name})
+
+  set_target_properties(${target_name} PROPERTIES FOLDER "Compiler-RT Misc")
+endmacro()
+
 # Builds custom version of libc++ and installs it in <prefix>.
 # Can be used to build sanitized versions of libc++ for running unit tests.
 # add_custom_libcxx(<name> <prefix>
diff --git a/compiler-rt/lib/asan/CMakeLists.txt b/compiler-rt/lib/asan/CMakeLists.txt
index e2f39f224df9c..97cc5c85703e1 100644
--- a/compiler-rt/lib/asan/CMakeLists.txt
+++ b/compiler-rt/lib/asan/CMakeLists.txt
@@ -281,6 +281,8 @@ else()
       PARENT_TARGET asan)
   endif()
 
+  # On AIX, we only need the static libraries.
+  if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
   foreach(arch ${ASAN_SUPPORTED_ARCH})
     if (COMPILER_RT_HAS_VERSION_SCRIPT)
       if(WIN32)
@@ -382,10 +384,21 @@ else()
       endif()
     endif()
   endforeach()
+  endif()
 endif()
 
 add_compiler_rt_resource_file(asan_ignorelist asan_ignorelist.txt asan)
 
+# On AIX, static sanitizer libraries are not added to the DSO, so we need to put 
+# asan.link_with_main_exec.txt and asan_cxx.link_with_main_exec.txt to the build
+# and install dir for use in resolving undefined sanitizer symbols at runtime.
+if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  foreach(arch ${ASAN_SUPPORTED_ARCH})
+    add_compiler_rt_cfg(asan_symbols_${arch} asan.link_with_main_exec.txt asan ${arch})
+    add_compiler_rt_cfg(asan_cxx_symbols_${arch} asan_cxx.link_with_main_exec.txt asan ${arch})
+  endforeach()
+endif()
+
 add_subdirectory(scripts)
 
 if(COMPILER_RT_INCLUDE_TESTS)
diff --git a/compiler-rt/lib/asan/asan.link_with_main_exec.txt b/compiler-rt/lib/asan/asan.link_with_main_exec.txt
new file mode 100644
index 0000000000000..5efc48c262369
--- /dev/null
+++ b/compiler-rt/lib/asan/asan.link_with_main_exec.txt
@@ -0,0 +1,115 @@
+#! .
+__asan_report_load_n
+__asan_loadN
+__asan_report_load1
+__asan_load1
+__asan_report_load2
+__asan_load2
+__asan_report_load4
+__asan_load4
+__asan_report_load8
+__asan_load8
+__asan_report_load16
+__asan_load16
+__asan_report_store_n
+__asan_storeN
+__asan_report_store1
+__asan_store1
+__asan_report_store2
+__asan_store2
+__asan_report_store4
+__asan_store4
+__asan_report_store8
+__asan_store8
+__asan_report_store16
+__asan_store16
+__asan_report_exp_load_n
+__asan_exp_loadN
+__asan_report_exp_load1
+__asan_exp_load1
+__asan_report_exp_load2
+__asan_exp_load2
+__asan_report_exp_load4
+__asan_exp_load4
+__asan_report_exp_load8
+__asan_exp_load8
+__asan_report_exp_load16
+__asan_exp_load16
+__asan_report_exp_store_n
+__asan_exp_storeN
+__asan_report_exp_store1
+__asan_exp_store1
+__asan_report_exp_store2
+__asan_exp_store2
+__asan_report_exp_store4
+__asan_exp_store4
+__asan_report_exp_store8
+__asan_exp_store8
+__asan_report_exp_store16
+__asan_exp_store16
+__asan_memmove
+__asan_memcpy
+__asan_memset
+__asan_handle_no_return
+__sanitizer_ptr_cmp
+__sanitizer_ptr_sub
+__asan_before_dynamic_init
+__asan_after_dynamic_init
+__asan_register_globals
+__asan_unregister_globals
+__asan_register_image_globals
+__asan_unregister_image_globals
+__asan_register_elf_globals
+__asan_unregister_elf_globals
+__asan_init
+__asan_version_mismatch_check_v8
+__asan_stack_malloc_0
+__asan_stack_malloc_1
+__asan_stack_malloc_2
+__asan_stack_malloc_3
+__asan_stack_malloc_4
+__asan_stack_malloc_5
+__asan_stack_malloc_6
+__asan_stack_malloc_7
+__asan_stack_malloc_8
+__asan_stack_malloc_9
+__asan_stack_malloc_10
+__asan_stack_malloc_always_0
+__asan_stack_malloc_always_1
+__asan_stack_malloc_always_2
+__asan_stack_malloc_always_3
+__asan_stack_malloc_always_4
+__asan_stack_malloc_always_5
+__asan_stack_malloc_always_6
+__asan_stack_malloc_always_7
+__asan_stack_malloc_always_8
+__asan_stack_malloc_always_9
+__asan_stack_malloc_always_10
+__asan_stack_free_0
+__asan_stack_free_1
+__asan_stack_free_2
+__asan_stack_free_3
+__asan_stack_free_4
+__asan_stack_free_5
+__asan_stack_free_6
+__asan_stack_free_7
+__asan_stack_free_8
+__asan_stack_free_9
+__asan_stack_free_10
+__asan_set_shadow_00
+__asan_set_shadow_01
+__asan_set_shadow_02
+__asan_set_shadow_03
+__asan_set_shadow_04
+__asan_set_shadow_05
+__asan_set_shadow_06
+__asan_set_shadow_07
+__asan_set_shadow_f1
+__asan_set_shadow_f2
+__asan_set_shadow_f3
+__asan_set_shadow_f5
+__asan_set_shadow_f8
+__asan_poison_stack_memory
+__asan_unpoison_stack_memory
+__asan_option_detect_stack_use_after_return
+__asan_shadow_memory_dynamic_address
diff --git a/compiler-rt/lib/asan/asan_cxx.link_with_main_exec.txt b/compiler-rt/lib/asan/asan_cxx.link_with_main_exec.txt
new file mode 100644
index 0000000000000..7387f8173e859
--- /dev/null
+++ b/compiler-rt/lib/asan/asan_cxx.link_with_main_exec.txt
@@ -0,0 +1,21 @@
+#! .
+_ZdaPv
+_ZdaPvRKSt9nothrow_t
+_ZdaPvSt11align_val_t
+_ZdaPvSt11align_val_tRKSt9nothrow_t
+_ZdaPvm
+_ZdaPvmSt11align_val_t
+_ZdlPv
+_ZdlPvRKSt9nothrow_t
+_ZdlPvSt11align_val_t
+_ZdlPvSt11align_val_tRKSt9nothrow_t
+_ZdlPvm
+_ZdlPvmSt11align_val_t
+_Znam
+_ZnamRKSt9nothrow_t
+_ZnamSt11align_val_t
+_ZnamSt11align_val_tRKSt9nothrow_t
+_Znwm
+_ZnwmRKSt9nothrow_t
+_ZnwmSt11align_val_t
+_ZnwmSt11align_val_tRKSt9nothrow_t
diff --git a/compiler-rt/lib/asan/asan_descriptions.cpp b/compiler-rt/lib/asan/asan_descriptions.cpp
index c9f3e4d682d95..0c30959b23e28 100644
--- a/compiler-rt/lib/asan/asan_descriptions.cpp
+++ b/compiler-rt/lib/asan/asan_descriptions.cpp
@@ -211,10 +211,10 @@ bool GetStackAddressInformation(uptr addr, uptr access_size,
   descr->frame_pc = access.frame_pc;
   descr->frame_descr = access.frame_descr;
 
-#if SANITIZER_PPC64V1
-  // On PowerPC64 ELFv1, the address of a function actually points to a
-  // three-doubleword data structure with the first field containing
-  // the address of the function's code.
+#if SANITIZER_PPC64V1 || SANITIZER_AIX
+  // On PowerPC64 ELFv1 or AIX, the address of a function actually points to a
+  // three-doubleword (or three-word for 32-bit AIX) data structure with
+  // the first field containing the address of the function's code.
   descr->frame_pc = *reinterpret_cast<uptr *>(descr->frame_pc);
 #endif
   descr->frame_pc += 16;
@@ -444,6 +444,16 @@ AddressDescription::AddressDescription(uptr addr, uptr access_size,
     data.kind = kAddressKindShadow;
     return;
   }
+
+  // Check global first. On AIX, some global data defined in shared libraries
+  // are put to the STACK region for unknown reasons. Check global first can
+  // workaround this issue.
+  // TODO: Look into whether there's a different solution to this problem.
+  if (GetGlobalAddressInformation(addr, access_size, &data.global)) {
+    data.kind = kAddressKindGlobal;
+    return;
+  }
+
   if (GetHeapAddressInformation(addr, access_size, &data.heap)) {
     data.kind = kAddressKindHeap;
     return;
@@ -461,10 +471,6 @@ AddressDescription::AddressDescription(uptr addr, uptr access_size,
     return;
   }
 
-  if (GetGlobalAddressInformation(addr, access_size, &data.global)) {
-    data.kind = kAddressKindGlobal;
-    return;
-  }
   data.kind = kAddressKindWild;
   data.wild.addr = addr;
   data.wild.access_size = access_size;
diff --git a/compiler-rt/test/tsan/java_heap_init2.cpp b/compiler-rt/test/tsan/java_heap_init2.cpp
new file mode 100644
index 0000000000000..2e5724d930e8f
--- /dev/null
+++ b/compiler-rt/test/tsan/java_heap_init2.cpp
@@ -0,0 +1,34 @@
+// RUN: %clangxx_tsan -O1 %s -o %t && %run %t 2>&1 | FileCheck %s
+// XFAIL: *
+
+#include "java.h"
+#include <errno.h>
+#include <sys/mman.h>
+
+int main() {
+  // Test a non-regular kHeapSize
+  // Previously __tsan_java_init failed because it encountered non-zero meta
+  // shadow for the destination.
+  size_t const kPageSize = sysconf(_SC_PAGESIZE);
+  int const kSize = kPageSize - 1;
+  jptr jheap2 = (jptr)mmap(0, kSize, PROT_READ | PROT_WRITE,
+                           MAP_ANON | MAP_PRIVATE, -1, 0);
+  if (jheap2 == (jptr)MAP_FAILED)
+    return printf("mmap failed with %d\n", errno);
+  __atomic_store_n((int *)(jheap2 + kSize - 3), 1, __ATOMIC_RELEASE);
+  // Due to the previous incorrect meta-end calculation, the following munmap
+  // did not clear the tail meta shadow.
+  munmap((void *)jheap2, kSize);
+  int const kHeapSize2 = kSize + 1;
+  jheap2 = (jptr)mmap((void *)jheap2, kHeapSize2, PROT_READ | PROT_WRITE,
+                      MAP_ANON | MAP_PRIVATE, -1, 0);
+  if (jheap2 == (jptr)MAP_FAILED)
+    return printf("second mmap failed with %d\n", errno);
+  __tsan_java_init(jheap2, kHeapSize2);
+  __tsan_java_move(jheap2, jheap2 + kHeapSize2 - 8, 8);
+  fprintf(stderr, "DONE\n");
+  return __tsan_java_fini();
+}
+
+// CHECK-NOT: WARNING: ThreadSanitizer: data race
+// CHECK: DONE
diff --git a/compiler-rt/test/tsan/munmap_clear_shadow.c b/compiler-rt/test/tsan/munmap_clear_shadow.c
new file mode 100644
index 0000000000000..8a435a84258f5
--- /dev/null
+++ b/compiler-rt/test/tsan/munmap_clear_shadow.c
@@ -0,0 +1,59 @@
+// RUN: %clang_tsan %s -o %t && %run %t | FileCheck %s
+// XFAIL: *
+
+#include "test.h"
+#include <assert.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+void __tsan_read1(void *addr);
+
+struct thread_params {
+  char *buf;
+  unsigned int size;
+};
+
+static void *thread_func(void *arg) {
+  struct thread_params *p = (struct thread_params *)arg;
+  // Access 1
+  p->buf[0] = 0x42;
+  p->buf[p->size - 1] = 0x42;
+  barrier_wait(&barrier);
+  return 0;
+}
+
+int main() {
+  const unsigned int kPageSize = sysconf(_SC_PAGESIZE);
+  // The relevant shadow memory size should be exactly multiple of kPageSize,
+  // even if Size = kPageSize - 1.
+  const unsigned int Size = kPageSize - 1;
+
+  barrier_init(&barrier, 2);
+  char *buf = (char *)mmap(NULL, Size, PROT_READ | PROT_WRITE,
+                           MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  assert(buf != MAP_FAILED);
+  assert(((uintptr_t)buf % kPageSize) == 0);
+
+  pthread_t t;
+  struct thread_params p = {buf, Size};
+  pthread_create(&t, 0, thread_func, &p);
+
+  barrier_wait(&barrier);
+  // Should clear all the shadow memory related to the mmaped memory.
+  munmap(buf, Size);
+
+  // If the shadow memory is cleared completely, the following reads should not
+  // cause races and behave the same. However, previously, __tsan_read1(&buf[0])
+  // would not report a race, while __tsan_read1(&buf[Size - 1]) did.
+  // CHECK-NOT: WARNING: ThreadSanitizer: data race
+  __tsan_read1(&buf[0]);        // Access 2
+  __tsan_read1(&buf[Size - 1]); // Access 2
+  pthread_join(t, 0);
+
+  puts("DONE");
+
+  return 0;
+}
diff --git a/flang/docs/ReleaseNotes.md b/flang/docs/ReleaseNotes.md
index 36be369595ffd..35da8323e0a10 100644
--- a/flang/docs/ReleaseNotes.md
+++ b/flang/docs/ReleaseNotes.md
@@ -54,6 +54,11 @@ page](https://llvm.org/releases/).
    now be emitted into Clang's per-target resource directory
    (next to libclang_rt.*.*) where it is also found by Flang's driver.
 
+  * Flang on AArch64 now always depends on compiler-rt to provide the
+    `__trampoline_setup` function. This dependency will be automatically added
+    in in-tree builds with the AArch64 target, but compiler-rt will need to be
+    manually added to LLVM builds when building flang out-of-tree.
+
 ## New Issues Found
 
 
diff --git a/flang/include/flang/Optimizer/Dialect/FIRTypes.td b/flang/include/flang/Optimizer/Dialect/FIRTypes.td
index 6fad77dffd9bc..0ead54df3ca97 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRTypes.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRTypes.td
@@ -330,7 +330,8 @@ def fir_RecordType : FIR_Type<"Record", "type"> {
 
   let extraClassDeclaration = [{
     using TypePair = std::pair<std::string, mlir::Type>;
-    using TypeList = std::vector<TypePair>;
+    using TypeList = llvm::ArrayRef<TypePair>;
+    using TypeVector = llvm::SmallVector<TypePair>;
     TypeList getTypeList() const;
     TypeList getLenParamList() const;
 
diff --git a/flang/include/flang/Support/OpenMP-features.h b/flang/include/flang/Support/OpenMP-features.h
index 349cd19c1224f..5e722930ae1b2 100644
--- a/flang/include/flang/Support/OpenMP-features.h
+++ b/flang/include/flang/Support/OpenMP-features.h
@@ -15,16 +15,8 @@ namespace Fortran::common {
 template <typename FortranPredefinitions>
 void setOpenMPMacro(int version, FortranPredefinitions &predefinitions) {
   switch (version) {
-  case 20:
-    predefinitions.emplace_back("_OPENMP", "200011");
-    break;
-  case 25:
-    predefinitions.emplace_back("_OPENMP", "200505");
-    break;
-  case 30:
-    predefinitions.emplace_back("_OPENMP", "200805");
-    break;
   case 31:
+  default:
     predefinitions.emplace_back("_OPENMP", "201107");
     break;
   case 40:
@@ -45,10 +37,6 @@ void setOpenMPMacro(int version, FortranPredefinitions &predefinitions) {
   case 60:
     predefinitions.emplace_back("_OPENMP", "202411");
     break;
-  case 11:
-  default:
-    predefinitions.emplace_back("_OPENMP", "199911");
-    break;
   }
 }
 } // namespace Fortran::common
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 2603a3f6dc643..07d6814da8671 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -26,6 +26,7 @@
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/OptionUtils.h"
 #include "clang/Driver/Options.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Frontend/Debug/Options.h"
@@ -44,6 +45,7 @@
 #include <cstdlib>
 #include <memory>
 #include <optional>
+#include <sstream>
 
 using namespace Fortran::frontend;
 
@@ -1140,11 +1142,43 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   res.getLangOpts().OpenMPVersion = 31;
   res.getFrontendOpts().features.Enable(
       Fortran::common::LanguageFeature::OpenMP);
-  if (int Version = getLastArgIntValue(
-          args, clang::driver::options::OPT_fopenmp_version_EQ,
-          res.getLangOpts().OpenMPVersion, diags)) {
-    res.getLangOpts().OpenMPVersion = Version;
+  if (auto *arg =
+          args.getLastArg(clang::driver::options::OPT_fopenmp_version_EQ)) {
+    llvm::ArrayRef<unsigned> ompVersions = llvm::omp::getOpenMPVersions();
+    unsigned oldVersions[] = {11, 20, 25, 30};
+    unsigned version = 0;
+
+    auto reportBadVersion = [&](llvm::StringRef value) {
+      const unsigned diagID =
+          diags.getCustomDiagID(clang::DiagnosticsEngine::Error,
+                                "'%0' is not a valid OpenMP version in '%1', "
+                                "valid versions are %2");
+      std::string buffer;
+      llvm::raw_string_ostream versions(buffer);
+      llvm::interleaveComma(ompVersions, versions);
+
+      diags.Report(diagID) << value << arg->getAsString(args) << versions.str();
+    };
+
+    llvm::StringRef value = arg->getValue();
+    if (!value.getAsInteger(/*radix=*/10, version)) {
+      if (llvm::is_contained(ompVersions, version)) {
+        res.getLangOpts().OpenMPVersion = version;
+      } else if (llvm::is_contained(oldVersions, version)) {
+        const unsigned diagID =
+            diags.getCustomDiagID(clang::DiagnosticsEngine::Warning,
+                                  "OpenMP version %0 is no longer supported, "
+                                  "assuming version %1");
+        std::string assumed = std::to_string(res.getLangOpts().OpenMPVersion);
+        diags.Report(diagID) << value << assumed;
+      } else {
+        reportBadVersion(value);
+      }
+    } else {
+      reportBadVersion(value);
+    }
   }
+
   if (args.hasArg(clang::driver::options::OPT_fopenmp_force_usm)) {
     res.getLangOpts().OpenMPForceUSM = 1;
   }
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 336a6f82319e6..8506b9a984e58 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -68,8 +68,8 @@
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/StateStack.h"
 #include "mlir/Parser/Parser.h"
+#include "mlir/Support/StateStack.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSet.h"
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 8575d8cf352fd..60b6366c184d4 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -39,7 +39,7 @@
 #include "flang/Support/OpenMP-utils.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
-#include "mlir/IR/StateStack.h"
+#include "mlir/Support/StateStack.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp
index 78571f1f4bc2d..2ff1d6d945ba3 100644
--- a/flang/lib/Optimizer/Dialect/FIRType.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRType.cpp
@@ -261,6 +261,8 @@ mlir::Type dyn_cast_ptrOrBoxEleTy(mlir::Type t) {
 }
 
 static bool hasDynamicSize(fir::RecordType recTy) {
+  if (recTy.getLenParamList().empty())
+    return false;
   for (auto field : recTy.getTypeList()) {
     if (auto arr = mlir::dyn_cast<fir::SequenceType>(field.second)) {
       if (sequenceWithNonConstantShape(arr))
@@ -1006,7 +1008,7 @@ mlir::Type fir::RecordType::parse(mlir::AsmParser &parser) {
     return {};
   RecordType result = RecordType::get(parser.getContext(), name);
 
-  RecordType::TypeList lenParamList;
+  RecordType::TypeVector lenParamList;
   if (!parser.parseOptionalLParen()) {
     while (true) {
       llvm::StringRef lenparam;
@@ -1024,7 +1026,7 @@ mlir::Type fir::RecordType::parse(mlir::AsmParser &parser) {
       return {};
   }
 
-  RecordType::TypeList typeList;
+  RecordType::TypeVector typeList;
   if (!parser.parseOptionalLess()) {
     result.pack(true);
   }
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
index f7efaa736a279..33f687db08f9a 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
@@ -326,11 +326,14 @@ class DeclareOpConversion : public mlir::OpRewritePattern<hlfir::DeclareOp> {
       auto genHlfirBox = [&]() -> mlir::Value {
         if (auto baseBoxType =
                 mlir::dyn_cast<fir::BaseBoxType>(firBase.getType())) {
-          // Rebox so that lower bounds are correct.
+          // Rebox so that lower bounds and attributes are correct.
           if (baseBoxType.isAssumedRank())
             return builder.create<fir::ReboxAssumedRankOp>(
                 loc, hlfirBaseType, firBase,
                 fir::LowerBoundModifierAttribute::SetToOnes);
+          if (!fir::extractSequenceType(baseBoxType.getEleTy()) &&
+              baseBoxType == hlfirBaseType)
+            return firBase;
           return builder.create<fir::ReboxOp>(loc, hlfirBaseType, firBase,
                                               declareOp.getShape(),
                                               /*slice=*/mlir::Value{});
diff --git a/flang/test/Driver/bbc-openmp-version-macro.f90 b/flang/test/Driver/bbc-openmp-version-macro.f90
index 6fa19e1672ad8..193c9d297de4f 100644
--- a/flang/test/Driver/bbc-openmp-version-macro.f90
+++ b/flang/test/Driver/bbc-openmp-version-macro.f90
@@ -1,29 +1,22 @@
 ! Test predefined _OPENMP macro which denotes OpenMP version
 
 ! RUN: bbc -fopenmp -o -  %s | FileCheck %s --check-prefix=DEFAULT-OPENMP-VERSION
-! RUN: bbc -fopenmp -fopenmp-version=11 -o - %s | FileCheck %s --check-prefix=OPENMP-VERSION-11
-! RUN: bbc -fopenmp -fopenmp-version=11 -o - %s | FileCheck %s --check-prefix=OPENMP-VERSION-11
-! RUN: bbc -fopenmp -fopenmp-version=20 -o - %s | FileCheck %s --check-prefix=OPENMP-VERSION-20
-! RUN: bbc -fopenmp -fopenmp-version=25 -o - %s | FileCheck %s --check-prefix=OPENMP-VERSION-25
-! RUN: bbc -fopenmp -fopenmp-version=30 -o - %s | FileCheck %s --check-prefix=OPENMP-VERSION-30
 ! RUN: bbc -fopenmp -fopenmp-version=31 -o - %s | FileCheck %s --check-prefix=OPENMP-VERSION-31
 ! RUN: bbc -fopenmp -fopenmp-version=40 -o - %s | FileCheck %s --check-prefix=OPENMP-VERSION-40
 ! RUN: bbc -fopenmp -fopenmp-version=45 -o - %s | FileCheck %s --check-prefix=OPENMP-VERSION-45
 ! RUN: bbc -fopenmp -fopenmp-version=50 -o - %s | FileCheck %s --check-prefix=OPENMP-VERSION-50
 ! RUN: bbc -fopenmp -fopenmp-version=51 -o - %s | FileCheck %s --check-prefix=OPENMP-VERSION-51
 ! RUN: bbc -fopenmp -fopenmp-version=52 -o - %s | FileCheck %s --check-prefix=OPENMP-VERSION-52
+! RUN: bbc -fopenmp -fopenmp-version=60 -o - %s | FileCheck %s --check-prefix=OPENMP-VERSION-60
 
 ! DEFAULT-OPENMP-VERSION: {{.*}} = arith.constant 201107 : i32
-! OPENMP-VERSION-11: {{.*}} = arith.constant 199911 : i32
-! OPENMP-VERSION-20: {{.*}} = arith.constant 200011 : i32
-! OPENMP-VERSION-25: {{.*}} = arith.constant 200505 : i32
-! OPENMP-VERSION-30: {{.*}} = arith.constant 200805 : i32
 ! OPENMP-VERSION-31: {{.*}} = arith.constant 201107 : i32
 ! OPENMP-VERSION-40: {{.*}} = arith.constant 201307 : i32
 ! OPENMP-VERSION-45: {{.*}} = arith.constant 201511 : i32
 ! OPENMP-VERSION-50: {{.*}} = arith.constant 201811 : i32
 ! OPENMP-VERSION-51: {{.*}} = arith.constant 202011 : i32
 ! OPENMP-VERSION-52: {{.*}} = arith.constant 202111 : i32
+! OPENMP-VERSION-60: {{.*}} = arith.constant 202411 : i32
 
 #if _OPENMP
   integer :: var1 = _OPENMP
diff --git a/flang/test/Driver/flang-openmp-version-macro.f90 b/flang/test/Driver/flang-openmp-version-macro.f90
index f690ab3819482..fcabfefca7f18 100644
--- a/flang/test/Driver/flang-openmp-version-macro.f90
+++ b/flang/test/Driver/flang-openmp-version-macro.f90
@@ -1,10 +1,6 @@
 ! Test predefined _OPENMP macro which denotes OpenMP version
 
 ! RUN: %flang_fc1 -fopenmp -cpp -E %s | FileCheck %s --check-prefix=DEFAULT-OPENMP-VERSION
-! RUN: %flang_fc1 -fopenmp -fopenmp-version=11 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-11
-! RUN: %flang_fc1 -fopenmp -fopenmp-version=20 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-20
-! RUN: %flang_fc1 -fopenmp -fopenmp-version=25 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-25
-! RUN: %flang_fc1 -fopenmp -fopenmp-version=30 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-30
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=31 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-31
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=40 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-40
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=45 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-45
@@ -14,10 +10,6 @@
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=60 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-60
 
 ! DEFAULT-OPENMP-VERSION: integer :: var1 = 201107
-! OPENMP-VERSION-11: integer :: var1 = 199911
-! OPENMP-VERSION-20: integer :: var1 = 200011
-! OPENMP-VERSION-25: integer :: var1 = 200505
-! OPENMP-VERSION-30: integer :: var1 = 200805
 ! OPENMP-VERSION-31: integer :: var1 = 201107
 ! OPENMP-VERSION-40: integer :: var1 = 201307
 ! OPENMP-VERSION-45: integer :: var1 = 201511
diff --git a/flang/test/Driver/fopenmp-version.F90 b/flang/test/Driver/fopenmp-version.F90
new file mode 100644
index 0000000000000..c2866561461b7
--- /dev/null
+++ b/flang/test/Driver/fopenmp-version.F90
@@ -0,0 +1,25 @@
+!RUN: %flang -dM -E -o - -fopenmp -fopenmp-version=31 %s | FileCheck --check-prefix=V31 %s
+!RUN: %flang -dM -E -o - -fopenmp -fopenmp-version=40 %s | FileCheck --check-prefix=V40 %s
+!RUN: %flang -dM -E -o - -fopenmp -fopenmp-version=45 %s | FileCheck --check-prefix=V45 %s
+!RUN: %flang -dM -E -o - -fopenmp -fopenmp-version=50 %s | FileCheck --check-prefix=V50 %s
+!RUN: %flang -dM -E -o - -fopenmp -fopenmp-version=51 %s | FileCheck --check-prefix=V51 %s
+!RUN: %flang -dM -E -o - -fopenmp -fopenmp-version=52 %s | FileCheck --check-prefix=V52 %s
+!RUN: %flang -dM -E -o - -fopenmp -fopenmp-version=60 %s | FileCheck --check-prefix=V60 %s
+
+!V31: #define _OPENMP 201107
+!V40: #define _OPENMP 201307
+!V45: #define _OPENMP 201511
+!V50: #define _OPENMP 201811
+!V51: #define _OPENMP 202011
+!V52: #define _OPENMP 202111
+!V60: #define _OPENMP 202411
+
+
+!RUN: %flang -c -fopenmp -fopenmp-version=25 %s 2>&1 | FileCheck --check-prefix=WARN-ASSUMED %s
+
+!WARN-ASSUMED: warning: OpenMP version 25 is no longer supported, assuming version 31
+
+
+!RUN: not %flang -c -fopenmp -fopenmp-version=29 %s 2>&1 | FileCheck --check-prefix=ERR-BAD %s
+
+!ERR-BAD: error: '29' is not a valid OpenMP version in '-fopenmp-version=29', valid versions are 31, 40, 45, 50, 51, 52, 60
diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir
index 6d8a8bb606b90..0e2bfe48a807d 100644
--- a/flang/test/Fir/convert-to-llvm.fir
+++ b/flang/test/Fir/convert-to-llvm.fir
@@ -1817,8 +1817,8 @@ func.func private @custom_typeP.field_1.offset() -> i32
 func.func private @custom_typeP.field_2.offset() -> i32
 
 func.func @field_index_dynamic_size() -> () {
-  %1 = fir.field_index field_1, !fir.type<custom_type{field_1:i32, field_2:!fir.array<?xf32>}>
-  %2 = fir.field_index field_2, !fir.type<custom_type{field_1:i32, field_2:!fir.array<?xf32>}>
+  %1 = fir.field_index field_1, !fir.type<custom_type(l:i32){field_1:i32, field_2:!fir.array<?xf32>}>
+  %2 = fir.field_index field_2, !fir.type<custom_type(l:i32){field_1:i32, field_2:!fir.array<?xf32>}>
   return
 }
 
diff --git a/flang/test/HLFIR/declare-codegen.fir b/flang/test/HLFIR/declare-codegen.fir
index bd0d61a2559db..a4edb630c4adb 100644
--- a/flang/test/HLFIR/declare-codegen.fir
+++ b/flang/test/HLFIR/declare-codegen.fir
@@ -219,3 +219,21 @@ func.func @assumed_rank_declare(%arg0: !fir.box<!fir.array<*:f32>>) {
 // CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>>) {
 // CHECK:    %[[VAL_1:.*]] = fir.declare %[[VAL_0]] {uniq_name = "x"} : (!fir.box<!fir.array<*:f32>>) -> !fir.box<!fir.array<*:f32>>
 // CHECK:    %[[VAL_2:.*]] = fir.rebox_assumed_rank %[[VAL_1]] lbs ones : (!fir.box<!fir.array<*:f32>>) -> !fir.box<!fir.array<*:f32>>
+
+func.func @no_useless_rebox(%arg0: !fir.class<!fir.type<sometype{i:i32}>>) {
+  %0:2 = hlfir.declare %arg0 {uniq_name = "x"} : (!fir.class<!fir.type<sometype{i:i32}>>) -> (!fir.class<!fir.type<sometype{i:i32}>>, !fir.class<!fir.type<sometype{i:i32}>>)
+  fir.call @takes_class(%0#0) : (!fir.class<!fir.type<sometype{i:i32}>>) -> ()
+  return
+}
+// CHECK-LABEL: @no_useless_rebox
+// CHECK-NOT: fir.rebox
+// CHECK: return
+
+func.func @rebox_scalar_attrs(%arg0: !fir.class<!fir.ptr<!fir.type<sometype{i:i32}>>>) {
+  %0:2 = hlfir.declare %arg0 {uniq_name = "x"} : (!fir.class<!fir.ptr<!fir.type<sometype{i:i32}>>>) -> (!fir.class<!fir.type<sometype{i:i32}>>, !fir.class<!fir.type<sometype{i:i32}>>)
+  fir.call @takes_class(%0#0) : (!fir.class<!fir.type<sometype{i:i32}>>) -> ()
+  return
+}
+// CHECK-LABEL: @rebox_scalar_attrs
+// CHECK: fir.rebox %{{.*}} : (!fir.class<!fir.ptr<!fir.type<sometype{i:i32}>>>) -> !fir.class<!fir.type<sometype{i:i32}>>
+// CHECK: return
diff --git a/flang/test/Lower/OpenMP/target-data-skip-mapper-calls.f90 b/flang/test/Lower/OpenMP/target-data-skip-mapper-calls.f90
new file mode 100644
index 0000000000000..f1a150d5dfabc
--- /dev/null
+++ b/flang/test/Lower/OpenMP/target-data-skip-mapper-calls.f90
@@ -0,0 +1,30 @@
+!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s --check-prefix=NORT
+!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s --check-prefix=LLVM
+
+!Make sure that there are no calls to the mapper.
+!NORT-NOT: call{{.*}}__tgt_target_data_begin_mapper
+!NORT-NOT: call{{.*}}__tgt_target_data_end_mapper
+
+!Make sure we generate the body
+!LLVM: define internal void @_QFPf(ptr %[[A0:[0-9]+]], ptr %[[A1:[0-9]+]])
+!LLVM:   %[[V0:[0-9]+]] = load i32, ptr %[[A0]], align 4
+!LLVM:   %[[V1:[0-9]+]] = load i32, ptr %[[A1]], align 4
+!LLVM:   %[[V2:[0-9]+]] = add i32 %[[V0]], %[[V1]]
+!LLVM:   store i32 %[[V2]], ptr %[[A0]], align 4
+!LLVM:   ret void
+!LLVM: }
+
+
+program test
+
+call f(1, 2)
+
+contains
+
+subroutine f(x, y)
+  integer :: x, y
+  !$omp target data map(tofrom: x, y)
+  x = x + y
+  !$omp end target data
+end subroutine
+end
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index c8a6d6e648af9..6b3fc9485ec1a 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -1251,6 +1251,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.wchar.mbrtowc
     libc.src.wchar.mbtowc
     libc.src.wchar.wcrtomb
+    libc.src.wchar.wctomb
   )
 endif()
 
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index 98cb3bdaf0ac9..397296894829d 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -175,6 +175,13 @@ functions:
       - type: char *__restrict
       - type: wchar_t
       - type: mbstate_t *__restrict
+  - name: wctomb
+    standards:
+      - stdc
+    return_type: int
+    arguments:
+      - type: char *
+      - type: wchar_t
   - name: wcscpy
     standards:
       - stdc
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 1f81de4248ff0..c54a1b751f402 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -25,6 +25,9 @@ constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
 // Information not metadata (# of bits excluding the byte headers)
 constexpr uint32_t MASK_ENCODED_BITS =
     mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
+// Maximum value for utf-32 for a utf-8 sequence of a given length
+constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
+constexpr int MAX_UTF8_LENGTH = 4;
 
 CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
 
@@ -40,6 +43,17 @@ bool CharacterConverter::isFull() {
 
 bool CharacterConverter::isEmpty() { return state->bytes_stored == 0; }
 
+bool CharacterConverter::isValidState() {
+  if (state->total_bytes > MAX_UTF8_LENGTH)
+    return false;
+
+  const char32_t max_utf32_value =
+      state->total_bytes == 0 ? 0
+                              : MAX_VALUE_PER_UTF8_LEN[state->total_bytes - 1];
+  return state->bytes_stored <= state->total_bytes &&
+         state->partial <= max_utf32_value;
+}
+
 int CharacterConverter::push(char8_t utf8_byte) {
   uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
   // Checking the first byte if first push
@@ -90,9 +104,7 @@ int CharacterConverter::push(char32_t utf32) {
   state->partial = utf32;
 
   // determine number of utf-8 bytes needed to represent this utf32 value
-  constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
-  constexpr int NUM_RANGES = 4;
-  for (uint8_t i = 0; i < NUM_RANGES; i++) {
+  for (uint8_t i = 0; i < MAX_UTF8_LENGTH; i++) {
     if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) {
       state->total_bytes = i + 1;
       state->bytes_stored = i + 1;
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index be0e6129df236..d9a63fdc0522c 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -28,6 +28,7 @@ class CharacterConverter {
   void clear();
   bool isFull();
   bool isEmpty();
+  bool isValidState();
 
   int push(char8_t utf8_byte);
   int push(char32_t utf32);
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index f390785e5817b..16664100d42c7 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -48,6 +48,19 @@ add_entrypoint_object(
     libc.src.__support.wchar.mbstate
 )
 
+add_entrypoint_object(
+  wctomb
+  SRCS
+    wctomb.cpp
+  HDRS
+    wctomb.h
+  DEPENDS
+    libc.hdr.types.wchar_t
+    libc.src.__support.wchar.wcrtomb
+    libc.src.__support.wchar.mbstate
+    libc.src.__support.libc_errno
+)
+
 add_entrypoint_object(
   mbrtowc
   SRCS
diff --git a/libc/src/wchar/wctomb.cpp b/libc/src/wchar/wctomb.cpp
new file mode 100644
index 0000000000000..142302e6ae09b
--- /dev/null
+++ b/libc/src/wchar/wctomb.cpp
@@ -0,0 +1,35 @@
+//===-- Implementation of wctomb ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/wctomb.h"
+
+#include "hdr/types/wchar_t.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/wcrtomb.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, wctomb, (char *s, wchar_t wc)) {
+  static internal::mbstate internal_mbstate;
+  if (s == nullptr)
+    return 0;
+
+  auto result = internal::wcrtomb(s, wc, &internal_mbstate);
+
+  if (!result.has_value()) { // invalid wide character
+    libc_errno = EILSEQ;
+    return -1;
+  }
+
+  return static_cast<int>(result.value());
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/wctomb.h b/libc/src/wchar/wctomb.h
new file mode 100644
index 0000000000000..02a34e5ad229f
--- /dev/null
+++ b/libc/src/wchar/wctomb.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for wctomb ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_WCTOMB_H
+#define LLVM_LIBC_SRC_WCHAR_WCTOMB_H
+
+#include "hdr/types/mbstate_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int wctomb(char *s, wchar_t wc);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCHAR_WCTOMB_H
diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
index a6a7bc4aa6f4c..1ad523e148845 100644
--- a/libc/test/src/__support/wchar/utf32_to_8_test.cpp
+++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
@@ -186,3 +186,45 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, CantPushMidConversion) {
   int err = cr.push(utf32);
   ASSERT_EQ(err, -1);
 }
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, InvalidState) {
+  LIBC_NAMESPACE::internal::mbstate s1;
+  LIBC_NAMESPACE::internal::CharacterConverter c1(&s1);
+  ASSERT_TRUE(c1.isValidState());
+
+  LIBC_NAMESPACE::internal::mbstate s2{0, 2, 0};
+  LIBC_NAMESPACE::internal::CharacterConverter c2(&s2);
+  ASSERT_FALSE(c2.isValidState());
+
+  LIBC_NAMESPACE::internal::mbstate s3{0x7f, 1, 1};
+  LIBC_NAMESPACE::internal::CharacterConverter c3(&s3);
+  ASSERT_TRUE(c3.isValidState());
+  LIBC_NAMESPACE::internal::mbstate s4{0x80, 1, 1};
+  LIBC_NAMESPACE::internal::CharacterConverter c4(&s4);
+  ASSERT_FALSE(c4.isValidState());
+
+  LIBC_NAMESPACE::internal::mbstate s5{0x7ff, 1, 2};
+  LIBC_NAMESPACE::internal::CharacterConverter c5(&s5);
+  ASSERT_TRUE(c5.isValidState());
+  LIBC_NAMESPACE::internal::mbstate s6{0x800, 1, 2};
+  LIBC_NAMESPACE::internal::CharacterConverter c6(&s6);
+  ASSERT_FALSE(c6.isValidState());
+
+  LIBC_NAMESPACE::internal::mbstate s7{0xffff, 1, 3};
+  LIBC_NAMESPACE::internal::CharacterConverter c7(&s7);
+  ASSERT_TRUE(c7.isValidState());
+  LIBC_NAMESPACE::internal::mbstate s8{0x10000, 1, 3};
+  LIBC_NAMESPACE::internal::CharacterConverter c8(&s8);
+  ASSERT_FALSE(c8.isValidState());
+
+  LIBC_NAMESPACE::internal::mbstate s9{0x10ffff, 1, 4};
+  LIBC_NAMESPACE::internal::CharacterConverter c9(&s9);
+  ASSERT_TRUE(c9.isValidState());
+  LIBC_NAMESPACE::internal::mbstate s10{0x110000, 1, 2};
+  LIBC_NAMESPACE::internal::CharacterConverter c10(&s10);
+  ASSERT_FALSE(c10.isValidState());
+
+  LIBC_NAMESPACE::internal::mbstate s11{0, 0, 5};
+  LIBC_NAMESPACE::internal::CharacterConverter c11(&s11);
+  ASSERT_FALSE(c11.isValidState());
+}
diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt
index 48688b3bdd1f3..ddf8709a6a2a2 100644
--- a/libc/test/src/wchar/CMakeLists.txt
+++ b/libc/test/src/wchar/CMakeLists.txt
@@ -74,6 +74,17 @@ add_libc_test(
     libc.src.__support.libc_errno
 )
 
+add_libc_test(
+  wctomb_test
+  SUITE
+    libc_wchar_unittests
+  SRCS
+    wctomb_test.cpp
+  DEPENDS
+    libc.src.wchar.wctomb
+    libc.hdr.types.wchar_t
+)
+
 add_libc_test(
   wmemset_test 
   SUITE
diff --git a/libc/test/src/wchar/wctomb_test.cpp b/libc/test/src/wchar/wctomb_test.cpp
new file mode 100644
index 0000000000000..09fbf52806224
--- /dev/null
+++ b/libc/test/src/wchar/wctomb_test.cpp
@@ -0,0 +1,73 @@
+//===-- Unittests for wctomb ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/types/wchar_t.h"
+#include "src/__support/libc_errno.h"
+#include "src/wchar/wctomb.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcWCToMBTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST(LlvmLibcWCToMBTest, OneByte) {
+  wchar_t wc = L'U';
+  char mb[4];
+  int cnt = LIBC_NAMESPACE::wctomb(mb, wc);
+  ASSERT_EQ(cnt, 1);
+  ASSERT_EQ(mb[0], 'U');
+}
+
+TEST(LlvmLibcWCToMBTest, TwoByte) {
+  // testing utf32: 0xff -> utf8: 0xc3 0xbf
+  wchar_t wc = 0xff;
+  char mb[4];
+  int cnt = LIBC_NAMESPACE::wctomb(mb, wc);
+  ASSERT_EQ(cnt, 2);
+  ASSERT_EQ(mb[0], static_cast<char>(0xc3));
+  ASSERT_EQ(mb[1], static_cast<char>(0xbf));
+}
+
+TEST(LlvmLibcWCToMBTest, ThreeByte) {
+  // testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
+  wchar_t wc = 0xac15;
+  char mb[4];
+  int cnt = LIBC_NAMESPACE::wctomb(mb, wc);
+  ASSERT_EQ(cnt, 3);
+  ASSERT_EQ(mb[0], static_cast<char>(0xea));
+  ASSERT_EQ(mb[1], static_cast<char>(0xb0));
+  ASSERT_EQ(mb[2], static_cast<char>(0x95));
+}
+
+TEST(LlvmLibcWCToMBTest, FourByte) {
+  // testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
+  wchar_t wc = 0x1f921;
+  char mb[4];
+  int cnt = LIBC_NAMESPACE::wctomb(mb, wc);
+  ASSERT_EQ(cnt, 4);
+  ASSERT_EQ(mb[0], static_cast<char>(0xf0));
+  ASSERT_EQ(mb[1], static_cast<char>(0x9f));
+  ASSERT_EQ(mb[2], static_cast<char>(0xa4));
+  ASSERT_EQ(mb[3], static_cast<char>(0xa1));
+}
+
+TEST(LlvmLibcWCToMBTest, NullString) {
+  wchar_t wc = L'A';
+
+  int cnt = LIBC_NAMESPACE::wctomb(nullptr, wc);
+
+  // no state-dependent encoding
+  ASSERT_EQ(cnt, 0);
+}
+
+TEST(LlvmLibcWCToMBTest, InvalidWchar) {
+  wchar_t wc = 0x12ffff;
+  char mb[4];
+  int cnt = LIBC_NAMESPACE::wctomb(mb, wc);
+  ASSERT_EQ(cnt, -1);
+  ASSERT_ERRNO_EQ(EILSEQ);
+}
diff --git a/libclc/clc/include/clc/clcmacro.h b/libclc/clc/include/clc/clcmacro.h
index de7b977021f8b..b712fe5cf326c 100644
--- a/libclc/clc/include/clc/clcmacro.h
+++ b/libclc/clc/include/clc/clcmacro.h
@@ -179,109 +179,4 @@
   _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION, ARG1_TYPE, \
                         ARG2_TYPE)
 
-// FIXME: Make _CLC_DEFINE_BINARY_BUILTIN avoid scalarization by default, and
-// introduce an explicit scalarizing version.
-#define _CLC_DEFINE_BINARY_BUILTIN_NO_SCALARIZE(RET_TYPE, FUNCTION, BUILTIN,   \
-                                                ARG1_TYPE, ARG2_TYPE)          \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) {         \
-    return BUILTIN(x, y);                                                      \
-  }                                                                            \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x,                  \
-                                              ARG2_TYPE##2 y) {                \
-    return BUILTIN(x, y);                                                      \
-  }                                                                            \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x,                  \
-                                              ARG2_TYPE##3 y) {                \
-    return BUILTIN(x, y);                                                      \
-  }                                                                            \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x,                  \
-                                              ARG2_TYPE##4 y) {                \
-    return BUILTIN(x, y);                                                      \
-  }                                                                            \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x,                  \
-                                              ARG2_TYPE##8 y) {                \
-    return BUILTIN(x, y);                                                      \
-  }                                                                            \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x,                \
-                                               ARG2_TYPE##16 y) {              \
-    return BUILTIN(x, y);                                                      \
-  }
-
-#define _CLC_DEFINE_BINARY_BUILTIN_WITH_SCALAR_SECOND_ARG(                     \
-    RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE)                         \
-  _CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE,           \
-                             ARG2_TYPE)                                        \
-  _CLC_BINARY_VECTORIZE_SCALAR_SECOND_ARG(_CLC_OVERLOAD _CLC_DEF, RET_TYPE,    \
-                                          FUNCTION, ARG1_TYPE, ARG2_TYPE)
-
-#define _CLC_DEFINE_UNARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE)      \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x) { return BUILTIN(x); } \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) {                \
-    return BUILTIN(x);                                                         \
-  }                                                                            \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x) {                \
-    return BUILTIN(x);                                                         \
-  }                                                                            \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x) {                \
-    return BUILTIN(x);                                                         \
-  }                                                                            \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x) {                \
-    return BUILTIN(x);                                                         \
-  }                                                                            \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x) {              \
-    return BUILTIN(x);                                                         \
-  }
-
-#define _CLC_DEFINE_TERNARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE,    \
-                                    ARG2_TYPE, ARG3_TYPE)                      \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y,           \
-                                           ARG3_TYPE z) {                      \
-    return BUILTIN(x, y, z);                                                   \
-  }                                                                            \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y,  \
-                                              ARG3_TYPE##2 z) {                \
-    return BUILTIN(x, y, z);                                                   \
-  }                                                                            \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y,  \
-                                              ARG3_TYPE##3 z) {                \
-    return BUILTIN(x, y, z);                                                   \
-  }                                                                            \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y,  \
-                                              ARG3_TYPE##4 z) {                \
-    return BUILTIN(x, y, z);                                                   \
-  }                                                                            \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y,  \
-                                              ARG3_TYPE##8 z) {                \
-    return BUILTIN(x, y, z);                                                   \
-  }                                                                            \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE##16 FUNCTION(                                \
-      ARG1_TYPE##16 x, ARG2_TYPE##16 y, ARG3_TYPE##16 z) {                     \
-    return BUILTIN(x, y, z);                                                   \
-  }
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#define _CLC_DEFINE_UNARY_BUILTIN_FP16(FUNCTION)                               \
-  _CLC_DEF _CLC_OVERLOAD half FUNCTION(half x) {                               \
-    return (half)FUNCTION((float)x);                                           \
-  }                                                                            \
-  _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, FUNCTION, half)
-
-#define _CLC_DEFINE_BINARY_BUILTIN_FP16(FUNCTION)                              \
-  _CLC_DEF _CLC_OVERLOAD half FUNCTION(half x, half y) {                       \
-    return (half)FUNCTION((float)x, (float)y);                                 \
-  }                                                                            \
-  _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, FUNCTION, half, half)
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : disable
-
-#else
-
-#define _CLC_DEFINE_UNARY_BUILTIN_FP16(FUNCTION)
-#define _CLC_DEFINE_BINARY_BUILTIN_FP16(FUNCTION)
-
-#endif
-
 #endif // __CLC_CLCMACRO_H__
diff --git a/libclc/clc/include/clc/math/clc_pown.h b/libclc/clc/include/clc/math/clc_pown.h
index 67475503f92b7..30628efb19001 100644
--- a/libclc/clc/include/clc/math/clc_pown.h
+++ b/libclc/clc/include/clc/math/clc_pown.h
@@ -9,7 +9,7 @@
 #ifndef __CLC_MATH_CLC_POWN_H__
 #define __CLC_MATH_CLC_POWN_H__
 
-#define __CLC_BODY <clc/math/binary_decl_with_int_second_arg.inc>
+#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
 #define __CLC_FUNCTION __clc_pown
 
 #include <clc/math/gentype.inc>
diff --git a/libclc/clc/include/clc/math/clc_rootn.h b/libclc/clc/include/clc/math/clc_rootn.h
index bf9dd5413c3de..90a25ad52d867 100644
--- a/libclc/clc/include/clc/math/clc_rootn.h
+++ b/libclc/clc/include/clc/math/clc_rootn.h
@@ -9,7 +9,7 @@
 #ifndef __CLC_MATH_CLC_ROOTN_H__
 #define __CLC_MATH_CLC_ROOTN_H__
 
-#define __CLC_BODY <clc/math/binary_decl_with_int_second_arg.inc>
+#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
 #define __CLC_FUNCTION __clc_rootn
 
 #include <clc/math/gentype.inc>
diff --git a/libclc/clc/include/clc/math/binary_decl_with_int_second_arg.inc b/libclc/clc/include/clc/shared/binary_decl_with_int_second_arg.inc
similarity index 100%
rename from libclc/clc/include/clc/math/binary_decl_with_int_second_arg.inc
rename to libclc/clc/include/clc/shared/binary_decl_with_int_second_arg.inc
diff --git a/libclc/clc/include/clc/math/binary_def_with_int_second_arg.inc b/libclc/clc/include/clc/shared/binary_def_with_int_second_arg.inc
similarity index 100%
rename from libclc/clc/include/clc/math/binary_def_with_int_second_arg.inc
rename to libclc/clc/include/clc/shared/binary_def_with_int_second_arg.inc
diff --git a/libclc/clc/lib/generic/math/clc_copysign.cl b/libclc/clc/lib/generic/math/clc_copysign.cl
index d336985ebf967..b066c14bcf3f5 100644
--- a/libclc/clc/lib/generic/math/clc_copysign.cl
+++ b/libclc/clc/lib/generic/math/clc_copysign.cl
@@ -6,30 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/clcmacro.h>
 #include <clc/internal/clc.h>
 
-_CLC_DEFINE_BINARY_BUILTIN_NO_SCALARIZE(float, __clc_copysign,
-                                        __builtin_elementwise_copysign, float,
-                                        float)
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_DEFINE_BINARY_BUILTIN_NO_SCALARIZE(double, __clc_copysign,
-                                        __builtin_elementwise_copysign, double,
-                                        double)
-
-#endif
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-_CLC_DEFINE_BINARY_BUILTIN_NO_SCALARIZE(half, __clc_copysign,
-                                        __builtin_elementwise_copysign, half,
-                                        half)
-
-#endif
+#define FUNCTION __clc_copysign
+#define __CLC_FUNCTION(x) __builtin_elementwise_copysign
+#define __CLC_BODY <clc/shared/binary_def.inc>
 
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/math/clc_pow.inc b/libclc/clc/lib/generic/math/clc_pow.inc
index 98e154984aaa3..8b1f820268ba0 100644
--- a/libclc/clc/lib/generic/math/clc_pow.inc
+++ b/libclc/clc/lib/generic/math/clc_pow.inc
@@ -330,6 +330,15 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
     const __CLC_GENTYPE lnof2_by_64_head = 0.010830424260348081;
     const __CLC_GENTYPE lnof2_by_64_tail = -4.359010638708991e-10;
 
+    // If v is so large that we need to return INFINITY, or so small that we
+    // need to return 0, set v to known values that will produce that result. Do
+    // not try to continue the computation with the original v and patch it up
+    // afterwards because v may be so large that temp is out of range of int, in
+    // which case that conversion, and a value based on that conversion being
+    // passed to __clc_ldexp, results in undefined behavior.
+    v = v > max_exp_arg ? 1000.0 : v;
+    v = v < min_exp_arg ? -1000.0 : v;
+
     __CLC_GENTYPE temp = v * sixtyfour_by_lnof2;
     __CLC_INTN n = __CLC_CONVERT_INTN(temp);
     __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(n);
@@ -357,10 +366,6 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
 
     expv = __clc_fma(f, q, f2) + f1;
     expv = __clc_ldexp(expv, m);
-
-    expv = v > max_exp_arg ? __CLC_AS_GENTYPE((__CLC_ULONGN)0x7FF0000000000000L)
-                           : expv;
-    expv = v < min_exp_arg ? 0.0 : expv;
   }
 
   // See whether y is an integer.
diff --git a/libclc/clc/lib/generic/math/clc_pown.inc b/libclc/clc/lib/generic/math/clc_pown.inc
index 8bdc407e9ac82..483fd2faf2717 100644
--- a/libclc/clc/lib/generic/math/clc_pown.inc
+++ b/libclc/clc/lib/generic/math/clc_pown.inc
@@ -317,6 +317,15 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
     const __CLC_GENTYPE lnof2_by_64_head = 0.010830424260348081;
     const __CLC_GENTYPE lnof2_by_64_tail = -4.359010638708991e-10;
 
+    // If v is so large that we need to return INFINITY, or so small that we
+    // need to return 0, set v to known values that will produce that result. Do
+    // not try to continue the computation with the original v and patch it up
+    // afterwards because v may be so large that temp is out of range of int, in
+    // which case that conversion, and a value based on that conversion being
+    // passed to __clc_ldexp, results in undefined behavior.
+    v = v > max_exp_arg ? 1000.0 : v;
+    v = v < min_exp_arg ? -1000.0 : v;
+
     __CLC_GENTYPE temp = v * sixtyfour_by_lnof2;
     __CLC_INTN n = __CLC_CONVERT_INTN(temp);
     __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(n);
@@ -344,10 +353,6 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
 
     expv = __clc_fma(f, q, f2) + f1;
     expv = __clc_ldexp(expv, m);
-
-    expv = v > max_exp_arg ? __CLC_AS_GENTYPE((__CLC_ULONGN)0x7FF0000000000000L)
-                           : expv;
-    expv = v < min_exp_arg ? 0.0 : expv;
   }
 
   // See whether y is an integer.
diff --git a/libclc/clc/lib/generic/math/clc_powr.inc b/libclc/clc/lib/generic/math/clc_powr.inc
index fbdf3d85de2b7..1244f7f6ac5d6 100644
--- a/libclc/clc/lib/generic/math/clc_powr.inc
+++ b/libclc/clc/lib/generic/math/clc_powr.inc
@@ -316,6 +316,15 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
     const __CLC_GENTYPE lnof2_by_64_head = 0.010830424260348081;
     const __CLC_GENTYPE lnof2_by_64_tail = -4.359010638708991e-10;
 
+    // If v is so large that we need to return INFINITY, or so small that we
+    // need to return 0, set v to known values that will produce that result. Do
+    // not try to continue the computation with the original v and patch it up
+    // afterwards because v may be so large that temp is out of range of int, in
+    // which case that conversion, and a value based on that conversion being
+    // passed to __clc_ldexp, results in undefined behavior.
+    v = v > max_exp_arg ? 1000.0 : v;
+    v = v < min_exp_arg ? -1000.0 : v;
+
     __CLC_GENTYPE temp = v * sixtyfour_by_lnof2;
     __CLC_INTN n = __CLC_CONVERT_INTN(temp);
     __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(n);
@@ -343,10 +352,6 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
 
     expv = __clc_fma(f, q, f2) + f1;
     expv = __clc_ldexp(expv, m);
-
-    expv = v > max_exp_arg ? __CLC_AS_GENTYPE((__CLC_ULONGN)0x7FF0000000000000L)
-                           : expv;
-    expv = v < min_exp_arg ? 0.0 : expv;
   }
 
   // See whether y is an integer.
diff --git a/libclc/clc/lib/generic/math/clc_rootn.inc b/libclc/clc/lib/generic/math/clc_rootn.inc
index 0c459ae5c3cbb..996f88f145357 100644
--- a/libclc/clc/lib/generic/math/clc_rootn.inc
+++ b/libclc/clc/lib/generic/math/clc_rootn.inc
@@ -323,6 +323,15 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
     const __CLC_GENTYPE lnof2_by_64_head = 0.010830424260348081;
     const __CLC_GENTYPE lnof2_by_64_tail = -4.359010638708991e-10;
 
+    // If v is so large that we need to return INFINITY, or so small that we
+    // need to return 0, set v to known values that will produce that result. Do
+    // not try to continue the computation with the original v and patch it up
+    // afterwards because v may be so large that temp is out of range of int, in
+    // which case that conversion, and a value based on that conversion being
+    // passed to __clc_ldexp, results in undefined behavior.
+    v = v > max_exp_arg ? 1000.0 : v;
+    v = v < min_exp_arg ? -1000.0 : v;
+
     __CLC_GENTYPE temp = v * sixtyfour_by_lnof2;
     __CLC_INTN n = __CLC_CONVERT_INTN(temp);
     __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(n);
@@ -350,10 +359,6 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
 
     expv = __clc_fma(f, q, f2) + f1;
     expv = __clc_ldexp(expv, m);
-
-    expv = v > max_exp_arg ? __CLC_AS_GENTYPE((__CLC_ULONGN)0x7FF0000000000000L)
-                           : expv;
-    expv = v < min_exp_arg ? 0.0 : expv;
   }
 
   // See whether y is an integer.
diff --git a/libclc/opencl/include/clc/opencl/math/ldexp.h b/libclc/opencl/include/clc/opencl/math/ldexp.h
index 6dcd2a9548d09..ca50ae6a98312 100644
--- a/libclc/opencl/include/clc/opencl/math/ldexp.h
+++ b/libclc/opencl/include/clc/opencl/math/ldexp.h
@@ -6,5 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define __CLC_FUNCTION ldexp
+#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_FUNCTION
+
 #define __CLC_BODY <clc/opencl/math/ldexp.inc>
 #include <clc/math/gentype.inc>
diff --git a/libclc/opencl/include/clc/opencl/math/ldexp.inc b/libclc/opencl/include/clc/opencl/math/ldexp.inc
index 116acdff41d37..b5a5cfcafdd53 100644
--- a/libclc/opencl/include/clc/opencl/math/ldexp.inc
+++ b/libclc/opencl/include/clc/opencl/math/ldexp.inc
@@ -6,10 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE ldexp(__CLC_GENTYPE x, int n);
-
 #ifndef __CLC_SCALAR
 
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE ldexp(__CLC_GENTYPE x, __CLC_INTN n);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE ldexp(__CLC_GENTYPE x, int n);
 
 #endif
diff --git a/libclc/opencl/include/clc/opencl/math/pown.h b/libclc/opencl/include/clc/opencl/math/pown.h
index 1d38c68947ba1..bbdf8f8b6e91e 100644
--- a/libclc/opencl/include/clc/opencl/math/pown.h
+++ b/libclc/opencl/include/clc/opencl/math/pown.h
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #define __CLC_FUNCTION pown
-#define __CLC_BODY <clc/math/binary_decl_with_int_second_arg.inc>
+#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
 
 #include <clc/math/gentype.inc>
 
diff --git a/libclc/opencl/include/clc/opencl/math/rootn.h b/libclc/opencl/include/clc/opencl/math/rootn.h
index 789f31596d1cd..669aeefb273a9 100644
--- a/libclc/opencl/include/clc/opencl/math/rootn.h
+++ b/libclc/opencl/include/clc/opencl/math/rootn.h
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define __CLC_BODY <clc/math/binary_decl_with_int_second_arg.inc>
+#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
 #define __CLC_FUNCTION rootn
 
 #include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/clspv/math/fma.cl b/libclc/opencl/lib/clspv/math/fma.cl
index 2722018121224..0f3141a0e09ee 100644
--- a/libclc/opencl/lib/clspv/math/fma.cl
+++ b/libclc/opencl/lib/clspv/math/fma.cl
@@ -6,8 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/clcmacro.h>
 #include <clc/internal/math/clc_sw_fma.h>
 #include <clc/opencl/clc.h>
 
-_CLC_DEFINE_TERNARY_BUILTIN(float, fma, __clc_sw_fma, float, float, float)
+#define __FLOAT_ONLY
+#define FUNCTION fma
+#define __CLC_FUNCTION(x) __clc_sw_fma
+#define __CLC_BODY <clc/shared/ternary_def.inc>
+
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/common/degrees.cl b/libclc/opencl/lib/generic/common/degrees.cl
index 8b17fe4321297..a86003c170bff 100644
--- a/libclc/opencl/lib/generic/common/degrees.cl
+++ b/libclc/opencl/lib/generic/common/degrees.cl
@@ -6,22 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/clcmacro.h>
 #include <clc/common/clc_degrees.h>
 #include <clc/opencl/clc.h>
 
-_CLC_DEFINE_UNARY_BUILTIN(float, degrees, __clc_degrees, float)
+#define FUNCTION degrees
+#define __CLC_BODY <clc/shared/unary_def.inc>
 
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_DEFINE_UNARY_BUILTIN(double, degrees, __clc_degrees, double)
-
-#endif
-
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-_CLC_DEFINE_UNARY_BUILTIN(half, degrees, __clc_degrees, half)
-
-#endif
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/common/radians.cl b/libclc/opencl/lib/generic/common/radians.cl
index 1c58c6c4da6f3..b45653be2e782 100644
--- a/libclc/opencl/lib/generic/common/radians.cl
+++ b/libclc/opencl/lib/generic/common/radians.cl
@@ -6,22 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/clcmacro.h>
 #include <clc/common/clc_radians.h>
 #include <clc/opencl/clc.h>
 
-_CLC_DEFINE_UNARY_BUILTIN(float, radians, __clc_radians, float)
+#define FUNCTION radians
+#define __CLC_BODY <clc/shared/unary_def.inc>
 
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_DEFINE_UNARY_BUILTIN(double, radians, __clc_radians, double)
-
-#endif
-
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-_CLC_DEFINE_UNARY_BUILTIN(half, radians, __clc_radians, half)
-
-#endif
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/math/fma.cl b/libclc/opencl/lib/generic/math/fma.cl
index ee3395bb2c648..c077357a44f0d 100644
--- a/libclc/opencl/lib/generic/math/fma.cl
+++ b/libclc/opencl/lib/generic/math/fma.cl
@@ -6,23 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/clcmacro.h>
 #include <clc/math/clc_fma.h>
 #include <clc/math/math.h>
 #include <clc/opencl/clc.h>
 
-_CLC_DEFINE_TERNARY_BUILTIN(float, fma, __clc_fma, float, float, float)
+#define FUNCTION fma
+#define __CLC_BODY <clc/shared/ternary_def.inc>
 
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_DEFINE_TERNARY_BUILTIN(double, fma, __clc_fma, double, double, double)
-
-#endif
-
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-_CLC_DEFINE_TERNARY_BUILTIN(half, fma, __clc_fma, half, half, half)
-
-#endif
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/math/ldexp.cl b/libclc/opencl/lib/generic/math/ldexp.cl
index e3b9b2b3f1363..069ba8251feba 100644
--- a/libclc/opencl/lib/generic/math/ldexp.cl
+++ b/libclc/opencl/lib/generic/math/ldexp.cl
@@ -6,27 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/clcmacro.h>
 #include <clc/math/clc_ldexp.h>
 #include <clc/opencl/clc.h>
 
-_CLC_DEFINE_BINARY_BUILTIN_NO_SCALARIZE(float, ldexp, __clc_ldexp, float, int)
+#define FUNCTION ldexp
+#define __CLC_FUNCTION(x) __clc_ldexp
+#define __CLC_BODY <clc/shared/binary_def_with_int_second_arg.inc>
 
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_DEFINE_BINARY_BUILTIN_NO_SCALARIZE(double, ldexp, __clc_ldexp, double, int)
-
-#endif
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-_CLC_DEFINE_BINARY_BUILTIN_NO_SCALARIZE(half, ldexp, __clc_ldexp, half, int)
-
-#endif
+#include <clc/math/gentype.inc>
 
 // This defines all the ldexp(GENTYPE, int) variants
 #define __CLC_BODY <ldexp.inc>
diff --git a/libclc/opencl/lib/generic/math/mad.cl b/libclc/opencl/lib/generic/math/mad.cl
index 20e6903094454..39aa8e884cc03 100644
--- a/libclc/opencl/lib/generic/math/mad.cl
+++ b/libclc/opencl/lib/generic/math/mad.cl
@@ -6,22 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/clcmacro.h>
 #include <clc/math/clc_mad.h>
 #include <clc/opencl/clc.h>
 
-_CLC_DEFINE_TERNARY_BUILTIN(float, mad, __clc_mad, float, float, float)
+#define FUNCTION mad
+#define __CLC_BODY <clc/shared/ternary_def.inc>
 
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_DEFINE_TERNARY_BUILTIN(double, mad, __clc_mad, double, double, double)
-
-#endif
-
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-_CLC_DEFINE_TERNARY_BUILTIN(half, mad, __clc_mad, half, half, half)
-
-#endif
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/math/nextafter.cl b/libclc/opencl/lib/generic/math/nextafter.cl
index ecb187c53069e..6a5a745f82526 100644
--- a/libclc/opencl/lib/generic/math/nextafter.cl
+++ b/libclc/opencl/lib/generic/math/nextafter.cl
@@ -6,27 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/clcmacro.h>
 #include <clc/math/clc_nextafter.h>
 #include <clc/opencl/clc.h>
 
-_CLC_DEFINE_BINARY_BUILTIN_NO_SCALARIZE(float, nextafter, __clc_nextafter,
-                                        float, float)
+#define FUNCTION nextafter
+#define __CLC_FUNCTION(x) __clc_nextafter
+#define __CLC_BODY <clc/shared/binary_def.inc>
 
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_DEFINE_BINARY_BUILTIN_NO_SCALARIZE(double, nextafter, __clc_nextafter,
-                                        double, double)
-
-#endif
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-_CLC_DEFINE_BINARY_BUILTIN_NO_SCALARIZE(half, nextafter, __clc_nextafter, half,
-                                        half)
-
-#endif
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/math/pown.cl b/libclc/opencl/lib/generic/math/pown.cl
index a2ed523a41f74..115bae3406f0e 100644
--- a/libclc/opencl/lib/generic/math/pown.cl
+++ b/libclc/opencl/lib/generic/math/pown.cl
@@ -10,5 +10,5 @@
 #include <clc/opencl/clc.h>
 
 #define FUNCTION pown
-#define __CLC_BODY <clc/math/binary_def_with_int_second_arg.inc>
+#define __CLC_BODY <clc/shared/binary_def_with_int_second_arg.inc>
 #include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/math/rootn.cl b/libclc/opencl/lib/generic/math/rootn.cl
index 9f737151b3903..0e1acc95470df 100644
--- a/libclc/opencl/lib/generic/math/rootn.cl
+++ b/libclc/opencl/lib/generic/math/rootn.cl
@@ -10,5 +10,5 @@
 #include <clc/opencl/clc.h>
 
 #define FUNCTION rootn
-#define __CLC_BODY <clc/math/binary_def_with_int_second_arg.inc>
+#define __CLC_BODY <clc/shared/binary_def_with_int_second_arg.inc>
 #include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/spirv/math/fma.cl b/libclc/opencl/lib/spirv/math/fma.cl
index 2722018121224..0f3141a0e09ee 100644
--- a/libclc/opencl/lib/spirv/math/fma.cl
+++ b/libclc/opencl/lib/spirv/math/fma.cl
@@ -6,8 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/clcmacro.h>
 #include <clc/internal/math/clc_sw_fma.h>
 #include <clc/opencl/clc.h>
 
-_CLC_DEFINE_TERNARY_BUILTIN(float, fma, __clc_sw_fma, float, float, float)
+#define __FLOAT_ONLY
+#define FUNCTION fma
+#define __CLC_FUNCTION(x) __clc_sw_fma
+#define __CLC_BODY <clc/shared/ternary_def.inc>
+
+#include <clc/math/gentype.inc>
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index 2eb1921069776..00fad3ff802a8 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -59,7 +59,7 @@
 "`P2248R8 <https://wg21.link/P2248R8>`__","Enabling list-initialization for algorithms","2024-03 (Tokyo)","","",""
 "`P2810R4 <https://wg21.link/P2810R4>`__","``is_debugger_present`` ``is_replaceable``","2024-03 (Tokyo)","","",""
 "`P1068R11 <https://wg21.link/P1068R11>`__","Vector API for random number generation","2024-03 (Tokyo)","","",""
-"`P2944R3 <https://wg21.link/P2944R3>`__","Comparisons for ``reference_wrapper``","2024-03 (Tokyo)","|Partial|","","The changes to ``optional``, ``tuple`` and ``variant`` are not yet implemented"
+"`P2944R3 <https://wg21.link/P2944R3>`__","Comparisons for ``reference_wrapper``","2024-03 (Tokyo)","|Partial|","","The changes to ``optional`` and ``tuple`` are not yet implemented"
 "`P2642R6 <https://wg21.link/P2642R6>`__","Padded ``mdspan`` layouts","2024-03 (Tokyo)","","",""
 "`P3029R1 <https://wg21.link/P3029R1>`__","Better ``mdspan``'s CTAD","2024-03 (Tokyo)","|Complete|","19",""
 "","","","","",""
diff --git a/libcxx/include/variant b/libcxx/include/variant
index dac6f786cc198..ede9f486ecc2e 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -242,6 +242,7 @@ namespace std {
 #  include <__type_traits/is_assignable.h>
 #  include <__type_traits/is_constructible.h>
 #  include <__type_traits/is_convertible.h>
+#  include <__type_traits/is_core_convertible.h>
 #  include <__type_traits/is_destructible.h>
 #  include <__type_traits/is_nothrow_assignable.h>
 #  include <__type_traits/is_nothrow_constructible.h>
@@ -1442,6 +1443,11 @@ struct __convert_to_bool {
 };
 
 template <class... _Types>
+#    if _LIBCPP_STD_VER >= 26
+  requires(requires(const _Types& __t) {
+    { __t == __t } -> __core_convertible_to<bool>;
+  } && ...)
+#    endif
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator==(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs) {
   using __variant_detail::__visitation::__variant;
   if (__lhs.index() != __rhs.index())
@@ -1474,6 +1480,11 @@ operator<=>(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs) {
 #    endif // _LIBCPP_STD_VER >= 20
 
 template <class... _Types>
+#    if _LIBCPP_STD_VER >= 26
+  requires(requires(const _Types& __t) {
+    { __t != __t } -> __core_convertible_to<bool>;
+  } && ...)
+#    endif
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator!=(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs) {
   using __variant_detail::__visitation::__variant;
   if (__lhs.index() != __rhs.index())
@@ -1484,6 +1495,11 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool operator!=(const variant<_Types...>& __lhs,
 }
 
 template <class... _Types>
+#    if _LIBCPP_STD_VER >= 26
+  requires(requires(const _Types& __t) {
+    { __t < __t } -> __core_convertible_to<bool>;
+  } && ...)
+#    endif
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator<(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs) {
   using __variant_detail::__visitation::__variant;
   if (__rhs.valueless_by_exception())
@@ -1498,6 +1514,11 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool operator<(const variant<_Types...>& __lhs,
 }
 
 template <class... _Types>
+#    if _LIBCPP_STD_VER >= 26
+  requires(requires(const _Types& __t) {
+    { __t > __t } -> __core_convertible_to<bool>;
+  } && ...)
+#    endif
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator>(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs) {
   using __variant_detail::__visitation::__variant;
   if (__lhs.valueless_by_exception())
@@ -1512,6 +1533,11 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool operator>(const variant<_Types...>& __lhs,
 }
 
 template <class... _Types>
+#    if _LIBCPP_STD_VER >= 26
+  requires(requires(const _Types& __t) {
+    { __t <= __t } -> __core_convertible_to<bool>;
+  } && ...)
+#    endif
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator<=(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs) {
   using __variant_detail::__visitation::__variant;
   if (__lhs.valueless_by_exception())
@@ -1526,6 +1552,11 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool operator<=(const variant<_Types...>& __lhs,
 }
 
 template <class... _Types>
+#    if _LIBCPP_STD_VER >= 26
+  requires(requires(const _Types& __t) {
+    { __t >= __t } -> __core_convertible_to<bool>;
+  } && ...)
+#    endif
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator>=(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs) {
   using __variant_detail::__visitation::__variant;
   if (__rhs.valueless_by_exception())
diff --git a/libcxx/test/std/utilities/variant/variant.relops/relops.pass.cpp b/libcxx/test/std/utilities/variant/variant.relops/relops.pass.cpp
index c1a5b8e474a74..2c00703662687 100644
--- a/libcxx/test/std/utilities/variant/variant.relops/relops.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.relops/relops.pass.cpp
@@ -39,8 +39,57 @@
 #include <utility>
 #include <variant>
 
+#include "test_comparisons.h"
 #include "test_macros.h"
 
+#if TEST_STD_VER >= 26
+
+// Test SFINAE.
+
+// ==
+static_assert(HasOperatorEqual<std::variant<EqualityComparable>>);
+static_assert(HasOperatorEqual<std::variant<EqualityComparable, int, long>>);
+
+static_assert(!HasOperatorEqual<std::variant<NonComparable>>);
+static_assert(!HasOperatorEqual<std::variant<NonComparable, EqualityComparable>>);
+
+// >
+static_assert(HasOperatorGreaterThan<std::variant<ThreeWayComparable>>);
+static_assert(HasOperatorGreaterThan<std::variant<ThreeWayComparable, int, long>>);
+
+static_assert(!HasOperatorGreaterThan<std::variant<NonComparable>>);
+static_assert(!HasOperatorGreaterThan<std::variant<NonComparable, ThreeWayComparable>>);
+
+// >=
+static_assert(HasOperatorGreaterThanEqual<std::variant<ThreeWayComparable>>);
+static_assert(HasOperatorGreaterThanEqual<std::variant<ThreeWayComparable, int, long>>);
+
+static_assert(!HasOperatorGreaterThanEqual<std::variant<NonComparable>>);
+static_assert(!HasOperatorGreaterThanEqual<std::variant<NonComparable, ThreeWayComparable>>);
+
+// <
+static_assert(HasOperatorLessThan<std::variant<ThreeWayComparable>>);
+static_assert(HasOperatorLessThan<std::variant<ThreeWayComparable, int, long>>);
+
+static_assert(!HasOperatorLessThan<std::variant<NonComparable>>);
+static_assert(!HasOperatorLessThan<std::variant<NonComparable, ThreeWayComparable>>);
+
+// <=
+static_assert(HasOperatorLessThanEqual<std::variant<ThreeWayComparable>>);
+static_assert(HasOperatorLessThanEqual<std::variant<ThreeWayComparable, int, long>>);
+
+static_assert(!HasOperatorLessThanEqual<std::variant<NonComparable>>);
+static_assert(!HasOperatorLessThanEqual<std::variant<NonComparable, ThreeWayComparable>>);
+
+// !=
+static_assert(HasOperatorNotEqual<std::variant<EqualityComparable>>);
+static_assert(HasOperatorNotEqual<std::variant<EqualityComparable, int, long>>);
+
+static_assert(!HasOperatorNotEqual<std::variant<NonComparable>>);
+static_assert(!HasOperatorNotEqual<std::variant<NonComparable, EqualityComparable>>);
+
+#endif
+
 #ifndef TEST_HAS_NO_EXCEPTIONS
 struct MakeEmptyT {
   MakeEmptyT() = default;
diff --git a/libcxx/test/std/utilities/variant/variant.relops/relops_bool_conv.verify.cpp b/libcxx/test/std/utilities/variant/variant.relops/relops_bool_conv.verify.cpp
index 64248171d1146..392a234e6a9b2 100644
--- a/libcxx/test/std/utilities/variant/variant.relops/relops_bool_conv.verify.cpp
+++ b/libcxx/test/std/utilities/variant/variant.relops/relops_bool_conv.verify.cpp
@@ -41,7 +41,9 @@
 
 #include "test_macros.h"
 
-
+#if TEST_STD_VER >= 26
+// expected-no-diagnostics
+#else
 struct MyBoolExplicit {
   bool value;
   constexpr explicit MyBoolExplicit(bool v) : value(v) {}
@@ -70,8 +72,7 @@ inline constexpr MyBoolExplicit operator>=(const ComparesToMyBoolExplicit& LHS,
   return MyBoolExplicit(LHS.value >= RHS.value);
 }
 
-
-int main(int, char**) {
+void test() {
   using V = std::variant<int, ComparesToMyBoolExplicit>;
   V v1(42);
   V v2(101);
@@ -83,6 +84,6 @@ int main(int, char**) {
   (void)(v1 <= v2); // expected-note {{here}}
   (void)(v1 > v2); // expected-note {{here}}
   (void)(v1 >= v2); // expected-note {{here}}
-
-  return 0;
 }
+
+#endif
diff --git a/libcxx/test/support/test_comparisons.h b/libcxx/test/support/test_comparisons.h
index d9729e0451b49..e37ab44828c70 100644
--- a/libcxx/test/support/test_comparisons.h
+++ b/libcxx/test/support/test_comparisons.h
@@ -271,12 +271,31 @@ struct PartialOrder {
 template <typename T1, typename T2 = T1>
 concept HasOperatorEqual = requires(T1 t1, T2 t2) { t1 == t2; };
 
+template <typename T1, typename T2 = T1>
+concept HasOperatorGreaterThan = requires(T1 t1, T2 t2) { t1 > t2; };
+
+template <typename T1, typename T2 = T1>
+concept HasOperatorGreaterThanEqual = requires(T1 t1, T2 t2) { t1 >= t2; };
+template <typename T1, typename T2 = T1>
+concept HasOperatorLessThan = requires(T1 t1, T2 t2) { t1 < t2; };
+
+template <typename T1, typename T2 = T1>
+concept HasOperatorLessThanEqual = requires(T1 t1, T2 t2) { t1 <= t2; };
+
+template <typename T1, typename T2 = T1>
+concept HasOperatorNotEqual = requires(T1 t1, T2 t2) { t1 != t2; };
+
 template <typename T1, typename T2 = T1>
 concept HasOperatorSpaceship = requires(T1 t1, T2 t2) { t1 <=> t2; };
 
 struct NonComparable {};
 static_assert(!std::equality_comparable<NonComparable>);
 static_assert(!HasOperatorEqual<NonComparable>);
+static_assert(!HasOperatorGreaterThan<NonComparable>);
+static_assert(!HasOperatorGreaterThanEqual<NonComparable>);
+static_assert(!HasOperatorLessThan<NonComparable>);
+static_assert(!HasOperatorLessThanEqual<NonComparable>);
+static_assert(!HasOperatorNotEqual<NonComparable>);
 static_assert(!HasOperatorSpaceship<NonComparable>);
 
 class EqualityComparable {
@@ -290,6 +309,28 @@ class EqualityComparable {
 };
 static_assert(std::equality_comparable<EqualityComparable>);
 static_assert(HasOperatorEqual<EqualityComparable>);
+static_assert(HasOperatorNotEqual<EqualityComparable>);
+
+class ThreeWayComparable {
+public:
+  constexpr ThreeWayComparable(int value) : value_{value} {};
+
+  friend constexpr bool operator==(const ThreeWayComparable&, const ThreeWayComparable&) noexcept = default;
+  friend constexpr std::strong_ordering
+  operator<=>(const ThreeWayComparable&, const ThreeWayComparable&) noexcept = default;
+
+private:
+  int value_;
+};
+static_assert(std::equality_comparable<ThreeWayComparable>);
+static_assert(std::three_way_comparable<ThreeWayComparable>);
+static_assert(HasOperatorEqual<ThreeWayComparable>);
+static_assert(HasOperatorGreaterThan<ThreeWayComparable>);
+static_assert(HasOperatorGreaterThanEqual<ThreeWayComparable>);
+static_assert(HasOperatorLessThan<ThreeWayComparable>);
+static_assert(HasOperatorLessThanEqual<ThreeWayComparable>);
+static_assert(HasOperatorNotEqual<ThreeWayComparable>);
+static_assert(HasOperatorSpaceship<ThreeWayComparable>);
 
 #endif // TEST_STD_VER >= 20
 
diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h
index 4e7f92dd1991a..b306b2013445c 100644
--- a/libcxxabi/src/demangle/ItaniumDemangle.h
+++ b/libcxxabi/src/demangle/ItaniumDemangle.h
@@ -4468,7 +4468,9 @@ Node *AbstractManglingParser<Derived, Alloc>::parseType() {
         return nullptr;
       if (!consumeIf('_'))
         return nullptr;
-      return make<BitIntType>(Size, Signed);
+      // The front end expects this to be available for Substitution
+      Result = make<BitIntType>(Size, Signed);
+      break;
     }
     //                ::= Di   # char32_t
     case 'i':
diff --git a/libcxxabi/test/DemangleTestCases.inc b/libcxxabi/test/DemangleTestCases.inc
index 1e3f7459deaa2..2721d2aa5504e 100644
--- a/libcxxabi/test/DemangleTestCases.inc
+++ b/libcxxabi/test/DemangleTestCases.inc
@@ -6,6 +6,7 @@
 {"_Z1fDU10_", "f(unsigned _BitInt(10))"},
 {"_Z1fIfEvDUstPT__", "void f<float>(unsigned _BitInt(sizeof (float*)))"},
 {"_Z1fIiEvDBstPT__", "void f<int>(_BitInt(sizeof (int*)))"},
+{"_Z6myfuncRDB8_S0_", "myfunc(_BitInt(8)&, _BitInt(8)&)"},
 {"_Z4testI1A1BE1Cv", "C test<A, B>()"},
 {"_Z4testI1A1BET0_T_S3_", "B test<A, B>(A, A)"},
 {"_ZN1SgtEi", "S::operator>(int)"},
diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h
index 9f82466a83417..2087ef2a11562 100644
--- a/lldb/include/lldb/Core/Debugger.h
+++ b/lldb/include/lldb/Core/Debugger.h
@@ -602,10 +602,6 @@ class Debugger : public std::enable_shared_from_this<Debugger>,
   void FlushProcessOutput(Process &process, bool flush_stdout,
                           bool flush_stderr);
 
-  void AddProtocolServer(lldb::ProtocolServerSP protocol_server_sp);
-  void RemoveProtocolServer(lldb::ProtocolServerSP protocol_server_sp);
-  lldb::ProtocolServerSP GetProtocolServer(llvm::StringRef protocol) const;
-
   SourceManager::SourceFileCache &GetSourceFileCache() {
     return m_source_file_cache;
   }
@@ -776,8 +772,6 @@ class Debugger : public std::enable_shared_from_this<Debugger>,
   mutable std::mutex m_progress_reports_mutex;
   /// @}
 
-  llvm::SmallVector<lldb::ProtocolServerSP> m_protocol_servers;
-
   std::mutex m_destroy_callback_mutex;
   lldb::callback_token_t m_destroy_callback_next_token = 0;
   struct DestroyCallbackInfo {
diff --git a/lldb/include/lldb/Core/ProtocolServer.h b/lldb/include/lldb/Core/ProtocolServer.h
index fafe460904323..937256c10aec1 100644
--- a/lldb/include/lldb/Core/ProtocolServer.h
+++ b/lldb/include/lldb/Core/ProtocolServer.h
@@ -20,8 +20,9 @@ class ProtocolServer : public PluginInterface {
   ProtocolServer() = default;
   virtual ~ProtocolServer() = default;
 
-  static lldb::ProtocolServerSP Create(llvm::StringRef name,
-                                       Debugger &debugger);
+  static ProtocolServer *GetOrCreate(llvm::StringRef name);
+
+  static std::vector<llvm::StringRef> GetSupportedProtocols();
 
   struct Connection {
     Socket::SocketProtocol protocol;
diff --git a/lldb/include/lldb/Target/MemoryTagManager.h b/lldb/include/lldb/Target/MemoryTagManager.h
index 6bd4180fff703..5b7219692d77f 100644
--- a/lldb/include/lldb/Target/MemoryTagManager.h
+++ b/lldb/include/lldb/Target/MemoryTagManager.h
@@ -122,11 +122,15 @@ class MemoryTagManager {
   //
   // 'reader' will always be a wrapper around a CoreFile in real use
   // but allows testing without having to mock a CoreFile.
+  //
+  // This call will fail in the case that the core file segment does not contain
+  // enough data to read all the tags.
   typedef std::function<size_t(lldb::offset_t, size_t, void *)> CoreReaderFn;
-  std::vector<lldb::addr_t> virtual UnpackTagsFromCoreFileSegment(
-      CoreReaderFn reader, lldb::addr_t tag_segment_virtual_address,
-      lldb::addr_t tag_segment_data_address, lldb::addr_t addr,
-      size_t len) const = 0;
+  llvm::
+      Expected<std::vector<lldb::addr_t>> virtual UnpackTagsFromCoreFileSegment(
+          CoreReaderFn reader, lldb::addr_t tag_segment_virtual_address,
+          lldb::addr_t tag_segment_data_address, lldb::addr_t addr,
+          size_t len) const = 0;
 
   // Pack uncompressed tags into their storage format (e.g. for gdb QMemTags).
   // Checks that each tag is within the expected value range.
diff --git a/lldb/include/lldb/Utility/XcodeSDK.h b/lldb/include/lldb/Utility/XcodeSDK.h
index ceb8abb8c502d..a1a0ec415b90e 100644
--- a/lldb/include/lldb/Utility/XcodeSDK.h
+++ b/lldb/include/lldb/Utility/XcodeSDK.h
@@ -93,19 +93,6 @@ class XcodeSDK {
   static bool SDKSupportsModules(Type type, llvm::VersionTuple version);
   static bool SDKSupportsModules(Type desired_type, const FileSpec &sdk_path);
 
-  /// Returns true if the SDK for the specified triple supports
-  /// builtin modules in system headers.
-  ///
-  /// NOTE: should be kept in sync with sdkSupportsBuiltinModules in
-  /// Toolchains/Darwin.cpp
-  ///
-  /// FIXME: this function will be removed once LLDB's ClangExpressionParser
-  /// constructs the compiler instance through the driver/toolchain. See \ref
-  /// SetupImportStdModuleLangOpts
-  ///
-  static bool SDKSupportsBuiltinModules(const llvm::Triple &target_triple,
-                                        llvm::VersionTuple sdk_version);
-
   /// Return the canonical SDK name, such as "macosx" for the macOS SDK.
   static std::string GetCanonicalName(Info info);
   /// Return the best-matching SDK type for a specific triple.
diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h
index 558818e8e2309..2bc85a2d2afa6 100644
--- a/lldb/include/lldb/lldb-forward.h
+++ b/lldb/include/lldb/lldb-forward.h
@@ -391,7 +391,7 @@ typedef std::shared_ptr<lldb_private::Platform> PlatformSP;
 typedef std::shared_ptr<lldb_private::Process> ProcessSP;
 typedef std::shared_ptr<lldb_private::ProcessAttachInfo> ProcessAttachInfoSP;
 typedef std::shared_ptr<lldb_private::ProcessLaunchInfo> ProcessLaunchInfoSP;
-typedef std::shared_ptr<lldb_private::ProtocolServer> ProtocolServerSP;
+typedef std::unique_ptr<lldb_private::ProtocolServer> ProtocolServerUP;
 typedef std::weak_ptr<lldb_private::Process> ProcessWP;
 typedef std::shared_ptr<lldb_private::RegisterCheckpoint> RegisterCheckpointSP;
 typedef std::shared_ptr<lldb_private::RegisterContext> RegisterContextSP;
diff --git a/lldb/include/lldb/lldb-private-interfaces.h b/lldb/include/lldb/lldb-private-interfaces.h
index 34eaaa8e581e9..249b25c251ac2 100644
--- a/lldb/include/lldb/lldb-private-interfaces.h
+++ b/lldb/include/lldb/lldb-private-interfaces.h
@@ -81,8 +81,7 @@ typedef lldb::PlatformSP (*PlatformCreateInstance)(bool force,
 typedef lldb::ProcessSP (*ProcessCreateInstance)(
     lldb::TargetSP target_sp, lldb::ListenerSP listener_sp,
     const FileSpec *crash_file_path, bool can_connect);
-typedef lldb::ProtocolServerSP (*ProtocolServerCreateInstance)(
-    Debugger &debugger);
+typedef lldb::ProtocolServerUP (*ProtocolServerCreateInstance)();
 typedef lldb::RegisterTypeBuilderSP (*RegisterTypeBuilderCreateInstance)(
     Target &target);
 typedef lldb::ScriptInterpreterSP (*ScriptInterpreterCreateInstance)(
diff --git a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
index 753de22b9cfee..b603c35c8df09 100644
--- a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
+++ b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
@@ -92,6 +92,9 @@ class MockGDBServerResponder:
     class RESPONSE_DISCONNECT:
         pass
 
+    class RESPONSE_NONE:
+        pass
+
     def __init__(self):
         self.packetLog = []
 
@@ -181,6 +184,8 @@ def respond(self, packet):
             return self.qQueryGDBServer()
         if packet == "qHostInfo":
             return self.qHostInfo()
+        if packet.startswith("qEcho"):
+            return self.qEcho(int(packet.split(":")[1]))
         if packet == "qGetWorkingDir":
             return self.qGetWorkingDir()
         if packet == "qOffsets":
@@ -237,6 +242,9 @@ def qProcessInfo(self):
     def qHostInfo(self):
         return "ptrsize:8;endian:little;"
 
+    def qEcho(self):
+        return "E04"
+
     def qQueryGDBServer(self):
         return "E04"
 
@@ -655,6 +663,8 @@ def _handlePacket(self, packet):
         if not isinstance(response, list):
             response = [response]
         for part in response:
+            if part is MockGDBServerResponder.RESPONSE_NONE:
+                continue
             if part is MockGDBServerResponder.RESPONSE_DISCONNECT:
                 raise self.TerminateConnectionException()
             self._sendPacket(part)
diff --git a/lldb/source/Commands/CommandObjectProtocolServer.cpp b/lldb/source/Commands/CommandObjectProtocolServer.cpp
index 115754769f3e3..55bd42ed1a533 100644
--- a/lldb/source/Commands/CommandObjectProtocolServer.cpp
+++ b/lldb/source/Commands/CommandObjectProtocolServer.cpp
@@ -23,20 +23,6 @@ using namespace lldb_private;
 #define LLDB_OPTIONS_mcp
 #include "CommandOptions.inc"
 
-static std::vector<llvm::StringRef> GetSupportedProtocols() {
-  std::vector<llvm::StringRef> supported_protocols;
-  size_t i = 0;
-
-  for (llvm::StringRef protocol_name =
-           PluginManager::GetProtocolServerPluginNameAtIndex(i++);
-       !protocol_name.empty();
-       protocol_name = PluginManager::GetProtocolServerPluginNameAtIndex(i++)) {
-    supported_protocols.push_back(protocol_name);
-  }
-
-  return supported_protocols;
-}
-
 class CommandObjectProtocolServerStart : public CommandObjectParsed {
 public:
   CommandObjectProtocolServerStart(CommandInterpreter &interpreter)
@@ -57,12 +43,11 @@ class CommandObjectProtocolServerStart : public CommandObjectParsed {
     }
 
     llvm::StringRef protocol = args.GetArgumentAtIndex(0);
-    std::vector<llvm::StringRef> supported_protocols = GetSupportedProtocols();
-    if (llvm::find(supported_protocols, protocol) ==
-        supported_protocols.end()) {
+    ProtocolServer *server = ProtocolServer::GetOrCreate(protocol);
+    if (!server) {
       result.AppendErrorWithFormatv(
           "unsupported protocol: {0}. Supported protocols are: {1}", protocol,
-          llvm::join(GetSupportedProtocols(), ", "));
+          llvm::join(ProtocolServer::GetSupportedProtocols(), ", "));
       return;
     }
 
@@ -72,10 +57,6 @@ class CommandObjectProtocolServerStart : public CommandObjectParsed {
     }
     llvm::StringRef connection_uri = args.GetArgumentAtIndex(1);
 
-    ProtocolServerSP server_sp = GetDebugger().GetProtocolServer(protocol);
-    if (!server_sp)
-      server_sp = ProtocolServer::Create(protocol, GetDebugger());
-
     const char *connection_error =
         "unsupported connection specifier, expected 'accept:///path' or "
         "'listen://[host]:port', got '{0}'.";
@@ -98,14 +79,12 @@ class CommandObjectProtocolServerStart : public CommandObjectParsed {
         formatv("[{0}]:{1}", uri->hostname.empty() ? "0.0.0.0" : uri->hostname,
                 uri->port.value_or(0));
 
-    if (llvm::Error error = server_sp->Start(connection)) {
+    if (llvm::Error error = server->Start(connection)) {
       result.AppendErrorWithFormatv("{0}", llvm::fmt_consume(std::move(error)));
       return;
     }
 
-    GetDebugger().AddProtocolServer(server_sp);
-
-    if (Socket *socket = server_sp->GetSocket()) {
+    if (Socket *socket = server->GetSocket()) {
       std::string address =
           llvm::join(socket->GetListeningConnectionURI(), ", ");
       result.AppendMessageWithFormatv(
@@ -134,30 +113,18 @@ class CommandObjectProtocolServerStop : public CommandObjectParsed {
     }
 
     llvm::StringRef protocol = args.GetArgumentAtIndex(0);
-    std::vector<llvm::StringRef> supported_protocols = GetSupportedProtocols();
-    if (llvm::find(supported_protocols, protocol) ==
-        supported_protocols.end()) {
+    ProtocolServer *server = ProtocolServer::GetOrCreate(protocol);
+    if (!server) {
       result.AppendErrorWithFormatv(
           "unsupported protocol: {0}. Supported protocols are: {1}", protocol,
-          llvm::join(GetSupportedProtocols(), ", "));
+          llvm::join(ProtocolServer::GetSupportedProtocols(), ", "));
       return;
     }
 
-    Debugger &debugger = GetDebugger();
-
-    ProtocolServerSP server_sp = debugger.GetProtocolServer(protocol);
-    if (!server_sp) {
-      result.AppendError(
-          llvm::formatv("no {0} protocol server running", protocol).str());
-      return;
-    }
-
-    if (llvm::Error error = server_sp->Stop()) {
+    if (llvm::Error error = server->Stop()) {
       result.AppendErrorWithFormatv("{0}", llvm::fmt_consume(std::move(error)));
       return;
     }
-
-    debugger.RemoveProtocolServer(server_sp);
   }
 };
 
diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
index 33d1053fd8a65..445baf1f63785 100644
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -2376,26 +2376,3 @@ llvm::ThreadPoolInterface &Debugger::GetThreadPool() {
          "Debugger::GetThreadPool called before Debugger::Initialize");
   return *g_thread_pool;
 }
-
-void Debugger::AddProtocolServer(lldb::ProtocolServerSP protocol_server_sp) {
-  assert(protocol_server_sp &&
-         GetProtocolServer(protocol_server_sp->GetPluginName()) == nullptr);
-  m_protocol_servers.push_back(protocol_server_sp);
-}
-
-void Debugger::RemoveProtocolServer(lldb::ProtocolServerSP protocol_server_sp) {
-  auto it = llvm::find(m_protocol_servers, protocol_server_sp);
-  if (it != m_protocol_servers.end())
-    m_protocol_servers.erase(it);
-}
-
-lldb::ProtocolServerSP
-Debugger::GetProtocolServer(llvm::StringRef protocol) const {
-  for (ProtocolServerSP protocol_server_sp : m_protocol_servers) {
-    if (!protocol_server_sp)
-      continue;
-    if (protocol_server_sp->GetPluginName() == protocol)
-      return protocol_server_sp;
-  }
-  return nullptr;
-}
diff --git a/lldb/source/Core/ProtocolServer.cpp b/lldb/source/Core/ProtocolServer.cpp
index d57a047afa7b2..41636cdacdecc 100644
--- a/lldb/source/Core/ProtocolServer.cpp
+++ b/lldb/source/Core/ProtocolServer.cpp
@@ -12,10 +12,36 @@
 using namespace lldb_private;
 using namespace lldb;
 
-ProtocolServerSP ProtocolServer::Create(llvm::StringRef name,
-                                        Debugger &debugger) {
+ProtocolServer *ProtocolServer::GetOrCreate(llvm::StringRef name) {
+  static std::mutex g_mutex;
+  static llvm::StringMap<ProtocolServerUP> g_protocol_server_instances;
+
+  std::lock_guard<std::mutex> guard(g_mutex);
+
+  auto it = g_protocol_server_instances.find(name);
+  if (it != g_protocol_server_instances.end())
+    return it->second.get();
+
   if (ProtocolServerCreateInstance create_callback =
-          PluginManager::GetProtocolCreateCallbackForPluginName(name))
-    return create_callback(debugger);
+          PluginManager::GetProtocolCreateCallbackForPluginName(name)) {
+    auto pair =
+        g_protocol_server_instances.try_emplace(name, create_callback());
+    return pair.first->second.get();
+  }
+
   return nullptr;
 }
+
+std::vector<llvm::StringRef> ProtocolServer::GetSupportedProtocols() {
+  std::vector<llvm::StringRef> supported_protocols;
+  size_t i = 0;
+
+  for (llvm::StringRef protocol_name =
+           PluginManager::GetProtocolServerPluginNameAtIndex(i++);
+       !protocol_name.empty();
+       protocol_name = PluginManager::GetProtocolServerPluginNameAtIndex(i++)) {
+    supported_protocols.push_back(protocol_name);
+  }
+
+  return supported_protocols;
+}
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
index 7aa9cae5a5614..ffc76e6e93498 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
@@ -319,49 +319,6 @@ class ClangDiagnosticManagerAdapter : public clang::DiagnosticConsumer {
   StringRef m_filename;
 };
 
-/// Returns true if the SDK for the specified triple supports
-/// builtin modules in system headers. This is used to decide
-/// whether to pass -fbuiltin-headers-in-system-modules to
-/// the compiler instance when compiling the `std` module.
-static llvm::Expected<bool>
-sdkSupportsBuiltinModules(lldb_private::Target &target) {
-  auto arch_spec = target.GetArchitecture();
-  auto const &triple = arch_spec.GetTriple();
-  auto module_sp = target.GetExecutableModule();
-  if (!module_sp)
-    return llvm::createStringError("Executable module not found.");
-
-  // Get SDK path that the target was compiled against.
-  auto platform_sp = target.GetPlatform();
-  if (!platform_sp)
-    return llvm::createStringError("No Platform plugin found on target.");
-
-  auto sdk_or_err = platform_sp->GetSDKPathFromDebugInfo(*module_sp);
-  if (!sdk_or_err)
-    return sdk_or_err.takeError();
-
-  // Use the SDK path from debug-info to find a local matching SDK directory.
-  auto sdk_path_or_err =
-      HostInfo::GetSDKRoot(HostInfo::SDKOptions{std::move(sdk_or_err->first)});
-  if (!sdk_path_or_err)
-    return sdk_path_or_err.takeError();
-
-  auto VFS = FileSystem::Instance().GetVirtualFileSystem();
-  if (!VFS)
-    return llvm::createStringError("No virtual filesystem available.");
-
-  // Extract SDK version from the /path/to/some.sdk/SDKSettings.json
-  auto parsed_or_err = clang::parseDarwinSDKInfo(*VFS, *sdk_path_or_err);
-  if (!parsed_or_err)
-    return parsed_or_err.takeError();
-
-  auto maybe_sdk = *parsed_or_err;
-  if (!maybe_sdk)
-    return llvm::createStringError("Couldn't find Darwin SDK info.");
-
-  return XcodeSDK::SDKSupportsBuiltinModules(triple, maybe_sdk->getVersion());
-}
-
 static void SetupModuleHeaderPaths(CompilerInstance *compiler,
                                    std::vector<std::string> include_directories,
                                    lldb::TargetSP target_sp) {
@@ -705,7 +662,6 @@ static void SetupLangOpts(CompilerInstance &compiler,
 
 static void SetupImportStdModuleLangOpts(CompilerInstance &compiler,
                                          lldb_private::Target &target) {
-  Log *log = GetLog(LLDBLog::Expressions);
   LangOptions &lang_opts = compiler.getLangOpts();
   lang_opts.Modules = true;
   // We want to implicitly build modules.
@@ -723,12 +679,7 @@ static void SetupImportStdModuleLangOpts(CompilerInstance &compiler,
   lang_opts.GNUKeywords = true;
   lang_opts.CPlusPlus11 = true;
 
-  if (auto supported_or_err = sdkSupportsBuiltinModules(target))
-    lang_opts.BuiltinHeadersInSystemModules = !*supported_or_err;
-  else
-    LLDB_LOG_ERROR(log, supported_or_err.takeError(),
-                   "Failed to determine BuiltinHeadersInSystemModules when "
-                   "setting up import-std-module: {0}");
+  lang_opts.BuiltinHeadersInSystemModules = false;
 
   // The Darwin libc expects this macro to be set.
   lang_opts.GNUCVersion = 40201;
diff --git a/lldb/source/Plugins/Process/Utility/MemoryTagManagerAArch64MTE.cpp b/lldb/source/Plugins/Process/Utility/MemoryTagManagerAArch64MTE.cpp
index 7e25bc4ea2a28..9f60675e51904 100644
--- a/lldb/source/Plugins/Process/Utility/MemoryTagManagerAArch64MTE.cpp
+++ b/lldb/source/Plugins/Process/Utility/MemoryTagManagerAArch64MTE.cpp
@@ -247,7 +247,7 @@ MemoryTagManagerAArch64MTE::UnpackTagsData(const std::vector<uint8_t> &tags,
   return unpacked;
 }
 
-std::vector<lldb::addr_t>
+llvm::Expected<std::vector<lldb::addr_t>>
 MemoryTagManagerAArch64MTE::UnpackTagsFromCoreFileSegment(
     CoreReaderFn reader, lldb::addr_t tag_segment_virtual_address,
     lldb::addr_t tag_segment_data_address, lldb::addr_t addr,
@@ -290,8 +290,12 @@ MemoryTagManagerAArch64MTE::UnpackTagsFromCoreFileSegment(
   const size_t bytes_copied =
       reader(tag_segment_data_address + file_offset_in_bytes, tag_bytes_to_read,
              tag_data.data());
-  UNUSED_IF_ASSERT_DISABLED(bytes_copied);
-  assert(bytes_copied == tag_bytes_to_read);
+  if (bytes_copied != tag_bytes_to_read) {
+    return llvm::createStringError(
+        llvm::inconvertibleErrorCode(),
+        "Could not read tags from core file segment. Segment "
+        "is missing some or all tag data.");
+  }
 
   std::vector<lldb::addr_t> tags;
   tags.reserve(2 * tag_data.size());
diff --git a/lldb/source/Plugins/Process/Utility/MemoryTagManagerAArch64MTE.h b/lldb/source/Plugins/Process/Utility/MemoryTagManagerAArch64MTE.h
index 365e176e5b1da..79d24ce78ecee 100644
--- a/lldb/source/Plugins/Process/Utility/MemoryTagManagerAArch64MTE.h
+++ b/lldb/source/Plugins/Process/Utility/MemoryTagManagerAArch64MTE.h
@@ -44,7 +44,7 @@ class MemoryTagManagerAArch64MTE : public MemoryTagManager {
   UnpackTagsData(const std::vector<uint8_t> &tags,
                  size_t granules = 0) const override;
 
-  std::vector<lldb::addr_t>
+  llvm::Expected<std::vector<lldb::addr_t>>
   UnpackTagsFromCoreFileSegment(CoreReaderFn reader,
                                 lldb::addr_t tag_segment_virtual_address,
                                 lldb::addr_t tag_segment_data_address,
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteClientBase.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteClientBase.cpp
index 394b62559da76..406fa06ea011a 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteClientBase.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteClientBase.cpp
@@ -180,7 +180,7 @@ bool GDBRemoteClientBase::Interrupt(std::chrono::seconds interrupt_timeout) {
 GDBRemoteCommunication::PacketResult
 GDBRemoteClientBase::SendPacketAndWaitForResponse(
     llvm::StringRef payload, StringExtractorGDBRemote &response,
-    std::chrono::seconds interrupt_timeout) {
+    std::chrono::seconds interrupt_timeout, bool sync_on_timeout) {
   Lock lock(*this, interrupt_timeout);
   if (!lock) {
     if (Log *log = GetLog(GDBRLog::Process))
@@ -191,7 +191,7 @@ GDBRemoteClientBase::SendPacketAndWaitForResponse(
     return PacketResult::ErrorSendFailed;
   }
 
-  return SendPacketAndWaitForResponseNoLock(payload, response);
+  return SendPacketAndWaitForResponseNoLock(payload, response, sync_on_timeout);
 }
 
 GDBRemoteCommunication::PacketResult
@@ -236,14 +236,15 @@ GDBRemoteClientBase::SendPacketAndReceiveResponseWithOutputSupport(
 
 GDBRemoteCommunication::PacketResult
 GDBRemoteClientBase::SendPacketAndWaitForResponseNoLock(
-    llvm::StringRef payload, StringExtractorGDBRemote &response) {
+    llvm::StringRef payload, StringExtractorGDBRemote &response,
+    bool sync_on_timeout) {
   PacketResult packet_result = SendPacketNoLock(payload);
   if (packet_result != PacketResult::Success)
     return packet_result;
 
   const size_t max_response_retries = 3;
   for (size_t i = 0; i < max_response_retries; ++i) {
-    packet_result = ReadPacket(response, GetPacketTimeout(), true);
+    packet_result = ReadPacket(response, GetPacketTimeout(), sync_on_timeout);
     // Make sure we received a response
     if (packet_result != PacketResult::Success)
       return packet_result;
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteClientBase.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteClientBase.h
index af2abdf4da5cf..9c17a8c1de057 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteClientBase.h
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteClientBase.h
@@ -61,7 +61,8 @@ class GDBRemoteClientBase : public GDBRemoteCommunication, public Broadcaster {
   // ErrorReplyTimeout.
   PacketResult SendPacketAndWaitForResponse(
       llvm::StringRef payload, StringExtractorGDBRemote &response,
-      std::chrono::seconds interrupt_timeout = std::chrono::seconds(0));
+      std::chrono::seconds interrupt_timeout = std::chrono::seconds(0),
+      bool sync_on_timeout = true);
 
   PacketResult ReadPacketWithOutputSupport(
       StringExtractorGDBRemote &response, Timeout<std::micro> timeout,
@@ -104,7 +105,8 @@ class GDBRemoteClientBase : public GDBRemoteCommunication, public Broadcaster {
 protected:
   PacketResult
   SendPacketAndWaitForResponseNoLock(llvm::StringRef payload,
-                                     StringExtractorGDBRemote &response);
+                                     StringExtractorGDBRemote &response,
+                                     bool sync_on_timeout = true);
 
   virtual void OnRunPacketSent(bool first);
 
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
index 0d3ead840b080..2ca7099544bcc 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
@@ -31,6 +31,7 @@
 #include <climits>
 #include <cstring>
 #include <sys/stat.h>
+#include <variant>
 
 #if defined(__APPLE__)
 #define DEBUGSERVER_BASENAME "debugserver"
@@ -354,8 +355,9 @@ GDBRemoteCommunication::WaitForPacketNoLock(StringExtractorGDBRemote &packet,
             disconnected = true;
             Disconnect();
           }
+        } else {
+          timed_out = true;
         }
-        timed_out = true;
         break;
       case eConnectionStatusSuccess:
         // printf ("status = success but error = %s\n",
@@ -894,11 +896,9 @@ FileSpec GDBRemoteCommunication::GetDebugserverPath(Platform *platform) {
 }
 
 Status GDBRemoteCommunication::StartDebugserverProcess(
-    const char *url, Platform *platform, ProcessLaunchInfo &launch_info,
-    uint16_t *port, const Args *inferior_args, shared_fd_t pass_comm_fd) {
+    std::variant<llvm::StringRef, shared_fd_t> comm, Platform *platform,
+    ProcessLaunchInfo &launch_info, const Args *inferior_args) {
   Log *log = GetLog(GDBRLog::Process);
-  LLDB_LOG(log, "Starting debug server: url={0}, port={1}",
-           url ? url : "<empty>", port ? *port : uint16_t(0));
 
   FileSpec debugserver_file_spec = GetDebugserverPath(platform);
   if (!debugserver_file_spec)
@@ -911,89 +911,58 @@ Status GDBRemoteCommunication::StartDebugserverProcess(
 
 #if !defined(__APPLE__)
   // First argument to lldb-server must be mode in which to run.
-  debugserver_args.AppendArgument(llvm::StringRef("gdbserver"));
+  debugserver_args.AppendArgument("gdbserver");
 #endif
 
-  // If a url is supplied then use it
-  if (url && url[0])
-    debugserver_args.AppendArgument(llvm::StringRef(url));
-
-  if (pass_comm_fd != SharedSocket::kInvalidFD) {
-    StreamString fd_arg;
-    fd_arg.Printf("--fd=%" PRIi64, (int64_t)pass_comm_fd);
-    debugserver_args.AppendArgument(fd_arg.GetString());
-    // Send "pass_comm_fd" down to the inferior so it can use it to
-    // communicate back with this process. Ignored on Windows.
-    launch_info.AppendDuplicateFileAction((int64_t)pass_comm_fd,
-                                          (int64_t)pass_comm_fd);
-  }
-
   // use native registers, not the GDB registers
-  debugserver_args.AppendArgument(llvm::StringRef("--native-regs"));
+  debugserver_args.AppendArgument("--native-regs");
 
   if (launch_info.GetLaunchInSeparateProcessGroup())
-    debugserver_args.AppendArgument(llvm::StringRef("--setsid"));
+    debugserver_args.AppendArgument("--setsid");
 
   llvm::SmallString<128> named_pipe_path;
   // socket_pipe is used by debug server to communicate back either
-  // TCP port or domain socket name which it listens on.
-  // The second purpose of the pipe to serve as a synchronization point -
+  // TCP port or domain socket name which it listens on. However, we're not
+  // interested in the actualy value here.
+  // The only reason for using the pipe is to serve as a synchronization point -
   // once data is written to the pipe, debug server is up and running.
   Pipe socket_pipe;
 
-  std::unique_ptr<TCPSocket> sock_up;
+  // If a url is supplied then use it
+  if (shared_fd_t *comm_fd = std::get_if<shared_fd_t>(&comm)) {
+    LLDB_LOG(log, "debugserver communicates over fd {0}", comm_fd);
+    assert(*comm_fd != SharedSocket::kInvalidFD);
+    debugserver_args.AppendArgument(llvm::formatv("--fd={0}", *comm_fd).str());
+    // Send "comm_fd" down to the inferior so it can use it to communicate back
+    // with this process.
+    launch_info.AppendDuplicateFileAction((int64_t)*comm_fd, (int64_t)*comm_fd);
+  } else {
+    llvm::StringRef url = std::get<llvm::StringRef>(comm);
+    LLDB_LOG(log, "debugserver listens on: {0}", url);
+    debugserver_args.AppendArgument(url);
 
-  // port is null when debug server should listen on domain socket - we're
-  // not interested in port value but rather waiting for debug server to
-  // become available.
-  if (pass_comm_fd == SharedSocket::kInvalidFD) {
-    if (url) {
-// Create a temporary file to get the stdout/stderr and redirect the output of
-// the command into this file. We will later read this file if all goes well
-// and fill the data into "command_output_ptr"
 #if defined(__APPLE__)
-      // Binding to port zero, we need to figure out what port it ends up
-      // using using a named pipe...
-      Status error = socket_pipe.CreateWithUniqueName("debugserver-named-pipe",
-                                                      false, named_pipe_path);
-      if (error.Fail()) {
-        LLDB_LOG(log, "named pipe creation failed: {0}", error);
-        return error;
-      }
-      debugserver_args.AppendArgument(llvm::StringRef("--named-pipe"));
-      debugserver_args.AppendArgument(named_pipe_path);
+    // Using a named pipe as debugserver does not support --pipe.
+    Status error = socket_pipe.CreateWithUniqueName("debugserver-named-pipe",
+                                                    false, named_pipe_path);
+    if (error.Fail()) {
+      LLDB_LOG(log, "named pipe creation failed: {0}", error);
+      return error;
+    }
+    debugserver_args.AppendArgument(llvm::StringRef("--named-pipe"));
+    debugserver_args.AppendArgument(named_pipe_path);
 #else
-      // Binding to port zero, we need to figure out what port it ends up
-      // using using an unnamed pipe...
-      Status error = socket_pipe.CreateNew(true);
-      if (error.Fail()) {
-        LLDB_LOG(log, "unnamed pipe creation failed: {0}", error);
-        return error;
-      }
-      pipe_t write = socket_pipe.GetWritePipe();
-      debugserver_args.AppendArgument(llvm::StringRef("--pipe"));
-      debugserver_args.AppendArgument(llvm::to_string(write));
-      launch_info.AppendCloseFileAction(socket_pipe.GetReadFileDescriptor());
-#endif
-    } else {
-      // No host and port given, so lets listen on our end and make the
-      // debugserver connect to us..
-      if (llvm::Expected<std::unique_ptr<TCPSocket>> expected_sock =
-              Socket::TcpListen("127.0.0.1:0"))
-        sock_up = std::move(*expected_sock);
-      else
-        return Status::FromError(expected_sock.takeError());
-
-      uint16_t port_ = sock_up->GetLocalPortNumber();
-      // Send the host and port down that debugserver and specify an option
-      // so that it connects back to the port we are listening to in this
-      // process
-      debugserver_args.AppendArgument(llvm::StringRef("--reverse-connect"));
-      debugserver_args.AppendArgument(
-          llvm::formatv("127.0.0.1:{0}", port_).str());
-      if (port)
-        *port = port_;
+    // Using an unnamed pipe as it's simpler.
+    Status error = socket_pipe.CreateNew(true);
+    if (error.Fail()) {
+      LLDB_LOG(log, "unnamed pipe creation failed: {0}", error);
+      return error;
     }
+    pipe_t write = socket_pipe.GetWritePipe();
+    debugserver_args.AppendArgument(llvm::StringRef("--pipe"));
+    debugserver_args.AppendArgument(llvm::to_string(write));
+    launch_info.AppendCloseFileAction(socket_pipe.GetReadFileDescriptor());
+#endif
   }
 
   Environment host_env = Host::GetEnvironment();
@@ -1070,7 +1039,7 @@ Status GDBRemoteCommunication::StartDebugserverProcess(
     return error;
   }
 
-  if (pass_comm_fd != SharedSocket::kInvalidFD)
+  if (std::holds_alternative<shared_fd_t>(comm))
     return Status();
 
   Status error;
@@ -1084,55 +1053,30 @@ Status GDBRemoteCommunication::StartDebugserverProcess(
 
   if (socket_pipe.CanWrite())
     socket_pipe.CloseWriteFileDescriptor();
-  if (socket_pipe.CanRead()) {
-    // Read port from pipe with 10 second timeout.
-    std::string port_str;
-    while (error.Success()) {
-      char buf[10];
-      if (llvm::Expected<size_t> num_bytes =
-              socket_pipe.Read(buf, std::size(buf), std::chrono::seconds(10))) {
-        if (*num_bytes == 0)
-          break;
-        port_str.append(buf, *num_bytes);
-      } else {
-        error = Status::FromError(num_bytes.takeError());
-      }
-    }
-    if (error.Success() && (port != nullptr)) {
-      // NB: Deliberately using .c_str() to stop at embedded '\0's
-      llvm::StringRef port_ref = port_str.c_str();
-      uint16_t child_port = 0;
-      // FIXME: improve error handling
-      llvm::to_integer(port_ref, child_port);
-      if (*port == 0 || *port == child_port) {
-        *port = child_port;
-        LLDB_LOG(log, "debugserver listens on port {0}", *port);
-      } else {
-        LLDB_LOG(log,
-                 "debugserver listening on port {0} but requested port was {1}",
-                 child_port, (*port));
-      }
+  assert(socket_pipe.CanRead());
+
+  // Read data from the pipe -- and ignore it (see comment above).
+  while (error.Success()) {
+    char buf[10];
+    if (llvm::Expected<size_t> num_bytes =
+            socket_pipe.Read(buf, std::size(buf), std::chrono::seconds(10))) {
+      if (*num_bytes == 0)
+        break;
     } else {
-      LLDB_LOG(log, "failed to read a port value from pipe {0}: {1}",
-               named_pipe_path, error);
+      error = Status::FromError(num_bytes.takeError());
     }
-    socket_pipe.Close();
   }
+  if (error.Fail()) {
+    LLDB_LOG(log, "failed to synchronize on pipe {0}: {1}", named_pipe_path,
+             error);
+  }
+  socket_pipe.Close();
 
   if (named_pipe_path.size() > 0) {
     if (Status err = socket_pipe.Delete(named_pipe_path); err.Fail())
       LLDB_LOG(log, "failed to delete pipe {0}: {1}", named_pipe_path, err);
   }
 
-  if (error.Success() && sock_up) {
-    Socket *accepted_socket = nullptr;
-    error = sock_up->Accept(/*timeout=*/std::nullopt, accepted_socket);
-    if (accepted_socket) {
-      SetConnection(std::make_unique<ConnectionFileDescriptor>(
-          std::unique_ptr<Socket>(accepted_socket)));
-    }
-  }
-
   return error;
 }
 
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h
index fc86f801f0d8a..31f8edf715a3a 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h
@@ -135,17 +135,15 @@ class GDBRemoteCommunication : public Communication {
   std::chrono::seconds GetPacketTimeout() const { return m_packet_timeout; }
 
   // Get the debugserver path and check that it exist.
-  FileSpec GetDebugserverPath(Platform *platform);
+  static FileSpec GetDebugserverPath(Platform *platform);
 
   // Start a debugserver instance on the current host using the
   // supplied connection URL.
-  Status StartDebugserverProcess(
-      const char *url,
+  static Status StartDebugserverProcess(
+      std::variant<llvm::StringRef, shared_fd_t> comm,
       Platform *platform, // If non nullptr, then check with the platform for
                           // the GDB server binary if it can't be located
-      ProcessLaunchInfo &launch_info, uint16_t *port, const Args *inferior_args,
-      shared_fd_t pass_comm_fd); // Communication file descriptor to pass during
-                                 // fork/exec to avoid having to connect/accept
+      ProcessLaunchInfo &launch_info, const Args *inferior_args);
 
   void DumpHistory(Stream &strm);
 
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index adbf06b9a19a0..7d2bd452acca9 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -406,7 +406,7 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() {
         m_supports_qXfer_memory_map_read = eLazyBoolYes;
       else if (x == "qXfer:siginfo:read+")
         m_supports_qXfer_siginfo_read = eLazyBoolYes;
-      else if (x == "qEcho")
+      else if (x == "qEcho+")
         m_supports_qEcho = eLazyBoolYes;
       else if (x == "QPassSignals+")
         m_supports_QPassSignals = eLazyBoolYes;
@@ -4358,7 +4358,9 @@ llvm::Expected<int> GDBRemoteCommunicationClient::KillProcess(lldb::pid_t pid) {
   StringExtractorGDBRemote response;
   GDBRemoteCommunication::ScopedTimeout(*this, seconds(3));
 
-  if (SendPacketAndWaitForResponse("k", response, GetPacketTimeout()) !=
+  // LLDB server typically sends no response for "k", so we shouldn't try
+  // to sync on timeout.
+  if (SendPacketAndWaitForResponse("k", response, GetPacketTimeout(), false) !=
       PacketResult::Success)
     return llvm::createStringError(llvm::inconvertibleErrorCode(),
                                    "failed to send k packet");
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp
index 89fdfa74bc025..7506cf64def38 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp
@@ -94,7 +94,16 @@ GDBRemoteCommunicationServerPlatform::~GDBRemoteCommunicationServerPlatform() =
 Status GDBRemoteCommunicationServerPlatform::LaunchGDBServer(
     const lldb_private::Args &args, lldb::pid_t &pid, std::string &socket_name,
     shared_fd_t fd) {
-  std::ostringstream url;
+  Log *log = GetLog(LLDBLog::Platform);
+
+  ProcessLaunchInfo debugserver_launch_info;
+  // Do not run in a new session so that it can not linger after the platform
+  // closes.
+  debugserver_launch_info.SetLaunchInSeparateProcessGroup(false);
+  debugserver_launch_info.SetMonitorProcessCallback(
+      [](lldb::pid_t, int, int) {});
+
+  Status error;
   if (fd == SharedSocket::kInvalidFD) {
     if (m_socket_protocol == Socket::ProtocolTcp) {
       // Just check that GDBServer exists. GDBServer must be launched after
@@ -104,31 +113,22 @@ Status GDBRemoteCommunicationServerPlatform::LaunchGDBServer(
       return Status();
     }
 
+    std::ostringstream url;
     // debugserver does not accept the URL scheme prefix.
 #if !defined(__APPLE__)
     url << Socket::FindSchemeByProtocol(m_socket_protocol) << "://";
 #endif
     socket_name = GetDomainSocketPath("gdbserver").GetPath();
     url << socket_name;
+    error = StartDebugserverProcess(url.str(), nullptr, debugserver_launch_info,
+                                    &args);
   } else {
     if (m_socket_protocol != Socket::ProtocolTcp)
       return Status::FromErrorString("protocol must be tcp");
+    error =
+        StartDebugserverProcess(fd, nullptr, debugserver_launch_info, &args);
   }
 
-  // Spawn a debugserver and try to get the port it listens to.
-  ProcessLaunchInfo debugserver_launch_info;
-  Log *log = GetLog(LLDBLog::Platform);
-  LLDB_LOG(log, "Launching debugserver url='{0}', fd={1}...", url.str(), fd);
-
-  // Do not run in a new session so that it can not linger after the platform
-  // closes.
-  debugserver_launch_info.SetLaunchInSeparateProcessGroup(false);
-  debugserver_launch_info.SetMonitorProcessCallback(
-      [](lldb::pid_t, int, int) {});
-
-  Status error = StartDebugserverProcess(
-      url.str().c_str(), nullptr, debugserver_launch_info, nullptr, &args, fd);
-
   if (error.Success()) {
     pid = debugserver_launch_info.GetProcessID();
     AddSpawnedProcess(pid);
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 4e70fe8ac1595..3f9c4ddc60a25 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -3494,9 +3494,9 @@ Status ProcessGDBRemote::LaunchAndConnectToDebugserver(
   if (error.Fail())
     return error;
 
-  error = m_gdb_comm.StartDebugserverProcess(
-      nullptr, GetTarget().GetPlatform().get(), debugserver_launch_info,
-      nullptr, nullptr, shared_socket.GetSendableFD());
+  error = m_gdb_comm.StartDebugserverProcess(shared_socket.GetSendableFD(),
+                                             GetTarget().GetPlatform().get(),
+                                             debugserver_launch_info, nullptr);
 
   if (error.Fail()) {
     Log *log = GetLog(GDBRLog::Process);
diff --git a/lldb/source/Plugins/Protocol/MCP/Protocol.h b/lldb/source/Plugins/Protocol/MCP/Protocol.h
index e315899406573..cb790dc4e5596 100644
--- a/lldb/source/Plugins/Protocol/MCP/Protocol.h
+++ b/lldb/source/Plugins/Protocol/MCP/Protocol.h
@@ -123,6 +123,8 @@ using Message = std::variant<Request, Response, Notification, Error>;
 bool fromJSON(const llvm::json::Value &, Message &, llvm::json::Path);
 llvm::json::Value toJSON(const Message &);
 
+using ToolArguments = std::variant<std::monostate, llvm::json::Value>;
+
 } // namespace lldb_private::mcp::protocol
 
 #endif
diff --git a/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp
index c3cd9a88c20bf..3180341b50b91 100644
--- a/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp
+++ b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp
@@ -24,8 +24,7 @@ LLDB_PLUGIN_DEFINE(ProtocolServerMCP)
 
 static constexpr size_t kChunkSize = 1024;
 
-ProtocolServerMCP::ProtocolServerMCP(Debugger &debugger)
-    : ProtocolServer(), m_debugger(debugger) {
+ProtocolServerMCP::ProtocolServerMCP() : ProtocolServer() {
   AddRequestHandler("initialize",
                     std::bind(&ProtocolServerMCP::InitializeHandler, this,
                               std::placeholders::_1));
@@ -39,8 +38,10 @@ ProtocolServerMCP::ProtocolServerMCP(Debugger &debugger)
       "notifications/initialized", [](const protocol::Notification &) {
         LLDB_LOG(GetLog(LLDBLog::Host), "MCP initialization complete");
       });
-  AddTool(std::make_unique<LLDBCommandTool>(
-      "lldb_command", "Run an lldb command.", m_debugger));
+  AddTool(
+      std::make_unique<CommandTool>("lldb_command", "Run an lldb command."));
+  AddTool(std::make_unique<DebuggerListTool>(
+      "lldb_debugger_list", "List debugger instances with their debugger_id."));
 }
 
 ProtocolServerMCP::~ProtocolServerMCP() { llvm::consumeError(Stop()); }
@@ -54,8 +55,8 @@ void ProtocolServerMCP::Terminate() {
   PluginManager::UnregisterPlugin(CreateInstance);
 }
 
-lldb::ProtocolServerSP ProtocolServerMCP::CreateInstance(Debugger &debugger) {
-  return std::make_shared<ProtocolServerMCP>(debugger);
+lldb::ProtocolServerUP ProtocolServerMCP::CreateInstance() {
+  return std::make_unique<ProtocolServerMCP>();
 }
 
 llvm::StringRef ProtocolServerMCP::GetPluginDescriptionStatic() {
@@ -145,7 +146,7 @@ llvm::Error ProtocolServerMCP::Start(ProtocolServer::Connection connection) {
   std::lock_guard<std::mutex> guard(m_server_mutex);
 
   if (m_running)
-    return llvm::createStringError("server already running");
+    return llvm::createStringError("the MCP server is already running");
 
   Status status;
   m_listener = Socket::Create(connection.protocol, status);
@@ -162,10 +163,10 @@ llvm::Error ProtocolServerMCP::Start(ProtocolServer::Connection connection) {
   if (llvm::Error error = handles.takeError())
     return error;
 
+  m_running = true;
   m_listen_handlers = std::move(*handles);
   m_loop_thread = std::thread([=] {
-    llvm::set_thread_name(
-        llvm::formatv("debugger-{0}.mcp.runloop", m_debugger.GetID()));
+    llvm::set_thread_name("protocol-server.mcp");
     m_loop.Run();
   });
 
@@ -175,6 +176,8 @@ llvm::Error ProtocolServerMCP::Start(ProtocolServer::Connection connection) {
 llvm::Error ProtocolServerMCP::Stop() {
   {
     std::lock_guard<std::mutex> guard(m_server_mutex);
+    if (!m_running)
+      return createStringError("the MCP sever is not running");
     m_running = false;
   }
 
@@ -311,11 +314,12 @@ ProtocolServerMCP::ToolsCallHandler(const protocol::Request &request) {
   if (it == m_tools.end())
     return llvm::createStringError(llvm::formatv("no tool \"{0}\"", tool_name));
 
-  const json::Value *args = param_obj->get("arguments");
-  if (!args)
-    return llvm::createStringError("no tool arguments");
+  protocol::ToolArguments tool_args;
+  if (const json::Value *args = param_obj->get("arguments"))
+    tool_args = *args;
 
-  llvm::Expected<protocol::TextResult> text_result = it->second->Call(*args);
+  llvm::Expected<protocol::TextResult> text_result =
+      it->second->Call(tool_args);
   if (!text_result)
     return text_result.takeError();
 
diff --git a/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.h b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.h
index 52bb92a04a802..d55882cc8ab09 100644
--- a/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.h
+++ b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.h
@@ -21,7 +21,7 @@ namespace lldb_private::mcp {
 
 class ProtocolServerMCP : public ProtocolServer {
 public:
-  ProtocolServerMCP(Debugger &debugger);
+  ProtocolServerMCP();
   virtual ~ProtocolServerMCP() override;
 
   virtual llvm::Error Start(ProtocolServer::Connection connection) override;
@@ -33,7 +33,7 @@ class ProtocolServerMCP : public ProtocolServer {
   static llvm::StringRef GetPluginNameStatic() { return "MCP"; }
   static llvm::StringRef GetPluginDescriptionStatic();
 
-  static lldb::ProtocolServerSP CreateInstance(Debugger &debugger);
+  static lldb::ProtocolServerUP CreateInstance();
 
   llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
 
@@ -71,8 +71,6 @@ class ProtocolServerMCP : public ProtocolServer {
   llvm::StringLiteral kName = "lldb-mcp";
   llvm::StringLiteral kVersion = "0.1.0";
 
-  Debugger &m_debugger;
-
   bool m_running = false;
 
   MainLoop m_loop;
diff --git a/lldb/source/Plugins/Protocol/MCP/Tool.cpp b/lldb/source/Plugins/Protocol/MCP/Tool.cpp
index de8fcc8f3cb4c..5c4626cf66b32 100644
--- a/lldb/source/Plugins/Protocol/MCP/Tool.cpp
+++ b/lldb/source/Plugins/Protocol/MCP/Tool.cpp
@@ -7,22 +7,38 @@
 //===----------------------------------------------------------------------===//
 
 #include "Tool.h"
+#include "lldb/Core/Module.h"
 #include "lldb/Interpreter/CommandInterpreter.h"
 #include "lldb/Interpreter/CommandReturnObject.h"
 
 using namespace lldb_private::mcp;
 using namespace llvm;
 
-struct LLDBCommandToolArguments {
+namespace {
+struct CommandToolArguments {
+  uint64_t debugger_id;
   std::string arguments;
 };
 
-bool fromJSON(const llvm::json::Value &V, LLDBCommandToolArguments &A,
+bool fromJSON(const llvm::json::Value &V, CommandToolArguments &A,
               llvm::json::Path P) {
   llvm::json::ObjectMapper O(V, P);
-  return O && O.map("arguments", A.arguments);
+  return O && O.map("debugger_id", A.debugger_id) &&
+         O.mapOptional("arguments", A.arguments);
 }
 
+/// Helper function to create a TextResult from a string output.
+static lldb_private::mcp::protocol::TextResult
+createTextResult(std::string output, bool is_error = false) {
+  lldb_private::mcp::protocol::TextResult text_result;
+  text_result.content.emplace_back(
+      lldb_private::mcp::protocol::TextContent{{std::move(output)}});
+  text_result.isError = is_error;
+  return text_result;
+}
+
+} // namespace
+
 Tool::Tool(std::string name, std::string description)
     : m_name(std::move(name)), m_description(std::move(description)) {}
 
@@ -37,22 +53,27 @@ protocol::ToolDefinition Tool::GetDefinition() const {
   return definition;
 }
 
-LLDBCommandTool::LLDBCommandTool(std::string name, std::string description,
-                                 Debugger &debugger)
-    : Tool(std::move(name), std::move(description)), m_debugger(debugger) {}
-
 llvm::Expected<protocol::TextResult>
-LLDBCommandTool::Call(const llvm::json::Value &args) {
-  llvm::json::Path::Root root;
+CommandTool::Call(const protocol::ToolArguments &args) {
+  if (!std::holds_alternative<json::Value>(args))
+    return createStringError("CommandTool requires arguments");
+
+  json::Path::Root root;
 
-  LLDBCommandToolArguments arguments;
-  if (!fromJSON(args, arguments, root))
+  CommandToolArguments arguments;
+  if (!fromJSON(std::get<json::Value>(args), arguments, root))
     return root.getError();
 
+  lldb::DebuggerSP debugger_sp =
+      Debugger::GetDebuggerAtIndex(arguments.debugger_id);
+  if (!debugger_sp)
+    return createStringError(
+        llvm::formatv("no debugger with id {0}", arguments.debugger_id));
+
   // FIXME: Disallow certain commands and their aliases.
   CommandReturnObject result(/*colors=*/false);
-  m_debugger.GetCommandInterpreter().HandleCommand(arguments.arguments.c_str(),
-                                                   eLazyBoolYes, result);
+  debugger_sp->GetCommandInterpreter().HandleCommand(
+      arguments.arguments.c_str(), eLazyBoolYes, result);
 
   std::string output;
   llvm::StringRef output_str = result.GetOutputString();
@@ -66,16 +87,64 @@ LLDBCommandTool::Call(const llvm::json::Value &args) {
     output += err_str;
   }
 
-  mcp::protocol::TextResult text_result;
-  text_result.content.emplace_back(mcp::protocol::TextContent{{output}});
-  text_result.isError = !result.Succeeded();
-  return text_result;
+  return createTextResult(output, !result.Succeeded());
 }
 
-std::optional<llvm::json::Value> LLDBCommandTool::GetSchema() const {
+std::optional<llvm::json::Value> CommandTool::GetSchema() const {
+  llvm::json::Object id_type{{"type", "number"}};
   llvm::json::Object str_type{{"type", "string"}};
-  llvm::json::Object properties{{"arguments", std::move(str_type)}};
+  llvm::json::Object properties{{"debugger_id", std::move(id_type)},
+                                {"arguments", std::move(str_type)}};
+  llvm::json::Array required{"debugger_id"};
   llvm::json::Object schema{{"type", "object"},
-                            {"properties", std::move(properties)}};
+                            {"properties", std::move(properties)},
+                            {"required", std::move(required)}};
   return schema;
 }
+
+llvm::Expected<protocol::TextResult>
+DebuggerListTool::Call(const protocol::ToolArguments &args) {
+  if (!std::holds_alternative<std::monostate>(args))
+    return createStringError("DebuggerListTool takes no arguments");
+
+  llvm::json::Path::Root root;
+
+  // Return a nested Markdown list with debuggers and target.
+  // Example output:
+  //
+  // - debugger 0
+  //     - target 0 /path/to/foo
+  //     - target 1
+  // - debugger 1
+  //     - target 0 /path/to/bar
+  //
+  // FIXME: Use Structured Content when we adopt protocol version 2025-06-18.
+  std::string output;
+  llvm::raw_string_ostream os(output);
+
+  const size_t num_debuggers = Debugger::GetNumDebuggers();
+  for (size_t i = 0; i < num_debuggers; ++i) {
+    lldb::DebuggerSP debugger_sp = Debugger::GetDebuggerAtIndex(i);
+    if (!debugger_sp)
+      continue;
+
+    os << "- debugger " << i << '\n';
+
+    TargetList &target_list = debugger_sp->GetTargetList();
+    const size_t num_targets = target_list.GetNumTargets();
+    for (size_t j = 0; j < num_targets; ++j) {
+      lldb::TargetSP target_sp = target_list.GetTargetAtIndex(j);
+      if (!target_sp)
+        continue;
+      os << "    - target " << j;
+      if (target_sp == target_list.GetSelectedTarget())
+        os << " (selected)";
+      // Append the module path if we have one.
+      if (Module *exe_module = target_sp->GetExecutableModulePointer())
+        os << " " << exe_module->GetFileSpec().GetPath();
+      os << '\n';
+    }
+  }
+
+  return createTextResult(output);
+}
diff --git a/lldb/source/Plugins/Protocol/MCP/Tool.h b/lldb/source/Plugins/Protocol/MCP/Tool.h
index 57a5125813b76..74ab04b472522 100644
--- a/lldb/source/Plugins/Protocol/MCP/Tool.h
+++ b/lldb/source/Plugins/Protocol/MCP/Tool.h
@@ -22,10 +22,10 @@ class Tool {
   virtual ~Tool() = default;
 
   virtual llvm::Expected<protocol::TextResult>
-  Call(const llvm::json::Value &args) = 0;
+  Call(const protocol::ToolArguments &args) = 0;
 
   virtual std::optional<llvm::json::Value> GetSchema() const {
-    return std::nullopt;
+    return llvm::json::Object{{"type", "object"}};
   }
 
   protocol::ToolDefinition GetDefinition() const;
@@ -37,20 +37,26 @@ class Tool {
   std::string m_description;
 };
 
-class LLDBCommandTool : public mcp::Tool {
+class CommandTool : public mcp::Tool {
 public:
-  LLDBCommandTool(std::string name, std::string description,
-                  Debugger &debugger);
-  ~LLDBCommandTool() = default;
+  using mcp::Tool::Tool;
+  ~CommandTool() = default;
 
   virtual llvm::Expected<protocol::TextResult>
-  Call(const llvm::json::Value &args) override;
+  Call(const protocol::ToolArguments &args) override;
 
   virtual std::optional<llvm::json::Value> GetSchema() const override;
+};
 
-private:
-  Debugger &m_debugger;
+class DebuggerListTool : public mcp::Tool {
+public:
+  using mcp::Tool::Tool;
+  ~DebuggerListTool() = default;
+
+  virtual llvm::Expected<protocol::TextResult>
+  Call(const protocol::ToolArguments &args) override;
 };
+
 } // namespace lldb_private::mcp
 
 #endif
diff --git a/lldb/source/Utility/XcodeSDK.cpp b/lldb/source/Utility/XcodeSDK.cpp
index 004b4717e315b..eb2047e67c326 100644
--- a/lldb/source/Utility/XcodeSDK.cpp
+++ b/lldb/source/Utility/XcodeSDK.cpp
@@ -266,27 +266,6 @@ bool XcodeSDK::SupportsSwift() const {
   }
 }
 
-bool XcodeSDK::SDKSupportsBuiltinModules(const llvm::Triple &target_triple,
-                                         llvm::VersionTuple sdk_version) {
-  using namespace llvm;
-
-  switch (target_triple.getOS()) {
-  case Triple::OSType::MacOSX:
-    return sdk_version >= VersionTuple(15U);
-  case Triple::OSType::IOS:
-    return sdk_version >= VersionTuple(18U);
-  case Triple::OSType::TvOS:
-    return sdk_version >= VersionTuple(18U);
-  case Triple::OSType::WatchOS:
-    return sdk_version >= VersionTuple(11U);
-  case Triple::OSType::XROS:
-    return sdk_version >= VersionTuple(2U);
-  default:
-    // New SDKs support builtin modules from the start.
-    return true;
-  }
-}
-
 bool XcodeSDK::SDKSupportsModules(XcodeSDK::Type desired_type,
                                   const FileSpec &sdk_path) {
   ConstString last_path_component = sdk_path.GetFilename();
diff --git a/lldb/test/API/commands/command/script_alias/TestCommandScriptAlias.py b/lldb/test/API/commands/command/script_alias/TestCommandScriptAlias.py
index 2696f703f0e1c..09886baf5406c 100644
--- a/lldb/test/API/commands/command/script_alias/TestCommandScriptAlias.py
+++ b/lldb/test/API/commands/command/script_alias/TestCommandScriptAlias.py
@@ -11,6 +11,7 @@ class CommandScriptAliasTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
     def test_pycmd(self):
+        self.runCmd("log enable -f /tmp/gdb.log gdb-remote all")
         self.runCmd("command script import tcsacmd.py")
         self.runCmd("command script add -f tcsacmd.some_command_here attach")
 
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py
index 08ac9290ee85a..12b464d3397eb 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py
@@ -356,6 +356,78 @@ def A(self, packet):
             ["vRun;%s;61726731;61726732;61726733" % (exe_hex,)]
         )
 
+    def test_launch_lengthy_vRun(self):
+        class MyResponder(MockGDBServerResponder):
+            def __init__(self, *args, **kwargs):
+                self.started = False
+                return super().__init__(*args, **kwargs)
+
+            def qC(self):
+                if self.started:
+                    return "QCp10.10"
+                else:
+                    return "E42"
+
+            def qfThreadInfo(self):
+                if self.started:
+                    return "mp10.10"
+                else:
+                    return "E42"
+
+            def qsThreadInfo(self):
+                return "l"
+
+            def qEcho(self, num):
+                resp = "qEcho:" + str(num)
+                if num >= 2:
+                    # We have launched our program
+                    self.started = True
+                    return [resp, "T13"]
+
+                return resp
+
+            def qSupported(self, client_supported):
+                return "PacketSize=3fff;QStartNoAckMode+;qEcho+;"
+
+            def qHostInfo(self):
+                return "default_packet_timeout:1;"
+
+            def vRun(self, packet):
+                return [self.RESPONSE_NONE]
+
+            def A(self, packet):
+                return "E28"
+
+        self.server.responder = MyResponder()
+
+        target = self.createTarget("a.yaml")
+        # NB: apparently GDB packets are using "/" on Windows too
+        exe_path = self.getBuildArtifact("a").replace(os.path.sep, "/")
+        exe_hex = binascii.b2a_hex(exe_path.encode()).decode()
+        process = self.connect(target)
+        lldbutil.expect_state_changes(
+            self, self.dbg.GetListener(), process, [lldb.eStateConnected]
+        )
+
+        process = target.Launch(
+            lldb.SBListener(),
+            ["arg1", "arg2", "arg3"],  # argv
+            [],  # envp
+            None,  # stdin_path
+            None,  # stdout_path
+            None,  # stderr_path
+            None,  # working_directory
+            0,  # launch_flags
+            True,  # stop_at_entry
+            lldb.SBError(),
+        )  # error
+        self.assertTrue(process, PROCESS_IS_VALID)
+        self.assertEqual(process.GetProcessID(), 16)
+
+        self.assertPacketLogContains(
+            ["vRun;%s;61726731;61726732;61726733" % (exe_hex,)]
+        )
+
     def test_launch_QEnvironment(self):
         class MyResponder(MockGDBServerResponder):
             def qC(self):
diff --git a/lldb/test/API/linux/aarch64/mte_core_file/TestAArch64LinuxMTEMemoryTagCoreFile.py b/lldb/test/API/linux/aarch64/mte_core_file/TestAArch64LinuxMTEMemoryTagCoreFile.py
index a9879f67d8b8f..bfdc8229094f0 100644
--- a/lldb/test/API/linux/aarch64/mte_core_file/TestAArch64LinuxMTEMemoryTagCoreFile.py
+++ b/lldb/test/API/linux/aarch64/mte_core_file/TestAArch64LinuxMTEMemoryTagCoreFile.py
@@ -248,3 +248,26 @@ def test_mte_ctrl_register(self):
                     "TCF: 0 = TCF_NONE, 1 = TCF_SYNC, 2 = TCF_ASYNC, 3 = TCF_ASYMM"
                 ],
             )
+
+    @skipIfLLVMTargetMissing("AArch64")
+    def test_mte_no_tags(self):
+        """Test that we handle there being a tag segment but that segment does
+        not contain any tag data. This can happen when the core is dumped
+        with a restrictive limit or filter."""
+        self.runCmd("target create --core core.mte.notags")
+
+        mte_buf_addr = 0xFFFFA4AF3000
+
+        # We can see which memory was tagged.
+        self.expect(
+            f"memory region {mte_buf_addr}", substrs=["memory tagging: enabled"]
+        )
+
+        # We cannot read those tags.
+        self.expect(
+            f"memory tag read {mte_buf_addr}",
+            substrs=[
+                "Could not read tags from core file segment. Segment is missing some or all tag data."
+            ],
+            error=True,
+        )
diff --git a/lldb/test/API/linux/aarch64/mte_core_file/core.mte.notags b/lldb/test/API/linux/aarch64/mte_core_file/core.mte.notags
new file mode 100644
index 0000000000000000000000000000000000000000..8f9d60668a84d99b425ab7bfe20c02b67e153542
GIT binary patch
literal 32768
zcmeI5dr(x@8Nk2Yy9-Oix;|2gA>r~cqGMK2V<b(7%dV)j6dkh3WRgjjbr*#IyShNC
z(TuA?d~BKu$+XpEMo>|gaT2FvTCwdw%%dizQ-#`T)6N9VLo=ogU>>02?f2b#4(u+F
zv?iTN+jH>V^Z3qp&Ub$2JLlf(A7?k_7cMp$48Vg4eg(rh0EG)8Vg(k$6qYx!xQQns
z>QOQyF}i?Pqdv?elIcd_FCVtH>Zz!^mE~AWdRiMYul4r7tQSWrT+e!gdL_?Z8Qo5!
zZcv1s(c^2_uc~^LsQb>O5qesI%xdinkFM9l<PrTvaS67LuBVLDi{jEThMxQ-dfj8_
zb&S-D;vyd&-OeK;c}CeOzc{)c;{g#|qV%$EP)`{{Zxk+3^H36_r!XEoj<8eXQo`g-
zF?up9=W&Fd)?bjxJ7V-$@vz_P1+<PdE<skI`v&#oq`^`B+8RB7m|Ly&#QXcQd0=J{
zlWk}$+3Q$VoKMB$=*B{1nwL$-%JFzne1;cd{QQK73?|Rq9-SX6$M?Di|2jIO-z4aJ
z*U=UF-f$CZ$UWb^a&A^u7TK-^HfApW7B83ec6)A{7R8e5@%VJYPosSPVv1gWNIqWQ
z(-bzBQ!H#xJrcb(`<dQpR-c<ua=YH%HKv!v@@v`nuwDt(<^5@MgLw%rJ~vQIH88{}
zT67VGbj8kz+vcyWu~pTSYPA4tbB0P}R`iu3vMq>L!IFaexNY255?IUDv3!upzwSu0
z3E|PY<2I%8PMUNdn8)&~C+P{c%wenA@G)`M)+~vbV4_|Pyey`<prEklskS#@exX*%
zp1GLj4B}ZiuzlFV^DO4|AYS(+&XDGm0)M>o{CK@sn{!HXALsI^OwOZQSYlzC?t9Z%
ze$zQu(3Awh+gP5DD~07hnxog}OL{5Ozm>_8V`7=*V<VcA0B%nCg$0ZGobdI@84tai
zSYzK&*gHh*y9Ce6x}2{U0QL?L`%LF~_RQYS75FopJ(Fo4)B?*xx4y2Q;^q8VdisT^
zeF8ezzQy*1S*#ggd4MBY{n@O1PMN=|bdJ~kSoP}itnA^12{$09tq#noEMGCl?GAX#
z7R<l1vfNj@_RdwlS}OI}s{OVFFz1_7$;%96=8}EcUufUgqoA)#dSv3ip}k>xOu*gJ
zwW4<AJ@WL`LoNy(`lRvoq2+o2CiIN_BUumVPirmK7BAj-tT!d@-%tXy_tUm-z)iid
zpy>VwF)cq3{Z2x4EVv&SBkEZBrj4@y=fK%Wg3SmwAU5=_V2WU)V&Fa(7g4S&Z)9z3
z)ZR(xz2T;aI{dk;F-KWKgzT!Szjbigga{dbA0Zii8CxS{m=!?~5CjAPK|l}?1Ox#=
zKoAfF1OY)n5D)|e0YN|z5CjAPK|l}?1Ox#=KoAfF1OY)n5D)|e0YN|z5CjAPK|l}?
z1Ox#=KoAfF1OY)n5D)|e0YN|z5CjAPK|l}?1Ox#=KoAfF1OY)n5D)|e0YN|z5CjAP
zK|l}?1Ox#=KoAfF1OY)n5csbourcQRe`$>K|NqPn^2Jfo`Tr8$$K%1Fbo4Bpp*vPc
zVtPE5Sw%YM6p!|Ue(mhhk@3(W)9h?pel9WT(6y5q7$tV<#lT~#OM%WurLzUeR%(*Y
z7mN(l96v*q>qN?V`Uy--gx(P+`P;C3682EY4lt}_3X3qdV)@@#IUfg|ZAkT-7`(%5
zoDZ>bU=_D9qex8kxsg9-;|G_kqBP(ut#o-R{hkWfs?wU8@~YC|VpiZ^>2<FgE*+{I
zD6JkY4b-@*%U1<TYia|&;ZkfN;P!d_RiGc<`<NF`#nleR)ea$b&9yDC&C5aij1hkZ
zq)dO!?f)20=cm(o%vE?%ps^V7zWtOdo}aBwTqzKHDx+a9DY_pF%Ub51SGOL2(fhM0
zTb{l)ZO43gi{_5cQ6ZftjCrueX>+P!#M@9g$-W?@+zV?in!%X{!D5efQ?Xs{nC87O
zXmx<}s1gUxCqZd@1<~JOhP&Hyq_dk%O4|w(%ukV}vnRU0b2cB0Z@%xD9%=_=pnka=
zdg`zeZppNVw`MqIQvGeH{|;CxULI7P3M8JoAjRYH%j0&O&)wB&uRhoZneop-#v8j+
zz<8;^0M7SvcQx*R>R2DOPZy3f<T1fSk{?o4XN!^^nhp>Kw6{ZY{VDTX!A>0RA1S>B
zZ8O4=NgKgf(Jg^97xho8>QtBM)<83WGsOyPHeFYpG`=^n95yJ=#9@821sdwD=Fn!e
zb+bVq=Rq9L41j@yCX7umKx5g8au>u`yo_y7|J~RI@o+(01+k+3KE^WY$KkzhO?%|J
z)dQP%m!h6IL2q{j@w=i9@mq}iKJeTTdI$TnY}nAX5-g!gWcWHVT*Vkxp?*7z*BI8J
zJd)$P$Z@mP9wv?s;)yJY+3R2o8PUcRkS=|QW9M8+X4iO18qX6-S}2Y2JWNcGMf^t_
zx`I{+)x+|(dP%oC4weoWO-nTn+ps*vV6HIWd;s!!A8n{BTh*@sg!0jb6WB)zm`C)h
z&!buE*Makx&iZ{C`z3b2Wbt`oh<Un9U4vHH3`gsc!9MiEmbhU*G{z76VI)WL!6##U
zupNEy9QvT0`Jf$r5Ser?LVNAV&W_w%V7fFDeGtdf*us3!4ra~&NJjqmI@;|yzP;Fy
zxHn+mu<tqhwUu3Q76`imUgAFK{rw;wWqo9l{DbRO@=5YXfV$nrj=VNx-3G7bwaF%o
zaeh9*u}1kV&K0qE70WYk9`<c2+k)I*QRCS<T;66>hy5sW9frDd8=g#B-nKP)dE2AY
z^TMUg_AvQx2=zA_B+ZZPaMP&|q_|mjNZ3I>M80E}<DRQavWM*%zuRqxxKInuV;tJ(
zit!;k`4{>Sl@pT#${nF!)11*>Vg#r6wL#pCj!dFGMasW!-HHroUn!lmsaVCe;z1TJ
zFh81#tZ8j37=yg2vnjr*FAv~oZp8X7IhognYkAfcY=3FQS|{J2{pX9~vWYssvyz-L
z)|HzsogG|%EIcSd!{z%7;ZGz(>!%MJ!=Fmf`k7>$MeSd$Hiz-t{oZSm)T-vfK+uwR
zT%HKdlG#clD8_~qlhV5l^LrFyuidib@k5yY+#`h-{`R^R(v_T)WC(Oy!I^q994*PP
z=iu79`||f|evu24FUdQf+5MxSp<$rH6wX6l-Sc2T#`v5c+DbBvIUp$mn;gnO3K%ZA
zCxf#--Ja7uPZ=;I20PQheCn!nbE^yB(iODVzB9P{d>p()HuLLkBOJZ_uAyPq7DIze
zg<jfYyQN!O9qQt)^U~zjbJEn-y#UV+V60qtyz!x_O5=J7Lb~jh)@~`e)d9iIU07#N
zx-rL$>EjZ-L^hFaI9HeQaBO>K7&W^-_c+1>6^p`GtMkK`@5>8+`mjAbxc=m!cH~TZ
zG(P=Wi6t>?!acJa?LQ|?X+1A}d)CP`eZEH1J1<RZt(VLpS2FY_SitF!;fx7wabI(E
zrJ^sU_B_$mROan3smSPWFU#n!lc6CO*Q^`;)7|q#V+-a>(xIUR3?Z~3lxI;I_kwl7
zKHOJQt#C$K1jpL($~8eRK3tqwzAn{*<r%QH9j{I3m(8H$7yxqG@e1NRte1?Tha}@E
ze4397BAXWU(ai(uT}`H7;~y}MY<d-o@^}Y6iz0o6+9y6uhG64sSdW;E#$hamu0-fP
ziEYq4y3k(bs_Hx^8C$!sEh?ilJ>N>Qpur7>25C~T(`d%$J9PAIkzio+IA}}3v64@{
zjC`mc+>1K*GFz@;9CYZ<JcB9N8ARWqX9=Gl(mO|MHZfRiMn9ynKTLi5m9`WUtZ7Eu
z$RD^byn!<E>2CDtLQL<$wj<?egSOwhGNiK>?+2YHfBaUT#df&wP~uPe61;!!-2TE_
zeFd{$>?=4>-)FkbQj~F9QqlB7O5u{VN@A7?CRnHKED9#?FKSM1DLRpSpy=J?gGF`5
zU@_D+7kQFppT|<-v)An_%1%!6r9LowS?c21ev++0S^5K+%hDHT`iJE+P(H(-DjR%u
z#pv^tNWQPsP6=yAmV9*<SmVaIg1Y@hsdX(y_PPT_p1OlYAVcAWx}G1RUpC+~vbT9a
zbuL+p9~bYzG6P;lyd=ELc*WsG>nQk{s$RoQaWYseM?Y89ewOa(SJh8g`n>^F?P2MK
zS5@`->#90UJK=ozcS-m=DgT`n{;J5|FE6qJ_L>eIxBw{d?}sFo=Re4>Ll+PIMiIT|
zQ$U~cQ~aQj?g02hKU$t;p@-$a#|!jo|FJv&?i#CuB@V|ua%NtjO2)Tv+3ELJddl48
zJ~?~7ZLY1jv@%=HwAYqbdS!Q@>P{@YYwpaMx-eUwn>BZSc6K&?64<KCs%ru@?iFAw
zb61yv&AZN5y{<}+YXV@y9|EyGRvM@-_xqwUE-%=8e*ASHTTSWO8nC%5Ys!7tRIR55
zt5$0bxU0%NU<>%Y?ix4PJpQVxQeVwj<N0^X=_eCt&iQn3Kj3i|Ymmz=EYJOhewT;Z
z;BxLKJU+zg({~QJ5ZCAaatoG`jO%m1<1zO$tfB?xhCB<Yn8M@x{al~>6_5GnJeSAD
z^gT;H%FDUm@^~TZiQCBS=LUQi6ZFZ;^<%%+@dF|%aBjTNfe9KDUuPN&5>kg4jwR?Z
zzso$q;sT6GpZ9-^{XYYb?~SP+%a6Xpsc+WZi2tY18jq0k@$-GegEDH9efC2?`(Y#=
z%P5NsT%X4kC?tLEXS^=|f0TS)z+!HHWXx@ktojI>_}GQQ^&;zvv>+e|2m*qDARq_`
S0)l`bAP5Kog24X{fqw$@NM;oP

literal 0
HcmV?d00001

diff --git a/lldb/test/API/linux/aarch64/mte_core_file/main.c b/lldb/test/API/linux/aarch64/mte_core_file/main.c
index 89027e0ea75d2..6537edd7bdb95 100644
--- a/lldb/test/API/linux/aarch64/mte_core_file/main.c
+++ b/lldb/test/API/linux/aarch64/mte_core_file/main.c
@@ -5,9 +5,14 @@
 //
 // Compile with:
 // <gcc or clang> -march=armv8.5-a+memtag -g main.c -o a.out.mte
+// (use a.out.mte to generate core.mte.notags and core.mte)
 // <gcc or clang> -march=armv8.5-a+memtag -g main.c -DNO_MTE -o a.out.nomte
 //
-// /proc/self/coredump_filter was set to 2 when the core files were made.
+// Set /proc/self/coredump_filter to the following values when generating the
+// core files:
+// * core.mte - 3
+// * core.mte.notags - 2
+// * core.nomte - 3
 
 #include <arm_acle.h>
 #include <asm/mman.h>
diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py
index 3ba7deb285de9..35810feb48366 100644
--- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py
+++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py
@@ -5,7 +5,7 @@
 from typing import Dict, Any, List
 
 import lldbdap_testcase
-from lldbsuite.test.decorators import skipIfWindows, skipIf, skipIfBuildType
+from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import line_number
 
 
@@ -32,6 +32,7 @@ def verify_stopped_on_entry(self, stopped_events: List[Dict[str, Any]]):
 
         self.assertEqual(seen_stopped_event, 1, "expect only one stopped entry event.")
 
+    @skipIfAsan
     @skipIfWindows
     @skipIf(oslist=["linux"], archs=["arm$"])  # Always times out on buildbot
     def test_basic_functionality(self):
@@ -80,6 +81,7 @@ def test_basic_functionality(self):
         )
         self.continue_to_exit()
 
+    @skipIfAsan
     @skipIfWindows
     @skipIf(oslist=["linux"], archs=["arm$"])  # Always times out on buildbot
     def test_stopOnEntry(self):
diff --git a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
index 3d07cd8b20e28..af8b6b140da47 100644
--- a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
+++ b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
@@ -2,7 +2,7 @@
 Test lldb-dap runInTerminal reverse request
 """
 
-from lldbsuite.test.decorators import skipIfBuildType, skipIfWindows, skipIf, no_match
+from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import line_number
 import lldbdap_testcase
 import os
@@ -26,6 +26,7 @@ def read_error_message(fifo_file):
         with open(fifo_file, "r") as file:
             return file.readline()
 
+    @skipIfAsan
     @skipIfWindows
     @skipIf(oslist=["linux"], archs=no_match(["x86_64"]))
     def test_runInTerminal(self):
@@ -73,6 +74,8 @@ def test_runInTerminal(self):
 
         self.continue_to_exit()
 
+    @skipIfAsan
+    @skipIfWindows
     @skipIf(oslist=["linux"], archs=no_match(["x86_64"]))
     def test_runInTerminalWithObjectEnv(self):
         """
diff --git a/lldb/unittests/Host/CMakeLists.txt b/lldb/unittests/Host/CMakeLists.txt
index 3b20f1d723d18..5591edda38aca 100644
--- a/lldb/unittests/Host/CMakeLists.txt
+++ b/lldb/unittests/Host/CMakeLists.txt
@@ -37,7 +37,9 @@ add_lldb_unittest(HostTests
     lldbUtilityHelpers
     lldbHostHelpers
     LLVMTestingSupport
-    LLVMTargetParser
+
+  LINK_COMPONENTS
+    TargetParser
   )
 
 add_subdirectory(common)
diff --git a/lldb/unittests/Process/Utility/MemoryTagManagerAArch64MTETest.cpp b/lldb/unittests/Process/Utility/MemoryTagManagerAArch64MTETest.cpp
index 40d7c3601ccfd..30199bfe5c254 100644
--- a/lldb/unittests/Process/Utility/MemoryTagManagerAArch64MTETest.cpp
+++ b/lldb/unittests/Process/Utility/MemoryTagManagerAArch64MTETest.cpp
@@ -87,31 +87,38 @@ TEST(MemoryTagManagerAArch64MTETest, UnpackTagsFromCoreFileSegment) {
   std::vector<uint8_t> tags_data;
   MemoryTagManager::CoreReaderFn reader =
       [&tags_data](lldb::offset_t offset, size_t length, void *dst) {
+        if ((offset + length) >= tags_data.size())
+          length = tags_data.size() - offset;
+
         std::memcpy(dst, tags_data.data() + offset, length);
         return length;
       };
 
   // Zero length is ok.
-  std::vector<lldb::addr_t> tags =
+  llvm::Expected<std::vector<lldb::addr_t>> tags =
       manager.UnpackTagsFromCoreFileSegment(reader, 0, 0, 0, 0);
-  ASSERT_EQ(tags.size(), (size_t)0);
+  ASSERT_THAT_EXPECTED(tags, llvm::Succeeded());
+  ASSERT_EQ(tags->size(), (size_t)0);
 
   // In the simplest case we read 2 tags which are in the same byte.
   tags_data.push_back(0x21);
   // The least significant bits are the first tag in memory.
   std::vector<lldb::addr_t> expected{1, 2};
   tags = manager.UnpackTagsFromCoreFileSegment(reader, 0, 0, 0, 32);
-  ASSERT_THAT(expected, testing::ContainerEq(tags));
+  ASSERT_THAT_EXPECTED(tags, llvm::Succeeded());
+  ASSERT_THAT(expected, testing::ContainerEq(*tags));
 
   // If we read just one then it will have to trim off the second one.
   expected = std::vector<lldb::addr_t>{1};
   tags = manager.UnpackTagsFromCoreFileSegment(reader, 0, 0, 0, 16);
-  ASSERT_THAT(expected, testing::ContainerEq(tags));
+  ASSERT_THAT_EXPECTED(tags, llvm::Succeeded());
+  ASSERT_THAT(expected, testing::ContainerEq(*tags));
 
   // If we read the second tag only then the first one must be trimmed.
   expected = std::vector<lldb::addr_t>{2};
   tags = manager.UnpackTagsFromCoreFileSegment(reader, 0, 0, 16, 16);
-  ASSERT_THAT(expected, testing::ContainerEq(tags));
+  ASSERT_THAT_EXPECTED(tags, llvm::Succeeded());
+  ASSERT_THAT(expected, testing::ContainerEq(*tags));
 
   // This trimming logic applies if you read a larger set of tags.
   tags_data = std::vector<uint8_t>{0x21, 0x43, 0x65, 0x87};
@@ -119,31 +126,55 @@ TEST(MemoryTagManagerAArch64MTETest, UnpackTagsFromCoreFileSegment) {
   // Trailing tag should be trimmed.
   expected = std::vector<lldb::addr_t>{1, 2, 3};
   tags = manager.UnpackTagsFromCoreFileSegment(reader, 0, 0, 0, 48);
-  ASSERT_THAT(expected, testing::ContainerEq(tags));
+  ASSERT_THAT_EXPECTED(tags, llvm::Succeeded());
+  ASSERT_THAT(expected, testing::ContainerEq(*tags));
 
   // Leading tag should be trimmed.
   expected = std::vector<lldb::addr_t>{2, 3, 4};
   tags = manager.UnpackTagsFromCoreFileSegment(reader, 0, 0, 16, 48);
-  ASSERT_THAT(expected, testing::ContainerEq(tags));
+  ASSERT_THAT_EXPECTED(tags, llvm::Succeeded());
+  ASSERT_THAT(expected, testing::ContainerEq(*tags));
 
   // Leading and trailing trimmmed.
   expected = std::vector<lldb::addr_t>{2, 3, 4, 5};
   tags = manager.UnpackTagsFromCoreFileSegment(reader, 0, 0, 16, 64);
-  ASSERT_THAT(expected, testing::ContainerEq(tags));
+  ASSERT_THAT_EXPECTED(tags, llvm::Succeeded());
+  ASSERT_THAT(expected, testing::ContainerEq(*tags));
 
   // The address given is an offset into the whole file so the address requested
   // from the reader should be beyond that.
   tags_data = std::vector<uint8_t>{0xFF, 0xFF, 0x21, 0x43, 0x65, 0x87};
   expected = std::vector<lldb::addr_t>{1, 2};
   tags = manager.UnpackTagsFromCoreFileSegment(reader, 0, 2, 0, 32);
-  ASSERT_THAT(expected, testing::ContainerEq(tags));
+  ASSERT_THAT_EXPECTED(tags, llvm::Succeeded());
+  ASSERT_THAT(expected, testing::ContainerEq(*tags));
 
   // addr is a virtual address that we expect to be >= the tag segment's
   // starting virtual address. So again an offset must be made from the
   // difference.
   expected = std::vector<lldb::addr_t>{3, 4};
   tags = manager.UnpackTagsFromCoreFileSegment(reader, 32, 2, 64, 32);
-  ASSERT_THAT(expected, testing::ContainerEq(tags));
+  ASSERT_THAT_EXPECTED(tags, llvm::Succeeded());
+  ASSERT_THAT(expected, testing::ContainerEq(*tags));
+
+  // Error when there is not enough data to decode tags.
+
+  // Read 1 tag from an offset just outside the segment's data.
+  tags_data = {0xAB};
+  tags = manager.UnpackTagsFromCoreFileSegment(reader, 0, 0, 32, 16);
+  const char *expected_err = "Could not read tags from core file segment. "
+                             "Segment is missing some or all tag data.";
+  EXPECT_THAT_EXPECTED(tags, llvm::FailedWithMessage(expected_err));
+
+  // First 2 tags come from the segment, second 2 cannot be read.
+  tags_data.push_back(0xCD);
+  tags = manager.UnpackTagsFromCoreFileSegment(reader, 0, 0, 32, 64);
+  EXPECT_THAT_EXPECTED(tags, llvm::FailedWithMessage(expected_err));
+
+  // Segment is completely empty.
+  tags_data.clear();
+  tags = manager.UnpackTagsFromCoreFileSegment(reader, 0, 0, 0, 16);
+  EXPECT_THAT_EXPECTED(tags, llvm::FailedWithMessage(expected_err));
 }
 
 TEST(MemoryTagManagerAArch64MTETest, GetLogicalTag) {
diff --git a/lldb/unittests/Protocol/ProtocolMCPServerTest.cpp b/lldb/unittests/Protocol/ProtocolMCPServerTest.cpp
index 72b8c7b1fd825..8e61379b5c731 100644
--- a/lldb/unittests/Protocol/ProtocolMCPServerTest.cpp
+++ b/lldb/unittests/Protocol/ProtocolMCPServerTest.cpp
@@ -46,9 +46,10 @@ class TestTool : public mcp::Tool {
   using mcp::Tool::Tool;
 
   virtual llvm::Expected<mcp::protocol::TextResult>
-  Call(const llvm::json::Value &args) override {
+  Call(const ToolArguments &args) override {
     std::string argument;
-    if (const json::Object *args_obj = args.getAsObject()) {
+    if (const json::Object *args_obj =
+            std::get<json::Value>(args).getAsObject()) {
       if (const json::Value *s = args_obj->get("arguments")) {
         argument = s->getAsString().value_or("");
       }
@@ -66,7 +67,7 @@ class ErrorTool : public mcp::Tool {
   using mcp::Tool::Tool;
 
   virtual llvm::Expected<mcp::protocol::TextResult>
-  Call(const llvm::json::Value &args) override {
+  Call(const ToolArguments &args) override {
     return llvm::createStringError("error");
   }
 };
@@ -77,7 +78,7 @@ class FailTool : public mcp::Tool {
   using mcp::Tool::Tool;
 
   virtual llvm::Expected<mcp::protocol::TextResult>
-  Call(const llvm::json::Value &args) override {
+  Call(const ToolArguments &args) override {
     mcp::protocol::TextResult text_result;
     text_result.content.emplace_back(mcp::protocol::TextContent{{"failed"}});
     text_result.isError = true;
@@ -115,7 +116,7 @@ class ProtocolServerMCPTest : public ::testing::Test {
     ProtocolServer::Connection connection;
     connection.protocol = Socket::SocketProtocol::ProtocolTcp;
     connection.name = llvm::formatv("{0}:0", k_localhost).str();
-    m_server_up = std::make_unique<TestProtocolServerMCP>(*m_debugger_sp);
+    m_server_up = std::make_unique<TestProtocolServerMCP>();
     m_server_up->AddTool(std::make_unique<TestTool>("test", "test tool"));
     ASSERT_THAT_ERROR(m_server_up->Start(connection), llvm::Succeeded());
 
@@ -145,7 +146,7 @@ class ProtocolServerMCPTest : public ::testing::Test {
 
 TEST_F(ProtocolServerMCPTest, Intialization) {
   llvm::StringLiteral request =
-      R"json({"method":"initialize","params":{"protocolVersion":"2024-11-05","capabilities":{},"clientInfo":{"name":"claude-ai","version":"0.1.0"}},"jsonrpc":"2.0","id":0})json";
+      R"json({"method":"initialize","params":{"protocolVersion":"2024-11-05","capabilities":{},"clientInfo":{"name":"lldb-unit","version":"0.1.0"}},"jsonrpc":"2.0","id":0})json";
   llvm::StringLiteral response =
       R"json({"jsonrpc":"2.0","id":0,"result":{"capabilities":{"tools":{"listChanged":true}},"protocolVersion":"2024-11-05","serverInfo":{"name":"lldb-mcp","version":"0.1.0"}}})json";
 
@@ -167,7 +168,7 @@ TEST_F(ProtocolServerMCPTest, ToolsList) {
   llvm::StringLiteral request =
       R"json({"method":"tools/list","params":{},"jsonrpc":"2.0","id":1})json";
   llvm::StringLiteral response =
-      R"json({"id":1,"jsonrpc":"2.0","result":{"tools":[{"description":"test tool","name":"test"},{"description":"Run an lldb command.","inputSchema":{"properties":{"arguments":{"type":"string"}},"type":"object"},"name":"lldb_command"}]}})json";
+      R"json( {"id":1,"jsonrpc":"2.0","result":{"tools":[{"description":"test tool","inputSchema":{"type":"object"},"name":"test"},{"description":"List debugger instances with their debugger_id.","inputSchema":{"type":"object"},"name":"lldb_debugger_list"},{"description":"Run an lldb command.","inputSchema":{"properties":{"arguments":{"type":"string"},"debugger_id":{"type":"number"}},"required":["debugger_id"],"type":"object"},"name":"lldb_command"}]}})json";
 
   ASSERT_THAT_ERROR(Write(request), llvm::Succeeded());
 
@@ -205,7 +206,7 @@ TEST_F(ProtocolServerMCPTest, ResourcesList) {
 
 TEST_F(ProtocolServerMCPTest, ToolsCall) {
   llvm::StringLiteral request =
-      R"json({"method":"tools/call","params":{"name":"test","arguments":{"arguments":"foo"}},"jsonrpc":"2.0","id":11})json";
+      R"json({"method":"tools/call","params":{"name":"test","arguments":{"arguments":"foo","debugger_id":0}},"jsonrpc":"2.0","id":11})json";
   llvm::StringLiteral response =
       R"json({"id":11,"jsonrpc":"2.0","result":{"content":[{"text":"foo","type":"text"}],"isError":false}})json";
 
@@ -227,7 +228,7 @@ TEST_F(ProtocolServerMCPTest, ToolsCallError) {
   m_server_up->AddTool(std::make_unique<ErrorTool>("error", "error tool"));
 
   llvm::StringLiteral request =
-      R"json({"method":"tools/call","params":{"name":"error","arguments":{"arguments":"foo"}},"jsonrpc":"2.0","id":11})json";
+      R"json({"method":"tools/call","params":{"name":"error","arguments":{"arguments":"foo","debugger_id":0}},"jsonrpc":"2.0","id":11})json";
   llvm::StringLiteral response =
       R"json({"error":{"code":-1,"message":"error"},"id":11,"jsonrpc":"2.0"})json";
 
@@ -249,7 +250,7 @@ TEST_F(ProtocolServerMCPTest, ToolsCallFail) {
   m_server_up->AddTool(std::make_unique<FailTool>("fail", "fail tool"));
 
   llvm::StringLiteral request =
-      R"json({"method":"tools/call","params":{"name":"fail","arguments":{"arguments":"foo"}},"jsonrpc":"2.0","id":11})json";
+      R"json({"method":"tools/call","params":{"name":"fail","arguments":{"arguments":"foo","debugger_id":0}},"jsonrpc":"2.0","id":11})json";
   llvm::StringLiteral response =
       R"json({"id":11,"jsonrpc":"2.0","result":{"content":[{"text":"failed","type":"text"}],"isError":true}})json";
 
diff --git a/llvm/docs/CommandGuide/llvm-objdump.rst b/llvm/docs/CommandGuide/llvm-objdump.rst
index 5e5eaccecd2b7..c9f0379694287 100644
--- a/llvm/docs/CommandGuide/llvm-objdump.rst
+++ b/llvm/docs/CommandGuide/llvm-objdump.rst
@@ -278,7 +278,7 @@ OPTIONS
   any analysis with a special representation (i.e. BlockFrequency,
   BranchProbability, etc) are printed as raw hex values.
 
-  Only works with PowerPC objects or X86 linked images.
+  Only supported for AArch64, BPF, PowerPC, and X86.
 
   Example:
     A non-symbolized branch instruction with a local target and pc-relative memory access like
diff --git a/llvm/docs/HowToReleaseLLVM.rst b/llvm/docs/HowToReleaseLLVM.rst
index ca55ee573c002..dd4bb08b81d1c 100644
--- a/llvm/docs/HowToReleaseLLVM.rst
+++ b/llvm/docs/HowToReleaseLLVM.rst
@@ -38,8 +38,8 @@ Releases should be tagged on Tuesdays.
 =============================== =========================
 Release                         Approx. Date
 =============================== =========================
-*release branch: even releases* *4th Tue in January*
-*release branch: odd releases*  *4th Tue in July*
+*release branch: even releases* *2nd Tue in January*
+*release branch: odd releases*  *2nd Tue in July*
 X.1.0-rc1                       3 days after branch.
 X.1.0-rc2                       2 weeks after branch.
 X.1.0-rc3                       4 weeks after branch
@@ -49,7 +49,11 @@ X.1.0-rc3                       4 weeks after branch
 **X.1.3**                       **12 weeks after branch**
 **X.1.4**                       **14 weeks after branch**
 **X.1.5**                       **16 weeks after branch**
-**X.1.6 (if necessary)**        **18 weeks after branch**
+**X.1.6**                       **18 weeks after branch**
+**X.1.7**                       **20 weeks after branch**
+**X.1.8**                       **22 weeks after branch**
+**X.1.9** (If necessary)        **24 weeks after branch**
+**Next release branches**       **~25 weeks after branch**
 =============================== =========================
 
 Release Process Summary
@@ -341,10 +345,10 @@ Below are the rules regarding patching the release branch:
    was created.  As with all phases, release managers and maintainers can reject
    patches that are deemed too invasive.
 
-#. *Before RC2* Patches should be limited to bug fixes or backend specific
+#. *Before RC2/RC3* Patches should be limited to bug fixes or backend specific
    improvements that are determined to be very safe.
 
-#. *Before RC3/Final Major Release* Patches should be limited to critical
+#. *Before Final Major Release* Patches should be limited to critical
    bugs or regressions.
 
 #. *Bug fix releases* Patches should be limited to bug fixes or very safe
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 4a1005667692d..73ae2ee599640 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -212,6 +212,7 @@ Changes to the RISC-V Backend
 * `-mtune=andes-45-series` was added.
 * Adds assembler support for the Andes `XAndesvbfhcvt` (Andes Vector BFLOAT16 Conversion extension).
 * `-mcpu=andes-ax45mpv` was added.
+* Removed -mattr=+no-rvc-hints that could be used to disable parsing and generation of RVC hints.
 
 Changes to the WebAssembly Backend
 ----------------------------------
diff --git a/llvm/docs/SourceLevelDebugging.rst b/llvm/docs/SourceLevelDebugging.rst
index 8a11dcf5254a9..dfc8c53edbb8e 100644
--- a/llvm/docs/SourceLevelDebugging.rst
+++ b/llvm/docs/SourceLevelDebugging.rst
@@ -143,6 +143,42 @@ debugging information influences optimization passes then it will be reported
 as a failure.  See :doc:`TestingGuide` for more information on LLVM test
 infrastructure and how to run various tests.
 
+.. _variables_and_variable_fragments:
+
+Variables and Variable Fragments
+================================
+
+In this document "variable" refers generally to any source language object
+which can have a value, including at least:
+
+- Variables
+- Constants
+- Formal parameters
+
+.. note::
+
+   There is no special provision for "true" constants in LLVM today, and
+   they are instead treated as local or global variables.
+
+A variable is represented by a `local variable <LangRef.html#dilocalvariable>`_
+or `global variable <LangRef.html#diglobalvariable>`_ metadata node.
+
+A "variable fragment" (or just "fragment") is a contiguous span of bits of a
+variable.
+
+A :ref:`debug record <debug_records>` which refers to a ``DIExpression`` ending
+with a ``DW_OP_LLVM_fragment`` operation describes a fragment of the variable
+it refers to.
+
+The operands of the ``DW_OP_LLVM_fragment`` operation encode the bit offset of
+the fragment relative to the start of the variable, and the size of the
+fragment in bits, respectively.
+
+.. note::
+
+   The ``DW_OP_LLVM_fragment`` operation acts only to encode the fragment
+   information, and does not have an effect on the semantics of the expression.
+
 .. _format:
 
 Debugging information format
@@ -510,10 +546,23 @@ values through compilation, when objects are promoted to SSA values a
 ``#dbg_value`` record is created for each assignment, recording the
 variable's new location. Compared with the ``#dbg_declare`` record:
 
-* A #dbg_value terminates the effect of any preceding #dbg_values for (any
-  overlapping fragments of) the specified variable.
-* The #dbg_value's position in the IR defines where in the instruction stream
-  the variable's value changes.
+* A ``#dbg_value`` terminates the effects that any preceding records have on
+  any common bits of a common variable.
+
+  .. note::
+
+    The current implementation generally terminates the effect of every
+    record in its entirety if any of its effects would be terminated, rather
+    than carrying forward the effect of previous records for non-overlapping
+    bits as it would be permitted to do by this definition. This is allowed
+    just as dropping any debug information at any point in the compilation is
+    allowed.
+
+    One exception to this is :doc:`AssignmentTracking` where certain
+    memory-based locations are carried forward partially in some situations.
+
+* The ``#dbg_value``'s position in the IR defines where in the instruction
+  stream the variable's value changes.
 * Operands can be constants, indicating the variable is assigned a
   constant value.
 
diff --git a/llvm/include/llvm/ADT/ArrayRef.h b/llvm/include/llvm/ADT/ArrayRef.h
index 4819c88471345..ddd2c7ce68c83 100644
--- a/llvm/include/llvm/ADT/ArrayRef.h
+++ b/llvm/include/llvm/ADT/ArrayRef.h
@@ -84,18 +84,19 @@ namespace llvm {
       assert(begin <= end);
     }
 
-    /// Construct an ArrayRef from a SmallVector. This is templated in order to
-    /// avoid instantiating SmallVectorTemplateCommon<T> whenever we
-    /// copy-construct an ArrayRef.
-    template<typename U>
-    /*implicit*/ ArrayRef(const SmallVectorTemplateCommon<T, U> &Vec)
-      : Data(Vec.data()), Length(Vec.size()) {
-    }
-
-    /// Construct an ArrayRef from a std::vector.
-    template<typename A>
-    /*implicit*/ ArrayRef(const std::vector<T, A> &Vec)
-      : Data(Vec.data()), Length(Vec.size()) {}
+    /// Construct an ArrayRef from a type that has a data() method that returns
+    /// a pointer convertible to const T *.
+    template <
+        typename C,
+        typename = std::enable_if_t<
+            std::conjunction_v<
+                std::is_convertible<
+                    decltype(std::declval<const C &>().data()) *,
+                    const T *const *>,
+                std::is_integral<decltype(std::declval<const C &>().size())>>,
+            void>>
+    /*implicit*/ constexpr ArrayRef(const C &V)
+        : Data(V.data()), Length(V.size()) {}
 
     /// Construct an ArrayRef from a std::array
     template <size_t N>
@@ -123,32 +124,6 @@ namespace llvm {
 #pragma GCC diagnostic pop
 #endif
 
-    /// Construct an ArrayRef<const T*> from ArrayRef<T*>. This uses SFINAE to
-    /// ensure that only ArrayRefs of pointers can be converted.
-    template <typename U>
-    ArrayRef(const ArrayRef<U *> &A,
-             std::enable_if_t<std::is_convertible<U *const *, T const *>::value>
-                 * = nullptr)
-        : Data(A.data()), Length(A.size()) {}
-
-    /// Construct an ArrayRef<const T*> from a SmallVector<T*>. This is
-    /// templated in order to avoid instantiating SmallVectorTemplateCommon<T>
-    /// whenever we copy-construct an ArrayRef.
-    template <typename U, typename DummyT>
-    /*implicit*/ ArrayRef(
-        const SmallVectorTemplateCommon<U *, DummyT> &Vec,
-        std::enable_if_t<std::is_convertible<U *const *, T const *>::value> * =
-            nullptr)
-        : Data(Vec.data()), Length(Vec.size()) {}
-
-    /// Construct an ArrayRef<const T*> from std::vector<T*>. This uses SFINAE
-    /// to ensure that only vectors of pointers can be converted.
-    template <typename U, typename A>
-    ArrayRef(const std::vector<U *, A> &Vec,
-             std::enable_if_t<std::is_convertible<U *const *, T const *>::value>
-                 * = nullptr)
-        : Data(Vec.data()), Length(Vec.size()) {}
-
     /// Construct an ArrayRef<T> from iterator_range<U*>. This uses SFINAE
     /// to ensure that this is only used for iterator ranges over plain pointer
     /// iterators.
diff --git a/llvm/include/llvm/Analysis/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h
index cfc21b3ec202b..9e2dc1ad771cf 100644
--- a/llvm/include/llvm/Analysis/DXILResource.h
+++ b/llvm/include/llvm/Analysis/DXILResource.h
@@ -34,7 +34,7 @@ namespace dxil {
 
 // Returns the resource name from dx_resource_handlefrombinding or
 // dx_resource_handlefromimplicitbinding call
-StringRef getResourceNameFromBindingCall(CallInst *CI);
+LLVM_ABI StringRef getResourceNameFromBindingCall(CallInst *CI);
 
 /// The dx.RawBuffer target extension type
 ///
@@ -387,7 +387,7 @@ class ResourceInfo {
 
   const ResourceBinding &getBinding() const { return Binding; }
   TargetExtType *getHandleTy() const { return HandleTy; }
-  const StringRef getName() const { return Name; }
+  StringRef getName() const { return Name; }
 
   bool hasSymbol() const { return Symbol; }
   LLVM_ABI GlobalVariable *createSymbol(Module &M, StructType *Ty);
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index de67955d85d7c..1eb4a9b8aaf9e 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -32,6 +32,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/JSON.h"
 #include <map>
@@ -57,9 +58,9 @@ enum class IR2VecKind { Symbolic };
 
 namespace ir2vec {
 
-extern cl::opt<float> OpcWeight;
-extern cl::opt<float> TypeWeight;
-extern cl::opt<float> ArgWeight;
+LLVM_ABI extern cl::opt<float> OpcWeight;
+LLVM_ABI extern cl::opt<float> TypeWeight;
+LLVM_ABI extern cl::opt<float> ArgWeight;
 
 /// Embedding is a datatype that wraps std::vector<double>. It provides
 /// additional functionality for arithmetic and comparison operations.
@@ -106,16 +107,17 @@ struct Embedding {
   const std::vector<double> &getData() const { return Data; }
 
   /// Arithmetic operators
-  Embedding &operator+=(const Embedding &RHS);
-  Embedding &operator-=(const Embedding &RHS);
+  LLVM_ABI Embedding &operator+=(const Embedding &RHS);
+  LLVM_ABI Embedding &operator-=(const Embedding &RHS);
 
   /// Adds Src Embedding scaled by Factor with the called Embedding.
   /// Called_Embedding += Src * Factor
-  Embedding &scaleAndAdd(const Embedding &Src, float Factor);
+  LLVM_ABI Embedding &scaleAndAdd(const Embedding &Src, float Factor);
 
   /// Returns true if the embedding is approximately equal to the RHS embedding
   /// within the specified tolerance.
-  bool approximatelyEquals(const Embedding &RHS, double Tolerance = 1e-6) const;
+  LLVM_ABI bool approximatelyEquals(const Embedding &RHS,
+                                    double Tolerance = 1e-6) const;
 };
 
 using InstEmbeddingsMap = DenseMap<const Instruction *, Embedding>;
@@ -148,7 +150,7 @@ class Embedder {
   mutable BBEmbeddingsMap BBVecMap;
   mutable InstEmbeddingsMap InstVecMap;
 
-  Embedder(const Function &F, const Vocab &Vocabulary);
+  LLVM_ABI Embedder(const Function &F, const Vocab &Vocabulary);
 
   /// Helper function to compute embeddings. It generates embeddings for all
   /// the instructions and basic blocks in the function F. Logic of computing
@@ -161,38 +163,38 @@ class Embedder {
 
   /// Lookup vocabulary for a given Key. If the key is not found, it returns a
   /// zero vector.
-  Embedding lookupVocab(const std::string &Key) const;
+  LLVM_ABI Embedding lookupVocab(const std::string &Key) const;
 
 public:
   virtual ~Embedder() = default;
 
   /// Factory method to create an Embedder object.
-  static Expected<std::unique_ptr<Embedder>>
+  LLVM_ABI static Expected<std::unique_ptr<Embedder>>
   create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary);
 
   /// Returns a map containing instructions and the corresponding embeddings for
   /// the function F if it has been computed. If not, it computes the embeddings
   /// for the function and returns the map.
-  const InstEmbeddingsMap &getInstVecMap() const;
+  LLVM_ABI const InstEmbeddingsMap &getInstVecMap() const;
 
   /// Returns a map containing basic block and the corresponding embeddings for
   /// the function F if it has been computed. If not, it computes the embeddings
   /// for the function and returns the map.
-  const BBEmbeddingsMap &getBBVecMap() const;
+  LLVM_ABI const BBEmbeddingsMap &getBBVecMap() const;
 
   /// Returns the embedding for a given basic block in the function F if it has
   /// been computed. If not, it computes the embedding for the basic block and
   /// returns it.
-  const Embedding &getBBVector(const BasicBlock &BB) const;
+  LLVM_ABI const Embedding &getBBVector(const BasicBlock &BB) const;
 
   /// Computes and returns the embedding for the current function.
-  const Embedding &getFunctionVector() const;
+  LLVM_ABI const Embedding &getFunctionVector() const;
 };
 
 /// Class for computing the Symbolic embeddings of IR2Vec.
 /// Symbolic embeddings are constructed based on the entity-level
 /// representations obtained from the Vocabulary.
-class SymbolicEmbedder : public Embedder {
+class LLVM_ABI SymbolicEmbedder : public Embedder {
 private:
   /// Utility function to compute the embedding for a given type.
   Embedding getTypeEmbedding(const Type *Ty) const;
@@ -219,13 +221,13 @@ class IR2VecVocabResult {
 
 public:
   IR2VecVocabResult() = default;
-  IR2VecVocabResult(ir2vec::Vocab &&Vocabulary);
+  LLVM_ABI IR2VecVocabResult(ir2vec::Vocab &&Vocabulary);
 
   bool isValid() const { return Valid; }
-  const ir2vec::Vocab &getVocabulary() const;
-  unsigned getDimension() const;
-  bool invalidate(Module &M, const PreservedAnalyses &PA,
-                  ModuleAnalysisManager::Invalidator &Inv) const;
+  LLVM_ABI const ir2vec::Vocab &getVocabulary() const;
+  LLVM_ABI unsigned getDimension() const;
+  LLVM_ABI bool invalidate(Module &M, const PreservedAnalyses &PA,
+                           ModuleAnalysisManager::Invalidator &Inv) const;
 };
 
 /// This analysis provides the vocabulary for IR2Vec. The vocabulary provides a
@@ -237,12 +239,12 @@ class IR2VecVocabAnalysis : public AnalysisInfoMixin<IR2VecVocabAnalysis> {
   void emitError(Error Err, LLVMContext &Ctx);
 
 public:
-  static AnalysisKey Key;
+  LLVM_ABI static AnalysisKey Key;
   IR2VecVocabAnalysis() = default;
-  explicit IR2VecVocabAnalysis(const ir2vec::Vocab &Vocab);
-  explicit IR2VecVocabAnalysis(ir2vec::Vocab &&Vocab);
+  LLVM_ABI explicit IR2VecVocabAnalysis(const ir2vec::Vocab &Vocab);
+  LLVM_ABI explicit IR2VecVocabAnalysis(ir2vec::Vocab &&Vocab);
   using Result = IR2VecVocabResult;
-  Result run(Module &M, ModuleAnalysisManager &MAM);
+  LLVM_ABI Result run(Module &M, ModuleAnalysisManager &MAM);
 };
 
 /// This pass prints the IR2Vec embeddings for instructions, basic blocks, and
@@ -253,7 +255,7 @@ class IR2VecPrinterPass : public PassInfoMixin<IR2VecPrinterPass> {
 
 public:
   explicit IR2VecPrinterPass(raw_ostream &OS) : OS(OS) {}
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+  LLVM_ABI PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
   static bool isRequired() { return true; }
 };
 
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 4596b2563c1d8..c804f551f5a75 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -311,11 +311,11 @@ LLVM_ABI std::optional<bool> computeKnownFPSignBit(const Value *V,
 
 /// Return true if the sign bit of the FP value can be ignored by the user when
 /// the value is zero.
-bool canIgnoreSignBitOfZero(const Use &U);
+LLVM_ABI bool canIgnoreSignBitOfZero(const Use &U);
 
 /// Return true if the sign bit of the FP value can be ignored by the user when
 /// the value is NaN.
-bool canIgnoreSignBitOfNaN(const Use &U);
+LLVM_ABI bool canIgnoreSignBitOfNaN(const Use &U);
 
 /// If the specified value can be set by repeating the same byte in memory,
 /// return the i8 value that it is represented with. This is true for all i8
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index cabecbec175b3..bf958e100f2ac 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1105,6 +1105,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
                                               VectorType *&SubTy) const {
     if (Mask.empty())
       return Kind;
+    int NumDstElts = Mask.size();
     int NumSrcElts = SrcTy->getElementCount().getKnownMinValue();
     switch (Kind) {
     case TTI::SK_PermuteSingleSrc: {
@@ -1115,8 +1116,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       if (isSplatMask(Mask, NumSrcElts, Index))
         return TTI::SK_Broadcast;
       if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
-          (Index + Mask.size()) <= (size_t)NumSrcElts) {
-        SubTy = FixedVectorType::get(SrcTy->getElementType(), Mask.size());
+          (Index + NumDstElts) <= NumSrcElts) {
+        SubTy = FixedVectorType::get(SrcTy->getElementType(), NumDstElts);
         return TTI::SK_ExtractSubvector;
       }
       break;
@@ -1126,8 +1127,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
         return improveShuffleKindFromMask(TTI::SK_PermuteSingleSrc, Mask, SrcTy,
                                           Index, SubTy);
       int NumSubElts;
-      if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
-                                 Mask, NumSrcElts, NumSubElts, Index)) {
+      if (NumDstElts > 2 && ShuffleVectorInst::isInsertSubvectorMask(
+                                Mask, NumSrcElts, NumSubElts, Index)) {
         if (Index + NumSubElts > NumSrcElts)
           return Kind;
         SubTy = FixedVectorType::get(SrcTy->getElementType(), NumSubElts);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
index 109dc8812c24d..75c051712ae43 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
@@ -609,6 +609,15 @@ class LLVM_ABI CallLowering {
   virtual bool isTypeIsValidForThisReturn(EVT Ty) const { return false; }
 };
 
+extern template LLVM_ABI void
+CallLowering::setArgFlags<Function>(CallLowering::ArgInfo &Arg, unsigned OpIdx,
+                                    const DataLayout &DL,
+                                    const Function &FuncInfo) const;
+
+extern template LLVM_ABI void
+CallLowering::setArgFlags<CallBase>(CallLowering::ArgInfo &Arg, unsigned OpIdx,
+                                    const DataLayout &DL,
+                                    const CallBase &FuncInfo) const;
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_GLOBALISEL_CALLLOWERING_H
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 4106be4c81cea..ea0873f41ebba 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -295,6 +295,11 @@ class LegalizerHelper {
   getNeutralElementForVecReduce(unsigned Opcode, MachineIRBuilder &MIRBuilder,
                                 LLT Ty);
 
+  LegalizeResult emitSincosLibcall(MachineInstr &MI,
+                                   MachineIRBuilder &MIRBuilder, unsigned Size,
+                                   Type *OpType,
+                                   LostDebugLocObserver &LocObserver);
+
 public:
   /// Return the alignment to use for a stack temporary object with the given
   /// type.
diff --git a/llvm/include/llvm/CodeGenTypes/LowLevelType.h b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
index 06879e1f8d15b..d8e0848aff84d 100644
--- a/llvm/include/llvm/CodeGenTypes/LowLevelType.h
+++ b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
@@ -28,6 +28,7 @@
 
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/CodeGenTypes/MachineValueType.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include <cassert>
 
@@ -140,7 +141,7 @@ class LLT {
   explicit constexpr LLT()
       : IsScalar(false), IsPointer(false), IsVector(false), RawData(0) {}
 
-  explicit LLT(MVT VT);
+  LLVM_ABI explicit LLT(MVT VT);
 
   constexpr bool isValid() const { return IsScalar || RawData != 0; }
   constexpr bool isScalar() const { return IsScalar; }
@@ -282,7 +283,7 @@ class LLT {
       return scalar(getScalarSizeInBits());
   }
 
-  void print(raw_ostream &OS) const;
+  LLVM_ABI void print(raw_ostream &OS) const;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   LLVM_DUMP_METHOD void dump() const;
diff --git a/llvm/include/llvm/CodeGenTypes/MachineValueType.h b/llvm/include/llvm/CodeGenTypes/MachineValueType.h
index c14abca027350..b8e91a022ec5e 100644
--- a/llvm/include/llvm/CodeGenTypes/MachineValueType.h
+++ b/llvm/include/llvm/CodeGenTypes/MachineValueType.h
@@ -17,6 +17,7 @@
 #define LLVM_CODEGEN_MACHINEVALUETYPE_H
 
 #include "llvm/ADT/Sequence.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TypeSize.h"
@@ -65,10 +66,10 @@ namespace llvm {
     bool operator<=(const MVT& S) const { return SimpleTy <= S.SimpleTy; }
 
     /// Support for debugging, callable in GDB: VT.dump()
-    void dump() const;
+    LLVM_ABI void dump() const;
 
     /// Implement operator<<.
-    void print(raw_ostream &OS) const;
+    LLVM_ABI void print(raw_ostream &OS) const;
 
     /// Return true if this is a valid simple valuetype.
     bool isValid() const {
@@ -509,11 +510,11 @@ namespace llvm {
     /// otherwise they are invalid.
     /// NB: This includes pointer types, which require a DataLayout to convert
     /// to a concrete value type.
-    static MVT getVT(Type *Ty, bool HandleUnknown = false);
+    LLVM_ABI static MVT getVT(Type *Ty, bool HandleUnknown = false);
 
     /// Returns an APFloat semantics tag appropriate for the value type. If this
     /// is a vector type, the element semantics are returned.
-    const fltSemantics &getFltSemantics() const;
+    LLVM_ABI const fltSemantics &getFltSemantics() const;
 
   public:
     /// SimpleValueType Iteration
diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index f4569850b093c..5533652736dc8 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -4468,7 +4468,9 @@ Node *AbstractManglingParser<Derived, Alloc>::parseType() {
         return nullptr;
       if (!consumeIf('_'))
         return nullptr;
-      return make<BitIntType>(Size, Signed);
+      // The front end expects this to be available for Substitution
+      Result = make<BitIntType>(Size, Signed);
+      break;
     }
     //                ::= Di   # char32_t
     case 'i':
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/COFF.h b/llvm/include/llvm/ExecutionEngine/Orc/COFF.h
index adc9e9e171165..42a6c85a577fa 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/COFF.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/COFF.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_EXECUTIONENGINE_ORC_COFF_H
 #define LLVM_EXECUTIONENGINE_ORC_COFF_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 
@@ -31,8 +32,8 @@ class COFFImportFileScanner {
 public:
   COFFImportFileScanner(std::set<std::string> &ImportedDynamicLibraries)
       : ImportedDynamicLibraries(ImportedDynamicLibraries) {}
-  Expected<bool> operator()(object::Archive &A, MemoryBufferRef MemberBuf,
-                            size_t Index) const;
+  LLVM_ABI Expected<bool>
+  operator()(object::Archive &A, MemoryBufferRef MemberBuf, size_t Index) const;
 
 private:
   std::set<std::string> &ImportedDynamicLibraries;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h b/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
index e63f5f7842520..6fce74ddf72fe 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
@@ -23,7 +23,6 @@
 #include "llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h"
 #include "llvm/ExecutionEngine/Orc/TaskDispatch.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/MSVCErrorWorkarounds.h"
 #include "llvm/TargetParser/Triple.h"
 
@@ -425,47 +424,6 @@ class LLVM_ABI InProcessMemoryAccess
   bool IsArch64Bit;
 };
 
-/// A ExecutorProcessControl instance that asserts if any of its methods are
-/// used. Suitable for use is unit tests, and by ORC clients who haven't moved
-/// to ExecutorProcessControl-based APIs yet.
-class UnsupportedExecutorProcessControl : public ExecutorProcessControl,
-                                          private InProcessMemoryAccess {
-public:
-  UnsupportedExecutorProcessControl(
-      std::shared_ptr<SymbolStringPool> SSP = nullptr,
-      std::unique_ptr<TaskDispatcher> D = nullptr, const std::string &TT = "",
-      unsigned PageSize = 0)
-      : ExecutorProcessControl(
-            SSP ? std::move(SSP) : std::make_shared<SymbolStringPool>(),
-            D ? std::move(D) : std::make_unique<InPlaceTaskDispatcher>()),
-        InProcessMemoryAccess(Triple(TT).isArch64Bit()) {
-    this->TargetTriple = Triple(TT);
-    this->PageSize = PageSize;
-    this->MemAccess = this;
-  }
-
-  Expected<int32_t> runAsMain(ExecutorAddr MainFnAddr,
-                              ArrayRef<std::string> Args) override {
-    llvm_unreachable("Unsupported");
-  }
-
-  Expected<int32_t> runAsVoidFunction(ExecutorAddr VoidFnAddr) override {
-    llvm_unreachable("Unsupported");
-  }
-
-  Expected<int32_t> runAsIntFunction(ExecutorAddr IntFnAddr, int Arg) override {
-    llvm_unreachable("Unsupported");
-  }
-
-  void callWrapperAsync(ExecutorAddr WrapperFnAddr,
-                        IncomingWFRHandler OnComplete,
-                        ArrayRef<char> ArgBuffer) override {
-    llvm_unreachable("Unsupported");
-  }
-
-  Error disconnect() override { return Error::success(); }
-};
-
 /// A ExecutorProcessControl implementation targeting the current process.
 class LLVM_ABI SelfExecutorProcessControl : public ExecutorProcessControl,
                                             private InProcessMemoryAccess,
diff --git a/llvm/include/llvm/Frontend/Directive/Spelling.h b/llvm/include/llvm/Frontend/Directive/Spelling.h
index a101489603254..a13e26e88823d 100644
--- a/llvm/include/llvm/Frontend/Directive/Spelling.h
+++ b/llvm/include/llvm/Frontend/Directive/Spelling.h
@@ -10,6 +10,7 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/Support/Compiler.h"
 
 #include <limits>
 #include <tuple>
@@ -33,7 +34,8 @@ struct Spelling {
   VersionRange Versions;
 };
 
-StringRef FindName(llvm::iterator_range<const Spelling *>, unsigned Version);
+LLVM_ABI StringRef FindName(llvm::iterator_range<const Spelling *>,
+                            unsigned Version);
 
 } // namespace llvm::directive
 
diff --git a/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h b/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h
index f0168c0407884..17ba28b6de443 100644
--- a/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h
+++ b/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h
@@ -63,7 +63,7 @@ enum ProfileInstrKind {
 };
 
 // Default filename used for profile generation.
-std::string getDefaultProfileGenName();
+LLVM_ABI std::string getDefaultProfileGenName();
 } // end namespace llvm::driver
 
 #endif
diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
index b871d3839fe03..43c8805ce06b6 100644
--- a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
+++ b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
@@ -99,13 +99,14 @@ class ResourceRange {
 
   // Returns a reference to the first RangeInfo that overlaps with
   // [Info.LowerBound;Info.UpperBound], or, std::nullopt if there is no overlap
-  std::optional<const RangeInfo *> getOverlapping(const RangeInfo &Info) const;
+  LLVM_ABI std::optional<const RangeInfo *>
+  getOverlapping(const RangeInfo &Info) const;
 
   // Return the mapped RangeInfo at X or nullptr if no mapping exists
-  const RangeInfo *lookup(uint32_t X) const;
+  LLVM_ABI const RangeInfo *lookup(uint32_t X) const;
 
   // Removes all entries of the ResourceRange
-  void clear();
+  LLVM_ABI void clear();
 
   // Insert the required (sub-)intervals such that the interval of [a;b] =
   // [Info.LowerBound, Info.UpperBound] is covered and points to a valid
@@ -133,7 +134,7 @@ class ResourceRange {
   // Returns a reference to the first RangeInfo that overlaps with
   // [Info.LowerBound;Info.UpperBound], or, std::nullopt if there is no overlap
   // (equivalent to getOverlapping)
-  std::optional<const RangeInfo *> insert(const RangeInfo &Info);
+  LLVM_ABI std::optional<const RangeInfo *> insert(const RangeInfo &Info);
 };
 
 } // namespace rootsig
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index b37c28477fb34..f2610011a7e04 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -651,7 +651,8 @@ def OMP_EndAssumes : Directive<[Spelling<"end assumes">]> {
   let category = OMP_BeginAssumes.category;
   let languages = OMP_BeginAssumes.languages;
 }
-def OMP_BeginDeclareTarget : Directive<[Spelling<"begin declare target">]> {
+def OMP_BeginDeclareTarget : Directive<[Spelling<"begin declare target", 1, 52>,
+                                        Spelling<"begin declare_target", 60>]> {
   let allowedClauses = [
     VersionedClause<OMPC_DeviceType>,
     VersionedClause<OMPC_Indirect, 51>,
@@ -662,17 +663,21 @@ def OMP_BeginDeclareTarget : Directive<[Spelling<"begin declare target">]> {
   let category = CA_Declarative;
   let languages = [L_C];
 }
-def OMP_EndDeclareTarget : Directive<[Spelling<"end declare target">]> {
+def OMP_EndDeclareTarget : Directive<[Spelling<"end declare target", 1, 52>,
+                                      Spelling<"end declare_target", 60>]> {
   let association = AS_Delimited;
   let category = OMP_BeginDeclareTarget.category;
   let languages = OMP_BeginDeclareTarget.languages;
 }
-def OMP_BeginDeclareVariant : Directive<[Spelling<"begin declare variant">]> {
+def OMP_BeginDeclareVariant
+    : Directive<[Spelling<"begin declare variant", 1, 52>,
+                 Spelling<"begin declare_variant", 60>]> {
   let association = AS_Delimited;
   let category = CA_Declarative;
   let languages = [L_C];
 }
-def OMP_EndDeclareVariant : Directive<[Spelling<"end declare variant">]> {
+def OMP_EndDeclareVariant : Directive<[Spelling<"end declare variant", 1, 52>,
+                                       Spelling<"end declare_variant", 60>]> {
   let association = AS_Delimited;
   let category = OMP_BeginDeclareVariant.category;
   let languages = OMP_BeginDeclareVariant.languages;
@@ -685,7 +690,8 @@ def OMP_Cancel : Directive<[Spelling<"cancel">]> {
   let association = AS_None;
   let category = CA_Executable;
 }
-def OMP_CancellationPoint : Directive<[Spelling<"cancellation point">]> {
+def OMP_CancellationPoint : Directive<[Spelling<"cancellation point", 1, 52>,
+                                       Spelling<"cancellation_point", 60>]> {
   let allowedOnceClauses = [
     VersionedClause<OMPC_CancellationConstructType>,
   ];
@@ -699,21 +705,24 @@ def OMP_Critical : Directive<[Spelling<"critical">]> {
   let association = AS_Block;
   let category = CA_Executable;
 }
-def OMP_DeclareMapper : Directive<[Spelling<"declare mapper">]> {
+def OMP_DeclareMapper : Directive<[Spelling<"declare mapper", 1, 52>,
+                                   Spelling<"declare_mapper", 60>]> {
   let requiredClauses = [
     VersionedClause<OMPC_Map, 45>,
   ];
   let association = AS_None;
   let category = CA_Declarative;
 }
-def OMP_DeclareReduction : Directive<[Spelling<"declare reduction">]> {
+def OMP_DeclareReduction : Directive<[Spelling<"declare reduction", 1, 52>,
+                                      Spelling<"declare_reduction", 60>]> {
   let allowedOnceClauses = [
     VersionedClause<OMPC_Initializer>,
   ];
   let association = AS_None;
   let category = CA_Declarative;
 }
-def OMP_DeclareSimd : Directive<[Spelling<"declare simd">]> {
+def OMP_DeclareSimd : Directive<[Spelling<"declare simd", 1, 52>,
+                                 Spelling<"declare_simd", 60>]> {
   let allowedClauses = [
     VersionedClause<OMPC_Aligned>,
     VersionedClause<OMPC_Linear>,
@@ -729,7 +738,8 @@ def OMP_DeclareSimd : Directive<[Spelling<"declare simd">]> {
   let association = AS_Declaration;
   let category = CA_Declarative;
 }
-def OMP_DeclareTarget : Directive<[Spelling<"declare target">]> {
+def OMP_DeclareTarget : Directive<[Spelling<"declare target", 1, 52>,
+                                   Spelling<"declare_target", 60>]> {
   let allowedClauses = [
     VersionedClause<OMPC_Enter, 52>,
     VersionedClause<OMPC_Indirect, 51>,
@@ -742,7 +752,8 @@ def OMP_DeclareTarget : Directive<[Spelling<"declare target">]> {
   let association = AS_None;
   let category = CA_Declarative;
 }
-def OMP_DeclareVariant : Directive<[Spelling<"declare variant">]> {
+def OMP_DeclareVariant : Directive<[Spelling<"declare variant", 1, 52>,
+                                    Spelling<"declare_variant", 60>]> {
   let allowedClauses = [
     VersionedClause<OMPC_AdjustArgs, 51>,
   ];
@@ -1101,7 +1112,8 @@ def OMP_Target : Directive<[Spelling<"target">]> {
   let association = AS_Block;
   let category = CA_Executable;
 }
-def OMP_TargetData : Directive<[Spelling<"target data">]> {
+def OMP_TargetData : Directive<[Spelling<"target data", 1, 52>,
+                                Spelling<"target_data", 60>]> {
   let allowedOnceClauses = [
     VersionedClause<OMPC_Device>,
     VersionedClause<OMPC_If>,
@@ -1114,7 +1126,8 @@ def OMP_TargetData : Directive<[Spelling<"target data">]> {
   let association = AS_Block;
   let category = CA_Executable;
 }
-def OMP_TargetEnterData : Directive<[Spelling<"target enter data">]> {
+def OMP_TargetEnterData : Directive<[Spelling<"target enter data", 1, 52>,
+                                     Spelling<"target_enter_data", 60>]> {
   let allowedClauses = [
     VersionedClause<OMPC_Depend>,
   ];
@@ -1129,7 +1142,8 @@ def OMP_TargetEnterData : Directive<[Spelling<"target enter data">]> {
   let association = AS_None;
   let category = CA_Executable;
 }
-def OMP_TargetExitData : Directive<[Spelling<"target exit data">]> {
+def OMP_TargetExitData : Directive<[Spelling<"target exit data", 1, 52>,
+                                    Spelling<"target_exit_data", 60>]> {
   let allowedClauses = [
     VersionedClause<OMPC_Depend>,
   ];
@@ -1144,7 +1158,8 @@ def OMP_TargetExitData : Directive<[Spelling<"target exit data">]> {
   let association = AS_None;
   let category = CA_Executable;
 }
-def OMP_TargetUpdate : Directive<[Spelling<"target update">]> {
+def OMP_TargetUpdate : Directive<[Spelling<"target update", 1, 52>,
+                                  Spelling<"target_update", 60>]> {
   let allowedClauses = [
     VersionedClause<OMPC_Depend>,
     VersionedClause<OMPC_From>,
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 93fb0d8e8d078..19a4058b64382 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -484,7 +484,7 @@ class OpenMPIRBuilder {
   /// not have an effect on \p M (see initialize)
   OpenMPIRBuilder(Module &M)
       : M(M), Builder(M.getContext()), OffloadInfoManager(this),
-        T(M.getTargetTriple()) {}
+        T(M.getTargetTriple()), IsFinalized(false) {}
   LLVM_ABI ~OpenMPIRBuilder();
 
   class AtomicInfo : public llvm::AtomicInfo {
@@ -521,6 +521,10 @@ class OpenMPIRBuilder {
   ///                              all functions are finalized.
   LLVM_ABI void finalize(Function *Fn = nullptr);
 
+  /// Check whether the finalize function has already run
+  /// \return true if the finalize function has already run
+  LLVM_ABI bool isFinalized();
+
   /// Add attributes known for \p FnID to \p Fn.
   LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn);
 
@@ -3286,6 +3290,8 @@ class OpenMPIRBuilder {
   Value *emitRMWOpAsInstruction(Value *Src1, Value *Src2,
                                 AtomicRMWInst::BinOp RMWOp);
 
+  bool IsFinalized;
+
 public:
   /// a struct to pack relevant information while generating atomic Ops
   struct AtomicOpValue {
diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h
index 43fca571ee6d5..99f7491b1b9b5 100644
--- a/llvm/include/llvm/IR/DIBuilder.h
+++ b/llvm/include/llvm/IR/DIBuilder.h
@@ -367,6 +367,22 @@ namespace llvm {
                                               uint32_t VBPtrOffset,
                                               DINode::DIFlags Flags);
 
+    /// Create debugging information entry for a member.
+    /// \param Scope        Member scope.
+    /// \param Name         Member name.
+    /// \param File         File where this member is defined.
+    /// \param LineNo       Line number.
+    /// \param SizeInBits   Member size.
+    /// \param AlignInBits  Member alignment.
+    /// \param OffsetInBits Member offset.
+    /// \param Flags        Flags to encode member attribute, e.g. private
+    /// \param Ty           Parent type.
+    /// \param Annotations  Member annotations.
+    LLVM_ABI DIDerivedType *createMemberType(
+        DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNo,
+        Metadata *SizeInBits, uint32_t AlignInBits, Metadata *OffsetInBits,
+        DINode::DIFlags Flags, DIType *Ty, DINodeArray Annotations = nullptr);
+
     /// Create debugging information entry for a member.
     /// \param Scope        Member scope.
     /// \param Name         Member name.
@@ -419,6 +435,23 @@ namespace llvm {
                                                     Constant *Discriminant,
                                                     DIType *Ty);
 
+    /// Create debugging information entry for a bit field member.
+    /// \param Scope               Member scope.
+    /// \param Name                Member name.
+    /// \param File                File where this member is defined.
+    /// \param LineNo              Line number.
+    /// \param SizeInBits          Member size.
+    /// \param OffsetInBits        Member offset.
+    /// \param StorageOffsetInBits Member storage offset.
+    /// \param Flags               Flags to encode member attribute.
+    /// \param Ty                  Parent type.
+    /// \param Annotations         Member annotations.
+    LLVM_ABI DIDerivedType *createBitFieldMemberType(
+        DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNo,
+        Metadata *SizeInBits, Metadata *OffsetInBits,
+        uint64_t StorageOffsetInBits, DINode::DIFlags Flags, DIType *Ty,
+        DINodeArray Annotations = nullptr);
+
     /// Create debugging information entry for a bit field member.
     /// \param Scope               Member scope.
     /// \param Name                Member name.
@@ -510,6 +543,29 @@ namespace llvm {
         unsigned RunTimeLang = 0, DIType *VTableHolder = nullptr,
         MDNode *TemplateParms = nullptr, StringRef UniqueIdentifier = "");
 
+    /// Create debugging information entry for a struct.
+    /// \param Scope        Scope in which this struct is defined.
+    /// \param Name         Struct name.
+    /// \param File         File where this member is defined.
+    /// \param LineNumber   Line number.
+    /// \param SizeInBits   Member size.
+    /// \param AlignInBits  Member alignment.
+    /// \param Flags        Flags to encode member attribute, e.g. private
+    /// \param Elements     Struct elements.
+    /// \param RunTimeLang  Optional parameter, Objective-C runtime version.
+    /// \param UniqueIdentifier A unique identifier for the struct.
+    /// \param Specification The type that this type completes. This is used by
+    /// Swift to represent generic types.
+    /// \param NumExtraInhabitants The number of extra inhabitants of the type.
+    /// An extra inhabitant is a bit pattern that does not represent a valid
+    /// value for instances of a given type. This is used by the Swift language.
+    LLVM_ABI DICompositeType *createStructType(
+        DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
+        Metadata *SizeInBits, uint32_t AlignInBits, DINode::DIFlags Flags,
+        DIType *DerivedFrom, DINodeArray Elements, unsigned RunTimeLang = 0,
+        DIType *VTableHolder = nullptr, StringRef UniqueIdentifier = "",
+        DIType *Specification = nullptr, uint32_t NumExtraInhabitants = 0);
+
     /// Create debugging information entry for a struct.
     /// \param Scope        Scope in which this struct is defined.
     /// \param Name         Struct name.
diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index 18228b7757897..f80e44ce3abbc 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -717,40 +717,33 @@ std::optional<StringRef> DIScope::getSource() const {
 class DIType : public DIScope {
   unsigned Line;
   DIFlags Flags;
-  uint64_t SizeInBits;
-  uint64_t OffsetInBits;
   uint32_t NumExtraInhabitants;
 
 protected:
+  static constexpr unsigned N_OPERANDS = 5;
+
   DIType(LLVMContext &C, unsigned ID, StorageType Storage, unsigned Tag,
-         unsigned Line, uint64_t SizeInBits, uint32_t AlignInBits,
-         uint64_t OffsetInBits, uint32_t NumExtraInhabitants, DIFlags Flags,
-         ArrayRef<Metadata *> Ops)
+         unsigned Line, uint32_t AlignInBits, uint32_t NumExtraInhabitants,
+         DIFlags Flags, ArrayRef<Metadata *> Ops)
       : DIScope(C, ID, Storage, Tag, Ops) {
-    init(Line, SizeInBits, AlignInBits, OffsetInBits, NumExtraInhabitants,
-         Flags);
+    init(Line, AlignInBits, NumExtraInhabitants, Flags);
   }
   ~DIType() = default;
 
-  void init(unsigned Line, uint64_t SizeInBits, uint32_t AlignInBits,
-            uint64_t OffsetInBits, uint32_t NumExtraInhabitants,
+  void init(unsigned Line, uint32_t AlignInBits, uint32_t NumExtraInhabitants,
             DIFlags Flags) {
     this->Line = Line;
     this->Flags = Flags;
-    this->SizeInBits = SizeInBits;
     this->SubclassData32 = AlignInBits;
-    this->OffsetInBits = OffsetInBits;
     this->NumExtraInhabitants = NumExtraInhabitants;
   }
 
   /// Change fields in place.
-  void mutate(unsigned Tag, unsigned Line, uint64_t SizeInBits,
-              uint32_t AlignInBits, uint64_t OffsetInBits,
+  void mutate(unsigned Tag, unsigned Line, uint32_t AlignInBits,
               uint32_t NumExtraInhabitants, DIFlags Flags) {
     assert(isDistinct() && "Only distinct nodes can mutate");
     setTag(Tag);
-    init(Line, SizeInBits, AlignInBits, OffsetInBits, NumExtraInhabitants,
-         Flags);
+    init(Line, AlignInBits, NumExtraInhabitants, Flags);
   }
 
 public:
@@ -759,10 +752,8 @@ class DIType : public DIScope {
   }
 
   unsigned getLine() const { return Line; }
-  uint64_t getSizeInBits() const { return SizeInBits; }
   LLVM_ABI uint32_t getAlignInBits() const;
   uint32_t getAlignInBytes() const { return getAlignInBits() / CHAR_BIT; }
-  uint64_t getOffsetInBits() const { return OffsetInBits; }
   uint32_t getNumExtraInhabitants() const { return NumExtraInhabitants; }
   DIFlags getFlags() const { return Flags; }
 
@@ -772,6 +763,24 @@ class DIType : public DIScope {
   Metadata *getRawScope() const { return getOperand(1); }
   MDString *getRawName() const { return getOperandAs<MDString>(2); }
 
+  Metadata *getRawSizeInBits() const { return getOperand(3); }
+  uint64_t getSizeInBits() const {
+    if (auto *MD = dyn_cast_or_null<ConstantAsMetadata>(getRawSizeInBits())) {
+      if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>(MD->getValue()))
+        return CI->getZExtValue();
+    }
+    return 0;
+  }
+
+  Metadata *getRawOffsetInBits() const { return getOperand(4); }
+  uint64_t getOffsetInBits() const {
+    if (auto *MD = dyn_cast_or_null<ConstantAsMetadata>(getRawOffsetInBits())) {
+      if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>(MD->getValue()))
+        return CI->getZExtValue();
+    }
+    return 0;
+  }
+
   /// Returns a new temporary DIType with updated Flags
   TempDIType cloneWithFlags(DIFlags NewFlags) const {
     auto NewTy = clone();
@@ -837,18 +846,18 @@ class DIBasicType : public DIType {
 
 protected:
   DIBasicType(LLVMContext &C, StorageType Storage, unsigned Tag,
-              uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding,
+              uint32_t AlignInBits, unsigned Encoding,
               uint32_t NumExtraInhabitants, DIFlags Flags,
               ArrayRef<Metadata *> Ops)
-      : DIType(C, DIBasicTypeKind, Storage, Tag, 0, SizeInBits, AlignInBits, 0,
+      : DIType(C, DIBasicTypeKind, Storage, Tag, 0, AlignInBits,
                NumExtraInhabitants, Flags, Ops),
         Encoding(Encoding) {}
   DIBasicType(LLVMContext &C, unsigned ID, StorageType Storage, unsigned Tag,
-              uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding,
+              uint32_t AlignInBits, unsigned Encoding,
               uint32_t NumExtraInhabitants, DIFlags Flags,
               ArrayRef<Metadata *> Ops)
-      : DIType(C, ID, Storage, Tag, 0, SizeInBits, AlignInBits, 0,
-               NumExtraInhabitants, Flags, Ops),
+      : DIType(C, ID, Storage, Tag, 0, AlignInBits, NumExtraInhabitants, Flags,
+               Ops),
         Encoding(Encoding) {}
   ~DIBasicType() = default;
 
@@ -866,11 +875,21 @@ class DIBasicType : public DIType {
                                        uint32_t AlignInBits, unsigned Encoding,
                                        uint32_t NumExtraInhabitants,
                                        DIFlags Flags, StorageType Storage,
-                                       bool ShouldCreate = true);
+                                       bool ShouldCreate = true) {
+    auto *SizeInBitsNode = ConstantAsMetadata::get(
+        ConstantInt::get(Type::getInt64Ty(Context), SizeInBits));
+    return getImpl(Context, Tag, Name, SizeInBitsNode, AlignInBits, Encoding,
+                   NumExtraInhabitants, Flags, Storage, ShouldCreate);
+  }
+  static DIBasicType *getImpl(LLVMContext &Context, unsigned Tag,
+                              MDString *Name, Metadata *SizeInBits,
+                              uint32_t AlignInBits, unsigned Encoding,
+                              uint32_t NumExtraInhabitants, DIFlags Flags,
+                              StorageType Storage, bool ShouldCreate = true);
 
   TempDIBasicType cloneImpl() const {
-    return getTemporary(getContext(), getTag(), getName(), getSizeInBits(),
-                        getAlignInBits(), getEncoding(),
+    return getTemporary(getContext(), getTag(), getRawName(),
+                        getRawSizeInBits(), getAlignInBits(), getEncoding(),
                         getNumExtraInhabitants(), getFlags());
   }
 
@@ -903,6 +922,12 @@ class DIBasicType : public DIType {
                      uint32_t NumExtraInhabitants, DIFlags Flags),
                     (Tag, Name, SizeInBits, AlignInBits, Encoding,
                      NumExtraInhabitants, Flags))
+  DEFINE_MDNODE_GET(DIBasicType,
+                    (unsigned Tag, MDString *Name, Metadata *SizeInBits,
+                     uint32_t AlignInBits, unsigned Encoding,
+                     uint32_t NumExtraInhabitants, DIFlags Flags),
+                    (Tag, Name, SizeInBits, AlignInBits, Encoding,
+                     NumExtraInhabitants, Flags))
 
   TempDIBasicType clone() const { return cloneImpl(); }
 
@@ -934,29 +959,28 @@ class DIFixedPointType : public DIBasicType {
   APInt Denominator;
 
   DIFixedPointType(LLVMContext &C, StorageType Storage, unsigned Tag,
-                   uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding,
-                   DIFlags Flags, unsigned Kind, int Factor,
-                   ArrayRef<Metadata *> Ops)
-      : DIBasicType(C, DIFixedPointTypeKind, Storage, Tag, SizeInBits,
-                    AlignInBits, Encoding, 0, Flags, Ops),
+                   uint32_t AlignInBits, unsigned Encoding, DIFlags Flags,
+                   unsigned Kind, int Factor, ArrayRef<Metadata *> Ops)
+      : DIBasicType(C, DIFixedPointTypeKind, Storage, Tag, AlignInBits,
+                    Encoding, 0, Flags, Ops),
         Kind(Kind), Factor(Factor) {
     assert(Kind == FixedPointBinary || Kind == FixedPointDecimal);
   }
   DIFixedPointType(LLVMContext &C, StorageType Storage, unsigned Tag,
-                   uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding,
-                   DIFlags Flags, unsigned Kind, APInt Numerator,
-                   APInt Denominator, ArrayRef<Metadata *> Ops)
-      : DIBasicType(C, DIFixedPointTypeKind, Storage, Tag, SizeInBits,
-                    AlignInBits, Encoding, 0, Flags, Ops),
+                   uint32_t AlignInBits, unsigned Encoding, DIFlags Flags,
+                   unsigned Kind, APInt Numerator, APInt Denominator,
+                   ArrayRef<Metadata *> Ops)
+      : DIBasicType(C, DIFixedPointTypeKind, Storage, Tag, AlignInBits,
+                    Encoding, 0, Flags, Ops),
         Kind(Kind), Factor(0), Numerator(Numerator), Denominator(Denominator) {
     assert(Kind == FixedPointRational);
   }
   DIFixedPointType(LLVMContext &C, StorageType Storage, unsigned Tag,
-                   uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding,
-                   DIFlags Flags, unsigned Kind, int Factor, APInt Numerator,
+                   uint32_t AlignInBits, unsigned Encoding, DIFlags Flags,
+                   unsigned Kind, int Factor, APInt Numerator,
                    APInt Denominator, ArrayRef<Metadata *> Ops)
-      : DIBasicType(C, DIFixedPointTypeKind, Storage, Tag, SizeInBits,
-                    AlignInBits, Encoding, 0, Flags, Ops),
+      : DIBasicType(C, DIFixedPointTypeKind, Storage, Tag, AlignInBits,
+                    Encoding, 0, Flags, Ops),
         Kind(Kind), Factor(Factor), Numerator(Numerator),
         Denominator(Denominator) {}
   ~DIFixedPointType() = default;
@@ -966,6 +990,17 @@ class DIFixedPointType : public DIBasicType {
           uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding,
           DIFlags Flags, unsigned Kind, int Factor, APInt Numerator,
           APInt Denominator, StorageType Storage, bool ShouldCreate = true) {
+    auto *SizeInBitsNode = ConstantAsMetadata::get(
+        ConstantInt::get(Type::getInt64Ty(Context), SizeInBits));
+    return getImpl(Context, Tag, getCanonicalMDString(Context, Name),
+                   SizeInBitsNode, AlignInBits, Encoding, Flags, Kind, Factor,
+                   Numerator, Denominator, Storage, ShouldCreate);
+  }
+  static DIFixedPointType *
+  getImpl(LLVMContext &Context, unsigned Tag, StringRef Name,
+          Metadata *SizeInBits, uint32_t AlignInBits, unsigned Encoding,
+          DIFlags Flags, unsigned Kind, int Factor, APInt Numerator,
+          APInt Denominator, StorageType Storage, bool ShouldCreate = true) {
     return getImpl(Context, Tag, getCanonicalMDString(Context, Name),
                    SizeInBits, AlignInBits, Encoding, Flags, Kind, Factor,
                    Numerator, Denominator, Storage, ShouldCreate);
@@ -974,12 +1009,23 @@ class DIFixedPointType : public DIBasicType {
   getImpl(LLVMContext &Context, unsigned Tag, MDString *Name,
           uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding,
           DIFlags Flags, unsigned Kind, int Factor, APInt Numerator,
+          APInt Denominator, StorageType Storage, bool ShouldCreate = true) {
+    auto *SizeInBitsNode = ConstantAsMetadata::get(
+        ConstantInt::get(Type::getInt64Ty(Context), SizeInBits));
+    return getImpl(Context, Tag, Name, SizeInBitsNode, AlignInBits, Encoding,
+                   Flags, Kind, Factor, Numerator, Denominator, Storage,
+                   ShouldCreate);
+  }
+  static DIFixedPointType *
+  getImpl(LLVMContext &Context, unsigned Tag, MDString *Name,
+          Metadata *SizeInBits, uint32_t AlignInBits, unsigned Encoding,
+          DIFlags Flags, unsigned Kind, int Factor, APInt Numerator,
           APInt Denominator, StorageType Storage, bool ShouldCreate = true);
 
   TempDIFixedPointType cloneImpl() const {
-    return getTemporary(getContext(), getTag(), getName(), getSizeInBits(),
-                        getAlignInBits(), getEncoding(), getFlags(), Kind,
-                        Factor, Numerator, Denominator);
+    return getTemporary(getContext(), getTag(), getRawName(),
+                        getRawSizeInBits(), getAlignInBits(), getEncoding(),
+                        getFlags(), Kind, Factor, Numerator, Denominator);
   }
 
 public:
@@ -1011,6 +1057,13 @@ class DIFixedPointType : public DIBasicType {
                      APInt Denominator),
                     (Tag, Name, SizeInBits, AlignInBits, Encoding, Flags, Kind,
                      Factor, Numerator, Denominator))
+  DEFINE_MDNODE_GET(DIFixedPointType,
+                    (unsigned Tag, MDString *Name, Metadata *SizeInBits,
+                     uint32_t AlignInBits, unsigned Encoding, DIFlags Flags,
+                     unsigned Kind, int Factor, APInt Numerator,
+                     APInt Denominator),
+                    (Tag, Name, SizeInBits, AlignInBits, Encoding, Flags, Kind,
+                     Factor, Numerator, Denominator))
 
   TempDIFixedPointType clone() const { return cloneImpl(); }
 
@@ -1050,13 +1103,15 @@ class DIStringType : public DIType {
   friend class LLVMContextImpl;
   friend class MDNode;
 
+  static constexpr unsigned MY_FIRST_OPERAND = DIType::N_OPERANDS;
+
   unsigned Encoding;
 
   DIStringType(LLVMContext &C, StorageType Storage, unsigned Tag,
-               uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding,
+               uint32_t AlignInBits, unsigned Encoding,
                ArrayRef<Metadata *> Ops)
-      : DIType(C, DIStringTypeKind, Storage, Tag, 0, SizeInBits, AlignInBits, 0,
-               0, FlagZero, Ops),
+      : DIType(C, DIStringTypeKind, Storage, Tag, 0, AlignInBits, 0, FlagZero,
+               Ops),
         Encoding(Encoding) {}
   ~DIStringType() = default;
 
@@ -1066,20 +1121,34 @@ class DIStringType : public DIType {
                                uint64_t SizeInBits, uint32_t AlignInBits,
                                unsigned Encoding, StorageType Storage,
                                bool ShouldCreate = true) {
+    auto *SizeInBitsNode = ConstantAsMetadata::get(
+        ConstantInt::get(Type::getInt64Ty(Context), SizeInBits));
     return getImpl(Context, Tag, getCanonicalMDString(Context, Name),
-                   StringLength, StrLenExp, StrLocationExp, SizeInBits,
+                   StringLength, StrLenExp, StrLocationExp, SizeInBitsNode,
                    AlignInBits, Encoding, Storage, ShouldCreate);
   }
   LLVM_ABI static DIStringType *
   getImpl(LLVMContext &Context, unsigned Tag, MDString *Name,
           Metadata *StringLength, Metadata *StrLenExp, Metadata *StrLocationExp,
           uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding,
-          StorageType Storage, bool ShouldCreate = true);
+          StorageType Storage, bool ShouldCreate = true) {
+    auto *SizeInBitsNode = ConstantAsMetadata::get(
+        ConstantInt::get(Type::getInt64Ty(Context), SizeInBits));
+    return getImpl(Context, Tag, Name, StringLength, StrLenExp, StrLocationExp,
+                   SizeInBitsNode, AlignInBits, Encoding, Storage,
+                   ShouldCreate);
+  }
+  static DIStringType *getImpl(LLVMContext &Context, unsigned Tag,
+                               MDString *Name, Metadata *StringLength,
+                               Metadata *StrLenExp, Metadata *StrLocationExp,
+                               Metadata *SizeInBits, uint32_t AlignInBits,
+                               unsigned Encoding, StorageType Storage,
+                               bool ShouldCreate = true);
 
   TempDIStringType cloneImpl() const {
     return getTemporary(getContext(), getTag(), getRawName(),
                         getRawStringLength(), getRawStringLengthExp(),
-                        getRawStringLocationExp(), getSizeInBits(),
+                        getRawStringLocationExp(), getRawSizeInBits(),
                         getAlignInBits(), getEncoding());
   }
 
@@ -1103,6 +1172,13 @@ class DIStringType : public DIType {
                      unsigned Encoding),
                     (Tag, Name, StringLength, StringLengthExp,
                      StringLocationExp, SizeInBits, AlignInBits, Encoding))
+  DEFINE_MDNODE_GET(DIStringType,
+                    (unsigned Tag, MDString *Name, Metadata *StringLength,
+                     Metadata *StringLengthExp, Metadata *StringLocationExp,
+                     Metadata *SizeInBits, uint32_t AlignInBits,
+                     unsigned Encoding),
+                    (Tag, Name, StringLength, StringLengthExp,
+                     StringLocationExp, SizeInBits, AlignInBits, Encoding))
 
   TempDIStringType clone() const { return cloneImpl(); }
 
@@ -1124,11 +1200,15 @@ class DIStringType : public DIType {
 
   unsigned getEncoding() const { return Encoding; }
 
-  Metadata *getRawStringLength() const { return getOperand(3); }
+  Metadata *getRawStringLength() const { return getOperand(MY_FIRST_OPERAND); }
 
-  Metadata *getRawStringLengthExp() const { return getOperand(4); }
+  Metadata *getRawStringLengthExp() const {
+    return getOperand(MY_FIRST_OPERAND + 1);
+  }
 
-  Metadata *getRawStringLocationExp() const { return getOperand(5); }
+  Metadata *getRawStringLocationExp() const {
+    return getOperand(MY_FIRST_OPERAND + 2);
+  }
 };
 
 /// Derived types.
@@ -1170,18 +1250,19 @@ class DIDerivedType : public DIType {
   friend class LLVMContextImpl;
   friend class MDNode;
 
+  static constexpr unsigned MY_FIRST_OPERAND = DIType::N_OPERANDS;
+
   /// The DWARF address space of the memory pointed to or referenced by a
   /// pointer or reference type respectively.
   std::optional<unsigned> DWARFAddressSpace;
 
   DIDerivedType(LLVMContext &C, StorageType Storage, unsigned Tag,
-                unsigned Line, uint64_t SizeInBits, uint32_t AlignInBits,
-                uint64_t OffsetInBits,
+                unsigned Line, uint32_t AlignInBits,
                 std::optional<unsigned> DWARFAddressSpace,
                 std::optional<PtrAuthData> PtrAuthData, DIFlags Flags,
                 ArrayRef<Metadata *> Ops)
-      : DIType(C, DIDerivedTypeKind, Storage, Tag, Line, SizeInBits,
-               AlignInBits, OffsetInBits, 0, Flags, Ops),
+      : DIType(C, DIDerivedTypeKind, Storage, Tag, Line, AlignInBits, 0, Flags,
+               Ops),
         DWARFAddressSpace(DWARFAddressSpace) {
     if (PtrAuthData)
       SubclassData32 = PtrAuthData->RawData;
@@ -1195,6 +1276,40 @@ class DIDerivedType : public DIType {
           std::optional<PtrAuthData> PtrAuthData, DIFlags Flags,
           Metadata *ExtraData, DINodeArray Annotations, StorageType Storage,
           bool ShouldCreate = true) {
+    auto *SizeInBitsNode = ConstantAsMetadata::get(
+        ConstantInt::get(Type::getInt64Ty(Context), SizeInBits));
+    auto *OffsetInBitsNode = ConstantAsMetadata::get(
+        ConstantInt::get(Type::getInt64Ty(Context), OffsetInBits));
+    return getImpl(Context, Tag, getCanonicalMDString(Context, Name), File,
+                   Line, Scope, BaseType, SizeInBitsNode, AlignInBits,
+                   OffsetInBitsNode, DWARFAddressSpace, PtrAuthData, Flags,
+                   ExtraData, Annotations.get(), Storage, ShouldCreate);
+  }
+  static DIDerivedType *
+  getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, DIFile *File,
+          unsigned Line, DIScope *Scope, DIType *BaseType, uint64_t SizeInBits,
+          uint32_t AlignInBits, uint64_t OffsetInBits,
+          std::optional<unsigned> DWARFAddressSpace,
+          std::optional<PtrAuthData> PtrAuthData, DIFlags Flags,
+          Metadata *ExtraData, DINodeArray Annotations, StorageType Storage,
+          bool ShouldCreate = true) {
+    auto *SizeInBitsNode = ConstantAsMetadata::get(
+        ConstantInt::get(Type::getInt64Ty(Context), SizeInBits));
+    auto *OffsetInBitsNode = ConstantAsMetadata::get(
+        ConstantInt::get(Type::getInt64Ty(Context), OffsetInBits));
+    return getImpl(Context, Tag, Name, File, Line, Scope, BaseType,
+                   SizeInBitsNode, AlignInBits, OffsetInBitsNode,
+                   DWARFAddressSpace, PtrAuthData, Flags, ExtraData,
+                   Annotations.get(), Storage, ShouldCreate);
+  }
+  static DIDerivedType *
+  getImpl(LLVMContext &Context, unsigned Tag, StringRef Name, DIFile *File,
+          unsigned Line, DIScope *Scope, DIType *BaseType, Metadata *SizeInBits,
+          uint32_t AlignInBits, Metadata *OffsetInBits,
+          std::optional<unsigned> DWARFAddressSpace,
+          std::optional<PtrAuthData> PtrAuthData, DIFlags Flags,
+          Metadata *ExtraData, DINodeArray Annotations, StorageType Storage,
+          bool ShouldCreate = true) {
     return getImpl(Context, Tag, getCanonicalMDString(Context, Name), File,
                    Line, Scope, BaseType, SizeInBits, AlignInBits, OffsetInBits,
                    DWARFAddressSpace, PtrAuthData, Flags, ExtraData,
@@ -1203,26 +1318,26 @@ class DIDerivedType : public DIType {
   LLVM_ABI static DIDerivedType *
   getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
           unsigned Line, Metadata *Scope, Metadata *BaseType,
-          uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
+          Metadata *SizeInBits, uint32_t AlignInBits, Metadata *OffsetInBits,
           std::optional<unsigned> DWARFAddressSpace,
           std::optional<PtrAuthData> PtrAuthData, DIFlags Flags,
           Metadata *ExtraData, Metadata *Annotations, StorageType Storage,
           bool ShouldCreate = true);
 
   TempDIDerivedType cloneImpl() const {
-    return getTemporary(getContext(), getTag(), getName(), getFile(), getLine(),
-                        getScope(), getBaseType(), getSizeInBits(),
-                        getAlignInBits(), getOffsetInBits(),
-                        getDWARFAddressSpace(), getPtrAuthData(), getFlags(),
-                        getExtraData(), getAnnotations());
+    return getTemporary(
+        getContext(), getTag(), getRawName(), getFile(), getLine(), getScope(),
+        getBaseType(), getRawSizeInBits(), getAlignInBits(),
+        getRawOffsetInBits(), getDWARFAddressSpace(), getPtrAuthData(),
+        getFlags(), getExtraData(), getRawAnnotations());
   }
 
 public:
   DEFINE_MDNODE_GET(DIDerivedType,
                     (unsigned Tag, MDString *Name, Metadata *File,
                      unsigned Line, Metadata *Scope, Metadata *BaseType,
-                     uint64_t SizeInBits, uint32_t AlignInBits,
-                     uint64_t OffsetInBits,
+                     Metadata *SizeInBits, uint32_t AlignInBits,
+                     Metadata *OffsetInBits,
                      std::optional<unsigned> DWARFAddressSpace,
                      std::optional<PtrAuthData> PtrAuthData, DIFlags Flags,
                      Metadata *ExtraData = nullptr,
@@ -1230,6 +1345,28 @@ class DIDerivedType : public DIType {
                     (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
                      AlignInBits, OffsetInBits, DWARFAddressSpace, PtrAuthData,
                      Flags, ExtraData, Annotations))
+  DEFINE_MDNODE_GET(DIDerivedType,
+                    (unsigned Tag, StringRef Name, DIFile *File, unsigned Line,
+                     DIScope *Scope, DIType *BaseType, Metadata *SizeInBits,
+                     uint32_t AlignInBits, Metadata *OffsetInBits,
+                     std::optional<unsigned> DWARFAddressSpace,
+                     std::optional<PtrAuthData> PtrAuthData, DIFlags Flags,
+                     Metadata *ExtraData = nullptr,
+                     DINodeArray Annotations = nullptr),
+                    (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                     AlignInBits, OffsetInBits, DWARFAddressSpace, PtrAuthData,
+                     Flags, ExtraData, Annotations))
+  DEFINE_MDNODE_GET(DIDerivedType,
+                    (unsigned Tag, MDString *Name, DIFile *File, unsigned Line,
+                     DIScope *Scope, DIType *BaseType, uint64_t SizeInBits,
+                     uint32_t AlignInBits, uint64_t OffsetInBits,
+                     std::optional<unsigned> DWARFAddressSpace,
+                     std::optional<PtrAuthData> PtrAuthData, DIFlags Flags,
+                     Metadata *ExtraData = nullptr,
+                     DINodeArray Annotations = nullptr),
+                    (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                     AlignInBits, OffsetInBits, DWARFAddressSpace, PtrAuthData,
+                     Flags, ExtraData, Annotations))
   DEFINE_MDNODE_GET(DIDerivedType,
                     (unsigned Tag, StringRef Name, DIFile *File, unsigned Line,
                      DIScope *Scope, DIType *BaseType, uint64_t SizeInBits,
@@ -1246,7 +1383,7 @@ class DIDerivedType : public DIType {
 
   /// Get the base type this is derived from.
   DIType *getBaseType() const { return cast_or_null<DIType>(getRawBaseType()); }
-  Metadata *getRawBaseType() const { return getOperand(3); }
+  Metadata *getRawBaseType() const { return getOperand(MY_FIRST_OPERAND); }
 
   /// \returns The DWARF address space of the memory pointed to or referenced by
   /// a pointer or reference type respectively.
@@ -1266,7 +1403,7 @@ class DIDerivedType : public DIType {
   /// TODO: Separate out types that need this extra operand: pointer-to-member
   /// types and member fields (static members and ivars).
   Metadata *getExtraData() const { return getRawExtraData(); }
-  Metadata *getRawExtraData() const { return getOperand(4); }
+  Metadata *getRawExtraData() const { return getOperand(MY_FIRST_OPERAND + 1); }
 
   /// Get the template parameters from a template alias.
   DITemplateParameterArray getTemplateParams() const {
@@ -1277,7 +1414,9 @@ class DIDerivedType : public DIType {
   DINodeArray getAnnotations() const {
     return cast_or_null<MDTuple>(getRawAnnotations());
   }
-  Metadata *getRawAnnotations() const { return getOperand(5); }
+  Metadata *getRawAnnotations() const {
+    return getOperand(MY_FIRST_OPERAND + 2);
+  }
 
   /// Get casted version of extra data.
   /// @{
@@ -1321,9 +1460,10 @@ class DISubrangeType : public DIType {
   friend class LLVMContextImpl;
   friend class MDNode;
 
+  static constexpr unsigned MY_FIRST_OPERAND = DIType::N_OPERANDS;
+
   DISubrangeType(LLVMContext &C, StorageType Storage, unsigned Line,
-                 uint64_t SizeInBits, uint32_t AlignInBits, DIFlags Flags,
-                 ArrayRef<Metadata *> Ops);
+                 uint32_t AlignInBits, DIFlags Flags, ArrayRef<Metadata *> Ops);
 
   ~DISubrangeType() = default;
 
@@ -1333,21 +1473,23 @@ class DISubrangeType : public DIType {
           DIFlags Flags, DIType *BaseType, Metadata *LowerBound,
           Metadata *UpperBound, Metadata *Stride, Metadata *Bias,
           StorageType Storage, bool ShouldCreate = true) {
+    auto *SizeInBitsNode = ConstantAsMetadata::get(
+        ConstantInt::get(Type::getInt64Ty(Context), SizeInBits));
     return getImpl(Context, getCanonicalMDString(Context, Name), File, Line,
-                   Scope, SizeInBits, AlignInBits, Flags, BaseType, LowerBound,
-                   UpperBound, Stride, Bias, Storage, ShouldCreate);
+                   Scope, SizeInBitsNode, AlignInBits, Flags, BaseType,
+                   LowerBound, UpperBound, Stride, Bias, Storage, ShouldCreate);
   }
 
   LLVM_ABI static DISubrangeType *
   getImpl(LLVMContext &Context, MDString *Name, Metadata *File, unsigned Line,
-          Metadata *Scope, uint64_t SizeInBits, uint32_t AlignInBits,
+          Metadata *Scope, Metadata *SizeInBits, uint32_t AlignInBits,
           DIFlags Flags, Metadata *BaseType, Metadata *LowerBound,
           Metadata *UpperBound, Metadata *Stride, Metadata *Bias,
           StorageType Storage, bool ShouldCreate = true);
 
   TempDISubrangeType cloneImpl() const {
-    return getTemporary(getContext(), getName(), getFile(), getLine(),
-                        getScope(), getSizeInBits(), getAlignInBits(),
+    return getTemporary(getContext(), getRawName(), getFile(), getLine(),
+                        getScope(), getRawSizeInBits(), getAlignInBits(),
                         getFlags(), getBaseType(), getRawLowerBound(),
                         getRawUpperBound(), getRawStride(), getRawBias());
   }
@@ -1357,9 +1499,10 @@ class DISubrangeType : public DIType {
 public:
   DEFINE_MDNODE_GET(DISubrangeType,
                     (MDString * Name, Metadata *File, unsigned Line,
-                     Metadata *Scope, uint64_t SizeInBits, uint32_t AlignInBits,
-                     DIFlags Flags, Metadata *BaseType, Metadata *LowerBound,
-                     Metadata *UpperBound, Metadata *Stride, Metadata *Bias),
+                     Metadata *Scope, Metadata *SizeInBits,
+                     uint32_t AlignInBits, DIFlags Flags, Metadata *BaseType,
+                     Metadata *LowerBound, Metadata *UpperBound,
+                     Metadata *Stride, Metadata *Bias),
                     (Name, File, Line, Scope, SizeInBits, AlignInBits, Flags,
                      BaseType, LowerBound, UpperBound, Stride, Bias))
   DEFINE_MDNODE_GET(DISubrangeType,
@@ -1374,15 +1517,23 @@ class DISubrangeType : public DIType {
 
   /// Get the base type this is derived from.
   DIType *getBaseType() const { return cast_or_null<DIType>(getRawBaseType()); }
-  Metadata *getRawBaseType() const { return getOperand(3); }
+  Metadata *getRawBaseType() const { return getOperand(MY_FIRST_OPERAND); }
 
-  Metadata *getRawLowerBound() const { return getOperand(4).get(); }
+  Metadata *getRawLowerBound() const {
+    return getOperand(MY_FIRST_OPERAND + 1).get();
+  }
 
-  Metadata *getRawUpperBound() const { return getOperand(5).get(); }
+  Metadata *getRawUpperBound() const {
+    return getOperand(MY_FIRST_OPERAND + 2).get();
+  }
 
-  Metadata *getRawStride() const { return getOperand(6).get(); }
+  Metadata *getRawStride() const {
+    return getOperand(MY_FIRST_OPERAND + 3).get();
+  }
 
-  Metadata *getRawBias() const { return getOperand(7).get(); }
+  Metadata *getRawBias() const {
+    return getOperand(MY_FIRST_OPERAND + 4).get();
+  }
 
   BoundType getLowerBound() const {
     return convertRawToBound(getRawLowerBound());
@@ -1409,31 +1560,30 @@ class DICompositeType : public DIType {
   friend class LLVMContextImpl;
   friend class MDNode;
 
+  static constexpr unsigned MY_FIRST_OPERAND = DIType::N_OPERANDS;
+
   unsigned RuntimeLang;
   std::optional<uint32_t> EnumKind;
 
   DICompositeType(LLVMContext &C, StorageType Storage, unsigned Tag,
-                  unsigned Line, unsigned RuntimeLang, uint64_t SizeInBits,
-                  uint32_t AlignInBits, uint64_t OffsetInBits,
+                  unsigned Line, unsigned RuntimeLang, uint32_t AlignInBits,
                   uint32_t NumExtraInhabitants,
                   std::optional<uint32_t> EnumKind, DIFlags Flags,
                   ArrayRef<Metadata *> Ops)
-      : DIType(C, DICompositeTypeKind, Storage, Tag, Line, SizeInBits,
-               AlignInBits, OffsetInBits, NumExtraInhabitants, Flags, Ops),
+      : DIType(C, DICompositeTypeKind, Storage, Tag, Line, AlignInBits,
+               NumExtraInhabitants, Flags, Ops),
         RuntimeLang(RuntimeLang), EnumKind(EnumKind) {}
   ~DICompositeType() = default;
 
   /// Change fields in place.
   void mutate(unsigned Tag, unsigned Line, unsigned RuntimeLang,
-              uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
-              uint32_t NumExtraInhabitants, std::optional<uint32_t> EnumKind,
-              DIFlags Flags) {
+              uint32_t AlignInBits, uint32_t NumExtraInhabitants,
+              std::optional<uint32_t> EnumKind, DIFlags Flags) {
     assert(isDistinct() && "Only distinct nodes can mutate");
     assert(getRawIdentifier() && "Only ODR-uniqued nodes should mutate");
     this->RuntimeLang = RuntimeLang;
     this->EnumKind = EnumKind;
-    DIType::mutate(Tag, Line, SizeInBits, AlignInBits, OffsetInBits,
-                   NumExtraInhabitants, Flags);
+    DIType::mutate(Tag, Line, AlignInBits, NumExtraInhabitants, Flags);
   }
 
   static DICompositeType *
@@ -1447,6 +1597,52 @@ class DICompositeType : public DIType {
           Metadata *DataLocation, Metadata *Associated, Metadata *Allocated,
           Metadata *Rank, DINodeArray Annotations, Metadata *BitStride,
           StorageType Storage, bool ShouldCreate = true) {
+    auto *SizeInBitsNode = ConstantAsMetadata::get(
+        ConstantInt::get(Type::getInt64Ty(Context), SizeInBits));
+    auto *OffsetInBitsNode = ConstantAsMetadata::get(
+        ConstantInt::get(Type::getInt64Ty(Context), OffsetInBits));
+    return getImpl(Context, Tag, getCanonicalMDString(Context, Name), File,
+                   Line, Scope, BaseType, SizeInBitsNode, AlignInBits,
+                   OffsetInBitsNode, Flags, Elements.get(), RuntimeLang,
+                   EnumKind, VTableHolder, TemplateParams.get(),
+                   getCanonicalMDString(Context, Identifier), Discriminator,
+                   DataLocation, Associated, Allocated, Rank, Annotations.get(),
+                   Specification, NumExtraInhabitants, BitStride, Storage,
+                   ShouldCreate);
+  }
+  static DICompositeType *
+  getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
+          unsigned Line, Metadata *Scope, Metadata *BaseType,
+          uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
+          DIFlags Flags, Metadata *Elements, unsigned RuntimeLang,
+          std::optional<uint32_t> EnumKind, Metadata *VTableHolder,
+          Metadata *TemplateParams, MDString *Identifier,
+          Metadata *Discriminator, Metadata *DataLocation, Metadata *Associated,
+          Metadata *Allocated, Metadata *Rank, Metadata *Annotations,
+          Metadata *Specification, uint32_t NumExtraInhabitants,
+          Metadata *BitStride, StorageType Storage, bool ShouldCreate = true) {
+    auto *SizeInBitsNode = ConstantAsMetadata::get(
+        ConstantInt::get(Type::getInt64Ty(Context), SizeInBits));
+    auto *OffsetInBitsNode = ConstantAsMetadata::get(
+        ConstantInt::get(Type::getInt64Ty(Context), OffsetInBits));
+    return getImpl(Context, Tag, Name, File, Line, Scope, BaseType,
+                   SizeInBitsNode, AlignInBits, OffsetInBitsNode, Flags,
+                   Elements, RuntimeLang, EnumKind, VTableHolder,
+                   TemplateParams, Identifier, Discriminator, DataLocation,
+                   Associated, Allocated, Rank, Annotations, Specification,
+                   NumExtraInhabitants, BitStride, Storage, ShouldCreate);
+  }
+  static DICompositeType *
+  getImpl(LLVMContext &Context, unsigned Tag, StringRef Name, Metadata *File,
+          unsigned Line, DIScope *Scope, DIType *BaseType, Metadata *SizeInBits,
+          uint32_t AlignInBits, Metadata *OffsetInBits, DIType *Specification,
+          uint32_t NumExtraInhabitants, DIFlags Flags, DINodeArray Elements,
+          unsigned RuntimeLang, std::optional<uint32_t> EnumKind,
+          DIType *VTableHolder, DITemplateParameterArray TemplateParams,
+          StringRef Identifier, DIDerivedType *Discriminator,
+          Metadata *DataLocation, Metadata *Associated, Metadata *Allocated,
+          Metadata *Rank, DINodeArray Annotations, Metadata *BitStride,
+          StorageType Storage, bool ShouldCreate = true) {
     return getImpl(
         Context, Tag, getCanonicalMDString(Context, Name), File, Line, Scope,
         BaseType, SizeInBits, AlignInBits, OffsetInBits, Flags, Elements.get(),
@@ -1458,7 +1654,7 @@ class DICompositeType : public DIType {
   LLVM_ABI static DICompositeType *
   getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
           unsigned Line, Metadata *Scope, Metadata *BaseType,
-          uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
+          Metadata *SizeInBits, uint32_t AlignInBits, Metadata *OffsetInBits,
           DIFlags Flags, Metadata *Elements, unsigned RuntimeLang,
           std::optional<uint32_t> EnumKind, Metadata *VTableHolder,
           Metadata *TemplateParams, MDString *Identifier,
@@ -1469,13 +1665,14 @@ class DICompositeType : public DIType {
 
   TempDICompositeType cloneImpl() const {
     return getTemporary(
-        getContext(), getTag(), getName(), getFile(), getLine(), getScope(),
-        getBaseType(), getSizeInBits(), getAlignInBits(), getOffsetInBits(),
-        getFlags(), getElements(), getRuntimeLang(), getEnumKind(),
-        getVTableHolder(), getTemplateParams(), getIdentifier(),
-        getDiscriminator(), getRawDataLocation(), getRawAssociated(),
-        getRawAllocated(), getRawRank(), getAnnotations(), getSpecification(),
-        getNumExtraInhabitants(), getRawBitStride());
+        getContext(), getTag(), getRawName(), getFile(), getLine(), getScope(),
+        getBaseType(), getRawSizeInBits(), getAlignInBits(),
+        getRawOffsetInBits(), getFlags(), getRawElements(), getRuntimeLang(),
+        getEnumKind(), getVTableHolder(), getRawTemplateParams(),
+        getRawIdentifier(), getDiscriminator(), getRawDataLocation(),
+        getRawAssociated(), getRawAllocated(), getRawRank(),
+        getRawAnnotations(), getSpecification(), getNumExtraInhabitants(),
+        getRawBitStride());
   }
 
 public:
@@ -1515,6 +1712,42 @@ class DICompositeType : public DIType {
        TemplateParams, Identifier, Discriminator, DataLocation, Associated,
        Allocated, Rank, Annotations, Specification, NumExtraInhabitants,
        BitStride))
+  DEFINE_MDNODE_GET(
+      DICompositeType,
+      (unsigned Tag, StringRef Name, DIFile *File, unsigned Line,
+       DIScope *Scope, DIType *BaseType, Metadata *SizeInBits,
+       uint32_t AlignInBits, Metadata *OffsetInBits, DIFlags Flags,
+       DINodeArray Elements, unsigned RuntimeLang,
+       std::optional<uint32_t> EnumKind, DIType *VTableHolder,
+       DITemplateParameterArray TemplateParams = nullptr,
+       StringRef Identifier = "", DIDerivedType *Discriminator = nullptr,
+       Metadata *DataLocation = nullptr, Metadata *Associated = nullptr,
+       Metadata *Allocated = nullptr, Metadata *Rank = nullptr,
+       DINodeArray Annotations = nullptr, DIType *Specification = nullptr,
+       uint32_t NumExtraInhabitants = 0, Metadata *BitStride = nullptr),
+      (Tag, Name, File, Line, Scope, BaseType, SizeInBits, AlignInBits,
+       OffsetInBits, Specification, NumExtraInhabitants, Flags, Elements,
+       RuntimeLang, EnumKind, VTableHolder, TemplateParams, Identifier,
+       Discriminator, DataLocation, Associated, Allocated, Rank, Annotations,
+       BitStride))
+  DEFINE_MDNODE_GET(
+      DICompositeType,
+      (unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
+       Metadata *Scope, Metadata *BaseType, Metadata *SizeInBits,
+       uint32_t AlignInBits, Metadata *OffsetInBits, DIFlags Flags,
+       Metadata *Elements, unsigned RuntimeLang,
+       std::optional<uint32_t> EnumKind, Metadata *VTableHolder,
+       Metadata *TemplateParams = nullptr, MDString *Identifier = nullptr,
+       Metadata *Discriminator = nullptr, Metadata *DataLocation = nullptr,
+       Metadata *Associated = nullptr, Metadata *Allocated = nullptr,
+       Metadata *Rank = nullptr, Metadata *Annotations = nullptr,
+       Metadata *Specification = nullptr, uint32_t NumExtraInhabitants = 0,
+       Metadata *BitStride = nullptr),
+      (Tag, Name, File, Line, Scope, BaseType, SizeInBits, AlignInBits,
+       OffsetInBits, Flags, Elements, RuntimeLang, EnumKind, VTableHolder,
+       TemplateParams, Identifier, Discriminator, DataLocation, Associated,
+       Allocated, Rank, Annotations, Specification, NumExtraInhabitants,
+       BitStride))
 
   TempDICompositeType clone() const { return cloneImpl(); }
 
@@ -1528,8 +1761,8 @@ class DICompositeType : public DIType {
   LLVM_ABI static DICompositeType *
   getODRType(LLVMContext &Context, MDString &Identifier, unsigned Tag,
              MDString *Name, Metadata *File, unsigned Line, Metadata *Scope,
-             Metadata *BaseType, uint64_t SizeInBits, uint32_t AlignInBits,
-             uint64_t OffsetInBits, Metadata *Specification,
+             Metadata *BaseType, Metadata *SizeInBits, uint32_t AlignInBits,
+             Metadata *OffsetInBits, Metadata *Specification,
              uint32_t NumExtraInhabitants, DIFlags Flags, Metadata *Elements,
              unsigned RuntimeLang, std::optional<uint32_t> EnumKind,
              Metadata *VTableHolder, Metadata *TemplateParams,
@@ -1551,8 +1784,8 @@ class DICompositeType : public DIType {
   LLVM_ABI static DICompositeType *
   buildODRType(LLVMContext &Context, MDString &Identifier, unsigned Tag,
                MDString *Name, Metadata *File, unsigned Line, Metadata *Scope,
-               Metadata *BaseType, uint64_t SizeInBits, uint32_t AlignInBits,
-               uint64_t OffsetInBits, Metadata *Specification,
+               Metadata *BaseType, Metadata *SizeInBits, uint32_t AlignInBits,
+               Metadata *OffsetInBits, Metadata *Specification,
                uint32_t NumExtraInhabitants, DIFlags Flags, Metadata *Elements,
                unsigned RuntimeLang, std::optional<uint32_t> EnumKind,
                Metadata *VTableHolder, Metadata *TemplateParams,
@@ -1570,41 +1803,55 @@ class DICompositeType : public DIType {
   DITemplateParameterArray getTemplateParams() const {
     return cast_or_null<MDTuple>(getRawTemplateParams());
   }
-  StringRef getIdentifier() const { return getStringOperand(7); }
+  StringRef getIdentifier() const {
+    return getStringOperand(MY_FIRST_OPERAND + 4);
+  }
   unsigned getRuntimeLang() const { return RuntimeLang; }
   std::optional<uint32_t> getEnumKind() const { return EnumKind; }
 
-  Metadata *getRawBaseType() const { return getOperand(3); }
-  Metadata *getRawElements() const { return getOperand(4); }
-  Metadata *getRawVTableHolder() const { return getOperand(5); }
-  Metadata *getRawTemplateParams() const { return getOperand(6); }
-  MDString *getRawIdentifier() const { return getOperandAs<MDString>(7); }
-  Metadata *getRawDiscriminator() const { return getOperand(8); }
+  Metadata *getRawBaseType() const { return getOperand(MY_FIRST_OPERAND); }
+  Metadata *getRawElements() const { return getOperand(MY_FIRST_OPERAND + 1); }
+  Metadata *getRawVTableHolder() const {
+    return getOperand(MY_FIRST_OPERAND + 2);
+  }
+  Metadata *getRawTemplateParams() const {
+    return getOperand(MY_FIRST_OPERAND + 3);
+  }
+  MDString *getRawIdentifier() const {
+    return getOperandAs<MDString>(MY_FIRST_OPERAND + 4);
+  }
+  Metadata *getRawDiscriminator() const {
+    return getOperand(MY_FIRST_OPERAND + 5);
+  }
   DIDerivedType *getDiscriminator() const {
-    return getOperandAs<DIDerivedType>(8);
+    return getOperandAs<DIDerivedType>(MY_FIRST_OPERAND + 5);
+  }
+  Metadata *getRawDataLocation() const {
+    return getOperand(MY_FIRST_OPERAND + 6);
   }
-  Metadata *getRawDataLocation() const { return getOperand(9); }
   DIVariable *getDataLocation() const {
     return dyn_cast_or_null<DIVariable>(getRawDataLocation());
   }
   DIExpression *getDataLocationExp() const {
     return dyn_cast_or_null<DIExpression>(getRawDataLocation());
   }
-  Metadata *getRawAssociated() const { return getOperand(10); }
+  Metadata *getRawAssociated() const {
+    return getOperand(MY_FIRST_OPERAND + 7);
+  }
   DIVariable *getAssociated() const {
     return dyn_cast_or_null<DIVariable>(getRawAssociated());
   }
   DIExpression *getAssociatedExp() const {
     return dyn_cast_or_null<DIExpression>(getRawAssociated());
   }
-  Metadata *getRawAllocated() const { return getOperand(11); }
+  Metadata *getRawAllocated() const { return getOperand(MY_FIRST_OPERAND + 8); }
   DIVariable *getAllocated() const {
     return dyn_cast_or_null<DIVariable>(getRawAllocated());
   }
   DIExpression *getAllocatedExp() const {
     return dyn_cast_or_null<DIExpression>(getRawAllocated());
   }
-  Metadata *getRawRank() const { return getOperand(12); }
+  Metadata *getRawRank() const { return getOperand(MY_FIRST_OPERAND + 9); }
   ConstantInt *getRankConst() const {
     if (auto *MD = dyn_cast_or_null<ConstantAsMetadata>(getRawRank()))
       return dyn_cast_or_null<ConstantInt>(MD->getValue());
@@ -1614,17 +1861,23 @@ class DICompositeType : public DIType {
     return dyn_cast_or_null<DIExpression>(getRawRank());
   }
 
-  Metadata *getRawAnnotations() const { return getOperand(13); }
+  Metadata *getRawAnnotations() const {
+    return getOperand(MY_FIRST_OPERAND + 10);
+  }
   DINodeArray getAnnotations() const {
     return cast_or_null<MDTuple>(getRawAnnotations());
   }
 
-  Metadata *getRawSpecification() const { return getOperand(14); }
+  Metadata *getRawSpecification() const {
+    return getOperand(MY_FIRST_OPERAND + 11);
+  }
   DIType *getSpecification() const {
     return cast_or_null<DIType>(getRawSpecification());
   }
 
-  Metadata *getRawBitStride() const { return getOperand(15); }
+  Metadata *getRawBitStride() const {
+    return getOperand(MY_FIRST_OPERAND + 12);
+  }
   ConstantInt *getBitStrideConst() const {
     if (auto *MD = dyn_cast_or_null<ConstantAsMetadata>(getRawBitStride()))
       return dyn_cast_or_null<ConstantInt>(MD->getValue());
@@ -1643,15 +1896,15 @@ class DICompositeType : public DIType {
       assert(is_contained(Elements->operands(), Op) &&
              "Lost a member during member list replacement");
 #endif
-    replaceOperandWith(4, Elements.get());
+    replaceOperandWith(MY_FIRST_OPERAND + 1, Elements.get());
   }
 
   void replaceVTableHolder(DIType *VTableHolder) {
-    replaceOperandWith(5, VTableHolder);
+    replaceOperandWith(MY_FIRST_OPERAND + 2, VTableHolder);
   }
 
   void replaceTemplateParams(DITemplateParameterArray TemplateParams) {
-    replaceOperandWith(6, TemplateParams.get());
+    replaceOperandWith(MY_FIRST_OPERAND + 3, TemplateParams.get());
   }
   /// @}
 
@@ -1667,6 +1920,8 @@ class DISubroutineType : public DIType {
   friend class LLVMContextImpl;
   friend class MDNode;
 
+  static constexpr unsigned MY_FIRST_OPERAND = DIType::N_OPERANDS;
+
   /// The calling convention used with DW_AT_calling_convention. Actually of
   /// type dwarf::CallingConvention.
   uint8_t CC;
@@ -1712,7 +1967,7 @@ class DISubroutineType : public DIType {
     return cast_or_null<MDTuple>(getRawTypeArray());
   }
 
-  Metadata *getRawTypeArray() const { return getOperand(3); }
+  Metadata *getRawTypeArray() const { return getOperand(MY_FIRST_OPERAND); }
 
   static bool classof(const Metadata *MD) {
     return MD->getMetadataID() == DISubroutineTypeKind;
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 7add4a27ce9e9..bd6f94ac1286c 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1842,11 +1842,11 @@ def int_ubsantrap : Intrinsic<[], [llvm_i8_ty],
 
 // Return true if ubsan check is allowed.
 def int_allow_ubsan_check : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i8_ty],
-    [IntrInaccessibleMemOnly, IntrWriteMem, ImmArg<ArgIndex<0>>, NoUndef<RetIndex>]>;
+    [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>, NoUndef<RetIndex>]>;
 
 // Return true if runtime check is allowed.
 def int_allow_runtime_check : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_metadata_ty],
-    [IntrInaccessibleMemOnly, IntrWriteMem, NoUndef<RetIndex>]>,
+    [IntrInaccessibleMemOnly, NoUndef<RetIndex>]>,
     ClangBuiltin<"__builtin_allow_runtime_check">;
 
 // Support for dynamic deoptimization (or de-specialization)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index e6f0bf6276086..6f974c97361de 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -588,6 +588,14 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
 def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
 def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
 
+def int_amdgcn_cvt_pk_f16_fp8 : DefaultAttrsIntrinsic<
+  [llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrSpeculatable]
+>, ClangBuiltin<"__builtin_amdgcn_cvt_pk_f16_fp8">;
+
+def int_amdgcn_cvt_pk_f16_bf8 : DefaultAttrsIntrinsic<
+  [llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrSpeculatable]
+>, ClangBuiltin<"__builtin_amdgcn_cvt_pk_f16_bf8">;
+
 class AMDGPUCvtScaleF32Intrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic<
   [DstTy], [Src0Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
 >, ClangBuiltin<"__builtin_amdgcn_"#name>;
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index 84c26599b5b70..7dd9ff7f08b8b 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -655,6 +655,14 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
       DefaultAttrsIntrinsic<[llvm_v1i128_ty],[llvm_v1i128_ty],[IntrNoMem]>;
 
   // BCD intrinsics.
+  def int_ppc_national2packed: ClangBuiltin<"__builtin_ppc_national2packed">,
+    DefaultAttrsIntrinsic<[llvm_v16i8_ty],[llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+  def int_ppc_packed2national: ClangBuiltin<"__builtin_ppc_packed2national">,
+    DefaultAttrsIntrinsic<[llvm_v16i8_ty],[llvm_v16i8_ty], [IntrNoMem]>;
+  def int_ppc_packed2zoned: ClangBuiltin<"__builtin_ppc_packed2zoned">,
+    DefaultAttrsIntrinsic<[llvm_v16i8_ty],[llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+  def int_ppc_zoned2packed: ClangBuiltin<"__builtin_ppc_zoned2packed">,
+    DefaultAttrsIntrinsic<[llvm_v16i8_ty],[llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_ppc_cdtbcdd : ClangBuiltin<"__builtin_ppc_cdtbcd">,
     DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem]>;
   def int_ppc_cbcdtdd: ClangBuiltin<"__builtin_ppc_cbcdtd">,
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 2a095be58a49e..5bd5fd1ce8d3f 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -149,6 +149,8 @@ struct RuntimeLibcallsInfo {
     return true;
   }
 
+  static bool darwinHasExp10(const Triple &TT);
+
   /// Return true if the target has sincosf/sincos/sincosl functions
   static bool hasSinCos(const Triple &TT) {
     return TT.isGNUEnvironment() || TT.isOSFuchsia() ||
diff --git a/llvm/include/llvm/MC/MCSchedule.h b/llvm/include/llvm/MC/MCSchedule.h
index eb71f4581bd61..d41a7412a9830 100644
--- a/llvm/include/llvm/MC/MCSchedule.h
+++ b/llvm/include/llvm/MC/MCSchedule.h
@@ -15,6 +15,7 @@
 #define LLVM_MC_MCSCHEDULE_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringTable.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -124,7 +125,7 @@ struct MCSchedClassDesc {
   static const unsigned short VariantNumMicroOps = InvalidNumMicroOps - 1;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  const char* Name;
+  uint32_t NameOff;
 #endif
   uint16_t NumMicroOps : 13;
   uint16_t BeginGroup : 1;
@@ -324,6 +325,7 @@ struct MCSchedModel {
   const MCSchedClassDesc *SchedClassTable;
   unsigned NumProcResourceKinds;
   unsigned NumSchedClasses;
+  const StringTable *SchedClassNames;
   // Instruction itinerary tables used by InstrItineraryData.
   friend class InstrItineraryData;
   const InstrItinerary *InstrItineraries;
@@ -368,6 +370,14 @@ struct MCSchedModel {
     return &SchedClassTable[SchedClassIdx];
   }
 
+  StringRef getSchedClassName(unsigned SchedClassIdx) const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    return (*SchedClassNames)[SchedClassTable[SchedClassIdx].NameOff];
+#else
+    return "<unknown>";
+#endif
+  }
+
   /// Returns the latency value for the scheduling class.
   LLVM_ABI static int computeInstrLatency(const MCSubtargetInfo &STI,
                                           const MCSchedClassDesc &SCDesc);
diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h
index 2ea7829d668a4..9b8d1f3c31121 100644
--- a/llvm/include/llvm/TargetParser/TargetParser.h
+++ b/llvm/include/llvm/TargetParser/TargetParser.h
@@ -214,7 +214,7 @@ struct BasicSubtargetSubTypeKV {
   }
 };
 
-std::optional<llvm::StringMap<bool>>
+LLVM_ABI std::optional<llvm::StringMap<bool>>
 getCPUDefaultTargetFeatures(StringRef CPU,
                             ArrayRef<BasicSubtargetSubTypeKV> ProcDesc,
                             ArrayRef<BasicSubtargetFeatureKV> ProcFeatures);
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index d6fa4537ee3b4..1865be6e95dea 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -581,6 +581,9 @@ class Triple {
   /// Is this an Apple XROS triple.
   bool isXROS() const { return getOS() == Triple::XROS; }
 
+  /// Is this an Apple BridgeOS triple.
+  bool isBridgeOS() const { return getOS() == Triple::BridgeOS; }
+
   /// Is this an Apple DriverKit triple.
   bool isDriverKit() const { return getOS() == Triple::DriverKit; }
 
@@ -591,9 +594,11 @@ class Triple {
     return (getVendor() == Triple::Apple) && isOSBinFormatMachO();
   }
 
-  /// Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit).
+  /// Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, DriverKit, XROS, or
+  /// bridgeOS).
   bool isOSDarwin() const {
-    return isMacOSX() || isiOS() || isWatchOS() || isDriverKit() || isXROS();
+    return isMacOSX() || isiOS() || isWatchOS() || isDriverKit() || isXROS() ||
+           isBridgeOS();
   }
 
   bool isSimulatorEnvironment() const {
diff --git a/llvm/include/llvm/Testing/Demangle/DemangleTestCases.inc b/llvm/include/llvm/Testing/Demangle/DemangleTestCases.inc
index 1e3f7459deaa2..2721d2aa5504e 100644
--- a/llvm/include/llvm/Testing/Demangle/DemangleTestCases.inc
+++ b/llvm/include/llvm/Testing/Demangle/DemangleTestCases.inc
@@ -6,6 +6,7 @@
 {"_Z1fDU10_", "f(unsigned _BitInt(10))"},
 {"_Z1fIfEvDUstPT__", "void f<float>(unsigned _BitInt(sizeof (float*)))"},
 {"_Z1fIiEvDBstPT__", "void f<int>(_BitInt(sizeof (int*)))"},
+{"_Z6myfuncRDB8_S0_", "myfunc(_BitInt(8)&, _BitInt(8)&)"},
 {"_Z4testI1A1BE1Cv", "C test<A, B>()"},
 {"_Z4testI1A1BET0_T_S3_", "B test<A, B>(A, A)"},
 {"_ZN1SgtEi", "S::operator>(int)"},
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 31611dfe4fd2f..86a2edbd8bd41 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -839,6 +839,10 @@ MemoryEffects BasicAAResult::getMemoryEffects(const CallBase *Call,
       FuncME |= MemoryEffects::readOnly();
     if (Call->hasClobberingOperandBundles())
       FuncME |= MemoryEffects::writeOnly();
+    if (Call->isVolatile()) {
+      // Volatile operations also access inaccessible memory.
+      FuncME |= MemoryEffects::inaccessibleMemOnly();
+    }
     Min &= FuncME;
   }
 
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index b58f9b26a8651..9e3c271f7d93f 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -2553,6 +2553,9 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
       case Intrinsic::cosh:
         return ConstantFoldFP(cosh, APF, Ty);
       case Intrinsic::atan:
+        // Implement optional behavior from C's Annex F for +/-0.0.
+        if (U.isZero())
+          return ConstantFP::get(Ty->getContext(), U);
         return ConstantFoldFP(atan, APF, Ty);
       case Intrinsic::sqrt:
         return ConstantFoldFP(sqrt, APF, Ty);
@@ -2606,6 +2609,9 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
       break;
     case LibFunc_atan:
     case LibFunc_atanf:
+      // Implement optional behavior from C's Annex F for +/-0.0.
+      if (U.isZero())
+        return ConstantFP::get(Ty->getContext(), U);
       if (TLI->has(Func))
         return ConstantFoldFP(atan, APF, Ty);
       break;
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 926dc6211eb8d..792a194aeb499 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -4818,6 +4818,34 @@ struct MDSignedOrMDField : MDEitherFieldImpl<MDSignedField, MDField> {
   }
 };
 
+struct MDUnsignedOrMDField : MDEitherFieldImpl<MDUnsignedField, MDField> {
+  MDUnsignedOrMDField(uint64_t Default = 0, bool AllowNull = true)
+      : ImplTy(MDUnsignedField(Default), MDField(AllowNull)) {}
+
+  MDUnsignedOrMDField(uint64_t Default, uint64_t Max, bool AllowNull = true)
+      : ImplTy(MDUnsignedField(Default, Max), MDField(AllowNull)) {}
+
+  bool isMDUnsignedField() const { return WhatIs == IsTypeA; }
+  bool isMDField() const { return WhatIs == IsTypeB; }
+  uint64_t getMDUnsignedValue() const {
+    assert(isMDUnsignedField() && "Wrong field type");
+    return A.Val;
+  }
+  Metadata *getMDFieldValue() const {
+    assert(isMDField() && "Wrong field type");
+    return B.Val;
+  }
+
+  Metadata *getValueAsMetadata(LLVMContext &Context) const {
+    if (isMDUnsignedField())
+      return ConstantAsMetadata::get(
+          ConstantInt::get(Type::getInt64Ty(Context), getMDUnsignedValue()));
+    if (isMDField())
+      return getMDFieldValue();
+    return nullptr;
+  }
+};
+
 } // end anonymous namespace
 
 namespace llvm {
@@ -5201,6 +5229,29 @@ bool LLParser::parseMDField(LocTy Loc, StringRef Name,
   return true;
 }
 
+template <>
+bool LLParser::parseMDField(LocTy Loc, StringRef Name,
+                            MDUnsignedOrMDField &Result) {
+  // Try to parse an unsigned int.
+  if (Lex.getKind() == lltok::APSInt) {
+    MDUnsignedField Res = Result.A;
+    if (!parseMDField(Loc, Name, Res)) {
+      Result.assign(Res);
+      return false;
+    }
+    return true;
+  }
+
+  // Otherwise, try to parse as an MDField.
+  MDField Res = Result.B;
+  if (!parseMDField(Loc, Name, Res)) {
+    Result.assign(Res);
+    return false;
+  }
+
+  return true;
+}
+
 template <>
 bool LLParser::parseMDField(LocTy Loc, StringRef Name, MDStringField &Result) {
   LocTy ValueLoc = Lex.getLoc();
@@ -5382,7 +5433,7 @@ bool LLParser::parseDISubrangeType(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(line, LineField, );                                                 \
   OPTIONAL(scope, MDField, );                                                  \
   OPTIONAL(baseType, MDField, );                                               \
-  OPTIONAL(size, MDUnsignedField, (0, UINT64_MAX));                            \
+  OPTIONAL(size, MDUnsignedOrMDField, (0, UINT64_MAX));                        \
   OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX));                           \
   OPTIONAL(flags, DIFlagField, );                                              \
   OPTIONAL(lowerBound, MDSignedOrMDField, );                                   \
@@ -5406,10 +5457,10 @@ bool LLParser::parseDISubrangeType(MDNode *&Result, bool IsDistinct) {
   Metadata *Stride = convToMetadata(stride);
   Metadata *Bias = convToMetadata(bias);
 
-  Result = GET_OR_DISTINCT(DISubrangeType,
-                           (Context, name.Val, file.Val, line.Val, scope.Val,
-                            size.Val, align.Val, flags.Val, baseType.Val,
-                            LowerBound, UpperBound, Stride, Bias));
+  Result = GET_OR_DISTINCT(
+      DISubrangeType, (Context, name.Val, file.Val, line.Val, scope.Val,
+                       size.getValueAsMetadata(Context), align.Val, flags.Val,
+                       baseType.Val, LowerBound, UpperBound, Stride, Bias));
 
   return false;
 }
@@ -5517,7 +5568,7 @@ bool LLParser::parseDIBasicType(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   OPTIONAL(tag, DwarfTagField, (dwarf::DW_TAG_base_type));                     \
   OPTIONAL(name, MDStringField, );                                             \
-  OPTIONAL(size, MDUnsignedField, (0, UINT64_MAX));                            \
+  OPTIONAL(size, MDUnsignedOrMDField, (0, UINT64_MAX));                        \
   OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX));                           \
   OPTIONAL(encoding, DwarfAttEncodingField, );                                 \
   OPTIONAL(num_extra_inhabitants, MDUnsignedField, (0, UINT32_MAX));           \
@@ -5525,7 +5576,8 @@ bool LLParser::parseDIBasicType(MDNode *&Result, bool IsDistinct) {
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
-  Result = GET_OR_DISTINCT(DIBasicType, (Context, tag.Val, name.Val, size.Val,
+  Result = GET_OR_DISTINCT(DIBasicType, (Context, tag.Val, name.Val,
+                                         size.getValueAsMetadata(Context),
                                          align.Val, encoding.Val,
                                          num_extra_inhabitants.Val, flags.Val));
   return false;
@@ -5540,7 +5592,7 @@ bool LLParser::parseDIFixedPointType(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   OPTIONAL(tag, DwarfTagField, (dwarf::DW_TAG_base_type));                     \
   OPTIONAL(name, MDStringField, );                                             \
-  OPTIONAL(size, MDUnsignedField, (0, UINT64_MAX));                            \
+  OPTIONAL(size, MDUnsignedOrMDField, (0, UINT64_MAX));                        \
   OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX));                           \
   OPTIONAL(encoding, DwarfAttEncodingField, );                                 \
   OPTIONAL(flags, DIFlagField, );                                              \
@@ -5552,7 +5604,8 @@ bool LLParser::parseDIFixedPointType(MDNode *&Result, bool IsDistinct) {
 #undef VISIT_MD_FIELDS
 
   Result = GET_OR_DISTINCT(DIFixedPointType,
-                           (Context, tag.Val, name.Val, size.Val, align.Val,
+                           (Context, tag.Val, name.Val,
+                            size.getValueAsMetadata(Context), align.Val,
                             encoding.Val, flags.Val, kind.Val, factor.Val,
                             numerator.Val, denominator.Val));
   return false;
@@ -5567,7 +5620,7 @@ bool LLParser::parseDIStringType(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(stringLength, MDField, );                                           \
   OPTIONAL(stringLengthExpression, MDField, );                                 \
   OPTIONAL(stringLocationExpression, MDField, );                               \
-  OPTIONAL(size, MDUnsignedField, (0, UINT64_MAX));                            \
+  OPTIONAL(size, MDUnsignedOrMDField, (0, UINT64_MAX));                        \
   OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX));                           \
   OPTIONAL(encoding, DwarfAttEncodingField, );
   PARSE_MD_FIELDS();
@@ -5576,7 +5629,8 @@ bool LLParser::parseDIStringType(MDNode *&Result, bool IsDistinct) {
   Result = GET_OR_DISTINCT(
       DIStringType,
       (Context, tag.Val, name.Val, stringLength.Val, stringLengthExpression.Val,
-       stringLocationExpression.Val, size.Val, align.Val, encoding.Val));
+       stringLocationExpression.Val, size.getValueAsMetadata(Context),
+       align.Val, encoding.Val));
   return false;
 }
 
@@ -5597,9 +5651,9 @@ bool LLParser::parseDIDerivedType(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(line, LineField, );                                                 \
   OPTIONAL(scope, MDField, );                                                  \
   REQUIRED(baseType, MDField, );                                               \
-  OPTIONAL(size, MDUnsignedField, (0, UINT64_MAX));                            \
+  OPTIONAL(size, MDUnsignedOrMDField, (0, UINT64_MAX));                        \
   OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX));                           \
-  OPTIONAL(offset, MDUnsignedField, (0, UINT64_MAX));                          \
+  OPTIONAL(offset, MDUnsignedOrMDField, (0, UINT64_MAX));                      \
   OPTIONAL(flags, DIFlagField, );                                              \
   OPTIONAL(extraData, MDField, );                                              \
   OPTIONAL(dwarfAddressSpace, MDUnsignedField, (UINT32_MAX, UINT32_MAX));      \
@@ -5622,11 +5676,11 @@ bool LLParser::parseDIDerivedType(MDNode *&Result, bool IsDistinct) {
         (unsigned)ptrAuthExtraDiscriminator.Val, ptrAuthIsaPointer.Val,
         ptrAuthAuthenticatesNullValues.Val);
 
-  Result = GET_OR_DISTINCT(DIDerivedType,
-                           (Context, tag.Val, name.Val, file.Val, line.Val,
-                            scope.Val, baseType.Val, size.Val, align.Val,
-                            offset.Val, DWARFAddressSpace, PtrAuthData,
-                            flags.Val, extraData.Val, annotations.Val));
+  Result = GET_OR_DISTINCT(
+      DIDerivedType, (Context, tag.Val, name.Val, file.Val, line.Val, scope.Val,
+                      baseType.Val, size.getValueAsMetadata(Context), align.Val,
+                      offset.getValueAsMetadata(Context), DWARFAddressSpace,
+                      PtrAuthData, flags.Val, extraData.Val, annotations.Val));
   return false;
 }
 
@@ -5638,9 +5692,9 @@ bool LLParser::parseDICompositeType(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(line, LineField, );                                                 \
   OPTIONAL(scope, MDField, );                                                  \
   OPTIONAL(baseType, MDField, );                                               \
-  OPTIONAL(size, MDUnsignedField, (0, UINT64_MAX));                            \
+  OPTIONAL(size, MDUnsignedOrMDField, (0, UINT64_MAX));                        \
   OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX));                           \
-  OPTIONAL(offset, MDUnsignedField, (0, UINT64_MAX));                          \
+  OPTIONAL(offset, MDUnsignedOrMDField, (0, UINT64_MAX));                      \
   OPTIONAL(flags, DIFlagField, );                                              \
   OPTIONAL(elements, MDField, );                                               \
   OPTIONAL(runtimeLang, DwarfLangField, );                                     \
@@ -5675,12 +5729,12 @@ bool LLParser::parseDICompositeType(MDNode *&Result, bool IsDistinct) {
   if (identifier.Val)
     if (auto *CT = DICompositeType::buildODRType(
             Context, *identifier.Val, tag.Val, name.Val, file.Val, line.Val,
-            scope.Val, baseType.Val, size.Val, align.Val, offset.Val,
-            specification.Val, num_extra_inhabitants.Val, flags.Val,
-            elements.Val, runtimeLang.Val, EnumKind, vtableHolder.Val,
-            templateParams.Val, discriminator.Val, dataLocation.Val,
-            associated.Val, allocated.Val, Rank, annotations.Val,
-            bitStride.Val)) {
+            scope.Val, baseType.Val, size.getValueAsMetadata(Context),
+            align.Val, offset.getValueAsMetadata(Context), specification.Val,
+            num_extra_inhabitants.Val, flags.Val, elements.Val, runtimeLang.Val,
+            EnumKind, vtableHolder.Val, templateParams.Val, discriminator.Val,
+            dataLocation.Val, associated.Val, allocated.Val, Rank,
+            annotations.Val, bitStride.Val)) {
       Result = CT;
       return false;
     }
@@ -5690,7 +5744,8 @@ bool LLParser::parseDICompositeType(MDNode *&Result, bool IsDistinct) {
   Result = GET_OR_DISTINCT(
       DICompositeType,
       (Context, tag.Val, name.Val, file.Val, line.Val, scope.Val, baseType.Val,
-       size.Val, align.Val, offset.Val, flags.Val, elements.Val,
+       size.getValueAsMetadata(Context), align.Val,
+       offset.getValueAsMetadata(Context), flags.Val, elements.Val,
        runtimeLang.Val, EnumKind, vtableHolder.Val, templateParams.Val,
        identifier.Val, discriminator.Val, dataLocation.Val, associated.Val,
        allocated.Val, Rank, annotations.Val, specification.Val,
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index 1cd1797c1092d..a9467d16c9a14 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -1287,6 +1287,14 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     return MetadataList.upgradeTypeRef(getMDOrNull(ID));
   };
 
+  auto getMetadataOrConstant = [&](bool IsMetadata,
+                                   uint64_t Entry) -> Metadata * {
+    if (IsMetadata)
+      return getMDOrNull(Entry);
+    return ConstantAsMetadata::get(
+        ConstantInt::get(Type::getInt64Ty(Context), Entry));
+  };
+
 #define GET_OR_DISTINCT(CLASS, ARGS)                                           \
   (IsDistinct ? CLASS::getDistinct ARGS : CLASS::get ARGS)
 
@@ -1525,15 +1533,18 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     if (Record.size() < 6 || Record.size() > 8)
       return error("Invalid record");
 
-    IsDistinct = Record[0];
+    IsDistinct = Record[0] & 1;
+    bool SizeIsMetadata = Record[0] & 2;
     DINode::DIFlags Flags = (Record.size() > 6)
                                 ? static_cast<DINode::DIFlags>(Record[6])
                                 : DINode::FlagZero;
     uint32_t NumExtraInhabitants = (Record.size() > 7) ? Record[7] : 0;
 
+    Metadata *SizeInBits = getMetadataOrConstant(SizeIsMetadata, Record[3]);
+
     MetadataList.assignValue(
         GET_OR_DISTINCT(DIBasicType,
-                        (Context, Record[1], getMDString(Record[2]), Record[3],
+                        (Context, Record[1], getMDString(Record[2]), SizeInBits,
                          Record[4], Record[5], NumExtraInhabitants, Flags)),
         NextMetadataNo);
     NextMetadataNo++;
@@ -1543,9 +1554,12 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     if (Record.size() < 11)
       return error("Invalid record");
 
-    IsDistinct = Record[0];
+    IsDistinct = Record[0] & 1;
+    bool SizeIsMetadata = Record[0] & 2;
     DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[6]);
 
+    Metadata *SizeInBits = getMetadataOrConstant(SizeIsMetadata, Record[3]);
+
     size_t Offset = 9;
 
     auto ReadWideInt = [&]() {
@@ -1565,7 +1579,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
 
     MetadataList.assignValue(
         GET_OR_DISTINCT(DIFixedPointType,
-                        (Context, Record[1], getMDString(Record[2]), Record[3],
+                        (Context, Record[1], getMDString(Record[2]), SizeInBits,
                          Record[4], Record[5], Flags, Record[7], Record[8],
                          Numerator, Denominator)),
         NextMetadataNo);
@@ -1576,17 +1590,21 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     if (Record.size() > 9 || Record.size() < 8)
       return error("Invalid record");
 
-    IsDistinct = Record[0];
+    IsDistinct = Record[0] & 1;
+    bool SizeIsMetadata = Record[0] & 2;
     bool SizeIs8 = Record.size() == 8;
     // StringLocationExp (i.e. Record[5]) is added at a later time
     // than the other fields. The code here enables backward compatibility.
     Metadata *StringLocationExp = SizeIs8 ? nullptr : getMDOrNull(Record[5]);
     unsigned Offset = SizeIs8 ? 5 : 6;
+    Metadata *SizeInBits =
+        getMetadataOrConstant(SizeIsMetadata, Record[Offset]);
+
     MetadataList.assignValue(
         GET_OR_DISTINCT(DIStringType,
                         (Context, Record[1], getMDString(Record[2]),
                          getMDOrNull(Record[3]), getMDOrNull(Record[4]),
-                         StringLocationExp, Record[Offset], Record[Offset + 1],
+                         StringLocationExp, SizeInBits, Record[Offset + 1],
                          Record[Offset + 2])),
         NextMetadataNo);
     NextMetadataNo++;
@@ -1615,15 +1633,20 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
         PtrAuthData.emplace(Record[14]);
     }
 
-    IsDistinct = Record[0];
+    IsDistinct = Record[0] & 1;
+    bool SizeIsMetadata = Record[0] & 2;
     DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[10]);
+
+    Metadata *SizeInBits = getMetadataOrConstant(SizeIsMetadata, Record[7]);
+    Metadata *OffsetInBits = getMetadataOrConstant(SizeIsMetadata, Record[9]);
+
     MetadataList.assignValue(
         GET_OR_DISTINCT(DIDerivedType,
                         (Context, Record[1], getMDString(Record[2]),
                          getMDOrNull(Record[3]), Record[4],
                          getDITypeRefOrNull(Record[5]),
-                         getDITypeRefOrNull(Record[6]), Record[7], Record[8],
-                         Record[9], DWARFAddressSpace, PtrAuthData, Flags,
+                         getDITypeRefOrNull(Record[6]), SizeInBits, Record[8],
+                         OffsetInBits, DWARFAddressSpace, PtrAuthData, Flags,
                          getDITypeRefOrNull(Record[11]), Annotations)),
         NextMetadataNo);
     NextMetadataNo++;
@@ -1633,13 +1656,17 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     if (Record.size() != 13)
       return error("Invalid record");
 
-    IsDistinct = Record[0];
+    IsDistinct = Record[0] & 1;
+    bool SizeIsMetadata = Record[0] & 2;
     DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[7]);
+
+    Metadata *SizeInBits = getMetadataOrConstant(SizeIsMetadata, Record[5]);
+
     MetadataList.assignValue(
         GET_OR_DISTINCT(DISubrangeType,
                         (Context, getMDString(Record[1]),
                          getMDOrNull(Record[2]), Record[3],
-                         getMDOrNull(Record[4]), Record[5], Record[6], Flags,
+                         getMDOrNull(Record[4]), SizeInBits, Record[6], Flags,
                          getDITypeRefOrNull(Record[8]), getMDOrNull(Record[9]),
                          getMDOrNull(Record[10]), getMDOrNull(Record[11]),
                          getMDOrNull(Record[12]))),
@@ -1654,18 +1681,18 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     // If we have a UUID and this is not a forward declaration, lookup the
     // mapping.
     IsDistinct = Record[0] & 0x1;
-    bool IsNotUsedInTypeRef = Record[0] >= 2;
+    bool IsNotUsedInTypeRef = Record[0] & 2;
+    bool SizeIsMetadata = Record[0] & 4;
     unsigned Tag = Record[1];
     MDString *Name = getMDString(Record[2]);
     Metadata *File = getMDOrNull(Record[3]);
     unsigned Line = Record[4];
     Metadata *Scope = getDITypeRefOrNull(Record[5]);
     Metadata *BaseType = nullptr;
-    uint64_t SizeInBits = Record[7];
     if (Record[8] > (uint64_t)std::numeric_limits<uint32_t>::max())
       return error("Alignment value is too large");
     uint32_t AlignInBits = Record[8];
-    uint64_t OffsetInBits = 0;
+    Metadata *OffsetInBits = nullptr;
     uint32_t NumExtraInhabitants = (Record.size() > 22) ? Record[22] : 0;
     DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[10]);
     Metadata *Elements = nullptr;
@@ -1712,7 +1739,9 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
         TemplateParams = getMDOrNull(Record[14]);
     } else {
       BaseType = getDITypeRefOrNull(Record[6]);
-      OffsetInBits = Record[9];
+
+      OffsetInBits = getMetadataOrConstant(SizeIsMetadata, Record[9]);
+
       Elements = getMDOrNull(Record[11]);
       VTableHolder = getDITypeRefOrNull(Record[13]);
       TemplateParams = getMDOrNull(Record[14]);
@@ -1740,6 +1769,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     if (Record.size() > 24 && Record[24] != dwarf::DW_APPLE_ENUM_KIND_invalid)
       EnumKind = Record[24];
 
+    Metadata *SizeInBits = getMetadataOrConstant(SizeIsMetadata, Record[7]);
+
     DICompositeType *CT = nullptr;
     if (Identifier)
       CT = DICompositeType::buildODRType(
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 628b939af19ce..2a2dd085a9461 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1899,10 +1899,11 @@ void ModuleBitcodeWriter::writeDIEnumerator(const DIEnumerator *N,
 void ModuleBitcodeWriter::writeDIBasicType(const DIBasicType *N,
                                            SmallVectorImpl<uint64_t> &Record,
                                            unsigned Abbrev) {
-  Record.push_back(N->isDistinct());
+  const unsigned SizeIsMetadata = 0x2;
+  Record.push_back(SizeIsMetadata | (unsigned)N->isDistinct());
   Record.push_back(N->getTag());
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
-  Record.push_back(N->getSizeInBits());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawSizeInBits()));
   Record.push_back(N->getAlignInBits());
   Record.push_back(N->getEncoding());
   Record.push_back(N->getFlags());
@@ -1915,10 +1916,11 @@ void ModuleBitcodeWriter::writeDIBasicType(const DIBasicType *N,
 void ModuleBitcodeWriter::writeDIFixedPointType(
     const DIFixedPointType *N, SmallVectorImpl<uint64_t> &Record,
     unsigned Abbrev) {
-  Record.push_back(N->isDistinct());
+  const unsigned SizeIsMetadata = 0x2;
+  Record.push_back(SizeIsMetadata | (unsigned)N->isDistinct());
   Record.push_back(N->getTag());
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
-  Record.push_back(N->getSizeInBits());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawSizeInBits()));
   Record.push_back(N->getAlignInBits());
   Record.push_back(N->getEncoding());
   Record.push_back(N->getFlags());
@@ -1944,13 +1946,14 @@ void ModuleBitcodeWriter::writeDIFixedPointType(
 void ModuleBitcodeWriter::writeDIStringType(const DIStringType *N,
                                             SmallVectorImpl<uint64_t> &Record,
                                             unsigned Abbrev) {
-  Record.push_back(N->isDistinct());
+  const unsigned SizeIsMetadata = 0x2;
+  Record.push_back(SizeIsMetadata | (unsigned)N->isDistinct());
   Record.push_back(N->getTag());
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
   Record.push_back(VE.getMetadataOrNullID(N->getStringLength()));
   Record.push_back(VE.getMetadataOrNullID(N->getStringLengthExp()));
   Record.push_back(VE.getMetadataOrNullID(N->getStringLocationExp()));
-  Record.push_back(N->getSizeInBits());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawSizeInBits()));
   Record.push_back(N->getAlignInBits());
   Record.push_back(N->getEncoding());
 
@@ -1961,16 +1964,17 @@ void ModuleBitcodeWriter::writeDIStringType(const DIStringType *N,
 void ModuleBitcodeWriter::writeDIDerivedType(const DIDerivedType *N,
                                              SmallVectorImpl<uint64_t> &Record,
                                              unsigned Abbrev) {
-  Record.push_back(N->isDistinct());
+  const unsigned SizeIsMetadata = 0x2;
+  Record.push_back(SizeIsMetadata | (unsigned)N->isDistinct());
   Record.push_back(N->getTag());
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
   Record.push_back(VE.getMetadataOrNullID(N->getFile()));
   Record.push_back(N->getLine());
   Record.push_back(VE.getMetadataOrNullID(N->getScope()));
   Record.push_back(VE.getMetadataOrNullID(N->getBaseType()));
-  Record.push_back(N->getSizeInBits());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawSizeInBits()));
   Record.push_back(N->getAlignInBits());
-  Record.push_back(N->getOffsetInBits());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawOffsetInBits()));
   Record.push_back(N->getFlags());
   Record.push_back(VE.getMetadataOrNullID(N->getExtraData()));
 
@@ -1995,12 +1999,13 @@ void ModuleBitcodeWriter::writeDIDerivedType(const DIDerivedType *N,
 void ModuleBitcodeWriter::writeDISubrangeType(const DISubrangeType *N,
                                               SmallVectorImpl<uint64_t> &Record,
                                               unsigned Abbrev) {
-  Record.push_back(N->isDistinct());
+  const unsigned SizeIsMetadata = 0x2;
+  Record.push_back(SizeIsMetadata | (unsigned)N->isDistinct());
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
   Record.push_back(VE.getMetadataOrNullID(N->getFile()));
   Record.push_back(N->getLine());
   Record.push_back(VE.getMetadataOrNullID(N->getScope()));
-  Record.push_back(N->getSizeInBits());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawSizeInBits()));
   Record.push_back(N->getAlignInBits());
   Record.push_back(N->getFlags());
   Record.push_back(VE.getMetadataOrNullID(N->getBaseType()));
@@ -2017,16 +2022,18 @@ void ModuleBitcodeWriter::writeDICompositeType(
     const DICompositeType *N, SmallVectorImpl<uint64_t> &Record,
     unsigned Abbrev) {
   const unsigned IsNotUsedInOldTypeRef = 0x2;
-  Record.push_back(IsNotUsedInOldTypeRef | (unsigned)N->isDistinct());
+  const unsigned SizeIsMetadata = 0x4;
+  Record.push_back(SizeIsMetadata | IsNotUsedInOldTypeRef |
+                   (unsigned)N->isDistinct());
   Record.push_back(N->getTag());
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
   Record.push_back(VE.getMetadataOrNullID(N->getFile()));
   Record.push_back(N->getLine());
   Record.push_back(VE.getMetadataOrNullID(N->getScope()));
   Record.push_back(VE.getMetadataOrNullID(N->getBaseType()));
-  Record.push_back(N->getSizeInBits());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawSizeInBits()));
   Record.push_back(N->getAlignInBits());
-  Record.push_back(N->getOffsetInBits());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawOffsetInBits()));
   Record.push_back(N->getFlags());
   Record.push_back(VE.getMetadataOrNullID(N->getElements().get()));
   Record.push_back(N->getRuntimeLang());
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 3b96225236cd6..754dba73673c2 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2142,16 +2142,20 @@ void AsmPrinter::emitFunctionBody() {
 }
 
 /// Compute the number of Global Variables that uses a Constant.
-static unsigned getNumGlobalVariableUses(const Constant *C) {
-  if (!C)
+static unsigned getNumGlobalVariableUses(const Constant *C,
+                                         bool &HasNonGlobalUsers) {
+  if (!C) {
+    HasNonGlobalUsers = true;
     return 0;
+  }
 
   if (isa<GlobalVariable>(C))
     return 1;
 
   unsigned NumUses = 0;
   for (const auto *CU : C->users())
-    NumUses += getNumGlobalVariableUses(dyn_cast<Constant>(CU));
+    NumUses +=
+        getNumGlobalVariableUses(dyn_cast<Constant>(CU), HasNonGlobalUsers);
 
   return NumUses;
 }
@@ -2162,7 +2166,8 @@ static unsigned getNumGlobalVariableUses(const Constant *C) {
 /// candidates are skipped and are emitted later in case at least one cstexpr
 /// isn't replaced by a PC relative GOT entry access.
 static bool isGOTEquivalentCandidate(const GlobalVariable *GV,
-                                     unsigned &NumGOTEquivUsers) {
+                                     unsigned &NumGOTEquivUsers,
+                                     bool &HasNonGlobalUsers) {
   // Global GOT equivalents are unnamed private globals with a constant
   // pointer initializer to another global symbol. They must point to a
   // GlobalVariable or Function, i.e., as GlobalValue.
@@ -2174,7 +2179,8 @@ static bool isGOTEquivalentCandidate(const GlobalVariable *GV,
   // To be a got equivalent, at least one of its users need to be a constant
   // expression used by another global variable.
   for (const auto *U : GV->users())
-    NumGOTEquivUsers += getNumGlobalVariableUses(dyn_cast<Constant>(U));
+    NumGOTEquivUsers +=
+        getNumGlobalVariableUses(dyn_cast<Constant>(U), HasNonGlobalUsers);
 
   return NumGOTEquivUsers > 0;
 }
@@ -2192,9 +2198,13 @@ void AsmPrinter::computeGlobalGOTEquivs(Module &M) {
 
   for (const auto &G : M.globals()) {
     unsigned NumGOTEquivUsers = 0;
-    if (!isGOTEquivalentCandidate(&G, NumGOTEquivUsers))
+    bool HasNonGlobalUsers = false;
+    if (!isGOTEquivalentCandidate(&G, NumGOTEquivUsers, HasNonGlobalUsers))
       continue;
-
+    // If non-global variables use it, we still need to emit it.
+    // Add 1 here, then emit it in `emitGlobalGOTEquivs`.
+    if (HasNonGlobalUsers)
+      NumGOTEquivUsers += 1;
     const MCSymbol *GOTEquivSym = getSymbol(&G);
     GlobalGOTEquivs[GOTEquivSym] = std::make_pair(&G, NumGOTEquivUsers);
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
index 5442fb15202ea..171fb8394990d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
@@ -374,6 +374,18 @@ static void handleNewDebugValue(InlinedEntity Var, const MachineInstr &DV,
                                 DbgValueHistoryMap &HistMap) {
   EntryIndex NewIndex;
   if (HistMap.startDbgValue(Var, DV, NewIndex)) {
+    // As we already need to iterate all LiveEntries when handling a DbgValue,
+    // we use this map to avoid a more expensive check against RegVars. There
+    // is an assert that we handle this correctly in addRegDescribedVar.
+    //
+    // In other terms, the presence in this map indicates the presence of a
+    // corresponding entry in RegVars.
+    //
+    // The bool value then tracks whether an entry is to be retained (true) or
+    // removed (false); as we end previous entries we speculatively assume they
+    // can be dropped from RegVars, but we then also visit the new entry whose
+    // set of debug register operands may overlap and "save" a reg from being
+    // dropped.
     SmallDenseMap<unsigned, bool, 4> TrackedRegs;
 
     // If we have created a new debug value entry, close all preceding
@@ -467,9 +479,6 @@ void llvm::calculateDbgEntityHistory(const MachineFunction *MF,
     for (const auto &MI : MBB) {
       if (MI.isDebugValue()) {
         assert(MI.getNumOperands() > 1 && "Invalid DBG_VALUE instruction!");
-        // Use the base variable (without any DW_OP_piece expressions)
-        // as index into History. The full variables including the
-        // piece expressions are attached to the MI.
         const DILocalVariable *RawVar = MI.getDebugVariable();
         assert(RawVar->isValidLocationForIntrinsic(MI.getDebugLoc()) &&
                "Expected inlined-at fields to agree");
@@ -493,8 +502,7 @@ void llvm::calculateDbgEntityHistory(const MachineFunction *MF,
       if (MI.isMetaInstruction())
         continue;
 
-      // Not a DBG_VALUE instruction. It may clobber registers which describe
-      // some variables.
+      // Other instructions may clobber registers which describe some variables.
       for (const MachineOperand &MO : MI.operands()) {
         if (MO.isReg() && MO.isDef() && MO.getReg()) {
           // Ignore call instructions that claim to clobber SP. The AArch64
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index bfe6e7d6a802a..9bd337a962b86 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -1013,7 +1013,6 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
   // Add name if not anonymous or intermediate type.
   StringRef Name = CTy->getName();
 
-  uint64_t Size = CTy->getSizeInBits() >> 3;
   uint16_t Tag = Buffer.getTag();
 
   switch (Tag) {
@@ -1176,15 +1175,28 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
   if (Tag == dwarf::DW_TAG_enumeration_type ||
       Tag == dwarf::DW_TAG_class_type || Tag == dwarf::DW_TAG_structure_type ||
       Tag == dwarf::DW_TAG_union_type) {
-    // Add size if non-zero (derived types might be zero-sized.)
-    // Ignore the size if it's a non-enum forward decl.
-    // TODO: Do we care about size for enum forward declarations?
-    if (Size &&
-        (!CTy->isForwardDecl() || Tag == dwarf::DW_TAG_enumeration_type))
-      addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt, Size);
-    else if (!CTy->isForwardDecl())
-      // Add zero size if it is not a forward declaration.
-      addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt, 0);
+    if (auto *Var = dyn_cast_or_null<DIVariable>(CTy->getRawSizeInBits())) {
+      if (auto *VarDIE = getDIE(Var))
+        addDIEEntry(Buffer, dwarf::DW_AT_bit_size, *VarDIE);
+    } else if (auto *Exp =
+                   dyn_cast_or_null<DIExpression>(CTy->getRawSizeInBits())) {
+      DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+      DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+      DwarfExpr.setMemoryLocationKind();
+      DwarfExpr.addExpression(Exp);
+      addBlock(Buffer, dwarf::DW_AT_bit_size, DwarfExpr.finalize());
+    } else {
+      uint64_t Size = CTy->getSizeInBits() >> 3;
+      // Add size if non-zero (derived types might be zero-sized.)
+      // Ignore the size if it's a non-enum forward decl.
+      // TODO: Do we care about size for enum forward declarations?
+      if (Size &&
+          (!CTy->isForwardDecl() || Tag == dwarf::DW_TAG_enumeration_type))
+        addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt, Size);
+      else if (!CTy->isForwardDecl())
+        // Add zero size if it is not a forward declaration.
+        addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt, 0);
+    }
 
     // If we're a forward decl, say so.
     if (CTy->isForwardDecl())
@@ -1864,74 +1876,117 @@ DIE &DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) {
 
     addBlock(MemberDie, dwarf::DW_AT_data_member_location, VBaseLocationDie);
   } else {
-    uint64_t Size = DT->getSizeInBits();
-    uint64_t FieldSize = DD->getBaseTypeSize(DT);
-    uint32_t AlignInBytes = DT->getAlignInBytes();
-    uint64_t OffsetInBytes;
+    uint64_t Size = 0;
+    uint64_t FieldSize = 0;
 
     bool IsBitfield = DT->isBitField();
-    if (IsBitfield) {
-      // Handle bitfield, assume bytes are 8 bits.
-      if (DD->useDWARF2Bitfields())
-        addUInt(MemberDie, dwarf::DW_AT_byte_size, std::nullopt, FieldSize / 8);
-      addUInt(MemberDie, dwarf::DW_AT_bit_size, std::nullopt, Size);
-
-      assert(DT->getOffsetInBits() <=
-             (uint64_t)std::numeric_limits<int64_t>::max());
-      int64_t Offset = DT->getOffsetInBits();
-      // We can't use DT->getAlignInBits() here: AlignInBits for member type
-      // is non-zero if and only if alignment was forced (e.g. _Alignas()),
-      // which can't be done with bitfields. Thus we use FieldSize here.
-      uint32_t AlignInBits = FieldSize;
-      uint32_t AlignMask = ~(AlignInBits - 1);
-      // The bits from the start of the storage unit to the start of the field.
-      uint64_t StartBitOffset = Offset - (Offset & AlignMask);
-      // The byte offset of the field's aligned storage unit inside the struct.
-      OffsetInBytes = (Offset - StartBitOffset) / 8;
-
-      if (DD->useDWARF2Bitfields()) {
-        uint64_t HiMark = (Offset + FieldSize) & AlignMask;
-        uint64_t FieldOffset = (HiMark - FieldSize);
-        Offset -= FieldOffset;
-
-        // Maybe we need to work from the other end.
-        if (Asm->getDataLayout().isLittleEndian())
-          Offset = FieldSize - (Offset + Size);
-
-        if (Offset < 0)
-          addSInt(MemberDie, dwarf::DW_AT_bit_offset, dwarf::DW_FORM_sdata,
+
+    // Handle the size.
+    if (auto *Var = dyn_cast_or_null<DIVariable>(DT->getRawSizeInBits())) {
+      if (auto *VarDIE = getDIE(Var))
+        addDIEEntry(MemberDie, dwarf::DW_AT_bit_size, *VarDIE);
+    } else if (auto *Exp =
+                   dyn_cast_or_null<DIExpression>(DT->getRawSizeInBits())) {
+      DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+      DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+      DwarfExpr.setMemoryLocationKind();
+      DwarfExpr.addExpression(Exp);
+      addBlock(MemberDie, dwarf::DW_AT_bit_size, DwarfExpr.finalize());
+    } else {
+      Size = DT->getSizeInBits();
+      FieldSize = DD->getBaseTypeSize(DT);
+      if (IsBitfield) {
+        // Handle bitfield, assume bytes are 8 bits.
+        if (DD->useDWARF2Bitfields())
+          addUInt(MemberDie, dwarf::DW_AT_byte_size, std::nullopt,
+                  FieldSize / 8);
+        addUInt(MemberDie, dwarf::DW_AT_bit_size, std::nullopt, Size);
+      }
+    }
+
+    // Handle the location.  DW_AT_data_bit_offset won't allow an
+    // expression until DWARF 6, but it can be used as an extension.
+    // See https://dwarfstd.org/issues/250501.1.html
+    if (auto *Var = dyn_cast_or_null<DIVariable>(DT->getRawOffsetInBits())) {
+      if (!Asm->TM.Options.DebugStrictDwarf || DD->getDwarfVersion() >= 6) {
+        if (auto *VarDIE = getDIE(Var))
+          addDIEEntry(MemberDie, dwarf::DW_AT_data_bit_offset, *VarDIE);
+      }
+    } else if (auto *Expr =
+                   dyn_cast_or_null<DIExpression>(DT->getRawOffsetInBits())) {
+      if (!Asm->TM.Options.DebugStrictDwarf || DD->getDwarfVersion() >= 6) {
+        DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+        DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+        DwarfExpr.setMemoryLocationKind();
+        DwarfExpr.addExpression(Expr);
+        addBlock(MemberDie, dwarf::DW_AT_data_bit_offset, DwarfExpr.finalize());
+      }
+    } else {
+      uint32_t AlignInBytes = DT->getAlignInBytes();
+      uint64_t OffsetInBytes;
+
+      if (IsBitfield) {
+        assert(DT->getOffsetInBits() <=
+               (uint64_t)std::numeric_limits<int64_t>::max());
+        int64_t Offset = DT->getOffsetInBits();
+        // We can't use DT->getAlignInBits() here: AlignInBits for member type
+        // is non-zero if and only if alignment was forced (e.g. _Alignas()),
+        // which can't be done with bitfields. Thus we use FieldSize here.
+        uint32_t AlignInBits = FieldSize;
+        uint32_t AlignMask = ~(AlignInBits - 1);
+        // The bits from the start of the storage unit to the start of the
+        // field.
+        uint64_t StartBitOffset = Offset - (Offset & AlignMask);
+        // The byte offset of the field's aligned storage unit inside the
+        // struct.
+        OffsetInBytes = (Offset - StartBitOffset) / 8;
+
+        if (DD->useDWARF2Bitfields()) {
+          uint64_t HiMark = (Offset + FieldSize) & AlignMask;
+          uint64_t FieldOffset = (HiMark - FieldSize);
+          Offset -= FieldOffset;
+
+          // Maybe we need to work from the other end.
+          if (Asm->getDataLayout().isLittleEndian())
+            Offset = FieldSize - (Offset + Size);
+
+          if (Offset < 0)
+            addSInt(MemberDie, dwarf::DW_AT_bit_offset, dwarf::DW_FORM_sdata,
+                    Offset);
+          else
+            addUInt(MemberDie, dwarf::DW_AT_bit_offset, std::nullopt,
+                    (uint64_t)Offset);
+          OffsetInBytes = FieldOffset >> 3;
+        } else {
+          addUInt(MemberDie, dwarf::DW_AT_data_bit_offset, std::nullopt,
                   Offset);
-        else
-          addUInt(MemberDie, dwarf::DW_AT_bit_offset, std::nullopt,
-                  (uint64_t)Offset);
-        OffsetInBytes = FieldOffset >> 3;
+        }
       } else {
-        addUInt(MemberDie, dwarf::DW_AT_data_bit_offset, std::nullopt, Offset);
+        // This is not a bitfield.
+        OffsetInBytes = DT->getOffsetInBits() / 8;
+        if (AlignInBytes)
+          addUInt(MemberDie, dwarf::DW_AT_alignment, dwarf::DW_FORM_udata,
+                  AlignInBytes);
       }
-    } else {
-      // This is not a bitfield.
-      OffsetInBytes = DT->getOffsetInBits() / 8;
-      if (AlignInBytes)
-        addUInt(MemberDie, dwarf::DW_AT_alignment, dwarf::DW_FORM_udata,
-                AlignInBytes);
-    }
 
-    if (DD->getDwarfVersion() <= 2) {
-      DIELoc *MemLocationDie = new (DIEValueAllocator) DIELoc;
-      addUInt(*MemLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-      addUInt(*MemLocationDie, dwarf::DW_FORM_udata, OffsetInBytes);
-      addBlock(MemberDie, dwarf::DW_AT_data_member_location, MemLocationDie);
-    } else if (!IsBitfield || DD->useDWARF2Bitfields()) {
-      // In DWARF v3, DW_FORM_data4/8 in DW_AT_data_member_location are
-      // interpreted as location-list pointers. Interpreting constants as
-      // pointers is not expected, so we use DW_FORM_udata to encode the
-      // constants here.
-      if (DD->getDwarfVersion() == 3)
-        addUInt(MemberDie, dwarf::DW_AT_data_member_location,
-                dwarf::DW_FORM_udata, OffsetInBytes);
-      else
-        addUInt(MemberDie, dwarf::DW_AT_data_member_location, std::nullopt,
-                OffsetInBytes);
+      if (DD->getDwarfVersion() <= 2) {
+        DIELoc *MemLocationDie = new (DIEValueAllocator) DIELoc;
+        addUInt(*MemLocationDie, dwarf::DW_FORM_data1,
+                dwarf::DW_OP_plus_uconst);
+        addUInt(*MemLocationDie, dwarf::DW_FORM_udata, OffsetInBytes);
+        addBlock(MemberDie, dwarf::DW_AT_data_member_location, MemLocationDie);
+      } else if (!IsBitfield || DD->useDWARF2Bitfields()) {
+        // In DWARF v3, DW_FORM_data4/8 in DW_AT_data_member_location are
+        // interpreted as location-list pointers. Interpreting constants as
+        // pointers is not expected, so we use DW_FORM_udata to encode the
+        // constants here.
+        if (DD->getDwarfVersion() == 3)
+          addUInt(MemberDie, dwarf::DW_AT_data_member_location,
+                  dwarf::DW_FORM_udata, OffsetInBytes);
+        else
+          addUInt(MemberDie, dwarf::DW_AT_data_member_location, std::nullopt,
+                  OffsetInBytes);
+      }
     }
   }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 5f5af5cad778c..461fc35337eac 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -469,6 +469,8 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
     RTLIBCASE(COSH_F);
   case TargetOpcode::G_FTANH:
     RTLIBCASE(TANH_F);
+  case TargetOpcode::G_FSINCOS:
+    RTLIBCASE(SINCOS_F);
   case TargetOpcode::G_FLOG10:
     RTLIBCASE(LOG10_F);
   case TargetOpcode::G_FLOG:
@@ -648,6 +650,54 @@ simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
                        LocObserver, &MI);
 }
 
+LegalizerHelper::LegalizeResult LegalizerHelper::emitSincosLibcall(
+    MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size, Type *OpType,
+    LostDebugLocObserver &LocObserver) {
+  MachineFunction &MF = *MI.getMF();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  Register DstSin = MI.getOperand(0).getReg();
+  Register DstCos = MI.getOperand(1).getReg();
+  Register Src = MI.getOperand(2).getReg();
+  LLT DstTy = MRI.getType(DstSin);
+
+  int MemSize = DstTy.getSizeInBytes();
+  Align Alignment = getStackTemporaryAlignment(DstTy);
+  const DataLayout &DL = MIRBuilder.getDataLayout();
+  unsigned AddrSpace = DL.getAllocaAddrSpace();
+  MachinePointerInfo PtrInfo;
+
+  Register StackPtrSin =
+      createStackTemporary(TypeSize::getFixed(MemSize), Alignment, PtrInfo)
+          .getReg(0);
+  Register StackPtrCos =
+      createStackTemporary(TypeSize::getFixed(MemSize), Alignment, PtrInfo)
+          .getReg(0);
+
+  auto &Ctx = MF.getFunction().getContext();
+  auto LibcallResult =
+      createLibcall(MIRBuilder, getRTLibDesc(MI.getOpcode(), Size),
+                    {{0}, Type::getVoidTy(Ctx), 0},
+                    {{Src, OpType, 0},
+                     {StackPtrSin, PointerType::get(Ctx, AddrSpace), 1},
+                     {StackPtrCos, PointerType::get(Ctx, AddrSpace), 2}},
+                    LocObserver, &MI);
+
+  if (LibcallResult != LegalizeResult::Legalized)
+    return LegalizerHelper::UnableToLegalize;
+
+  MachineMemOperand *LoadMMOSin = MF.getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOLoad, MemSize, Alignment);
+  MachineMemOperand *LoadMMOCos = MF.getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOLoad, MemSize, Alignment);
+
+  MIRBuilder.buildLoad(DstSin, StackPtrSin, *LoadMMOSin);
+  MIRBuilder.buildLoad(DstCos, StackPtrCos, *LoadMMOCos);
+  MI.eraseFromParent();
+
+  return LegalizerHelper::Legalized;
+}
+
 LegalizerHelper::LegalizeResult
 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
                        MachineInstr &MI, LostDebugLocObserver &LocObserver) {
@@ -1275,6 +1325,16 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
       return Status;
     break;
   }
+  case TargetOpcode::G_FSINCOS: {
+    LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
+    unsigned Size = LLTy.getSizeInBits();
+    Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
+    if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
+      LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
+      return UnableToLegalize;
+    }
+    return emitSincosLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
+  }
   case TargetOpcode::G_LROUND:
   case TargetOpcode::G_LLROUND:
   case TargetOpcode::G_INTRINSIC_LRINT:
diff --git a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
index 84dc4ab0a5522..92ecfadf97c99 100644
--- a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
+++ b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/CGData/CodeGenData.h"
 #include "llvm/CGData/CodeGenDataWriter.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/StructuralHash.h"
 #include "llvm/InitializePasses.h"
diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp
index 5265bd74d2dbf..f80e1e8b683b3 100644
--- a/llvm/lib/CodeGen/IfConversion.cpp
+++ b/llvm/lib/CodeGen/IfConversion.cpp
@@ -117,7 +117,11 @@ namespace {
     /// IsAnalyzed      - True if BB has been analyzed (info is still valid).
     /// IsEnqueued      - True if BB has been enqueued to be ifcvt'ed.
     /// IsBrAnalyzable  - True if analyzeBranch() returns false.
-    /// HasFallThrough  - True if BB may fallthrough to the following BB.
+    /// HasFallThrough  - True if BB has fallthrough to the following BB.
+    ///                   Note that BB may have a fallthrough if both
+    ///                   !HasFallThrough and !IsBrAnalyzable is true. Also note
+    ///                   that blockNeverFallThrough() can be used to prove that
+    ///                   there is no fall through.
     /// IsUnpredicable  - True if BB is known to be unpredicable.
     /// ClobbersPred    - True if BB could modify predicates (e.g. has
     ///                   cmp, call, etc.)
@@ -125,7 +129,10 @@ namespace {
     /// ExtraCost       - Extra cost for multi-cycle instructions.
     /// ExtraCost2      - Some instructions are slower when predicated
     /// BB              - Corresponding MachineBasicBlock.
-    /// TrueBB / FalseBB- See analyzeBranch().
+    /// TrueBB / FalseBB- See analyzeBranch(), but note that FalseBB can be set
+    ///                   by AnalyzeBranches even if there is a fallthrough. So
+    ///                   it doesn't correspond exactly to the result from
+    ///                   TTI::analyzeBranch.
     /// BrCond          - Conditions for end of block conditional branches.
     /// Predicate       - Predicate used in the BB.
     struct BBInfo {
@@ -397,6 +404,21 @@ namespace {
       return BBI.IsBrAnalyzable && BBI.TrueBB == nullptr;
     }
 
+    /// Returns true if Block is known not to fallthrough to the following BB.
+    bool blockNeverFallThrough(BBInfo &BBI) const {
+      // Trust "HasFallThrough" if we could analyze branches.
+      if (BBI.IsBrAnalyzable)
+        return !BBI.HasFallThrough;
+      // If this is the last MBB in the function, or if the textual successor
+      // isn't in the successor list, then there is no fallthrough.
+      MachineFunction::iterator PI = BBI.BB->getIterator();
+      MachineFunction::iterator I = std::next(PI);
+      if (I == BBI.BB->getParent()->end() || !PI->isSuccessor(&*I))
+        return true;
+      // Could not prove that there is no fallthrough.
+      return false;
+    }
+
     /// Used to sort if-conversion candidates.
     static bool IfcvtTokenCmp(const std::unique_ptr<IfcvtToken> &C1,
                               const std::unique_ptr<IfcvtToken> &C2) {
@@ -1715,9 +1737,8 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
     // Only merge them if the true block does not fallthrough to the false
     // block. By not merging them, we make it possible to iteratively
     // ifcvt the blocks.
-    if (!HasEarlyExit &&
-        NextMBB.pred_size() == 1 && !NextBBI->HasFallThrough &&
-        !NextMBB.hasAddressTaken()) {
+    if (!HasEarlyExit && NextMBB.pred_size() == 1 &&
+        blockNeverFallThrough(*NextBBI) && !NextMBB.hasAddressTaken()) {
       MergeBlocks(BBI, *NextBBI);
       FalseBBDead = true;
     } else {
@@ -2052,8 +2073,8 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
     BBI.BB->removeSuccessor(FalseBBI.BB, true);
 
     BBInfo &TailBBI = BBAnalysis[TailBB->getNumber()];
-    bool CanMergeTail = !TailBBI.HasFallThrough &&
-      !TailBBI.BB->hasAddressTaken();
+    bool CanMergeTail =
+        blockNeverFallThrough(TailBBI) && !TailBBI.BB->hasAddressTaken();
     // The if-converted block can still have a predicated terminator
     // (e.g. a predicated return). If that is the case, we cannot merge
     // it with the tail block.
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 49f1504d244ed..9c4c86cebe7e5 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -629,173 +629,12 @@ static unsigned getIntrinsicFactor(const IntrinsicInst *II) {
   }
 }
 
-// For an (de)interleave tree like this:
-//
-//   A   C B   D
-//   |___| |___|
-//     |_____|
-//        |
-//     A B C D
-//
-//  We will get ABCD at the end while the leaf operands/results
-//  are ACBD, which are also what we initially collected in
-//  getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI
-//  hooks (e.g. lowerDeinterleaveIntrinsicToLoad) expect ABCD, so we need
-//  to reorder them by interleaving these values.
-static void interleaveLeafValues(MutableArrayRef<Value *> SubLeaves) {
-  unsigned NumLeaves = SubLeaves.size();
-  assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1);
-  if (NumLeaves == 2)
-    return;
-
-  const unsigned HalfLeaves = NumLeaves / 2;
-  // Visit the sub-trees.
-  interleaveLeafValues(SubLeaves.take_front(HalfLeaves));
-  interleaveLeafValues(SubLeaves.drop_front(HalfLeaves));
-
-  SmallVector<Value *, 8> Buffer;
-  //    a0 a1 a2 a3 b0 b1 b2 b3
-  // -> a0 b0 a1 b1 a2 b2 a3 b3
-  for (unsigned i = 0U; i < NumLeaves; ++i)
-    Buffer.push_back(SubLeaves[i / 2 + (i % 2 ? HalfLeaves : 0)]);
-
-  llvm::copy(Buffer, SubLeaves.begin());
-}
-
-static bool
-getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
-                          SmallVectorImpl<Instruction *> &DeadInsts) {
-  assert(isInterleaveIntrinsic(II->getIntrinsicID()));
-
-  // Visit with BFS
-  SmallVector<IntrinsicInst *, 8> Queue;
-  Queue.push_back(II);
-  while (!Queue.empty()) {
-    IntrinsicInst *Current = Queue.front();
-    Queue.erase(Queue.begin());
-
-    // All the intermediate intrinsics will be deleted.
-    DeadInsts.push_back(Current);
-
-    for (unsigned I = 0; I < getIntrinsicFactor(Current); ++I) {
-      Value *Op = Current->getOperand(I);
-      if (auto *OpII = dyn_cast<IntrinsicInst>(Op))
-        if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) {
-          Queue.push_back(OpII);
-          continue;
-        }
-
-      // If this is not a perfectly balanced tree, the leaf
-      // result types would be different.
-      if (!Operands.empty() && Op->getType() != Operands.back()->getType())
-        return false;
-
-      Operands.push_back(Op);
-    }
-  }
-
-  const unsigned Factor = Operands.size();
-  // Currently we only recognize factors 2...8 and other powers of 2.
-  // FIXME: should we assert here instead?
-  if (Factor <= 1 ||
-      (!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II)))
-    return false;
-
-  // Recursively interleaved factors need to have their values reordered
-  // TODO: Remove once the loop vectorizer no longer recursively interleaves
-  // factors 4 + 8
-  if (isPowerOf2_32(Factor) && getIntrinsicFactor(II) == 2)
-    interleaveLeafValues(Operands);
-  return true;
-}
-
-static bool
-getVectorDeinterleaveFactor(IntrinsicInst *II,
-                            SmallVectorImpl<Value *> &Results,
-                            SmallVectorImpl<Instruction *> &DeadInsts) {
-  assert(isDeinterleaveIntrinsic(II->getIntrinsicID()));
-  using namespace PatternMatch;
-  if (!II->hasNUses(getIntrinsicFactor(II)))
-    return false;
-
-  // Visit with BFS
-  SmallVector<IntrinsicInst *, 8> Queue;
-  Queue.push_back(II);
-  while (!Queue.empty()) {
-    IntrinsicInst *Current = Queue.front();
-    Queue.erase(Queue.begin());
-    assert(Current->hasNUses(getIntrinsicFactor(Current)));
-
-    // All the intermediate intrinsics will be deleted from the bottom-up.
-    DeadInsts.insert(DeadInsts.begin(), Current);
-
-    SmallVector<ExtractValueInst *> EVs(getIntrinsicFactor(Current), nullptr);
-    for (User *Usr : Current->users()) {
-      if (!isa<ExtractValueInst>(Usr))
-        return 0;
-
-      auto *EV = cast<ExtractValueInst>(Usr);
-      // Intermediate ExtractValue instructions will also be deleted.
-      DeadInsts.insert(DeadInsts.begin(), EV);
-      ArrayRef<unsigned> Indices = EV->getIndices();
-      if (Indices.size() != 1)
-        return false;
-
-      if (!EVs[Indices[0]])
-        EVs[Indices[0]] = EV;
-      else
-        return false;
-    }
-
-    // We have legal indices. At this point we're either going
-    // to continue the traversal or push the leaf values into Results.
-    for (ExtractValueInst *EV : EVs) {
-      // Continue the traversal. We're playing safe here and matching only the
-      // expression consisting of a perfectly balanced binary tree in which all
-      // intermediate values are only used once.
-      if (EV->hasOneUse() &&
-          match(EV->user_back(),
-                m_Intrinsic<Intrinsic::vector_deinterleave2>()) &&
-          EV->user_back()->hasNUses(2)) {
-        auto *EVUsr = cast<IntrinsicInst>(EV->user_back());
-        Queue.push_back(EVUsr);
-        continue;
-      }
-
-      // If this is not a perfectly balanced tree, the leaf
-      // result types would be different.
-      if (!Results.empty() && EV->getType() != Results.back()->getType())
-        return false;
-
-      // Save the leaf value.
-      Results.push_back(EV);
-    }
-  }
-
-  const unsigned Factor = Results.size();
-  // Currently we only recognize factors of 2...8 and other powers of 2.
-  // FIXME: should we assert here instead?
-  if (Factor <= 1 ||
-      (!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II)))
-    return 0;
-
-  // Recursively interleaved factors need to have their values reordered
-  // TODO: Remove once the loop vectorizer no longer recursively interleaves
-  // factors 4 + 8
-  if (isPowerOf2_32(Factor) && getIntrinsicFactor(II) == 2)
-    interleaveLeafValues(Results);
-  return true;
-}
-
 static Value *getMask(Value *WideMask, unsigned Factor,
                       ElementCount LeafValueEC) {
   if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
-    SmallVector<Value *, 8> Operands;
-    SmallVector<Instruction *, 8> DeadInsts;
-    if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) {
-      assert(!Operands.empty());
-      if (Operands.size() == Factor && llvm::all_equal(Operands))
-        return Operands[0];
+    if (isInterleaveIntrinsic(IMI->getIntrinsicID()) &&
+        getIntrinsicFactor(IMI) == Factor && llvm::all_equal(IMI->args())) {
+      return IMI->getArgOperand(0);
     }
   }
 
@@ -830,13 +669,19 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
   if (!LoadedVal->hasOneUse() || !isa<LoadInst, VPIntrinsic>(LoadedVal))
     return false;
 
-  SmallVector<Value *, 8> DeinterleaveValues;
-  SmallVector<Instruction *, 8> DeinterleaveDeadInsts;
-  if (!getVectorDeinterleaveFactor(DI, DeinterleaveValues,
-                                   DeinterleaveDeadInsts))
+  const unsigned Factor = getIntrinsicFactor(DI);
+  if (!DI->hasNUses(Factor))
     return false;
-
-  const unsigned Factor = DeinterleaveValues.size();
+  SmallVector<Value *, 8> DeinterleaveValues(Factor);
+  for (auto *User : DI->users()) {
+    auto *Extract = dyn_cast<ExtractValueInst>(User);
+    if (!Extract || Extract->getNumIndices() != 1)
+      return false;
+    unsigned Idx = Extract->getIndices()[0];
+    if (DeinterleaveValues[Idx])
+      return false;
+    DeinterleaveValues[Idx] = Extract;
+  }
 
   if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
     if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
@@ -869,7 +714,9 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
       return false;
   }
 
-  DeadInsts.insert_range(DeinterleaveDeadInsts);
+  for (Value *V : DeinterleaveValues)
+    DeadInsts.insert(cast<Instruction>(V));
+  DeadInsts.insert(DI);
   // We now have a target-specific load, so delete the old one.
   DeadInsts.insert(cast<Instruction>(LoadedVal));
   return true;
@@ -883,12 +730,8 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
   if (!isa<StoreInst, VPIntrinsic>(StoredBy))
     return false;
 
-  SmallVector<Value *, 8> InterleaveValues;
-  SmallVector<Instruction *, 8> InterleaveDeadInsts;
-  if (!getVectorInterleaveFactor(II, InterleaveValues, InterleaveDeadInsts))
-    return false;
-
-  const unsigned Factor = InterleaveValues.size();
+  SmallVector<Value *, 8> InterleaveValues(II->args());
+  const unsigned Factor = getIntrinsicFactor(II);
 
   if (auto *VPStore = dyn_cast<VPIntrinsic>(StoredBy)) {
     if (VPStore->getIntrinsicID() != Intrinsic::vp_store)
@@ -922,7 +765,7 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
 
   // We now have a target-specific store, so delete the old one.
   DeadInsts.insert(cast<Instruction>(StoredBy));
-  DeadInsts.insert_range(InterleaveDeadInsts);
+  DeadInsts.insert(II);
   return true;
 }
 
diff --git a/llvm/lib/CodeGen/MachineDomTreeUpdater.cpp b/llvm/lib/CodeGen/MachineDomTreeUpdater.cpp
index 72e4be0165bf8..15a175d6391dd 100644
--- a/llvm/lib/CodeGen/MachineDomTreeUpdater.cpp
+++ b/llvm/lib/CodeGen/MachineDomTreeUpdater.cpp
@@ -14,21 +14,22 @@
 #include "llvm/CodeGen/MachineDomTreeUpdater.h"
 #include "llvm/Analysis/GenericDomTreeUpdaterImpl.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm {
 
-template class GenericDomTreeUpdater<
+template class LLVM_EXPORT_TEMPLATE GenericDomTreeUpdater<
     MachineDomTreeUpdater, MachineDominatorTree, MachinePostDominatorTree>;
 
-template void
+template LLVM_EXPORT_TEMPLATE void
 GenericDomTreeUpdater<MachineDomTreeUpdater, MachineDominatorTree,
                       MachinePostDominatorTree>::recalculate(MachineFunction
                                                                  &MF);
 
-template void GenericDomTreeUpdater<
+template LLVM_EXPORT_TEMPLATE void GenericDomTreeUpdater<
     MachineDomTreeUpdater, MachineDominatorTree,
     MachinePostDominatorTree>::applyUpdatesImpl</*IsForward=*/true>();
-template void GenericDomTreeUpdater<
+template LLVM_EXPORT_TEMPLATE void GenericDomTreeUpdater<
     MachineDomTreeUpdater, MachineDominatorTree,
     MachinePostDominatorTree>::applyUpdatesImpl</*IsForward=*/false>();
 
diff --git a/llvm/lib/CodeGen/MachineDominators.cpp b/llvm/lib/CodeGen/MachineDominators.cpp
index 917519f12a039..b221fa8b6de84 100644
--- a/llvm/lib/CodeGen/MachineDominators.cpp
+++ b/llvm/lib/CodeGen/MachineDominators.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/GenericDomTreeConstruction.h"
 
 using namespace llvm;
@@ -35,24 +36,29 @@ static cl::opt<bool, true> VerifyMachineDomInfoX(
     cl::desc("Verify machine dominator info (time consuming)"));
 
 namespace llvm {
-template class DomTreeNodeBase<MachineBasicBlock>;
-template class DominatorTreeBase<MachineBasicBlock, false>; // DomTreeBase
+template class LLVM_EXPORT_TEMPLATE DomTreeNodeBase<MachineBasicBlock>;
+template class LLVM_EXPORT_TEMPLATE
+    DominatorTreeBase<MachineBasicBlock, false>; // DomTreeBase
 
 namespace DomTreeBuilder {
-template void Calculate<MBBDomTree>(MBBDomTree &DT);
-template void CalculateWithUpdates<MBBDomTree>(MBBDomTree &DT, MBBUpdates U);
+template LLVM_EXPORT_TEMPLATE void Calculate<MBBDomTree>(MBBDomTree &DT);
+template LLVM_EXPORT_TEMPLATE void
+CalculateWithUpdates<MBBDomTree>(MBBDomTree &DT, MBBUpdates U);
 
-template void InsertEdge<MBBDomTree>(MBBDomTree &DT, MachineBasicBlock *From,
-                                     MachineBasicBlock *To);
+template LLVM_EXPORT_TEMPLATE void
+InsertEdge<MBBDomTree>(MBBDomTree &DT, MachineBasicBlock *From,
+                       MachineBasicBlock *To);
 
-template void DeleteEdge<MBBDomTree>(MBBDomTree &DT, MachineBasicBlock *From,
-                                     MachineBasicBlock *To);
+template LLVM_EXPORT_TEMPLATE void
+DeleteEdge<MBBDomTree>(MBBDomTree &DT, MachineBasicBlock *From,
+                       MachineBasicBlock *To);
 
-template void ApplyUpdates<MBBDomTree>(MBBDomTree &DT, MBBDomTreeGraphDiff &,
-                                       MBBDomTreeGraphDiff *);
+template LLVM_EXPORT_TEMPLATE void
+ApplyUpdates<MBBDomTree>(MBBDomTree &DT, MBBDomTreeGraphDiff &,
+                         MBBDomTreeGraphDiff *);
 
-template bool Verify<MBBDomTree>(const MBBDomTree &DT,
-                                 MBBDomTree::VerificationLevel VL);
+template LLVM_EXPORT_TEMPLATE bool
+Verify<MBBDomTree>(const MBBDomTree &DT, MBBDomTree::VerificationLevel VL);
 } // namespace DomTreeBuilder
 }
 
diff --git a/llvm/lib/CodeGen/MachineLoopInfo.cpp b/llvm/lib/CodeGen/MachineLoopInfo.cpp
index 1c97e5c9063e4..fdb1a470493ce 100644
--- a/llvm/lib/CodeGen/MachineLoopInfo.cpp
+++ b/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -22,13 +22,16 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/GenericLoopInfoImpl.h"
 
 using namespace llvm;
 
 // Explicitly instantiate methods in LoopInfoImpl.h for MI-level Loops.
-template class llvm::LoopBase<MachineBasicBlock, MachineLoop>;
-template class llvm::LoopInfoBase<MachineBasicBlock, MachineLoop>;
+template class LLVM_EXPORT_TEMPLATE
+    llvm::LoopBase<MachineBasicBlock, MachineLoop>;
+template class LLVM_EXPORT_TEMPLATE
+    llvm::LoopInfoBase<MachineBasicBlock, MachineLoop>;
 
 AnalysisKey MachineLoopAnalysis::Key;
 
diff --git a/llvm/lib/CodeGen/MachinePassManager.cpp b/llvm/lib/CodeGen/MachinePassManager.cpp
index bbe386507fcd2..6e445f6787903 100644
--- a/llvm/lib/CodeGen/MachinePassManager.cpp
+++ b/llvm/lib/CodeGen/MachinePassManager.cpp
@@ -17,20 +17,21 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManagerImpl.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
 AnalysisKey FunctionAnalysisManagerMachineFunctionProxy::Key;
 
 namespace llvm {
-template class AnalysisManager<MachineFunction>;
+template class LLVM_EXPORT_TEMPLATE AnalysisManager<MachineFunction>;
 template class PassManager<MachineFunction>;
-template class InnerAnalysisManagerProxy<MachineFunctionAnalysisManager,
-                                         Module>;
-template class InnerAnalysisManagerProxy<MachineFunctionAnalysisManager,
-                                         Function>;
-template class OuterAnalysisManagerProxy<ModuleAnalysisManager,
-                                         MachineFunction>;
+template class LLVM_EXPORT_TEMPLATE
+    InnerAnalysisManagerProxy<MachineFunctionAnalysisManager, Module>;
+template class LLVM_EXPORT_TEMPLATE
+    InnerAnalysisManagerProxy<MachineFunctionAnalysisManager, Function>;
+template class LLVM_EXPORT_TEMPLATE
+    OuterAnalysisManagerProxy<ModuleAnalysisManager, MachineFunction>;
 } // namespace llvm
 
 bool FunctionAnalysisManagerMachineFunctionProxy::Result::invalidate(
diff --git a/llvm/lib/CodeGen/MachinePostDominators.cpp b/llvm/lib/CodeGen/MachinePostDominators.cpp
index 51637130addc4..1cb7e465881a2 100644
--- a/llvm/lib/CodeGen/MachinePostDominators.cpp
+++ b/llvm/lib/CodeGen/MachinePostDominators.cpp
@@ -18,22 +18,25 @@
 using namespace llvm;
 
 namespace llvm {
-template class DominatorTreeBase<MachineBasicBlock, true>; // PostDomTreeBase
+template class LLVM_EXPORT_TEMPLATE
+    DominatorTreeBase<MachineBasicBlock, true>; // PostDomTreeBase
 
 namespace DomTreeBuilder {
 
-template void Calculate<MBBPostDomTree>(MBBPostDomTree &DT);
-template void InsertEdge<MBBPostDomTree>(MBBPostDomTree &DT,
-                                         MachineBasicBlock *From,
-                                         MachineBasicBlock *To);
-template void DeleteEdge<MBBPostDomTree>(MBBPostDomTree &DT,
-                                         MachineBasicBlock *From,
-                                         MachineBasicBlock *To);
-template void ApplyUpdates<MBBPostDomTree>(MBBPostDomTree &DT,
-                                           MBBPostDomTreeGraphDiff &,
-                                           MBBPostDomTreeGraphDiff *);
-template bool Verify<MBBPostDomTree>(const MBBPostDomTree &DT,
-                                     MBBPostDomTree::VerificationLevel VL);
+template LLVM_EXPORT_TEMPLATE void
+Calculate<MBBPostDomTree>(MBBPostDomTree &DT);
+template LLVM_EXPORT_TEMPLATE void
+InsertEdge<MBBPostDomTree>(MBBPostDomTree &DT, MachineBasicBlock *From,
+                           MachineBasicBlock *To);
+template LLVM_EXPORT_TEMPLATE void
+DeleteEdge<MBBPostDomTree>(MBBPostDomTree &DT, MachineBasicBlock *From,
+                           MachineBasicBlock *To);
+template LLVM_EXPORT_TEMPLATE void
+ApplyUpdates<MBBPostDomTree>(MBBPostDomTree &DT, MBBPostDomTreeGraphDiff &,
+                             MBBPostDomTreeGraphDiff *);
+template LLVM_EXPORT_TEMPLATE bool
+Verify<MBBPostDomTree>(const MBBPostDomTree &DT,
+                       MBBPostDomTree::VerificationLevel VL);
 
 } // namespace DomTreeBuilder
 extern bool VerifyMachineDomInfo;
diff --git a/llvm/lib/CodeGen/RegAllocScore.cpp b/llvm/lib/CodeGen/RegAllocScore.cpp
index 8c140261c11ca..b86647dbe0a48 100644
--- a/llvm/lib/CodeGen/RegAllocScore.cpp
+++ b/llvm/lib/CodeGen/RegAllocScore.cpp
@@ -23,13 +23,16 @@
 #include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
-cl::opt<double> CopyWeight("regalloc-copy-weight", cl::init(0.2), cl::Hidden);
-cl::opt<double> LoadWeight("regalloc-load-weight", cl::init(4.0), cl::Hidden);
-cl::opt<double> StoreWeight("regalloc-store-weight", cl::init(1.0), cl::Hidden);
-cl::opt<double> CheapRematWeight("regalloc-cheap-remat-weight", cl::init(0.2),
-                                 cl::Hidden);
-cl::opt<double> ExpensiveRematWeight("regalloc-expensive-remat-weight",
-                                     cl::init(1.0), cl::Hidden);
+LLVM_ABI cl::opt<double> CopyWeight("regalloc-copy-weight", cl::init(0.2),
+                                    cl::Hidden);
+LLVM_ABI cl::opt<double> LoadWeight("regalloc-load-weight", cl::init(4.0),
+                                    cl::Hidden);
+LLVM_ABI cl::opt<double> StoreWeight("regalloc-store-weight", cl::init(1.0),
+                                     cl::Hidden);
+LLVM_ABI cl::opt<double> CheapRematWeight("regalloc-cheap-remat-weight",
+                                          cl::init(0.2), cl::Hidden);
+LLVM_ABI cl::opt<double> ExpensiveRematWeight("regalloc-expensive-remat-weight",
+                                              cl::init(1.0), cl::Hidden);
 #define DEBUG_TYPE "regalloc-score"
 
 RegAllocScore &RegAllocScore::operator+=(const RegAllocScore &Other) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 66717135c9adf..a0b5f67c2e6c7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3587,6 +3587,19 @@ bool TargetLowering::SimplifyDemandedVectorElts(
         DemandedRHS.setBit(M - NumElts);
     }
 
+    // If either side isn't demanded, replace it by UNDEF. We handle this
+    // explicitly here to also simplify in case of multiple uses (on the
+    // contrary to the SimplifyDemandedVectorElts calls below).
+    bool FoldLHS = !DemandedLHS && !LHS.isUndef();
+    bool FoldRHS = !DemandedRHS && !RHS.isUndef();
+    if (FoldLHS || FoldRHS) {
+      LHS = FoldLHS ? TLO.DAG.getUNDEF(LHS.getValueType()) : LHS;
+      RHS = FoldRHS ? TLO.DAG.getUNDEF(RHS.getValueType()) : RHS;
+      SDValue NewOp =
+          TLO.DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, ShuffleMask);
+      return TLO.CombineTo(Op, NewOp);
+    }
+
     // See if we can simplify either shuffle operand.
     APInt UndefLHS, ZeroLHS;
     APInt UndefRHS, ZeroRHS;
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
index 45cb28af56050..854e6d7135860 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/DefaultHostBootstrapValues.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/TargetExecutionUtils.h"
+#include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/Process.h"
 #include "llvm/TargetParser/Host.h"
 
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index ddc9c5392f922..d4f95be083a47 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -824,8 +824,12 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
         M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
     emitUsed("llvm.compiler.used", LLVMCompilerUsed);
   }
+
+  IsFinalized = true;
 }
 
+bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
+
 OpenMPIRBuilder::~OpenMPIRBuilder() {
   assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
 }
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index fd8c2d7bb5cc3..6001ed421183b 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -325,21 +325,22 @@ DIStringType *DIBuilder::createStringType(StringRef Name,
 }
 
 DIDerivedType *DIBuilder::createQualifiedType(unsigned Tag, DIType *FromTy) {
-  return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr, FromTy, 0,
-                            0, 0, std::nullopt, std::nullopt, DINode::FlagZero);
+  return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr, FromTy,
+                            (uint64_t)0, 0, (uint64_t)0, std::nullopt,
+                            std::nullopt, DINode::FlagZero);
 }
 
 DIDerivedType *DIBuilder::createPtrAuthQualifiedType(
     DIType *FromTy, unsigned Key, bool IsAddressDiscriminated,
     unsigned ExtraDiscriminator, bool IsaPointer,
     bool AuthenticatesNullValues) {
-  return DIDerivedType::get(VMContext, dwarf::DW_TAG_LLVM_ptrauth_type, "",
-                            nullptr, 0, nullptr, FromTy, 0, 0, 0, std::nullopt,
-                            std::optional<DIDerivedType::PtrAuthData>(
-                                std::in_place, Key, IsAddressDiscriminated,
-                                ExtraDiscriminator, IsaPointer,
-                                AuthenticatesNullValues),
-                            DINode::FlagZero);
+  return DIDerivedType::get(
+      VMContext, dwarf::DW_TAG_LLVM_ptrauth_type, "", nullptr, 0, nullptr,
+      FromTy, (uint64_t)0, 0, (uint64_t)0, std::nullopt,
+      std::optional<DIDerivedType::PtrAuthData>(
+          std::in_place, Key, IsAddressDiscriminated, ExtraDiscriminator,
+          IsaPointer, AuthenticatesNullValues),
+      DINode::FlagZero);
 }
 
 DIDerivedType *
@@ -381,9 +382,9 @@ DIDerivedType *DIBuilder::createTypedef(DIType *Ty, StringRef Name,
                                         DINode::DIFlags Flags,
                                         DINodeArray Annotations) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_typedef, Name, File,
-                            LineNo, getNonCompileUnitScope(Context), Ty, 0,
-                            AlignInBits, 0, std::nullopt, std::nullopt, Flags,
-                            nullptr, Annotations);
+                            LineNo, getNonCompileUnitScope(Context), Ty,
+                            (uint64_t)0, AlignInBits, (uint64_t)0, std::nullopt,
+                            std::nullopt, Flags, nullptr, Annotations);
 }
 
 DIDerivedType *
@@ -392,17 +393,17 @@ DIBuilder::createTemplateAlias(DIType *Ty, StringRef Name, DIFile *File,
                                DINodeArray TParams, uint32_t AlignInBits,
                                DINode::DIFlags Flags, DINodeArray Annotations) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_template_alias, Name, File,
-                            LineNo, getNonCompileUnitScope(Context), Ty, 0,
-                            AlignInBits, 0, std::nullopt, std::nullopt, Flags,
-                            TParams.get(), Annotations);
+                            LineNo, getNonCompileUnitScope(Context), Ty,
+                            (uint64_t)0, AlignInBits, (uint64_t)0, std::nullopt,
+                            std::nullopt, Flags, TParams.get(), Annotations);
 }
 
 DIDerivedType *DIBuilder::createFriend(DIType *Ty, DIType *FriendTy) {
   assert(Ty && "Invalid type!");
   assert(FriendTy && "Invalid friend type!");
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_friend, "", nullptr, 0, Ty,
-                            FriendTy, 0, 0, 0, std::nullopt, std::nullopt,
-                            DINode::FlagZero);
+                            FriendTy, (uint64_t)0, 0, (uint64_t)0, std::nullopt,
+                            std::nullopt, DINode::FlagZero);
 }
 
 DIDerivedType *DIBuilder::createInheritance(DIType *Ty, DIType *BaseTy,
@@ -427,6 +428,16 @@ DIDerivedType *DIBuilder::createMemberType(
                             std::nullopt, Flags, nullptr, Annotations);
 }
 
+DIDerivedType *DIBuilder::createMemberType(
+    DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
+    Metadata *SizeInBits, uint32_t AlignInBits, Metadata *OffsetInBits,
+    DINode::DIFlags Flags, DIType *Ty, DINodeArray Annotations) {
+  return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
+                            LineNumber, getNonCompileUnitScope(Scope), Ty,
+                            SizeInBits, AlignInBits, OffsetInBits, std::nullopt,
+                            std::nullopt, Flags, nullptr, Annotations);
+}
+
 static ConstantAsMetadata *getConstantOrNull(Constant *C) {
   if (C)
     return ConstantAsMetadata::get(C);
@@ -451,14 +462,29 @@ DIDerivedType *DIBuilder::createVariantMemberType(DIScope *Scope,
                                                   Constant *Discriminant,
                                                   DIType *Ty) {
   auto *V = DICompositeType::get(VMContext, dwarf::DW_TAG_variant, {}, nullptr,
-                                 0, getNonCompileUnitScope(Scope), {}, 0, 0, 0,
-                                 DINode::FlagZero, Elements, 0, {}, nullptr);
+                                 0, getNonCompileUnitScope(Scope), {},
+                                 (uint64_t)0, 0, (uint64_t)0, DINode::FlagZero,
+                                 Elements, 0, {}, nullptr);
 
   trackIfUnresolved(V);
   return createVariantMemberType(Scope, {}, nullptr, 0, 0, 0, 0, Discriminant,
                                  DINode::FlagZero, V);
 }
 
+DIDerivedType *DIBuilder::createBitFieldMemberType(
+    DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
+    Metadata *SizeInBits, Metadata *OffsetInBits, uint64_t StorageOffsetInBits,
+    DINode::DIFlags Flags, DIType *Ty, DINodeArray Annotations) {
+  Flags |= DINode::FlagBitField;
+  return DIDerivedType::get(
+      VMContext, dwarf::DW_TAG_member, Name, File, LineNumber,
+      getNonCompileUnitScope(Scope), Ty, SizeInBits, /*AlignInBits=*/0,
+      OffsetInBits, std::nullopt, std::nullopt, Flags,
+      ConstantAsMetadata::get(ConstantInt::get(IntegerType::get(VMContext, 64),
+                                               StorageOffsetInBits)),
+      Annotations);
+}
+
 DIDerivedType *DIBuilder::createBitFieldMemberType(
     DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
     uint64_t SizeInBits, uint64_t OffsetInBits, uint64_t StorageOffsetInBits,
@@ -480,9 +506,9 @@ DIBuilder::createStaticMemberType(DIScope *Scope, StringRef Name, DIFile *File,
                                   unsigned Tag, uint32_t AlignInBits) {
   Flags |= DINode::FlagStaticMember;
   return DIDerivedType::get(VMContext, Tag, Name, File, LineNumber,
-                            getNonCompileUnitScope(Scope), Ty, 0, AlignInBits,
-                            0, std::nullopt, std::nullopt, Flags,
-                            getConstantOrNull(Val));
+                            getNonCompileUnitScope(Scope), Ty, (uint64_t)0,
+                            AlignInBits, (uint64_t)0, std::nullopt,
+                            std::nullopt, Flags, getConstantOrNull(Val));
 }
 
 DIDerivedType *
@@ -563,6 +589,22 @@ DICompositeType *DIBuilder::createClassType(
   return R;
 }
 
+DICompositeType *DIBuilder::createStructType(
+    DIScope *Context, StringRef Name, DIFile *File, unsigned LineNumber,
+    Metadata *SizeInBits, uint32_t AlignInBits, DINode::DIFlags Flags,
+    DIType *DerivedFrom, DINodeArray Elements, unsigned RunTimeLang,
+    DIType *VTableHolder, StringRef UniqueIdentifier, DIType *Specification,
+    uint32_t NumExtraInhabitants) {
+  auto *R = DICompositeType::get(
+      VMContext, dwarf::DW_TAG_structure_type, Name, File, LineNumber,
+      getNonCompileUnitScope(Context), DerivedFrom, SizeInBits, AlignInBits, 0,
+      Flags, Elements, RunTimeLang, /*EnumKind=*/std::nullopt, VTableHolder,
+      nullptr, UniqueIdentifier, nullptr, nullptr, nullptr, nullptr, nullptr,
+      nullptr, Specification, NumExtraInhabitants);
+  trackIfUnresolved(R);
+  return R;
+}
+
 DICompositeType *DIBuilder::createStructType(
     DIScope *Context, StringRef Name, DIFile *File, unsigned LineNumber,
     uint64_t SizeInBits, uint32_t AlignInBits, DINode::DIFlags Flags,
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index 473114b99225b..44b0f0d50067c 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -825,25 +825,23 @@ DIGenericSubrange::BoundType DIGenericSubrange::getStride() const {
 }
 
 DISubrangeType::DISubrangeType(LLVMContext &C, StorageType Storage,
-                               unsigned Line, uint64_t SizeInBits,
-                               uint32_t AlignInBits, DIFlags Flags,
-                               ArrayRef<Metadata *> Ops)
+                               unsigned Line, uint32_t AlignInBits,
+                               DIFlags Flags, ArrayRef<Metadata *> Ops)
     : DIType(C, DISubrangeTypeKind, Storage, dwarf::DW_TAG_subrange_type, Line,
-             SizeInBits, AlignInBits, 0, 0, Flags, Ops) {}
+             AlignInBits, 0, Flags, Ops) {}
 
 DISubrangeType *DISubrangeType::getImpl(
     LLVMContext &Context, MDString *Name, Metadata *File, unsigned Line,
-    Metadata *Scope, uint64_t SizeInBits, uint32_t AlignInBits, DIFlags Flags,
+    Metadata *Scope, Metadata *SizeInBits, uint32_t AlignInBits, DIFlags Flags,
     Metadata *BaseType, Metadata *LowerBound, Metadata *UpperBound,
     Metadata *Stride, Metadata *Bias, StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(DISubrangeType, (Name, File, Line, Scope, SizeInBits,
                                          AlignInBits, Flags, BaseType,
                                          LowerBound, UpperBound, Stride, Bias));
-  Metadata *Ops[] = {File,       Scope,      Name,   BaseType,
-                     LowerBound, UpperBound, Stride, Bias};
-  DEFINE_GETIMPL_STORE(DISubrangeType, (Line, SizeInBits, AlignInBits, Flags),
-                       Ops);
+  Metadata *Ops[] = {File,     Scope,      Name,       SizeInBits, nullptr,
+                     BaseType, LowerBound, UpperBound, Stride,     Bias};
+  DEFINE_GETIMPL_STORE(DISubrangeType, (Line, AlignInBits, Flags), Ops);
 }
 
 DISubrangeType::BoundType
@@ -883,18 +881,17 @@ DIEnumerator *DIEnumerator::getImpl(LLVMContext &Context, const APInt &Value,
 }
 
 DIBasicType *DIBasicType::getImpl(LLVMContext &Context, unsigned Tag,
-                                  MDString *Name, uint64_t SizeInBits,
+                                  MDString *Name, Metadata *SizeInBits,
                                   uint32_t AlignInBits, unsigned Encoding,
                                   uint32_t NumExtraInhabitants, DIFlags Flags,
                                   StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(DIBasicType, (Tag, Name, SizeInBits, AlignInBits,
                                       Encoding, NumExtraInhabitants, Flags));
-  Metadata *Ops[] = {nullptr, nullptr, Name};
-  DEFINE_GETIMPL_STORE(
-      DIBasicType,
-      (Tag, SizeInBits, AlignInBits, Encoding, NumExtraInhabitants, Flags),
-      Ops);
+  Metadata *Ops[] = {nullptr, nullptr, Name, SizeInBits, nullptr};
+  DEFINE_GETIMPL_STORE(DIBasicType,
+                       (Tag, AlignInBits, Encoding, NumExtraInhabitants, Flags),
+                       Ops);
 }
 
 std::optional<DIBasicType::Signedness> DIBasicType::getSignedness() const {
@@ -914,18 +911,18 @@ std::optional<DIBasicType::Signedness> DIBasicType::getSignedness() const {
 
 DIFixedPointType *
 DIFixedPointType::getImpl(LLVMContext &Context, unsigned Tag, MDString *Name,
-                          uint64_t SizeInBits, uint32_t AlignInBits,
+                          Metadata *SizeInBits, uint32_t AlignInBits,
                           unsigned Encoding, DIFlags Flags, unsigned Kind,
                           int Factor, APInt Numerator, APInt Denominator,
                           StorageType Storage, bool ShouldCreate) {
   DEFINE_GETIMPL_LOOKUP(DIFixedPointType,
                         (Tag, Name, SizeInBits, AlignInBits, Encoding, Flags,
                          Kind, Factor, Numerator, Denominator));
-  Metadata *Ops[] = {nullptr, nullptr, Name};
-  DEFINE_GETIMPL_STORE(DIFixedPointType,
-                       (Tag, SizeInBits, AlignInBits, Encoding, Flags, Kind,
-                        Factor, Numerator, Denominator),
-                       Ops);
+  Metadata *Ops[] = {nullptr, nullptr, Name, SizeInBits, nullptr};
+  DEFINE_GETIMPL_STORE(
+      DIFixedPointType,
+      (Tag, AlignInBits, Encoding, Flags, Kind, Factor, Numerator, Denominator),
+      Ops);
 }
 
 bool DIFixedPointType::isSigned() const {
@@ -957,17 +954,17 @@ DIStringType *DIStringType::getImpl(LLVMContext &Context, unsigned Tag,
                                     MDString *Name, Metadata *StringLength,
                                     Metadata *StringLengthExp,
                                     Metadata *StringLocationExp,
-                                    uint64_t SizeInBits, uint32_t AlignInBits,
+                                    Metadata *SizeInBits, uint32_t AlignInBits,
                                     unsigned Encoding, StorageType Storage,
                                     bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(DIStringType,
                         (Tag, Name, StringLength, StringLengthExp,
                          StringLocationExp, SizeInBits, AlignInBits, Encoding));
-  Metadata *Ops[] = {nullptr,      nullptr,         Name,
-                     StringLength, StringLengthExp, StringLocationExp};
-  DEFINE_GETIMPL_STORE(DIStringType, (Tag, SizeInBits, AlignInBits, Encoding),
-                       Ops);
+  Metadata *Ops[] = {nullptr,         nullptr,          Name,
+                     SizeInBits,      nullptr,          StringLength,
+                     StringLengthExp, StringLocationExp};
+  DEFINE_GETIMPL_STORE(DIStringType, (Tag, AlignInBits, Encoding), Ops);
 }
 DIType *DIDerivedType::getClassType() const {
   assert(getTag() == dwarf::DW_TAG_ptr_to_member_type);
@@ -1004,8 +1001,8 @@ Constant *DIDerivedType::getDiscriminantValue() const {
 
 DIDerivedType *DIDerivedType::getImpl(
     LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
-    unsigned Line, Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
-    uint32_t AlignInBits, uint64_t OffsetInBits,
+    unsigned Line, Metadata *Scope, Metadata *BaseType, Metadata *SizeInBits,
+    uint32_t AlignInBits, Metadata *OffsetInBits,
     std::optional<unsigned> DWARFAddressSpace,
     std::optional<PtrAuthData> PtrAuthData, DIFlags Flags, Metadata *ExtraData,
     Metadata *Annotations, StorageType Storage, bool ShouldCreate) {
@@ -1014,11 +1011,11 @@ DIDerivedType *DIDerivedType::getImpl(
                         (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
                          AlignInBits, OffsetInBits, DWARFAddressSpace,
                          PtrAuthData, Flags, ExtraData, Annotations));
-  Metadata *Ops[] = {File, Scope, Name, BaseType, ExtraData, Annotations};
-  DEFINE_GETIMPL_STORE(DIDerivedType,
-                       (Tag, Line, SizeInBits, AlignInBits, OffsetInBits,
-                        DWARFAddressSpace, PtrAuthData, Flags),
-                       Ops);
+  Metadata *Ops[] = {File,         Scope,    Name,      SizeInBits,
+                     OffsetInBits, BaseType, ExtraData, Annotations};
+  DEFINE_GETIMPL_STORE(
+      DIDerivedType,
+      (Tag, Line, AlignInBits, DWARFAddressSpace, PtrAuthData, Flags), Ops);
 }
 
 std::optional<DIDerivedType::PtrAuthData>
@@ -1030,8 +1027,8 @@ DIDerivedType::getPtrAuthData() const {
 
 DICompositeType *DICompositeType::getImpl(
     LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
-    unsigned Line, Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
-    uint32_t AlignInBits, uint64_t OffsetInBits, DIFlags Flags,
+    unsigned Line, Metadata *Scope, Metadata *BaseType, Metadata *SizeInBits,
+    uint32_t AlignInBits, Metadata *OffsetInBits, DIFlags Flags,
     Metadata *Elements, unsigned RuntimeLang, std::optional<uint32_t> EnumKind,
     Metadata *VTableHolder, Metadata *TemplateParams, MDString *Identifier,
     Metadata *Discriminator, Metadata *DataLocation, Metadata *Associated,
@@ -1047,20 +1044,21 @@ DICompositeType *DICompositeType::getImpl(
        OffsetInBits, Flags, Elements, RuntimeLang, VTableHolder, TemplateParams,
        Identifier, Discriminator, DataLocation, Associated, Allocated, Rank,
        Annotations, Specification, NumExtraInhabitants, BitStride));
-  Metadata *Ops[] = {File,          Scope,        Name,           BaseType,
-                     Elements,      VTableHolder, TemplateParams, Identifier,
-                     Discriminator, DataLocation, Associated,     Allocated,
-                     Rank,          Annotations,  Specification,  BitStride};
+  Metadata *Ops[] = {File,           Scope,      Name,          SizeInBits,
+                     OffsetInBits,   BaseType,   Elements,      VTableHolder,
+                     TemplateParams, Identifier, Discriminator, DataLocation,
+                     Associated,     Allocated,  Rank,          Annotations,
+                     Specification,  BitStride};
   DEFINE_GETIMPL_STORE(DICompositeType,
-                       (Tag, Line, RuntimeLang, SizeInBits, AlignInBits,
-                        OffsetInBits, NumExtraInhabitants, EnumKind, Flags),
+                       (Tag, Line, RuntimeLang, AlignInBits,
+                        NumExtraInhabitants, EnumKind, Flags),
                        Ops);
 }
 
 DICompositeType *DICompositeType::buildODRType(
     LLVMContext &Context, MDString &Identifier, unsigned Tag, MDString *Name,
     Metadata *File, unsigned Line, Metadata *Scope, Metadata *BaseType,
-    uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
+    Metadata *SizeInBits, uint32_t AlignInBits, Metadata *OffsetInBits,
     Metadata *Specification, uint32_t NumExtraInhabitants, DIFlags Flags,
     Metadata *Elements, unsigned RuntimeLang, std::optional<uint32_t> EnumKind,
     Metadata *VTableHolder, Metadata *TemplateParams, Metadata *Discriminator,
@@ -1086,12 +1084,13 @@ DICompositeType *DICompositeType::buildODRType(
     return CT;
 
   // Mutate CT in place.  Keep this in sync with getImpl.
-  CT->mutate(Tag, Line, RuntimeLang, SizeInBits, AlignInBits, OffsetInBits,
-             NumExtraInhabitants, EnumKind, Flags);
-  Metadata *Ops[] = {File,          Scope,        Name,           BaseType,
-                     Elements,      VTableHolder, TemplateParams, &Identifier,
-                     Discriminator, DataLocation, Associated,     Allocated,
-                     Rank,          Annotations,  Specification,  BitStride};
+  CT->mutate(Tag, Line, RuntimeLang, AlignInBits, NumExtraInhabitants, EnumKind,
+             Flags);
+  Metadata *Ops[] = {File,           Scope,       Name,          SizeInBits,
+                     OffsetInBits,   BaseType,    Elements,      VTableHolder,
+                     TemplateParams, &Identifier, Discriminator, DataLocation,
+                     Associated,     Allocated,   Rank,          Annotations,
+                     Specification,  BitStride};
   assert((std::end(Ops) - std::begin(Ops)) == (int)CT->getNumOperands() &&
          "Mismatched number of operands");
   for (unsigned I = 0, E = CT->getNumOperands(); I != E; ++I)
@@ -1103,7 +1102,7 @@ DICompositeType *DICompositeType::buildODRType(
 DICompositeType *DICompositeType::getODRType(
     LLVMContext &Context, MDString &Identifier, unsigned Tag, MDString *Name,
     Metadata *File, unsigned Line, Metadata *Scope, Metadata *BaseType,
-    uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
+    Metadata *SizeInBits, uint32_t AlignInBits, Metadata *OffsetInBits,
     Metadata *Specification, uint32_t NumExtraInhabitants, DIFlags Flags,
     Metadata *Elements, unsigned RuntimeLang, std::optional<uint32_t> EnumKind,
     Metadata *VTableHolder, Metadata *TemplateParams, Metadata *Discriminator,
@@ -1138,7 +1137,7 @@ DISubroutineType::DISubroutineType(LLVMContext &C, StorageType Storage,
                                    DIFlags Flags, uint8_t CC,
                                    ArrayRef<Metadata *> Ops)
     : DIType(C, DISubroutineTypeKind, Storage, dwarf::DW_TAG_subroutine_type, 0,
-             0, 0, 0, 0, Flags, Ops),
+             0, 0, Flags, Ops),
       CC(CC) {}
 
 DISubroutineType *DISubroutineType::getImpl(LLVMContext &Context, DIFlags Flags,
@@ -1146,7 +1145,7 @@ DISubroutineType *DISubroutineType::getImpl(LLVMContext &Context, DIFlags Flags,
                                             StorageType Storage,
                                             bool ShouldCreate) {
   DEFINE_GETIMPL_LOOKUP(DISubroutineType, (Flags, CC, TypeArray));
-  Metadata *Ops[] = {nullptr, nullptr, nullptr, TypeArray};
+  Metadata *Ops[] = {nullptr, nullptr, nullptr, nullptr, nullptr, TypeArray};
   DEFINE_GETIMPL_STORE(DISubroutineType, (Flags, CC), Ops);
 }
 
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 2d89ec1b0a8d3..8d9b545d4134f 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -643,6 +643,10 @@ MemoryEffects CallBase::getMemoryEffects() const {
       if (hasClobberingOperandBundles())
         FnME |= MemoryEffects::writeOnly();
     }
+    if (isVolatile()) {
+      // Volatile operations also access inaccessible memory.
+      FnME |= MemoryEffects::inaccessibleMemOnly();
+    }
     ME &= FnME;
   }
   return ME;
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index ef279721b9643..4446f47d323d2 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -491,27 +491,28 @@ template <> struct MDNodeKeyImpl<DIEnumerator> {
 template <> struct MDNodeKeyImpl<DIBasicType> {
   unsigned Tag;
   MDString *Name;
-  uint64_t SizeInBits;
+  Metadata *SizeInBits;
   uint32_t AlignInBits;
   unsigned Encoding;
   uint32_t NumExtraInhabitants;
   unsigned Flags;
 
-  MDNodeKeyImpl(unsigned Tag, MDString *Name, uint64_t SizeInBits,
+  MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *SizeInBits,
                 uint32_t AlignInBits, unsigned Encoding,
                 uint32_t NumExtraInhabitants, unsigned Flags)
       : Tag(Tag), Name(Name), SizeInBits(SizeInBits), AlignInBits(AlignInBits),
         Encoding(Encoding), NumExtraInhabitants(NumExtraInhabitants),
         Flags(Flags) {}
   MDNodeKeyImpl(const DIBasicType *N)
-      : Tag(N->getTag()), Name(N->getRawName()), SizeInBits(N->getSizeInBits()),
-        AlignInBits(N->getAlignInBits()), Encoding(N->getEncoding()),
+      : Tag(N->getTag()), Name(N->getRawName()),
+        SizeInBits(N->getRawSizeInBits()), AlignInBits(N->getAlignInBits()),
+        Encoding(N->getEncoding()),
         NumExtraInhabitants(N->getNumExtraInhabitants()), Flags(N->getFlags()) {
   }
 
   bool isKeyOf(const DIBasicType *RHS) const {
     return Tag == RHS->getTag() && Name == RHS->getRawName() &&
-           SizeInBits == RHS->getSizeInBits() &&
+           SizeInBits == RHS->getRawSizeInBits() &&
            AlignInBits == RHS->getAlignInBits() &&
            Encoding == RHS->getEncoding() &&
            NumExtraInhabitants == RHS->getNumExtraInhabitants() &&
@@ -526,7 +527,7 @@ template <> struct MDNodeKeyImpl<DIBasicType> {
 template <> struct MDNodeKeyImpl<DIFixedPointType> {
   unsigned Tag;
   MDString *Name;
-  uint64_t SizeInBits;
+  Metadata *SizeInBits;
   uint32_t AlignInBits;
   unsigned Encoding;
   unsigned Flags;
@@ -535,20 +536,21 @@ template <> struct MDNodeKeyImpl<DIFixedPointType> {
   APInt Numerator;
   APInt Denominator;
 
-  MDNodeKeyImpl(unsigned Tag, MDString *Name, uint64_t SizeInBits,
+  MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *SizeInBits,
                 uint32_t AlignInBits, unsigned Encoding, unsigned Flags,
                 unsigned Kind, int Factor, APInt Numerator, APInt Denominator)
       : Tag(Tag), Name(Name), SizeInBits(SizeInBits), AlignInBits(AlignInBits),
         Encoding(Encoding), Flags(Flags), Kind(Kind), Factor(Factor),
         Numerator(Numerator), Denominator(Denominator) {}
   MDNodeKeyImpl(const DIFixedPointType *N)
-      : Tag(N->getTag()), Name(N->getRawName()), SizeInBits(N->getSizeInBits()),
-        AlignInBits(N->getAlignInBits()), Encoding(N->getEncoding()),
-        Flags(N->getFlags()), Kind(N->getKind()), Factor(N->getFactorRaw()),
-        Numerator(N->getNumeratorRaw()), Denominator(N->getDenominatorRaw()) {}
+      : Tag(N->getTag()), Name(N->getRawName()),
+        SizeInBits(N->getRawSizeInBits()), AlignInBits(N->getAlignInBits()),
+        Encoding(N->getEncoding()), Flags(N->getFlags()), Kind(N->getKind()),
+        Factor(N->getFactorRaw()), Numerator(N->getNumeratorRaw()),
+        Denominator(N->getDenominatorRaw()) {}
 
   bool isKeyOf(const DIFixedPointType *RHS) const {
-    return Name == RHS->getRawName() && SizeInBits == RHS->getSizeInBits() &&
+    return Name == RHS->getRawName() && SizeInBits == RHS->getRawSizeInBits() &&
            AlignInBits == RHS->getAlignInBits() && Kind == RHS->getKind() &&
            (RHS->isRational() ? (Numerator == RHS->getNumerator() &&
                                  Denominator == RHS->getDenominator())
@@ -566,13 +568,13 @@ template <> struct MDNodeKeyImpl<DIStringType> {
   Metadata *StringLength;
   Metadata *StringLengthExp;
   Metadata *StringLocationExp;
-  uint64_t SizeInBits;
+  Metadata *SizeInBits;
   uint32_t AlignInBits;
   unsigned Encoding;
 
   MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *StringLength,
                 Metadata *StringLengthExp, Metadata *StringLocationExp,
-                uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding)
+                Metadata *SizeInBits, uint32_t AlignInBits, unsigned Encoding)
       : Tag(Tag), Name(Name), StringLength(StringLength),
         StringLengthExp(StringLengthExp), StringLocationExp(StringLocationExp),
         SizeInBits(SizeInBits), AlignInBits(AlignInBits), Encoding(Encoding) {}
@@ -581,7 +583,7 @@ template <> struct MDNodeKeyImpl<DIStringType> {
         StringLength(N->getRawStringLength()),
         StringLengthExp(N->getRawStringLengthExp()),
         StringLocationExp(N->getRawStringLocationExp()),
-        SizeInBits(N->getSizeInBits()), AlignInBits(N->getAlignInBits()),
+        SizeInBits(N->getRawSizeInBits()), AlignInBits(N->getAlignInBits()),
         Encoding(N->getEncoding()) {}
 
   bool isKeyOf(const DIStringType *RHS) const {
@@ -589,7 +591,7 @@ template <> struct MDNodeKeyImpl<DIStringType> {
            StringLength == RHS->getRawStringLength() &&
            StringLengthExp == RHS->getRawStringLengthExp() &&
            StringLocationExp == RHS->getRawStringLocationExp() &&
-           SizeInBits == RHS->getSizeInBits() &&
+           SizeInBits == RHS->getRawSizeInBits() &&
            AlignInBits == RHS->getAlignInBits() &&
            Encoding == RHS->getEncoding();
   }
@@ -609,8 +611,8 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
   unsigned Line;
   Metadata *Scope;
   Metadata *BaseType;
-  uint64_t SizeInBits;
-  uint64_t OffsetInBits;
+  Metadata *SizeInBits;
+  Metadata *OffsetInBits;
   uint32_t AlignInBits;
   std::optional<unsigned> DWARFAddressSpace;
   std::optional<DIDerivedType::PtrAuthData> PtrAuthData;
@@ -619,8 +621,8 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
   Metadata *Annotations;
 
   MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
-                Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
-                uint32_t AlignInBits, uint64_t OffsetInBits,
+                Metadata *Scope, Metadata *BaseType, Metadata *SizeInBits,
+                uint32_t AlignInBits, Metadata *OffsetInBits,
                 std::optional<unsigned> DWARFAddressSpace,
                 std::optional<DIDerivedType::PtrAuthData> PtrAuthData,
                 unsigned Flags, Metadata *ExtraData, Metadata *Annotations)
@@ -632,8 +634,8 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
   MDNodeKeyImpl(const DIDerivedType *N)
       : Tag(N->getTag()), Name(N->getRawName()), File(N->getRawFile()),
         Line(N->getLine()), Scope(N->getRawScope()),
-        BaseType(N->getRawBaseType()), SizeInBits(N->getSizeInBits()),
-        OffsetInBits(N->getOffsetInBits()), AlignInBits(N->getAlignInBits()),
+        BaseType(N->getRawBaseType()), SizeInBits(N->getRawSizeInBits()),
+        OffsetInBits(N->getRawOffsetInBits()), AlignInBits(N->getAlignInBits()),
         DWARFAddressSpace(N->getDWARFAddressSpace()),
         PtrAuthData(N->getPtrAuthData()), Flags(N->getFlags()),
         ExtraData(N->getRawExtraData()), Annotations(N->getRawAnnotations()) {}
@@ -642,9 +644,9 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
     return Tag == RHS->getTag() && Name == RHS->getRawName() &&
            File == RHS->getRawFile() && Line == RHS->getLine() &&
            Scope == RHS->getRawScope() && BaseType == RHS->getRawBaseType() &&
-           SizeInBits == RHS->getSizeInBits() &&
+           SizeInBits == RHS->getRawSizeInBits() &&
            AlignInBits == RHS->getAlignInBits() &&
-           OffsetInBits == RHS->getOffsetInBits() &&
+           OffsetInBits == RHS->getRawOffsetInBits() &&
            DWARFAddressSpace == RHS->getDWARFAddressSpace() &&
            PtrAuthData == RHS->getPtrAuthData() && Flags == RHS->getFlags() &&
            ExtraData == RHS->getRawExtraData() &&
@@ -673,7 +675,7 @@ template <> struct MDNodeKeyImpl<DISubrangeType> {
   Metadata *File;
   unsigned Line;
   Metadata *Scope;
-  uint64_t SizeInBits;
+  Metadata *SizeInBits;
   uint32_t AlignInBits;
   unsigned Flags;
   Metadata *BaseType;
@@ -683,7 +685,7 @@ template <> struct MDNodeKeyImpl<DISubrangeType> {
   Metadata *Bias;
 
   MDNodeKeyImpl(MDString *Name, Metadata *File, unsigned Line, Metadata *Scope,
-                uint64_t SizeInBits, uint32_t AlignInBits, unsigned Flags,
+                Metadata *SizeInBits, uint32_t AlignInBits, unsigned Flags,
                 Metadata *BaseType, Metadata *LowerBound, Metadata *UpperBound,
                 Metadata *Stride, Metadata *Bias)
       : Name(Name), File(File), Line(Line), Scope(Scope),
@@ -692,7 +694,7 @@ template <> struct MDNodeKeyImpl<DISubrangeType> {
         Stride(Stride), Bias(Bias) {}
   MDNodeKeyImpl(const DISubrangeType *N)
       : Name(N->getRawName()), File(N->getRawFile()), Line(N->getLine()),
-        Scope(N->getRawScope()), SizeInBits(N->getSizeInBits()),
+        Scope(N->getRawScope()), SizeInBits(N->getRawSizeInBits()),
         AlignInBits(N->getAlignInBits()), Flags(N->getFlags()),
         BaseType(N->getRawBaseType()), LowerBound(N->getRawLowerBound()),
         UpperBound(N->getRawUpperBound()), Stride(N->getRawStride()),
@@ -716,7 +718,7 @@ template <> struct MDNodeKeyImpl<DISubrangeType> {
 
     return Name == RHS->getRawName() && File == RHS->getRawFile() &&
            Line == RHS->getLine() && Scope == RHS->getRawScope() &&
-           SizeInBits == RHS->getSizeInBits() &&
+           SizeInBits == RHS->getRawSizeInBits() &&
            AlignInBits == RHS->getAlignInBits() && Flags == RHS->getFlags() &&
            BaseType == RHS->getRawBaseType() &&
            BoundsEqual(LowerBound, RHS->getRawLowerBound()) &&
@@ -784,8 +786,8 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
   unsigned Line;
   Metadata *Scope;
   Metadata *BaseType;
-  uint64_t SizeInBits;
-  uint64_t OffsetInBits;
+  Metadata *SizeInBits;
+  Metadata *OffsetInBits;
   uint32_t AlignInBits;
   unsigned Flags;
   Metadata *Elements;
@@ -804,8 +806,8 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
   Metadata *BitStride;
 
   MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
-                Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
-                uint32_t AlignInBits, uint64_t OffsetInBits, unsigned Flags,
+                Metadata *Scope, Metadata *BaseType, Metadata *SizeInBits,
+                uint32_t AlignInBits, Metadata *OffsetInBits, unsigned Flags,
                 Metadata *Elements, unsigned RuntimeLang,
                 Metadata *VTableHolder, Metadata *TemplateParams,
                 MDString *Identifier, Metadata *Discriminator,
@@ -825,8 +827,8 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
   MDNodeKeyImpl(const DICompositeType *N)
       : Tag(N->getTag()), Name(N->getRawName()), File(N->getRawFile()),
         Line(N->getLine()), Scope(N->getRawScope()),
-        BaseType(N->getRawBaseType()), SizeInBits(N->getSizeInBits()),
-        OffsetInBits(N->getOffsetInBits()), AlignInBits(N->getAlignInBits()),
+        BaseType(N->getRawBaseType()), SizeInBits(N->getRawSizeInBits()),
+        OffsetInBits(N->getRawOffsetInBits()), AlignInBits(N->getAlignInBits()),
         Flags(N->getFlags()), Elements(N->getRawElements()),
         RuntimeLang(N->getRuntimeLang()), VTableHolder(N->getRawVTableHolder()),
         TemplateParams(N->getRawTemplateParams()),
@@ -843,10 +845,10 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
     return Tag == RHS->getTag() && Name == RHS->getRawName() &&
            File == RHS->getRawFile() && Line == RHS->getLine() &&
            Scope == RHS->getRawScope() && BaseType == RHS->getRawBaseType() &&
-           SizeInBits == RHS->getSizeInBits() &&
+           SizeInBits == RHS->getRawSizeInBits() &&
            AlignInBits == RHS->getAlignInBits() &&
-           OffsetInBits == RHS->getOffsetInBits() && Flags == RHS->getFlags() &&
-           Elements == RHS->getRawElements() &&
+           OffsetInBits == RHS->getRawOffsetInBits() &&
+           Flags == RHS->getFlags() && Elements == RHS->getRawElements() &&
            RuntimeLang == RHS->getRuntimeLang() &&
            VTableHolder == RHS->getRawVTableHolder() &&
            TemplateParams == RHS->getRawTemplateParams() &&
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 702e0a51357f5..5c01d8595d0f9 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -432,19 +432,11 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
     setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
 
     // Some darwins have an optimized __bzero/bzero function.
-    switch (TT.getArch()) {
-    case Triple::x86:
-    case Triple::x86_64:
+    if (TT.isX86()) {
       if (TT.isMacOSX() && !TT.isMacOSXVersionLT(10, 6))
         setLibcallName(RTLIB::BZERO, "__bzero");
-      break;
-    case Triple::aarch64:
-    case Triple::aarch64_32:
+    } else if (TT.isAArch64())
       setLibcallName(RTLIB::BZERO, "bzero");
-      break;
-    default:
-      break;
-    }
 
     if (darwinHasSinCosStret(TT)) {
       setLibcallName(RTLIB::SINCOS_STRET_F32, "__sincosf_stret");
@@ -457,37 +449,13 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
       }
     }
 
-    switch (TT.getOS()) {
-    case Triple::MacOSX:
-      if (TT.isMacOSXVersionLT(10, 9)) {
-        setLibcallName(RTLIB::EXP10_F32, nullptr);
-        setLibcallName(RTLIB::EXP10_F64, nullptr);
-      } else {
-        setLibcallName(RTLIB::EXP10_F32, "__exp10f");
-        setLibcallName(RTLIB::EXP10_F64, "__exp10");
-      }
-      break;
-    case Triple::IOS:
-      if (TT.isOSVersionLT(7, 0)) {
-        setLibcallName(RTLIB::EXP10_F32, nullptr);
-        setLibcallName(RTLIB::EXP10_F64, nullptr);
-        break;
-      }
-      [[fallthrough]];
-    case Triple::DriverKit:
-    case Triple::TvOS:
-    case Triple::WatchOS:
-    case Triple::XROS:
+    if (darwinHasExp10(TT)) {
       setLibcallName(RTLIB::EXP10_F32, "__exp10f");
       setLibcallName(RTLIB::EXP10_F64, "__exp10");
-      break;
-    default:
-      break;
+    } else {
+      setLibcallName(RTLIB::EXP10_F32, nullptr);
+      setLibcallName(RTLIB::EXP10_F64, nullptr);
     }
-  } else if (TT.getOS() == Triple::BridgeOS) {
-    // TODO: BridgeOS should be included in isOSDarwin.
-    setLibcallName(RTLIB::EXP10_F32, "__exp10f");
-    setLibcallName(RTLIB::EXP10_F64, "__exp10");
   }
 
   if (hasSinCos(TT)) {
@@ -665,3 +633,22 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
   if (TT.getArch() == Triple::ArchType::msp430)
     setMSP430Libcalls(*this, TT);
 }
+
+bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) {
+  assert(TT.isOSDarwin() && "should be called with darwin triple");
+
+  switch (TT.getOS()) {
+  case Triple::MacOSX:
+    return !TT.isMacOSXVersionLT(10, 9);
+  case Triple::IOS:
+    return !TT.isOSVersionLT(7, 0);
+  case Triple::DriverKit:
+  case Triple::TvOS:
+  case Triple::WatchOS:
+  case Triple::XROS:
+  case Triple::BridgeOS:
+    return true;
+  default:
+    return false;
+  }
+}
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 71261343b3482..e7bb6d9a3e32d 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -1172,6 +1172,10 @@ void Verifier::visitDISubrangeType(const DISubrangeType &N) {
   CheckDI(!Bias || isa<ConstantAsMetadata>(Bias) || isa<DIVariable>(Bias) ||
               isa<DIExpression>(Bias),
           "Bias must be signed constant or DIVariable or DIExpression", &N);
+  // Subrange types currently only support constant size.
+  auto *Size = N.getRawSizeInBits();
+  CheckDI(!Size || isa<ConstantAsMetadata>(Size),
+          "SizeInBits must be a constant");
 }
 
 void Verifier::visitDISubrange(const DISubrange &N) {
@@ -1233,6 +1237,10 @@ void Verifier::visitDIBasicType(const DIBasicType &N) {
               N.getTag() == dwarf::DW_TAG_unspecified_type ||
               N.getTag() == dwarf::DW_TAG_string_type,
           "invalid tag", &N);
+  // Basic types currently only support constant size.
+  auto *Size = N.getRawSizeInBits();
+  CheckDI(!Size || isa<ConstantAsMetadata>(Size),
+          "SizeInBits must be a constant");
 }
 
 void Verifier::visitDIFixedPointType(const DIFixedPointType &N) {
@@ -1313,6 +1321,11 @@ void Verifier::visitDIDerivedType(const DIDerivedType &N) {
             "DWARF address space only applies to pointer or reference types",
             &N);
   }
+
+  auto *Size = N.getRawSizeInBits();
+  CheckDI(!Size || isa<ConstantAsMetadata>(Size) || isa<DIVariable>(Size) ||
+              isa<DIExpression>(Size),
+          "SizeInBits must be a constant or DIVariable or DIExpression");
 }
 
 /// Detect mutually exclusive flags.
@@ -1400,6 +1413,11 @@ void Verifier::visitDICompositeType(const DICompositeType &N) {
   if (N.getTag() == dwarf::DW_TAG_array_type) {
     CheckDI(N.getRawBaseType(), "array types must have a base type", &N);
   }
+
+  auto *Size = N.getRawSizeInBits();
+  CheckDI(!Size || isa<ConstantAsMetadata>(Size) || isa<DIVariable>(Size) ||
+              isa<DIExpression>(Size),
+          "SizeInBits must be a constant or DIVariable or DIExpression");
 }
 
 void Verifier::visitDISubroutineType(const DISubroutineType &N) {
@@ -5008,6 +5026,9 @@ void Verifier::visitProfMetadata(Instruction &I, MDNode *MD) {
       Check(mdconst::dyn_extract<ConstantInt>(MDO),
             "!prof brunch_weights operand is not a const int");
     }
+  } else {
+    Check(ProfName == "VP", "expected either branch_weights or VP profile name",
+          MD);
   }
 }
 
@@ -5517,7 +5538,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
                   Call.getOperand(Elem.Begin + 1)->getType()->isPointerTy(),
               "arguments to separate_storage assumptions should be pointers",
               Call);
-        return;
+        continue;
       }
       Check(Elem.Tag->getKey() == "ignore" ||
                 Attribute::isExistingAttribute(Elem.Tag->getKey()),
@@ -5534,7 +5555,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
         if (ArgCount == 3)
           Check(Call.getOperand(Elem.Begin + 2)->getType()->isIntegerTy(),
                 "third argument should be an integer if present", Call);
-        return;
+        continue;
       }
       Check(ArgCount <= 2, "too many arguments", Call);
       if (Kind == Attribute::None)
diff --git a/llvm/lib/MC/MCSchedule.cpp b/llvm/lib/MC/MCSchedule.cpp
index 8aea08919f469..527ccf3fc36e0 100644
--- a/llvm/lib/MC/MCSchedule.cpp
+++ b/llvm/lib/MC/MCSchedule.cpp
@@ -37,6 +37,7 @@ const MCSchedModel MCSchedModel::Default = {DefaultIssueWidth,
                                             0,
                                             0,
                                             nullptr,
+                                            nullptr,
                                             nullptr};
 
 int MCSchedModel::computeInstrLatency(const MCSubtargetInfo &STI,
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index 5f1fd57802c7b..6cd6b4abdd327 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -1453,10 +1453,9 @@ static VersionTuple getMachoBuildVersionSupportedOS(const Triple &Target) {
   case Triple::WatchOS:
     return VersionTuple(5);
   case Triple::DriverKit:
-    // DriverKit always uses the build version load command.
-    return VersionTuple();
+  case Triple::BridgeOS:
   case Triple::XROS:
-    // XROS always uses the build version load command.
+    // DriverKit/BridgeOS/XROS always use the build version load command.
     return VersionTuple();
   default:
     break;
@@ -1487,6 +1486,8 @@ getMachoBuildVersionPlatformType(const Triple &Target) {
   case Triple::XROS:
     return Target.isSimulatorEnvironment() ? MachO::PLATFORM_XROS_SIMULATOR
                                            : MachO::PLATFORM_XROS;
+  case Triple::BridgeOS:
+    return MachO::PLATFORM_BRIDGEOS;
   default:
     break;
   }
@@ -1520,6 +1521,7 @@ void MCStreamer::emitVersionForTarget(
     Version = Target.getDriverKitVersion();
     break;
   case Triple::XROS:
+  case Triple::BridgeOS:
     Version = Target.getOSVersion();
     break;
   default:
diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp
index 2bac99b6309af..cad25a6ddd3f5 100644
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp
@@ -75,7 +75,8 @@ static void initializeUsedResources(InstrDesc &ID,
       WithColor::warning()
           << "Ignoring invalid write of zero cycles on processor resource "
           << PR.Name << "\n";
-      WithColor::note() << "found in scheduling class " << SCDesc.Name
+      WithColor::note() << "found in scheduling class "
+                        << SM.getSchedClassName(ID.SchedClassID)
                         << " (write index #" << I << ")\n";
 #endif
       continue;
diff --git a/llvm/lib/Option/Arg.cpp b/llvm/lib/Option/Arg.cpp
index 2d52b947aaede..3aab7c0768e14 100644
--- a/llvm/lib/Option/Arg.cpp
+++ b/llvm/lib/Option/Arg.cpp
@@ -6,13 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Option/Arg.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/InterleavedRange.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -72,13 +73,7 @@ std::string Arg::getAsString(const ArgList &Args) const {
 
   ArgStringList ASL;
   render(Args, ASL);
-  for (ArgStringList::iterator
-         it = ASL.begin(), ie = ASL.end(); it != ie; ++it) {
-    if (it != ASL.begin())
-      OS << ' ';
-    OS << *it;
-  }
-
+  OS << llvm::interleaved(ASL, " ");
   return std::string(OS.str());
 }
 
@@ -100,11 +95,7 @@ void Arg::render(const ArgList &Args, ArgStringList &Output) const {
   case Option::RenderCommaJoinedStyle: {
     SmallString<256> Res;
     raw_svector_ostream OS(Res);
-    OS << getSpelling();
-    for (unsigned i = 0, e = getNumValues(); i != e; ++i) {
-      if (i) OS << ',';
-      OS << getValue(i);
-    }
+    OS << getSpelling() << llvm::interleaved(getValues(), ",");
     Output.push_back(Args.MakeArgString(OS.str()));
     break;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 356c2b71d2018..13835747c91e5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13404,30 +13404,6 @@ static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   return true;
 }
 
-/// isDUPQMask - matches a splat of equivalent lanes within 128b segments in
-/// the first vector operand.
-static std::optional<unsigned> isDUPQMask(ArrayRef<int> M, EVT VT) {
-  assert(VT.getFixedSizeInBits() % 128 == 0 && "Unsupported SVE vector size");
-  unsigned Lane = (unsigned)M[0];
-  unsigned Segments = VT.getFixedSizeInBits() / 128;
-  unsigned SegmentElts = VT.getVectorNumElements() / Segments;
-
-  // Make sure there's no size changes.
-  if (SegmentElts * Segments != M.size())
-    return std::nullopt;
-
-  // Check the first index corresponds to one of the lanes in the first segment.
-  if (Lane >= SegmentElts)
-    return std::nullopt;
-
-  // Check that all lanes match the first, adjusted for segment.
-  for (unsigned I = 0; I < M.size(); ++I)
-    if ((unsigned)M[I] != (Lane + ((I / SegmentElts) * SegmentElts)))
-      return std::nullopt;
-
-  return Lane;
-}
-
 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
@@ -30029,8 +30005,15 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
           DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
     }
 
-    if (Subtarget->hasSVE2p1()) {
-      if (std::optional<unsigned> Lane = isDUPQMask(ShuffleMask, VT)) {
+    if ((Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) &&
+        Subtarget->isSVEorStreamingSVEAvailable()) {
+      assert(VT.getFixedSizeInBits() % AArch64::SVEBitsPerBlock == 0 &&
+             "Unsupported SVE vector size");
+
+      unsigned Segments = VT.getFixedSizeInBits() / AArch64::SVEBitsPerBlock;
+      unsigned SegmentElts = VT.getVectorNumElements() / Segments;
+      if (std::optional<unsigned> Lane =
+              isDUPQMask(ShuffleMask, Segments, SegmentElts)) {
         SDValue IID =
             DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64);
         return convertFromScalableVector(
diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
index 7b044cf7c238f..e9bc6d947b0d9 100644
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 
 namespace llvm {
 
@@ -6723,6 +6724,32 @@ inline bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts,
   return true;
 }
 
+/// isDUPQMask - matches a splat of equivalent lanes within segments of a given
+///              number of elements.
+inline std::optional<unsigned> isDUPQMask(ArrayRef<int> Mask, unsigned Segments,
+                                          unsigned SegmentSize) {
+  unsigned Lane = unsigned(Mask[0]);
+
+  // Make sure there's no size changes.
+  if (SegmentSize * Segments != Mask.size())
+    return std::nullopt;
+
+  // Check the first index corresponds to one of the lanes in the first segment.
+  if (Lane >= SegmentSize)
+    return std::nullopt;
+
+  // Check that all lanes match the first, adjusted for segment.
+  // Undef/poison lanes (<0) are also accepted.
+  if (all_of(enumerate(Mask), [&](auto P) {
+        const unsigned SegmentIndex = P.index() / SegmentSize;
+        return P.value() < 0 ||
+               unsigned(P.value()) == Lane + SegmentIndex * SegmentSize;
+      }))
+    return Lane;
+
+  return std::nullopt;
+}
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 15e38e6cb2408..3387dee8aa4c8 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5599,6 +5599,23 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
     SrcTy = DstTy;
   }
 
+  // Segmented shuffle matching.
+  if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
+      ST->isSVEorStreamingSVEAvailable() && Kind == TTI::SK_PermuteSingleSrc &&
+      isa<FixedVectorType>(SrcTy) && !Mask.empty() &&
+      SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
+          AArch64::SVEBitsPerBlock)) {
+
+    FixedVectorType *VTy = cast<FixedVectorType>(SrcTy);
+    unsigned Segments =
+        VTy->getPrimitiveSizeInBits() / AArch64::SVEBitsPerBlock;
+    unsigned SegmentElts = VTy->getNumElements() / Segments;
+
+    // dupq zd.t, zn.t[idx]
+    if (isDUPQMask(Mask, Segments, SegmentElts))
+      return LT.first;
+  }
+
   // Check for broadcast loads, which are supported by the LD1R instruction.
   // In terms of code-size, the shuffle vector is free when a load + dup get
   // folded into a LD1R. That's what we check and return here. For performance
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index bbe83821eca8e..3c8b5712c1f0c 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -1784,6 +1784,10 @@ void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, uint64_t Address,
                                            unsigned OpNum,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
+  // Do not print the numeric target address when symbolizing.
+  if (SymbolizeOperands)
+    return;
+
   const MCOperand &Op = MI->getOperand(OpNum);
 
   // If the label has already been resolved to an immediate offset (say, when
@@ -1813,6 +1817,12 @@ void AArch64InstPrinter::printAdrAdrpLabel(const MCInst *MI, uint64_t Address,
                                            unsigned OpNum,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
+  // Do not print the numeric target address when symbolizing.
+  // However, do print for ADRP, as this is typically used together with an ADD
+  // or an immediate-offset ldr/str and the label is likely at the wrong point.
+  if (SymbolizeOperands && MI->getOpcode() != AArch64::ADRP)
+    return;
+
   const MCOperand &Op = MI->getOperand(OpNum);
 
   // If the label has already been resolved to an immediate offset (say, when
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 18a253b4d9f48..1f634d21df51a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2364,7 +2364,7 @@ def HasExportInsts : Predicate<"Subtarget->hasExportInsts()">,
   AssemblerPredicate<(all_of (not FeatureGFX90AInsts), (not FeatureGFX1250Insts))>;
 
 def HasVINTERPEncoding : Predicate<"Subtarget->hasVINTERPEncoding()">,
-  AssemblerPredicate<(all_of FeatureGFX11Insts)>;
+  AssemblerPredicate<(all_of FeatureGFX11Insts, (not FeatureGFX1250Insts))>;
 
 def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
   AssemblerPredicate<(all_of FeatureGFX9Insts)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index b2ddc6e88966b..6a59a28b1d32c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -171,8 +171,7 @@ void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-const std::pair<Register, Register>
-RegBankLegalizeHelper::unpackZExt(Register Reg) {
+std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
   auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
   auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
   auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
@@ -180,16 +179,14 @@ RegBankLegalizeHelper::unpackZExt(Register Reg) {
   return {Lo.getReg(0), Hi.getReg(0)};
 }
 
-const std::pair<Register, Register>
-RegBankLegalizeHelper::unpackSExt(Register Reg) {
+std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
   auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
   auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
   auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
   return {Lo.getReg(0), Hi.getReg(0)};
 }
 
-const std::pair<Register, Register>
-RegBankLegalizeHelper::unpackAExt(Register Reg) {
+std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
   auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
   auto Lo = PackedS32;
   auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index 50bd86dc15a1f..08cc7d43bd78e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -111,9 +111,9 @@ class RegBankLegalizeHelper {
              SmallSet<Register, 4> &SgprWaterfallOperandRegs);
 
   void lowerVccExtToSel(MachineInstr &MI);
-  const std::pair<Register, Register> unpackZExt(Register Reg);
-  const std::pair<Register, Register> unpackSExt(Register Reg);
-  const std::pair<Register, Register> unpackAExt(Register Reg);
+  std::pair<Register, Register> unpackZExt(Register Reg);
+  std::pair<Register, Register> unpackSExt(Register Reg);
+  std::pair<Register, Register> unpackAExt(Register Reg);
   void lowerUnpackBitShift(MachineInstr &MI);
   void lowerV_BFE(MachineInstr &MI);
   void lowerS_BFE(MachineInstr &MI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index a7b08794fdf1b..b20760c356263 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4541,6 +4541,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_cvt_pknorm_u16:
     case Intrinsic::amdgcn_cvt_pk_i16:
     case Intrinsic::amdgcn_cvt_pk_u16:
+    case Intrinsic::amdgcn_cvt_pk_f16_fp8:
+    case Intrinsic::amdgcn_cvt_pk_f16_bf8:
     case Intrinsic::amdgcn_fmed3:
     case Intrinsic::amdgcn_cubeid:
     case Intrinsic::amdgcn_cubema:
diff --git a/llvm/lib/Target/AMDGPU/DSDIRInstructions.td b/llvm/lib/Target/AMDGPU/DSDIRInstructions.td
index 383e3371993d6..d9d7a650dfc21 100644
--- a/llvm/lib/Target/AMDGPU/DSDIRInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSDIRInstructions.td
@@ -141,7 +141,7 @@ def : GCNPat <
 
 } // End SubtargetPredicate = isGFX11Only
 
-let SubtargetPredicate = isGFX12Plus in {
+let SubtargetPredicate = isGFX12PlusNot12_50 in {
 
 def DS_DIRECT_LOAD : DSDIR_Pseudo<"ds_direct_load", VDSDIR_getIns<1>.ret, 1>;
 def DS_PARAM_LOAD : DSDIR_Pseudo<"ds_param_load", VDSDIR_getIns<0>.ret, 0>;
@@ -156,7 +156,7 @@ def : GCNPat <
   (DS_PARAM_LOAD timm:$attr, timm:$attrchan, 0, 1)
 >;
 
-} // End SubtargetPredicate = isGFX12Only
+} // End SubtargetPredicate = isGFX12PlusNot12_50.
 
 //===----------------------------------------------------------------------===//
 // GFX11
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 27b3d6bc9440c..59c72fcbff18a 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -599,6 +599,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                         DecW, Address, CS))
         break;
 
+      if (isGFX1250() &&
+          tryDecodeInst(DecoderTableGFX125096, DecoderTableGFX1250_FAKE1696, MI,
+                        DecW, Address, CS))
+        break;
+
       if (isGFX12() &&
           tryDecodeInst(DecoderTableGFX1296, DecoderTableGFX12_FAKE1696, MI,
                         DecW, Address, CS))
@@ -661,9 +666,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       if (isGFX10() && tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS))
         break;
 
-      // FIXME: DecoderTableGFX125064 is not defined yet.
       if (isGFX1250() &&
-          tryDecodeInst(DecoderTableGFX1250_FAKE1664, MI, QW, Address, CS))
+          tryDecodeInst(DecoderTableGFX125064, DecoderTableGFX1250_FAKE1664, MI,
+                        QW, Address, CS))
         break;
 
       if (isGFX12() &&
@@ -722,10 +727,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                         Address, CS))
         break;
 
-      // FIXME: Should use DecoderTableGFX1250_FAKE1632, but it is not generated
-      //        yet.
       if (isGFX1250() &&
-          tryDecodeInst(DecoderTableGFX125032, MI, DW, Address, CS))
+          tryDecodeInst(DecoderTableGFX125032, DecoderTableGFX1250_FAKE1632, MI,
+                        DW, Address, CS))
         break;
 
       if (isGFX12() &&
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 927af726a8664..89574fdd0ef3f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -697,9 +697,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return !hasGFX940Insts() && !hasGFX1250Insts();
   }
 
-  bool hasVINTERPEncoding() const {
-    return GFX11Insts;
-  }
+  bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); }
 
   // DS_ADD_F64/DS_ADD_RTN_F64
   bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8827de2b2a537..0cca7a4fe9197 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8736,11 +8736,15 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
                     : False);
   if (IsGFX10Plus)
     Ops.push_back(IsA16 ? True : False);
-  if (!Subtarget->hasGFX90AInsts()) {
+
+  if (!Subtarget->hasGFX90AInsts())
     Ops.push_back(TFE); // tfe
-  } else if (TFE->getAsZExtVal()) {
-    report_fatal_error("TFE is not supported on this GPU");
+  else if (TFE->getAsZExtVal()) {
+    DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+        DAG.getMachineFunction().getFunction(),
+        "TFE is not supported on this GPU", DL.getDebugLoc()));
   }
+
   if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
     Ops.push_back(LWE); // lwe
   if (!IsGFX10Plus)
@@ -8771,9 +8775,23 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
     if (Subtarget->hasGFX90AInsts()) {
       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
                                      NumVDataDwords, NumVAddrDwords);
-      if (Opcode == -1)
-        report_fatal_error(
-            "requested image instruction is not supported on this GPU");
+      if (Opcode == -1) {
+        DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+            DAG.getMachineFunction().getFunction(),
+            "requested image instruction is not supported on this GPU",
+            DL.getDebugLoc()));
+
+        unsigned Idx = 0;
+        SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
+        for (EVT VT : OrigResultTypes) {
+          if (VT == MVT::Other)
+            RetValues[Idx++] = Op.getOperand(0); // Chain
+          else
+            RetValues[Idx++] = DAG.getPOISON(VT);
+        }
+
+        return DAG.getMergeValues(RetValues, DL);
+      }
     }
     if (Opcode == -1 &&
         Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index f43831016952a..fdd5834e3b9a7 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -302,12 +302,8 @@ class WaitcntBrackets {
   }
 
   unsigned getSgprScoresIdx(InstCounterType T) const {
-    if (T == SmemAccessCounter)
-      return 0;
-    if (T == X_CNT)
-      return 1;
-
-    llvm_unreachable("Invalid SMEM counter");
+    assert(isSmemCounter(T) && "Invalid SMEM counter");
+    return T == X_CNT ? 1 : 0;
   }
 
   unsigned getScoreLB(InstCounterType T) const {
@@ -325,10 +321,8 @@ class WaitcntBrackets {
   }
 
   unsigned getRegScore(int GprNo, InstCounterType T) const {
-    if (GprNo < NUM_ALL_VGPRS) {
+    if (GprNo < NUM_ALL_VGPRS)
       return VgprScores[T][GprNo];
-    }
-    assert(isSmemCounter(T));
     return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
   }
 
@@ -866,7 +860,6 @@ void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
       VgprUB = std::max(VgprUB, RegNo);
       VgprScores[CntTy][RegNo] = Score;
     } else {
-      assert(isSmemCounter(CntTy));
       SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
       SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
     }
@@ -1006,12 +999,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
       }
     }
   } else if (T == X_CNT) {
-    for (const MachineOperand &Op : Inst.all_uses()) {
-      RegInterval Interval = getRegInterval(&Inst, MRI, TRI, Op);
-      for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-        setRegScore(RegNo, T, CurrScore);
-      }
-    }
+    for (const MachineOperand &Op : Inst.all_uses())
+      setScoreByOperand(&Inst, TRI, MRI, Op, T, CurrScore);
   } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
     // Match the score to the destination registers.
     //
@@ -1353,7 +1342,13 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
   MachineInstr *WaitcntInstr = nullptr;
   MachineInstr *WaitcntVsCntInstr = nullptr;
 
-  LLVM_DEBUG(dbgs() << "PreGFX12::applyPreexistingWaitcnt at: " << *It);
+  LLVM_DEBUG({
+    dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
+    if (It == OldWaitcntInstr.getParent()->instr_end())
+      dbgs() << "end of block\n";
+    else
+      dbgs() << *It;
+  });
 
   for (auto &II :
        make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
@@ -1507,7 +1502,13 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
   MachineInstr *CombinedStoreDsCntInstr = nullptr;
   MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
 
-  LLVM_DEBUG(dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: " << *It);
+  LLVM_DEBUG({
+    dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
+    if (It == OldWaitcntInstr.getParent()->instr_end())
+      dbgs() << "end of block\n";
+    else
+      dbgs() << *It;
+  });
 
   for (auto &II :
        make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 02b912bcfb9e0..d504c8134202d 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -259,6 +259,12 @@ foreach vt = Reg32Types.types in {
   >;
 }
 
+let HasOMod = 0, HasClamp = 0 in {
+  def VOPProfile_CVT_F32_BF16_gfx1250_t16 : VOPProfile_True16 <VOP_F32_BF16>;
+  let HasOpSel = 1, EmitDstSel = 0 in
+  def VOPProfile_CVT_F32_BF16_gfx1250_fake16 : VOPProfile_Fake16 <VOP_F32_BF16>;
+} // End HasOMod = 0, HasClamp = 0
+
 let isReMaterializable = 1 in {
 let SchedRW = [WriteDoubleCvt] in {
 // OMod clears exceptions when set in this instruction
@@ -309,8 +315,14 @@ let OtherPredicates = [UseRealTrue16Insts] in
 let OtherPredicates = [UseFakeTrue16Insts] in
   defm V_CVT_F32_F16_fake16 : VOP1Inst <"v_cvt_f32_f16_fake16", VOPProfile_Fake16<VOP_F32_F16>, any_fpextend>;
 
-let SubtargetPredicate = HasBF16ConversionInsts in
-defm V_CVT_F32_BF16 : VOP1Inst_t16 <"v_cvt_f32_bf16", VOP_F32_BF16>;
+let SubtargetPredicate = HasGFX950Insts, OtherPredicates = [HasBF16ConversionInsts] in {
+  defm V_CVT_F32_BF16 : VOP1Inst_t16 <"v_cvt_f32_bf16", VOP_F32_BF16>;
+}
+let SubtargetPredicate = isGFX1250Plus, OtherPredicates = [HasBF16ConversionInsts] in {
+  defm V_CVT_F32_BF16_gfx1250 : VOP1Inst_t16_with_profiles <"v_cvt_f32_bf16_gfx1250", VOP_F32_BF16,
+                                                            VOPProfile_CVT_F32_BF16_gfx1250_t16,
+                                                            VOPProfile_CVT_F32_BF16_gfx1250_fake16>;
+}
 
 let ReadsModeReg = 0, mayRaiseFPException = 0 in {
 defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
@@ -717,6 +729,24 @@ let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts]
   }
 }
 
+// FIXME-TRUE16: True16 versions of these instructions are untested.
+let HasExtSDWA = 0, HasOpSel = 1, EmitDstSel = 0, HasOMod = 0, HasModifiers = 1 in {
+def VOPProfile_CVT_PK_F16_F8 : VOPProfile<[v2f16, i16, untyped, untyped]>;
+def VOPProfile_CVT_PK_F16_F8_true16 : VOP3_Profile_True16<VOPProfile_CVT_PK_F16_F8>;
+def VOPProfile_CVT_PK_F16_F8_fake16 : VOP3_Profile_Fake16<VOPProfile_CVT_PK_F16_F8>;
+}
+
+let SubtargetPredicate = isGFX1250Plus in {
+  let mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
+    defm V_CVT_PK_F16_FP8 : VOP1Inst_t16_with_profiles<"v_cvt_pk_f16_fp8",
+      VOPProfile_CVT_PK_F16_F8, VOPProfile_CVT_PK_F16_F8_true16, VOPProfile_CVT_PK_F16_F8_fake16,
+      int_amdgcn_cvt_pk_f16_fp8>;
+    defm V_CVT_PK_F16_BF8 : VOP1Inst_t16_with_profiles<"v_cvt_pk_f16_bf8",
+      VOPProfile_CVT_PK_F16_F8, VOPProfile_CVT_PK_F16_F8_true16, VOPProfile_CVT_PK_F16_F8_fake16,
+      int_amdgcn_cvt_pk_f16_bf8>;
+  }
+} // End SubtargetPredicate = isGFX1250Plus
+
 let SubtargetPredicate = isGFX10Plus in {
   defm V_PIPEFLUSH        : VOP1Inst<"v_pipeflush", VOP_NO_EXT<VOP_NONE>>;
 
@@ -980,6 +1010,13 @@ multiclass VOP1_Real_NO_DPP_OP_SEL_with_name<GFXGen Gen, bits<9> op,
   VOP1_Real_e32_with_name<Gen, op, opName, asmName>,
   VOP3_Real_with_name<Gen, {0, 1, 1, op{6-0}}, opName, asmName>;
 
+multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250<
+    bits<9> op, string asmName = !tolower(NAME), string opName = NAME> {
+  defm opName#"_t16" :
+       VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_t16", asmName>;
+  defm opName#"_fake16":
+       VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_fake16", asmName>;
+}
 
 defm V_CVT_F32_FP8      : VOP1_Real_FULL_with_name<GFX12Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
 defm V_CVT_F32_BF8      : VOP1_Real_FULL_with_name<GFX12Gen, 0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;
@@ -1042,6 +1079,10 @@ defm V_CVT_NORM_U16_F16      : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x064>;
 defm V_CVT_F16_F32           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00a>;
 defm V_CVT_F32_F16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>;
 
+defm V_CVT_F32_BF16          : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
+defm V_CVT_PK_F16_FP8        : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
+defm V_CVT_PK_F16_BF8        : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>;
+
 //===----------------------------------------------------------------------===//
 // GFX10.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 0b64b504466c8..1e47acb5fde4f 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1679,6 +1679,7 @@ class Base_VOP3_DPP8_t16<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
   let SchedRW = ps.SchedRW;
   let Uses = ps.Uses;
 
+  let SubtargetPredicate = ps.SubtargetPredicate;
   let OtherPredicates = ps.OtherPredicates;
   let True16Predicate = ps.True16Predicate;
 }
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index b67161b060638..c106835bdf3a8 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -6186,7 +6186,7 @@ static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
 
   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
                     Read.getValue(1)));
-  Results.push_back(Read.getOperand(0));
+  Results.push_back(Read.getValue(2)); // Chain
 }
 
 /// \p BC is a bitcast that is about to be turned into a VMOVDRR.
diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index 0a9b2bb99f7eb..71eb1349314ea 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -308,9 +308,8 @@ bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
     NeedsTransform = true;
   } else if (AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrOperand)) {
     Type *AllocatedType = Alloca->getAllocatedType();
-    // OrigGEPType might just be a pointer lets make sure
-    // to add the allocated type so we have a size
-    if (AllocatedType != OrigGEPType) {
+    // Only transform if the allocated type is an array
+    if (AllocatedType != OrigGEPType && isa<ArrayType>(AllocatedType)) {
       NewGEPType = AllocatedType;
       NeedsTransform = true;
     }
diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
index cb58f4833631d..c8866bfefdfc5 100644
--- a/llvm/lib/Target/DirectX/DXILPrepare.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -11,6 +11,7 @@
 /// Language (DXIL).
 //===----------------------------------------------------------------------===//
 
+#include "DXILRootSignature.h"
 #include "DXILShaderFlags.h"
 #include "DirectX.h"
 #include "DirectXIRPasses/PointerTypeAnalysis.h"
@@ -286,12 +287,21 @@ class DXILPrepareModule : public ModulePass {
     }
     // Remove flags not for DXIL.
     cleanModuleFlags(M);
+
+    // dx.rootsignatures will have been parsed from its metadata form as its
+    // binary form as part of the RootSignatureAnalysisWrapper, so safely
+    // remove it as it is not recognized in DXIL
+    if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures"))
+      RootSignature->eraseFromParent();
+
     return true;
   }
 
   DXILPrepareModule() : ModulePass(ID) {}
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DXILMetadataAnalysisWrapperPass>();
+    AU.addRequired<RootSignatureAnalysisWrapper>();
+    AU.addPreserved<RootSignatureAnalysisWrapper>();
     AU.addPreserved<ShaderFlagsAnalysisWrapper>();
     AU.addPreserved<DXILMetadataAnalysisWrapperPass>();
     AU.addPreserved<DXILResourceWrapperPass>();
@@ -305,6 +315,7 @@ char DXILPrepareModule::ID = 0;
 INITIALIZE_PASS_BEGIN(DXILPrepareModule, DEBUG_TYPE, "DXIL Prepare Module",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(RootSignatureAnalysisWrapper)
 INITIALIZE_PASS_END(DXILPrepareModule, DEBUG_TYPE, "DXIL Prepare Module", false,
                     false)
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
index c5d176596d8c6..616640152c8d3 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
@@ -319,3 +319,19 @@ def : Pat<(bitconvert FPR64:$src), (MOVFR2GR_D FPR64:$src)>;
 let Predicates = [HasBasicD, IsLA64] in {
 def : PatFpr<frint, FRINT_D, FPR64>;
 } // Predicates = [HasBasicD, IsLA64]
+
+/// Pseudo-instructions needed for the soft-float ABI with LA32D
+
+let Predicates = [HasBasicD, IsLA32] in {
+// Moves two GPRs to an FPR.
+let usesCustomInserter = 1 in
+def BuildPairF64Pseudo
+    : Pseudo<(outs FPR64:$dst), (ins GPR:$src1, GPR:$src2),
+             [(set FPR64:$dst, (loongarch_build_pair_f64 GPR:$src1, GPR:$src2))]>;
+
+// Moves an FPR to two GPRs.
+let usesCustomInserter = 1 in
+def SplitPairF64Pseudo
+    : Pseudo<(outs GPR:$dst1, GPR:$dst2), (ins FPR64:$src),
+             [(set GPR:$dst1, GPR:$dst2, (loongarch_split_pair_f64 FPR64:$src))]>;
+} // Predicates = [HasBasicD, IsLA32]
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 6946ed554a7e5..cab1d83ddac4a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -169,6 +169,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::INTRINSIC_VOID, MVT::i64, Custom);
     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
+    if (Subtarget.hasBasicD())
+      setOperationAction(ISD::BITCAST, MVT::i64, Custom);
   }
 
   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
@@ -2713,13 +2715,20 @@ SDValue LoongArchTargetLowering::lowerBITCAST(SDValue Op,
                                               SelectionDAG &DAG) const {
 
   SDLoc DL(Op);
+  EVT VT = Op.getValueType();
   SDValue Op0 = Op.getOperand(0);
+  EVT Op0VT = Op0.getValueType();
 
-  if (Op.getValueType() == MVT::f32 && Op0.getValueType() == MVT::i32 &&
+  if (Op.getValueType() == MVT::f32 && Op0VT == MVT::i32 &&
       Subtarget.is64Bit() && Subtarget.hasBasicF()) {
     SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
     return DAG.getNode(LoongArchISD::MOVGR2FR_W_LA64, DL, MVT::f32, NewOp0);
   }
+  if (VT == MVT::f64 && Op0VT == MVT::i64 && !Subtarget.is64Bit()) {
+    SDValue Lo, Hi;
+    std::tie(Lo, Hi) = DAG.SplitScalar(Op0, DL, MVT::i32, MVT::i32);
+    return DAG.getNode(LoongArchISD::BUILD_PAIR_F64, DL, MVT::f64, Lo, Hi);
+  }
   return Op;
 }
 
@@ -4006,6 +4015,12 @@ void LoongArchTargetLowering::ReplaceNodeResults(
       SDValue Dst =
           DAG.getNode(LoongArchISD::MOVFR2GR_S_LA64, DL, MVT::i64, Src);
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Dst));
+    } else if (VT == MVT::i64 && SrcVT == MVT::f64 && !Subtarget.is64Bit()) {
+      SDValue NewReg = DAG.getNode(LoongArchISD::SPLIT_PAIR_F64, DL,
+                                   DAG.getVTList(MVT::i32, MVT::i32), Src);
+      SDValue RetReg = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64,
+                                   NewReg.getValue(0), NewReg.getValue(1));
+      Results.push_back(RetReg);
     }
     break;
   }
@@ -5649,6 +5664,37 @@ static SDValue performVMSKLTZCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue
+performSPLIT_PAIR_F64Combine(SDNode *N, SelectionDAG &DAG,
+                             TargetLowering::DAGCombinerInfo &DCI,
+                             const LoongArchSubtarget &Subtarget) {
+  SDValue Op0 = N->getOperand(0);
+  SDLoc DL(N);
+
+  // If the input to SplitPairF64 is just BuildPairF64 then the operation is
+  // redundant. Instead, use BuildPairF64's operands directly.
+  if (Op0->getOpcode() == LoongArchISD::BUILD_PAIR_F64)
+    return DCI.CombineTo(N, Op0.getOperand(0), Op0.getOperand(1));
+
+  if (Op0->isUndef()) {
+    SDValue Lo = DAG.getUNDEF(MVT::i32);
+    SDValue Hi = DAG.getUNDEF(MVT::i32);
+    return DCI.CombineTo(N, Lo, Hi);
+  }
+
+  // It's cheaper to materialise two 32-bit integers than to load a double
+  // from the constant pool and transfer it to integer registers through the
+  // stack.
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op0)) {
+    APInt V = C->getValueAPF().bitcastToAPInt();
+    SDValue Lo = DAG.getConstant(V.trunc(32), DL, MVT::i32);
+    SDValue Hi = DAG.getConstant(V.lshr(32).trunc(32), DL, MVT::i32);
+    return DCI.CombineTo(N, Lo, Hi);
+  }
+
+  return SDValue();
+}
+
 SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
                                                    DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -5676,6 +5722,8 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
   case LoongArchISD::VMSKLTZ:
   case LoongArchISD::XVMSKLTZ:
     return performVMSKLTZCombine(N, DAG, DCI, Subtarget);
+  case LoongArchISD::SPLIT_PAIR_F64:
+    return performSPLIT_PAIR_F64Combine(N, DAG, DCI, Subtarget);
   }
   return SDValue();
 }
@@ -6072,6 +6120,50 @@ emitPseudoVMSKCOND(MachineInstr &MI, MachineBasicBlock *BB,
   return BB;
 }
 
+static MachineBasicBlock *
+emitSplitPairF64Pseudo(MachineInstr &MI, MachineBasicBlock *BB,
+                       const LoongArchSubtarget &Subtarget) {
+  assert(MI.getOpcode() == LoongArch::SplitPairF64Pseudo &&
+         "Unexpected instruction");
+
+  MachineFunction &MF = *BB->getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  Register LoReg = MI.getOperand(0).getReg();
+  Register HiReg = MI.getOperand(1).getReg();
+  Register SrcReg = MI.getOperand(2).getReg();
+
+  BuildMI(*BB, MI, DL, TII.get(LoongArch::MOVFR2GR_S_64), LoReg).addReg(SrcReg);
+  BuildMI(*BB, MI, DL, TII.get(LoongArch::MOVFRH2GR_S), HiReg)
+      .addReg(SrcReg, getKillRegState(MI.getOperand(2).isKill()));
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+static MachineBasicBlock *
+emitBuildPairF64Pseudo(MachineInstr &MI, MachineBasicBlock *BB,
+                       const LoongArchSubtarget &Subtarget) {
+  assert(MI.getOpcode() == LoongArch::BuildPairF64Pseudo &&
+         "Unexpected instruction");
+
+  MachineFunction &MF = *BB->getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  Register TmpReg = MRI.createVirtualRegister(&LoongArch::FPR64RegClass);
+  Register DstReg = MI.getOperand(0).getReg();
+  Register LoReg = MI.getOperand(1).getReg();
+  Register HiReg = MI.getOperand(2).getReg();
+
+  BuildMI(*BB, MI, DL, TII.get(LoongArch::MOVGR2FR_W_64), TmpReg)
+      .addReg(LoReg, getKillRegState(MI.getOperand(1).isKill()));
+  BuildMI(*BB, MI, DL, TII.get(LoongArch::MOVGR2FRH_W), DstReg)
+      .addReg(TmpReg, RegState::Kill)
+      .addReg(HiReg, getKillRegState(MI.getOperand(2).isKill()));
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
 static bool isSelectPseudo(MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
@@ -6252,6 +6344,10 @@ MachineBasicBlock *LoongArchTargetLowering::EmitInstrWithCustomInserter(
   }
   case LoongArch::Select_GPR_Using_CC_GPR:
     return emitSelectPseudo(MI, BB, Subtarget);
+  case LoongArch::BuildPairF64Pseudo:
+    return emitBuildPairF64Pseudo(MI, BB, Subtarget);
+  case LoongArch::SplitPairF64Pseudo:
+    return emitSplitPairF64Pseudo(MI, BB, Subtarget);
   case LoongArch::PseudoVBZ:
   case LoongArch::PseudoVBZ_B:
   case LoongArch::PseudoVBZ_H:
@@ -6348,6 +6444,8 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(MOVGR2FR_W_LA64)
     NODE_NAME_CASE(MOVFR2GR_S_LA64)
     NODE_NAME_CASE(FTINT)
+    NODE_NAME_CASE(BUILD_PAIR_F64)
+    NODE_NAME_CASE(SPLIT_PAIR_F64)
     NODE_NAME_CASE(REVB_2H)
     NODE_NAME_CASE(REVB_2W)
     NODE_NAME_CASE(BITREV_4B)
@@ -6527,21 +6625,6 @@ static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI,
     break;
   }
 
-  // FPR32 and FPR64 alias each other.
-  if (State.getFirstUnallocated(ArgFPR32s) == std::size(ArgFPR32s))
-    UseGPRForFloat = true;
-
-  if (UseGPRForFloat && ValVT == MVT::f32) {
-    LocVT = GRLenVT;
-    LocInfo = CCValAssign::BCvt;
-  } else if (UseGPRForFloat && GRLen == 64 && ValVT == MVT::f64) {
-    LocVT = MVT::i64;
-    LocInfo = CCValAssign::BCvt;
-  } else if (UseGPRForFloat && GRLen == 32 && ValVT == MVT::f64) {
-    // TODO: Handle passing f64 on LA32 with D feature.
-    report_fatal_error("Passing f64 with GPR on LA32 is undefined");
-  }
-
   // If this is a variadic argument, the LoongArch calling convention requires
   // that it is assigned an 'even' or 'aligned' register if it has (2*GRLen)/8
   // byte alignment. An aligned register should be used regardless of whether
@@ -6564,6 +6647,45 @@ static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI,
   assert(PendingLocs.size() == PendingArgFlags.size() &&
          "PendingLocs and PendingArgFlags out of sync");
 
+  // FPR32 and FPR64 alias each other.
+  if (State.getFirstUnallocated(ArgFPR32s) == std::size(ArgFPR32s))
+    UseGPRForFloat = true;
+
+  if (UseGPRForFloat && ValVT == MVT::f32) {
+    LocVT = GRLenVT;
+    LocInfo = CCValAssign::BCvt;
+  } else if (UseGPRForFloat && GRLen == 64 && ValVT == MVT::f64) {
+    LocVT = MVT::i64;
+    LocInfo = CCValAssign::BCvt;
+  } else if (UseGPRForFloat && GRLen == 32 && ValVT == MVT::f64) {
+    // Handle passing f64 on LA32D with a soft float ABI or when floating point
+    // registers are exhausted.
+    assert(PendingLocs.empty() && "Can't lower f64 if it is split");
+    // Depending on available argument GPRS, f64 may be passed in a pair of
+    // GPRs, split between a GPR and the stack, or passed completely on the
+    // stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these
+    // cases.
+    MCRegister Reg = State.AllocateReg(ArgGPRs);
+    if (!Reg) {
+      int64_t StackOffset = State.AllocateStack(8, Align(8));
+      State.addLoc(
+          CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
+      return false;
+    }
+    LocVT = MVT::i32;
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    MCRegister HiReg = State.AllocateReg(ArgGPRs);
+    if (HiReg) {
+      State.addLoc(
+          CCValAssign::getCustomReg(ValNo, ValVT, HiReg, LocVT, LocInfo));
+    } else {
+      int64_t StackOffset = State.AllocateStack(4, Align(4));
+      State.addLoc(
+          CCValAssign::getCustomMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
+    }
+    return false;
+  }
+
   // Split arguments might be passed indirectly, so keep track of the pending
   // values.
   if (ValVT.isScalarInteger() && (ArgFlags.isSplit() || !PendingLocs.empty())) {
@@ -6764,6 +6886,38 @@ static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain,
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), ValVT);
 }
 
+static SDValue unpackF64OnLA32DSoftABI(SelectionDAG &DAG, SDValue Chain,
+                                       const CCValAssign &VA,
+                                       const CCValAssign &HiVA,
+                                       const SDLoc &DL) {
+  assert(VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64 &&
+         "Unexpected VA");
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  assert(VA.isRegLoc() && "Expected register VA assignment");
+
+  Register LoVReg = RegInfo.createVirtualRegister(&LoongArch::GPRRegClass);
+  RegInfo.addLiveIn(VA.getLocReg(), LoVReg);
+  SDValue Lo = DAG.getCopyFromReg(Chain, DL, LoVReg, MVT::i32);
+  SDValue Hi;
+  if (HiVA.isMemLoc()) {
+    // Second half of f64 is passed on the stack.
+    int FI = MFI.CreateFixedObject(4, HiVA.getLocMemOffset(),
+                                   /*IsImmutable=*/true);
+    SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+    Hi = DAG.getLoad(MVT::i32, DL, Chain, FIN,
+                     MachinePointerInfo::getFixedStack(MF, FI));
+  } else {
+    // Second half of f64 is passed in another GPR.
+    Register HiVReg = RegInfo.createVirtualRegister(&LoongArch::GPRRegClass);
+    RegInfo.addLiveIn(HiVA.getLocReg(), HiVReg);
+    Hi = DAG.getCopyFromReg(Chain, DL, HiVReg, MVT::i32);
+  }
+  return DAG.getNode(LoongArchISD::BUILD_PAIR_F64, DL, MVT::f64, Lo, Hi);
+}
+
 static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val,
                                    const CCValAssign &VA, const SDLoc &DL) {
   EVT LocVT = VA.getLocVT();
@@ -6861,11 +7015,16 @@ SDValue LoongArchTargetLowering::LowerFormalArguments(
   else
     analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false, CC_LoongArch);
 
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+  for (unsigned i = 0, e = ArgLocs.size(), InsIdx = 0; i != e; ++i, ++InsIdx) {
     CCValAssign &VA = ArgLocs[i];
     SDValue ArgValue;
-    if (VA.isRegLoc())
-      ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL, Ins[i], *this);
+    // Passing f64 on LA32D with a soft float ABI must be handled as a special
+    // case.
+    if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) {
+      assert(VA.needsCustom());
+      ArgValue = unpackF64OnLA32DSoftABI(DAG, Chain, VA, ArgLocs[++i], DL);
+    } else if (VA.isRegLoc())
+      ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL, Ins[InsIdx], *this);
     else
       ArgValue = unpackFromMemLoc(DAG, Chain, VA, DL);
     if (VA.getLocInfo() == CCValAssign::Indirect) {
@@ -6873,17 +7032,18 @@ SDValue LoongArchTargetLowering::LowerFormalArguments(
       // load all parts of it here (using the same address).
       InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue,
                                    MachinePointerInfo()));
-      unsigned ArgIndex = Ins[i].OrigArgIndex;
-      unsigned ArgPartOffset = Ins[i].PartOffset;
+      unsigned ArgIndex = Ins[InsIdx].OrigArgIndex;
+      unsigned ArgPartOffset = Ins[InsIdx].PartOffset;
       assert(ArgPartOffset == 0);
-      while (i + 1 != e && Ins[i + 1].OrigArgIndex == ArgIndex) {
+      while (i + 1 != e && Ins[InsIdx + 1].OrigArgIndex == ArgIndex) {
         CCValAssign &PartVA = ArgLocs[i + 1];
-        unsigned PartOffset = Ins[i + 1].PartOffset - ArgPartOffset;
+        unsigned PartOffset = Ins[InsIdx + 1].PartOffset - ArgPartOffset;
         SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL);
         SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue, Offset);
         InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address,
                                      MachinePointerInfo()));
         ++i;
+        ++InsIdx;
       }
       continue;
     }
@@ -7112,31 +7272,67 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI,
   SmallVector<std::pair<Register, SDValue>> RegsToPass;
   SmallVector<SDValue> MemOpChains;
   SDValue StackPtr;
-  for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) {
+  for (unsigned i = 0, j = 0, e = ArgLocs.size(), OutIdx = 0; i != e;
+       ++i, ++OutIdx) {
     CCValAssign &VA = ArgLocs[i];
-    SDValue ArgValue = OutVals[i];
-    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    SDValue ArgValue = OutVals[OutIdx];
+    ISD::ArgFlagsTy Flags = Outs[OutIdx].Flags;
+
+    // Handle passing f64 on LA32D with a soft float ABI as a special case.
+    if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) {
+      assert(VA.isRegLoc() && "Expected register VA assignment");
+      assert(VA.needsCustom());
+      SDValue SplitF64 =
+          DAG.getNode(LoongArchISD::SPLIT_PAIR_F64, DL,
+                      DAG.getVTList(MVT::i32, MVT::i32), ArgValue);
+      SDValue Lo = SplitF64.getValue(0);
+      SDValue Hi = SplitF64.getValue(1);
+
+      Register RegLo = VA.getLocReg();
+      RegsToPass.push_back(std::make_pair(RegLo, Lo));
+
+      // Get the CCValAssign for the Hi part.
+      CCValAssign &HiVA = ArgLocs[++i];
+
+      if (HiVA.isMemLoc()) {
+        // Second half of f64 is passed on the stack.
+        if (!StackPtr.getNode())
+          StackPtr = DAG.getCopyFromReg(Chain, DL, LoongArch::R3, PtrVT);
+        SDValue Address =
+            DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
+                        DAG.getIntPtrConstant(HiVA.getLocMemOffset(), DL));
+        // Emit the store.
+        MemOpChains.push_back(DAG.getStore(
+            Chain, DL, Hi, Address,
+            MachinePointerInfo::getStack(MF, HiVA.getLocMemOffset())));
+      } else {
+        // Second half of f64 is passed in another GPR.
+        Register RegHigh = HiVA.getLocReg();
+        RegsToPass.push_back(std::make_pair(RegHigh, Hi));
+      }
+      continue;
+    }
 
     // Promote the value if needed.
     // For now, only handle fully promoted and indirect arguments.
     if (VA.getLocInfo() == CCValAssign::Indirect) {
       // Store the argument in a stack slot and pass its address.
       Align StackAlign =
-          std::max(getPrefTypeAlign(Outs[i].ArgVT, DAG),
+          std::max(getPrefTypeAlign(Outs[OutIdx].ArgVT, DAG),
                    getPrefTypeAlign(ArgValue.getValueType(), DAG));
       TypeSize StoredSize = ArgValue.getValueType().getStoreSize();
       // If the original argument was split and passed by reference, we need to
       // store the required parts of it here (and pass just one address).
-      unsigned ArgIndex = Outs[i].OrigArgIndex;
-      unsigned ArgPartOffset = Outs[i].PartOffset;
+      unsigned ArgIndex = Outs[OutIdx].OrigArgIndex;
+      unsigned ArgPartOffset = Outs[OutIdx].PartOffset;
       assert(ArgPartOffset == 0);
       // Calculate the total size to store. We don't have access to what we're
       // actually storing other than performing the loop and collecting the
       // info.
       SmallVector<std::pair<SDValue, SDValue>> Parts;
-      while (i + 1 != e && Outs[i + 1].OrigArgIndex == ArgIndex) {
-        SDValue PartValue = OutVals[i + 1];
-        unsigned PartOffset = Outs[i + 1].PartOffset - ArgPartOffset;
+      while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == ArgIndex) {
+        SDValue PartValue = OutVals[OutIdx + 1];
+        unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset;
         SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL);
         EVT PartVT = PartValue.getValueType();
 
@@ -7144,6 +7340,7 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI,
         StackAlign = std::max(StackAlign, getPrefTypeAlign(PartVT, DAG));
         Parts.push_back(std::make_pair(PartValue, Offset));
         ++i;
+        ++OutIdx;
       }
       SDValue SpillSlot = DAG.CreateStackTemporary(StoredSize, StackAlign);
       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
@@ -7279,7 +7476,8 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI,
   analyzeInputArgs(MF, RetCCInfo, Ins, /*IsRet=*/true, CC_LoongArch);
 
   // Copy all of the result registers out of their specified physreg.
-  for (auto &VA : RVLocs) {
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    auto &VA = RVLocs[i];
     // Copy the value out.
     SDValue RetValue =
         DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), Glue);
@@ -7287,7 +7485,16 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI,
     Chain = RetValue.getValue(1);
     Glue = RetValue.getValue(2);
 
-    RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL);
+    if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) {
+      assert(VA.needsCustom());
+      SDValue RetValue2 = DAG.getCopyFromReg(Chain, DL, RVLocs[++i].getLocReg(),
+                                             MVT::i32, Glue);
+      Chain = RetValue2.getValue(1);
+      Glue = RetValue2.getValue(2);
+      RetValue = DAG.getNode(LoongArchISD::BUILD_PAIR_F64, DL, MVT::f64,
+                             RetValue, RetValue2);
+    } else
+      RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL);
 
     InVals.push_back(RetValue);
   }
@@ -7333,17 +7540,37 @@ SDValue LoongArchTargetLowering::LowerReturn(
   SmallVector<SDValue, 4> RetOps(1, Chain);
 
   // Copy the result values into the output registers.
-  for (unsigned i = 0, e = RVLocs.size(); i < e; ++i) {
+  for (unsigned i = 0, e = RVLocs.size(), OutIdx = 0; i < e; ++i, ++OutIdx) {
+    SDValue Val = OutVals[OutIdx];
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
 
-    // Handle a 'normal' return.
-    SDValue Val = convertValVTToLocVT(DAG, OutVals[i], VA, DL);
-    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Glue);
+    if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) {
+      // Handle returning f64 on LA32D with a soft float ABI.
+      assert(VA.isRegLoc() && "Expected return via registers");
+      assert(VA.needsCustom());
+      SDValue SplitF64 = DAG.getNode(LoongArchISD::SPLIT_PAIR_F64, DL,
+                                     DAG.getVTList(MVT::i32, MVT::i32), Val);
+      SDValue Lo = SplitF64.getValue(0);
+      SDValue Hi = SplitF64.getValue(1);
+      Register RegLo = VA.getLocReg();
+      Register RegHi = RVLocs[++i].getLocReg();
+
+      Chain = DAG.getCopyToReg(Chain, DL, RegLo, Lo, Glue);
+      Glue = Chain.getValue(1);
+      RetOps.push_back(DAG.getRegister(RegLo, MVT::i32));
+      Chain = DAG.getCopyToReg(Chain, DL, RegHi, Hi, Glue);
+      Glue = Chain.getValue(1);
+      RetOps.push_back(DAG.getRegister(RegHi, MVT::i32));
+    } else {
+      // Handle a 'normal' return.
+      Val = convertValVTToLocVT(DAG, Val, VA, DL);
+      Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Glue);
 
-    // Guarantee that all emitted copies are stuck together.
-    Glue = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+      // Guarantee that all emitted copies are stuck together.
+      Glue = Chain.getValue(1);
+      RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+    }
   }
 
   RetOps[0] = Chain; // Update chain.
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 79aa89726191b..60dc2b385a75c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -60,6 +60,10 @@ enum NodeType : unsigned {
 
   FTINT,
 
+  // Build and split F64 pair
+  BUILD_PAIR_F64,
+  SPLIT_PAIR_F64,
+
   // Bit counting operations
   CLZ_W,
   CTZ_W,
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index b6552ed33f5b1..2b94e65cac0e5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -58,6 +58,13 @@ def SDT_LoongArchMovgr2fcsr : SDTypeProfile<0, 2, [SDTCisVT<0, GRLenVT>,
 def SDT_LoongArchMovfcsr2gr : SDTypeProfile<1, 1, [SDTCisVT<0, GRLenVT>,
                                                    SDTCisSameAs<0, 1>]>;
 
+def SDT_LoongArchBuildPairF64 : SDTypeProfile<1, 2, [SDTCisVT<0, f64>,
+                                                     SDTCisVT<1, i32>,
+                                                     SDTCisSameAs<1, 2>]>;
+def SDT_LoongArchSplitPairF64 : SDTypeProfile<2, 1, [SDTCisVT<0, i32>,
+                                                     SDTCisVT<1, i32>,
+                                                     SDTCisVT<2, f64>]>;
+
 // TODO: Add LoongArch specific DAG Nodes
 // Target-independent nodes, but with target-specific formats.
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart,
@@ -165,6 +172,11 @@ def loongarch_iocsrwr_d : SDNode<"LoongArchISD::IOCSRWR_D",
 def loongarch_cpucfg : SDNode<"LoongArchISD::CPUCFG", SDTUnaryOp,
                                [SDNPHasChain]>;
 
+def loongarch_build_pair_f64 : SDNode<"LoongArchISD::BUILD_PAIR_F64",
+                                      SDT_LoongArchBuildPairF64>;
+def loongarch_split_pair_f64 : SDNode<"LoongArchISD::SPLIT_PAIR_F64",
+                                      SDT_LoongArchSplitPairF64>;
+
 def to_fclass_mask: SDNodeXForm<timm, [{
   uint64_t Check = N->getZExtValue();
   unsigned Mask = 0;
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index cc79257fb9c86..28f6968ee6caf 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -457,3 +457,25 @@ void NVPTXInstPrinter::printCTAGroup(const MCInst *MI, int OpNum,
   }
   llvm_unreachable("Invalid cta_group in printCTAGroup");
 }
+
+void NVPTXInstPrinter::printCallOperand(const MCInst *MI, int OpNum,
+                                        raw_ostream &O, StringRef Modifier) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isImm() && "Invalid operand");
+  const auto Imm = MO.getImm();
+
+  if (Modifier == "RetList") {
+    assert((Imm == 1 || Imm == 0) && "Invalid return list");
+    if (Imm)
+      O << " (retval0),";
+    return;
+  }
+
+  if (Modifier == "ParamList") {
+    assert(Imm >= 0 && "Invalid parameter list");
+    interleaveComma(llvm::seq(Imm), O,
+                    [&](const auto &I) { O << "param" << I; });
+    return;
+  }
+  llvm_unreachable("Invalid modifier");
+}
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
index f73af7a3f2c6e..6189284e8a58c 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
@@ -52,6 +52,8 @@ class NVPTXInstPrinter : public MCInstPrinter {
   void printPrmtMode(const MCInst *MI, int OpNum, raw_ostream &O);
   void printTmaReductionMode(const MCInst *MI, int OpNum, raw_ostream &O);
   void printCTAGroup(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printCallOperand(const MCInst *MI, int OpNum, raw_ostream &O,
+                        StringRef Modifier = {});
 };
 
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ff10eea371049..61fe8a53cb63a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -160,15 +160,9 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
   case NVPTXISD::StoreParam:
   case NVPTXISD::StoreParamV2:
   case NVPTXISD::StoreParamV4:
-  case NVPTXISD::StoreParamS32:
-  case NVPTXISD::StoreParamU32:
     if (tryStoreParam(N))
       return;
     break;
-  case ISD::INTRINSIC_WO_CHAIN:
-    if (tryIntrinsicNoChain(N))
-      return;
-    break;
   case ISD::INTRINSIC_W_CHAIN:
     if (tryIntrinsicChain(N))
       return;
@@ -904,25 +898,6 @@ NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
   return {InstructionOrdering, Scope};
 }
 
-bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
-  unsigned IID = N->getConstantOperandVal(0);
-  switch (IID) {
-  default:
-    return false;
-  case Intrinsic::nvvm_texsurf_handle_internal:
-    SelectTexSurfHandle(N);
-    return true;
-  }
-}
-
-void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
-  // Op 0 is the intrinsic ID
-  SDValue Wrapper = N->getOperand(1);
-  SDValue GlobalVal = Wrapper.getOperand(0);
-  ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
-                                        MVT::i64, GlobalVal));
-}
-
 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
   SDValue Src = N->getOperand(0);
   AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
@@ -1717,8 +1692,6 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Unexpected opcode");
-  case NVPTXISD::StoreParamU32:
-  case NVPTXISD::StoreParamS32:
   case NVPTXISD::StoreParam:
     NumElts = 1;
     break;
@@ -1796,27 +1769,6 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
     }
     }
     break;
-  // Special case: if we have a sign-extend/zero-extend node, insert the
-  // conversion instruction first, and use that as the value operand to
-  // the selected StoreParam node.
-  case NVPTXISD::StoreParamU32: {
-    Opcode = NVPTX::StoreParamI32_r;
-    SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
-                                                MVT::i32);
-    SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
-                                         MVT::i32, Ops[0], CvtNone);
-    Ops[0] = SDValue(Cvt, 0);
-    break;
-  }
-  case NVPTXISD::StoreParamS32: {
-    Opcode = NVPTX::StoreParamI32_r;
-    SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
-                                                MVT::i32);
-    SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
-                                         MVT::i32, Ops[0], CvtNone);
-    Ops[0] = SDValue(Cvt, 0);
-    break;
-  }
   }
 
   SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
@@ -2105,22 +2057,14 @@ static inline bool isAddLike(const SDValue V) {
 // selectBaseADDR - Match a dag node which will serve as the base address for an
 // ADDR operand pair.
 static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG) {
-  // Return true if TGA or ES.
-  if (N.getOpcode() == ISD::TargetGlobalAddress ||
-      N.getOpcode() == ISD::TargetExternalSymbol)
-    return N;
-
-  if (N.getOpcode() == NVPTXISD::Wrapper)
-    return N.getOperand(0);
-
-  // addrspacecast(Wrapper(arg_symbol) to addrspace(PARAM)) -> arg_symbol
-  if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N))
-    if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
-        CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
-        CastN->getOperand(0).getOpcode() == NVPTXISD::Wrapper)
-      return selectBaseADDR(CastN->getOperand(0).getOperand(0), DAG);
-
-  if (auto *FIN = dyn_cast<FrameIndexSDNode>(N))
+  if (const auto *GA = dyn_cast<GlobalAddressSDNode>(N))
+    return DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N),
+                                       GA->getValueType(0), GA->getOffset(),
+                                       GA->getTargetFlags());
+  if (const auto *ES = dyn_cast<ExternalSymbolSDNode>(N))
+    return DAG->getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
+                                        ES->getTargetFlags());
+  if (const auto *FIN = dyn_cast<FrameIndexSDNode>(N))
     return DAG->getTargetFrameIndex(FIN->getIndex(), FIN->getValueType(0));
 
   return N;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index ff58e4486a222..92b5c773258ed 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -69,7 +69,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
 #include "NVPTXGenDAGISel.inc"
 
   void Select(SDNode *N) override;
-  bool tryIntrinsicNoChain(SDNode *N);
   bool tryIntrinsicChain(SDNode *N);
   bool tryIntrinsicVoid(SDNode *N);
   void SelectTexSurfHandle(SDNode *N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d2fafe854e9e4..b924a1f5ac93c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -702,9 +702,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::BR_JT, MVT::Other, Custom);
   setOperationAction(ISD::BRIND, MVT::Other, Expand);
 
-  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
-  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
-
   // We want to legalize constant related memmove and memcopy
   // intrinsics.
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -1055,45 +1052,24 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case NVPTXISD::FIRST_NUMBER:
     break;
 
-    MAKE_CASE(NVPTXISD::CALL)
     MAKE_CASE(NVPTXISD::RET_GLUE)
-    MAKE_CASE(NVPTXISD::LOAD_PARAM)
-    MAKE_CASE(NVPTXISD::Wrapper)
     MAKE_CASE(NVPTXISD::DeclareParam)
     MAKE_CASE(NVPTXISD::DeclareScalarParam)
     MAKE_CASE(NVPTXISD::DeclareRet)
-    MAKE_CASE(NVPTXISD::DeclareScalarRet)
     MAKE_CASE(NVPTXISD::DeclareRetParam)
-    MAKE_CASE(NVPTXISD::PrintCall)
-    MAKE_CASE(NVPTXISD::PrintConvergentCall)
-    MAKE_CASE(NVPTXISD::PrintCallUni)
-    MAKE_CASE(NVPTXISD::PrintConvergentCallUni)
+    MAKE_CASE(NVPTXISD::CALL)
     MAKE_CASE(NVPTXISD::LoadParam)
     MAKE_CASE(NVPTXISD::LoadParamV2)
     MAKE_CASE(NVPTXISD::LoadParamV4)
     MAKE_CASE(NVPTXISD::StoreParam)
     MAKE_CASE(NVPTXISD::StoreParamV2)
     MAKE_CASE(NVPTXISD::StoreParamV4)
-    MAKE_CASE(NVPTXISD::StoreParamS32)
-    MAKE_CASE(NVPTXISD::StoreParamU32)
-    MAKE_CASE(NVPTXISD::CallArgBegin)
-    MAKE_CASE(NVPTXISD::CallArg)
-    MAKE_CASE(NVPTXISD::LastCallArg)
-    MAKE_CASE(NVPTXISD::CallArgEnd)
-    MAKE_CASE(NVPTXISD::CallVoid)
-    MAKE_CASE(NVPTXISD::CallVal)
-    MAKE_CASE(NVPTXISD::CallSymbol)
-    MAKE_CASE(NVPTXISD::Prototype)
     MAKE_CASE(NVPTXISD::MoveParam)
     MAKE_CASE(NVPTXISD::StoreRetval)
     MAKE_CASE(NVPTXISD::StoreRetvalV2)
     MAKE_CASE(NVPTXISD::StoreRetvalV4)
-    MAKE_CASE(NVPTXISD::PseudoUseParam)
     MAKE_CASE(NVPTXISD::UNPACK_VECTOR)
     MAKE_CASE(NVPTXISD::BUILD_VECTOR)
-    MAKE_CASE(NVPTXISD::RETURN)
-    MAKE_CASE(NVPTXISD::CallSeqBegin)
-    MAKE_CASE(NVPTXISD::CallSeqEnd)
     MAKE_CASE(NVPTXISD::CallPrototype)
     MAKE_CASE(NVPTXISD::ProxyReg)
     MAKE_CASE(NVPTXISD::LoadV2)
@@ -1115,7 +1091,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(NVPTXISD::STACKSAVE)
     MAKE_CASE(NVPTXISD::SETP_F16X2)
     MAKE_CASE(NVPTXISD::SETP_BF16X2)
-    MAKE_CASE(NVPTXISD::Dummy)
     MAKE_CASE(NVPTXISD::MUL_WIDE_SIGNED)
     MAKE_CASE(NVPTXISD::MUL_WIDE_UNSIGNED)
     MAKE_CASE(NVPTXISD::BrxEnd)
@@ -1189,15 +1164,6 @@ SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
   }
 }
 
-SDValue
-NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
-  auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
-  Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
-  return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
-}
-
 std::string NVPTXTargetLowering::getPrototype(
     const DataLayout &DL, Type *retTy, const ArgListTy &Args,
     const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign RetAlign,
@@ -1601,9 +1567,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
               ? promoteScalarArgumentSize(TypeSize * 8)
               : TypeSize * 8;
 
-      Chain = DAG.getNode(
-          NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
-          {Chain, GetI32(ArgI), GetI32(PromotedSize), GetI32(0), InGlue});
+      Chain =
+          DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
+                      {Chain, GetI32(ArgI), GetI32(PromotedSize), InGlue});
     }
     InGlue = Chain.getValue(1);
 
@@ -1740,16 +1706,13 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     const unsigned ResultSize = DL.getTypeAllocSizeInBits(RetTy);
     if (!shouldPassAsArray(RetTy)) {
       const unsigned PromotedResultSize = promoteScalarArgumentSize(ResultSize);
-      SDValue DeclareRetOps[] = {Chain, GetI32(1), GetI32(PromotedResultSize),
-                                 GetI32(0), InGlue};
       Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, {MVT::Other, MVT::Glue},
-                          DeclareRetOps);
+                          {Chain, GetI32(PromotedResultSize), InGlue});
       InGlue = Chain.getValue(1);
     } else {
-      SDValue DeclareRetOps[] = {Chain, GetI32(RetAlign->value()),
-                                 GetI32(ResultSize / 8), GetI32(0), InGlue};
-      Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl,
-                          {MVT::Other, MVT::Glue}, DeclareRetOps);
+      Chain = DAG.getNode(
+          NVPTXISD::DeclareRetParam, dl, {MVT::Other, MVT::Glue},
+          {Chain, GetI32(RetAlign->value()), GetI32(ResultSize / 8), InGlue});
       InGlue = Chain.getValue(1);
     }
   }
@@ -1800,25 +1763,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                      HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB,
                      UniqueCallSite);
     const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
-    SDValue ProtoOps[] = {
-        Chain,
-        DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),
-        InGlue,
-    };
-    Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, {MVT::Other, MVT::Glue},
-                        ProtoOps);
+    Chain = DAG.getNode(
+        NVPTXISD::CallPrototype, dl, {MVT::Other, MVT::Glue},
+        {Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InGlue});
     InGlue = Chain.getValue(1);
   }
-  // Op to just print "call"
-  SDValue PrintCallOps[] = {Chain, GetI32(Ins.empty() ? 0 : 1), InGlue};
-  // We model convergent calls as separate opcodes.
-  unsigned Opcode =
-      IsIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni;
-  if (CLI.IsConvergent)
-    Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
-                                              : NVPTXISD::PrintConvergentCall;
-  Chain = DAG.getNode(Opcode, dl, {MVT::Other, MVT::Glue}, PrintCallOps);
-  InGlue = Chain.getValue(1);
 
   if (ConvertToIndirectCall) {
     // Copy the function ptr to a ptx register and use the register to call the
@@ -1832,38 +1781,17 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Callee = DAG.getCopyFromReg(RegCopy, dl, DestReg, DestVT);
   }
 
-  // Ops to print out the function name
-  SDValue CallVoidOps[] = { Chain, Callee, InGlue };
-  Chain =
-      DAG.getNode(NVPTXISD::CallVoid, dl, {MVT::Other, MVT::Glue}, CallVoidOps);
-  InGlue = Chain.getValue(1);
-
-  // Ops to print out the param list
-  SDValue CallArgBeginOps[] = { Chain, InGlue };
-  Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, {MVT::Other, MVT::Glue},
-                      CallArgBeginOps);
+  const unsigned Proto = IsIndirectCall ? UniqueCallSite : 0;
+  const unsigned NumArgs =
+      std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());
+  /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
+  ///      NumParams, Callee, Proto, InGlue)
+  Chain = DAG.getNode(NVPTXISD::CALL, dl, {MVT::Other, MVT::Glue},
+                      {Chain, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
+                       GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee,
+                       GetI32(Proto), InGlue});
   InGlue = Chain.getValue(1);
 
-  const unsigned E = std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());
-  for (const unsigned I : llvm::seq(E)) {
-    const unsigned Opcode =
-        I == (E - 1) ? NVPTXISD::LastCallArg : NVPTXISD::CallArg;
-    SDValue CallArgOps[] = {Chain, GetI32(1), GetI32(I), InGlue};
-    Chain = DAG.getNode(Opcode, dl, {MVT::Other, MVT::Glue}, CallArgOps);
-    InGlue = Chain.getValue(1);
-  }
-  SDValue CallArgEndOps[] = {Chain, GetI32(IsIndirectCall ? 0 : 1), InGlue};
-  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, {MVT::Other, MVT::Glue},
-                      CallArgEndOps);
-  InGlue = Chain.getValue(1);
-
-  if (IsIndirectCall) {
-    SDValue PrototypeOps[] = {Chain, GetI32(UniqueCallSite), InGlue};
-    Chain = DAG.getNode(NVPTXISD::Prototype, dl, {MVT::Other, MVT::Glue},
-                        PrototypeOps);
-    InGlue = Chain.getValue(1);
-  }
-
   SmallVector<SDValue, 16> ProxyRegOps;
   // An item of the vector is filled if the element does not need a ProxyReg
   // operation on it and should be added to InVals as is. ProxyRegOps and
@@ -2919,8 +2847,6 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return SDValue();
   case ISD::ADDRSPACECAST:
     return LowerADDRSPACECAST(Op, DAG);
-  case ISD::GlobalAddress:
-    return LowerGlobalAddress(Op, DAG);
   case ISD::INTRINSIC_W_CHAIN:
     return Op;
   case ISD::INTRINSIC_WO_CHAIN:
@@ -3129,8 +3055,7 @@ SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
 
   // Store the address of unsized array <function>_vararg[] in the ap object.
-  SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
-  SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg);
+  SDValue VAReg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
 
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
@@ -3370,7 +3295,7 @@ SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,
                                             EVT v) const {
   StringRef SavedStr = nvTM->getStrPool().save(
       getParamName(&DAG.getMachineFunction().getFunction(), idx));
-  return DAG.getTargetExternalSymbol(SavedStr.data(), v);
+  return DAG.getExternalSymbol(SavedStr.data(), v);
 }
 
 SDValue NVPTXTargetLowering::LowerFormalArguments(
@@ -3438,7 +3363,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
 
       SDValue P;
       if (isKernelFunction(*F)) {
-        P = DAG.getNode(NVPTXISD::Wrapper, dl, ByvalIn.VT, ArgSymbol);
+        P = ArgSymbol;
         P.getNode()->setIROrder(Arg.getArgNo() + 1);
       } else {
         P = DAG.getNode(NVPTXISD::MoveParam, dl, ByvalIn.VT, ArgSymbol);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 0a54a8fd71f32..5efdd1582214a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -24,32 +24,19 @@ namespace NVPTXISD {
 enum NodeType : unsigned {
   // Start the numbering from where ISD NodeType finishes.
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
-  Wrapper,
-  CALL,
   RET_GLUE,
-  LOAD_PARAM,
   DeclareParam,
   DeclareScalarParam,
   DeclareRetParam,
   DeclareRet,
-  DeclareScalarRet,
-  PrintCall,
-  PrintConvergentCall,
-  PrintCallUni,
-  PrintConvergentCallUni,
-  CallArgBegin,
-  CallArg,
-  LastCallArg,
-  CallArgEnd,
-  CallVoid,
-  CallVal,
-  CallSymbol,
-  Prototype,
+
+  /// This node represents a PTX call instruction. It's operands are as follows:
+  ///
+  /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
+  ///      NumParams, Callee, Proto, InGlue)
+  CALL,
+
   MoveParam,
-  PseudoUseParam,
-  RETURN,
-  CallSeqBegin,
-  CallSeqEnd,
   CallPrototype,
   ProxyReg,
   FSHL_CLAMP,
@@ -83,7 +70,6 @@ enum NodeType : unsigned {
   CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X,
   CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y,
   CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z,
-  Dummy,
 
   FIRST_MEMORY_OPCODE,
   LoadV2 = FIRST_MEMORY_OPCODE,
@@ -100,8 +86,6 @@ enum NodeType : unsigned {
   StoreParam,
   StoreParamV2,
   StoreParamV4,
-  StoreParamS32, // to sext and store a <32bit value, not used currently
-  StoreParamU32, // to zext and store a <32bit value, not used currently
   StoreRetval,
   StoreRetvalV2,
   StoreRetvalV4,
@@ -120,8 +104,6 @@ class NVPTXTargetLowering : public TargetLowering {
                                const NVPTXSubtarget &STI);
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
-  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
-
   const char *getTargetNodeName(unsigned Opcode) const override;
 
   bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index bf84d1dca4ed5..e218ef17bb09b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -190,22 +190,4 @@ unsigned NVPTXInstrInfo::insertBranch(MachineBasicBlock &MBB,
   BuildMI(&MBB, DL, get(NVPTX::CBranch)).add(Cond[0]).addMBB(TBB);
   BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(FBB);
   return 2;
-}
-
-bool NVPTXInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
-                                          const MachineBasicBlock *MBB,
-                                          const MachineFunction &MF) const {
-  // Prevent the scheduler from reordering & splitting up MachineInstrs
-  // which must stick together (in initially set order) to
-  // comprise a valid PTX function call sequence.
-  switch (MI.getOpcode()) {
-  case NVPTX::CallUniPrintCallRetInst1:
-  case NVPTX::CallArgBeginInst:
-  case NVPTX::CallArgParam:
-  case NVPTX::LastCallArgParam:
-  case NVPTX::CallArgEndInst1:
-    return true;
-  }
-
-  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
-}
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
index 95464dbbd176d..4e9dc9d3b4686 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -66,9 +66,6 @@ class NVPTXInstrInfo : public NVPTXGenInstrInfo {
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
-  bool isSchedulingBoundary(const MachineInstr &MI,
-                            const MachineBasicBlock *MBB,
-                            const MachineFunction &MF) const override;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 5979054764647..1ea6d98a1df8e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1700,17 +1700,6 @@ def Offseti32imm : Operand<i32> {
   let PrintMethod = "printOffseti32imm";
 }
 
-def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
-def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
-
-// Load a memory address into a u32 or u64 register.
-def MOV_ADDR : BasicNVPTXInst<(outs B32:$dst), (ins ADDR_base:$a),
-                         "mov.b32",
-                         [(set i32:$dst, (Wrapper tglobaladdr:$a))]>;
-def MOV_ADDR64 : BasicNVPTXInst<(outs B64:$dst), (ins ADDR_base:$a),
-                           "mov.b64",
-                           [(set i64:$dst, (Wrapper tglobaladdr:$a))]>;
-
 // Get pointer to local stack.
 let hasSideEffects = false in {
   def MOV_DEPOT_ADDR :    NVPTXInst<(outs B32:$d), (ins i32imm:$num),
@@ -1750,8 +1739,27 @@ def BFMOV16i : MOVi<B16, "b16", bf16, bf16imm, fpimm>;
 def FMOV32i : MOVi<B32, "b32", f32, f32imm, fpimm>;
 def FMOV64i : MOVi<B64, "b64", f64, f64imm, fpimm>;
 
-def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32i texternalsym:$dst)>;
-def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64i texternalsym:$dst)>;
+
+def to_tglobaladdr : SDNodeXForm<globaladdr, [{
+  return CurDAG->getTargetGlobalAddress(N->getGlobal(), SDLoc(N),
+                                        N->getValueType(0), N->getOffset(),
+                                        N->getTargetFlags());
+}]>;
+
+def to_texternsym : SDNodeXForm<externalsym, [{
+  return CurDAG->getTargetExternalSymbol(N->getSymbol(), N->getValueType(0),
+                                         N->getTargetFlags());
+}]>;
+
+def to_tframeindex : SDNodeXForm<frameindex, [{
+  return CurDAG->getTargetFrameIndex(N->getIndex(), N->getValueType(0));
+}]>;
+
+def : Pat<(i32 globaladdr:$dst), (IMOV32i (to_tglobaladdr $dst))>;
+def : Pat<(i64 globaladdr:$dst), (IMOV64i (to_tglobaladdr $dst))>;
+
+def : Pat<(i32 externalsym:$dst), (IMOV32i (to_texternsym $dst))>;
+def : Pat<(i64 externalsym:$dst), (IMOV64i (to_texternsym $dst))>;
 
 //---- Copy Frame Index ----
 def LEA_ADDRi :   NVPTXInst<(outs B32:$dst), (ins ADDR:$addr),
@@ -1759,10 +1767,6 @@ def LEA_ADDRi :   NVPTXInst<(outs B32:$dst), (ins ADDR:$addr),
 def LEA_ADDRi64 : NVPTXInst<(outs B64:$dst), (ins ADDR:$addr),
                             "add.u64 \t$dst, ${addr:add};", []>;
 
-def to_tframeindex : SDNodeXForm<frameindex, [{
-  return CurDAG->getTargetFrameIndex(N->getIndex(), N->getValueType(0));
-}]>;
-
 def : Pat<(i32 frameindex:$fi), (LEA_ADDRi (to_tframeindex $fi), 0)>;
 def : Pat<(i64 frameindex:$fi), (LEA_ADDRi64 (to_tframeindex $fi), 0)>;
 
@@ -1975,26 +1979,19 @@ defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
 defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
 
 def SDTDeclareParamProfile :
-  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
+  SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
 def SDTDeclareScalarParamProfile :
-  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
+  SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
 def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
 def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
 def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
-def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
 def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
 def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
 def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
-def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
-def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
-def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
-def SDTCallValProfile : SDTypeProfile<1, 0, []>;
-def SDTMoveParamProfile : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTMoveParamProfile : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>;
 def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
 def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
 def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
-def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
 def SDTProxyRegProfile : SDTypeProfile<1, 1, []>;
 
 def DeclareParam :
@@ -2004,10 +2001,12 @@ def DeclareScalarParam :
   SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
 def DeclareRetParam :
-  SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
+  SDNode<"NVPTXISD::DeclareRetParam",
+         SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>,
          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
 def DeclareRet :
-  SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
+  SDNode<"NVPTXISD::DeclareRet",
+         SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>,
          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
 def LoadParam :
   SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
@@ -2018,18 +2017,6 @@ def LoadParamV2 :
 def LoadParamV4 :
   SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
          [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def PrintCall :
-  SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def PrintConvergentCall :
-  SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def PrintCallUni :
-  SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def PrintConvergentCallUni :
-  SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
 def StoreParam :
   SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
@@ -2039,33 +2026,6 @@ def StoreParamV2 :
 def StoreParamV4 :
   SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamU32 :
-  SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamS32 :
-  SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallArgBegin :
-  SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallArg :
-  SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def LastCallArg :
-  SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallArgEnd :
-  SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallVoid :
-  SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def Prototype :
-  SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallVal :
-  SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
 def MoveParam :
   SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
 def StoreRetval :
@@ -2077,16 +2037,19 @@ def StoreRetvalV2 :
 def StoreRetvalV4 :
   SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
          [SDNPHasChain, SDNPSideEffect]>;
-def PseudoUseParam :
-  SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def RETURNNode :
-  SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
-         [SDNPHasChain, SDNPSideEffect]>;
 def ProxyReg :
   SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile,
          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
 
+  /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
+  ///      NumParams, Callee, Proto, InGlue)
+def SDTCallProfile : SDTypeProfile<0, 6,
+                       [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>,
+                        SDTCisVT<3, i32>, SDTCisVT<5, i32>]>;
+def call :
+  SDNode<"NVPTXISD::CALL", SDTCallProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+
 let mayLoad = true in {
   class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
         NVPTXInst<(outs regclass:$dst), (ins Offseti32imm:$b),
@@ -2107,11 +2070,6 @@ let mayLoad = true in {
                   []>;
 }
 
-class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
-      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
-                !strconcat("mov", opstr, " \t$dst, retval$b;"),
-                [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
-
 let mayStore = true in {
 
   multiclass StoreParamInst<NVPTXRegClass regclass, Operand IMMType, string opstr, bit support_imm = true> {
@@ -2174,23 +2132,42 @@ let mayStore = true in {
                   []>;
 }
 
-let isCall=1 in {
-  multiclass CALL<string OpcStr, SDNode OpNode> {
-     def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
-       OpcStr # " ", [(OpNode 0)]>;
-     def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
-       OpcStr # " (retval0), ", [(OpNode 1)]>;
+/// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
+///      NumParams, Callee, Proto, InGlue)
+
+def CallOperand : Operand<i32> { let PrintMethod = "printCallOperand"; }
+
+foreach is_convergent = [0, 1] in {
+  defvar convergent_suffix = !if(is_convergent, "_conv", "");
+
+  let isCall = 1, isConvergent = is_convergent in {
+    def CALL # convergent_suffix :
+      NVPTXInst<(outs),
+                (ins ADDR_base:$addr, CallOperand:$rets, CallOperand:$params, 
+                     i32imm:$proto),
+                "call${rets:RetList} $addr, (${params:ParamList}), prototype_$proto;", []>;
+
+    def CALL_UNI # convergent_suffix :
+      NVPTXInst<(outs),
+                (ins ADDR_base:$addr, CallOperand:$rets, CallOperand:$params),
+                "call.uni${rets:RetList} $addr, (${params:ParamList});", []>;
   }
-}
 
-defm Call : CALL<"call", PrintCall>;
-defm CallUni : CALL<"call.uni", PrintCallUni>;
+  defvar call_inst = !cast<NVPTXInst>("CALL" # convergent_suffix);
+  def : Pat<(call is_convergent, 1, imm:$rets, imm:$params, globaladdr:$addr, imm:$proto),
+            (call_inst (to_tglobaladdr $addr), imm:$rets, imm:$params, imm:$proto)>;
+  def : Pat<(call is_convergent, 1, imm:$rets, imm:$params, i32:$addr, imm:$proto),
+            (call_inst $addr, imm:$rets, imm:$params, imm:$proto)>;
+  def : Pat<(call is_convergent, 1, imm:$rets, imm:$params, i64:$addr, imm:$proto),
+            (call_inst $addr, imm:$rets, imm:$params, imm:$proto)>;
 
-// Convergent call instructions.  These are identical to regular calls, except
-// they have the isConvergent bit set.
-let isConvergent=1 in {
-  defm ConvergentCall : CALL<"call", PrintConvergentCall>;
-  defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
+  defvar call_uni_inst = !cast<NVPTXInst>("CALL_UNI" # convergent_suffix);
+  def : Pat<(call is_convergent, 0, imm:$rets, imm:$params, globaladdr:$addr, 0),
+            (call_uni_inst (to_tglobaladdr $addr), imm:$rets, imm:$params)>;
+  def : Pat<(call is_convergent, 0, imm:$rets, imm:$params, i32:$addr, 0),
+            (call_uni_inst $addr, imm:$rets, imm:$params)>;
+  def : Pat<(call is_convergent, 0, imm:$rets, imm:$params, i64:$addr, 0),
+            (call_uni_inst $addr, imm:$rets, imm:$params)>;
 }
 
 def LoadParamMemI64    : LoadParamMemInst<B64, ".b64">;
@@ -2244,69 +2221,30 @@ def StoreRetvalV4I32  : StoreRetvalV4Inst<B32, ".b32">;
 def StoreRetvalV4I16  : StoreRetvalV4Inst<B16, ".b16">;
 def StoreRetvalV4I8   : StoreRetvalV4Inst<B16, ".b8">;
 
-def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
-def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
-def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
-def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
-
-def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
-                             [(CallArg 1, imm:$a)]>;
-def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
-                                 [(LastCallArg 1, imm:$a)]>;
-
-def CallVoidInst :      NVPTXInst<(outs), (ins ADDR_base:$addr), "$addr, ",
-                                  [(CallVoid (Wrapper tglobaladdr:$addr))]>;
-def CallVoidInstReg :   NVPTXInst<(outs), (ins B32:$addr), "$addr, ",
-                                  [(CallVoid i32:$addr)]>;
-def CallVoidInstReg64 : NVPTXInst<(outs), (ins B64:$addr), "$addr, ",
-                                  [(CallVoid i64:$addr)]>;
-def PrototypeInst :     NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
-                                  [(Prototype (i32 imm:$val))]>;
-
 def DeclareRetMemInst :
-  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
-            ".param .align $align .b8 retval$num[$size];",
-            [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
+  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size),
+            ".param .align $align .b8 retval0[$size];",
+            [(DeclareRetParam imm:$align, imm:$size)]>;
 def DeclareRetScalarInst :
-  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
-            ".param .b$size retval$num;",
-            [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
-def DeclareRetRegInst :
-  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
-            ".reg .b$size retval$num;",
-            [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
+  NVPTXInst<(outs), (ins i32imm:$size),
+            ".param .b$size retval0;",
+            [(DeclareRet imm:$size)]>;
 
 def DeclareParamInst :
   NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
             ".param .align $align .b8 param$a[$size];",
-            [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
+            [(DeclareParam imm:$align, imm:$a, imm:$size)]>;
 def DeclareScalarParamInst :
   NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
             ".param .b$size param$a;",
-            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
-def DeclareScalarRegInst :
-  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
-            ".reg .b$size param$a;",
-            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
-
-class MoveParamSymbolInst<RegTyInfo t> :
-  BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src),
-            "mov.b" # t.Size,
-            [(set t.Ty:$dst, (MoveParam texternalsym:$src))]>;
-
-def MOV64_PARAM : MoveParamSymbolInst<I64RT>;
-def MOV32_PARAM : MoveParamSymbolInst<I32RT>;
-
-class PseudoUseParamInst<NVPTXRegClass regclass, ValueType vt> :
-  NVPTXInst<(outs), (ins regclass:$src),
-            "// Pseudo use of $src",
-            [(PseudoUseParam vt:$src)]>;
+            [(DeclareScalarParam imm:$a, imm:$size)]>;
 
-def PseudoUseParamI64 : PseudoUseParamInst<B64, i64>;
-def PseudoUseParamI32 : PseudoUseParamInst<B32, i32>;
-def PseudoUseParamI16 : PseudoUseParamInst<B16, i16>;
-def PseudoUseParamF64 : PseudoUseParamInst<B64, f64>;
-def PseudoUseParamF32 : PseudoUseParamInst<B32, f32>;
+foreach t = [I32RT, I64RT] in {
+  defvar inst_name = "MOV" # t.Size # "_PARAM";
+  def inst_name : BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src), "mov.b" # t.Size>;
+  def : Pat<(MoveParam (t.Ty externalsym:$src)),
+            (!cast<NVPTXInst>(inst_name) (t.Ty (to_texternsym $src)))>;
+}
 
 multiclass ProxyRegInst<string SzStr, NVPTXRegClass rc> {
   def NAME : BasicNVPTXInst<(outs rc:$dst), (ins rc:$src),
@@ -2861,21 +2799,6 @@ def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                             SDNPSideEffect]>;
 
-def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
-def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
-                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-def calltarget : Operand<i32>;
-let isCall=1 in {
-   def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
-}
-
-def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
-def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
-
-// Pseudo instructions.
-class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
-   : NVPTXInst<outs, ins, asmstr, pattern>;
-
 def Callseq_Start :
   NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
             "\\{ // callseq $amt1, $amt2",
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 10d7f04d8d937..cc1fd027d8515 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2224,6 +2224,8 @@ def nvvm_move_sym64 : NVPTXInst<(outs B64:$r), (ins ADDR_base:$s),
 
 def texsurf_handles
   : BasicNVPTXInst<(outs B64:$result), (ins ADDR_base:$src), "mov.u64">;
+def : Pat<(int_nvvm_texsurf_handle_internal globaladdr:$src),
+          (texsurf_handles (to_tglobaladdr $src))>;
 
 //-----------------------------------
 // Compiler Error Warn
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index f921032356d65..415164fc9e2cb 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -4961,6 +4961,21 @@ bool PPCDAGToDAGISel::tryAsSingleRLWINM(SDNode *N) {
   // If this is just a masked value where the input is not handled, and
   // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm
   if (isRunOfOnes(Imm, MB, ME) && Val.getOpcode() != ISD::ROTL) {
+    // The result of LBARX/LHARX do not need to be cleared as the instructions
+    // implicitly clear the upper bits.
+    unsigned AlreadyCleared = 0;
+    if (Val.getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+      auto IntrinsicID = Val.getConstantOperandVal(1);
+      if (IntrinsicID == Intrinsic::ppc_lbarx)
+        AlreadyCleared = 24;
+      else if (IntrinsicID == Intrinsic::ppc_lharx)
+        AlreadyCleared = 16;
+      if (AlreadyCleared != 0 && AlreadyCleared == MB && ME == 31) {
+        ReplaceUses(SDValue(N, 0), N->getOperand(0));
+        return true;
+      }
+    }
+
     SDValue Ops[] = {Val, getI32Imm(0, dl), getI32Imm(MB, dl),
                      getI32Imm(ME, dl)};
     CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
index 386c94a324996..24287a95ecb05 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -1617,10 +1617,14 @@ class VX_VT5_EO5_VB5_XO9_o<bits<5> eo, bits<9> xo, string opc,
 }
 
 // Decimal Convert From/to National/Zoned/Signed-QWord
-def BCDCFN_rec  : VX_VT5_EO5_VB5_PS1_XO9_o<7, 385, "bcdcfn." , []>;
-def BCDCFZ_rec  : VX_VT5_EO5_VB5_PS1_XO9_o<6, 385, "bcdcfz." , []>;
-def BCDCTN_rec  : VX_VT5_EO5_VB5_XO9_o    <5, 385, "bcdctn." , []>;
-def BCDCTZ_rec  : VX_VT5_EO5_VB5_PS1_XO9_o<4, 385, "bcdctz." , []>;
+def BCDCFN_rec  : VX_VT5_EO5_VB5_PS1_XO9_o<7, 385, "bcdcfn." ,
+                  [(set v16i8:$VD, (int_ppc_national2packed v16i8:$VB, timm:$PS))]>;
+def BCDCFZ_rec  : VX_VT5_EO5_VB5_PS1_XO9_o<6, 385, "bcdcfz." , 
+                  [(set v16i8:$VD, (int_ppc_zoned2packed v16i8:$VB, timm:$PS))]>;
+def BCDCTN_rec  : VX_VT5_EO5_VB5_XO9_o    <5, 385, "bcdctn." , 
+                  [(set v16i8:$VD, (int_ppc_packed2national v16i8:$VB))]>;
+def BCDCTZ_rec  : VX_VT5_EO5_VB5_PS1_XO9_o<4, 385, "bcdctz." , 
+                  [(set v16i8:$VD, (int_ppc_packed2zoned v16i8:$VB, timm:$PS))]>;
 def BCDCFSQ_rec : VX_VT5_EO5_VB5_PS1_XO9_o<2, 385, "bcdcfsq.", []>;
 def BCDCTSQ_rec : VX_VT5_EO5_VB5_XO9_o    <0, 385, "bcdctsq.", []>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 0e59861b8a786..b078b9268c984 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -292,7 +292,7 @@ void RISCVAsmPrinter::emitNTLHint(const MachineInstr *MI) {
     NontemporalMode += 0b10;
 
   MCInst Hint;
-  if (STI->hasStdExtZca() && STI->enableRVCHintInstrs())
+  if (STI->hasStdExtZca())
     Hint.setOpcode(RISCV::C_ADD_HINT);
   else
     Hint.setOpcode(RISCV::ADD);
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 36b3aff51cda9..dc80432fcb738 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -373,13 +373,6 @@ def HasStdExtZhinx : Predicate<"Subtarget->hasStdExtZhinx()">,
 def NoStdExtZhinx : Predicate<"!Subtarget->hasStdExtZhinx()">;
 
 // Compressed Extensions
-def FeatureNoRVCHints
-    : SubtargetFeature<"no-rvc-hints", "EnableRVCHintInstrs", "false",
-                       "Disable RVC Hint Instructions.">;
-def HasRVCHints : Predicate<"Subtarget->enableRVCHintInstrs()">,
-                  AssemblerPredicate<(all_of(not FeatureNoRVCHints)),
-                                     "RVC Hint Instructions">;
-
 def FeatureStdExtZca
     : RISCVExtension<1, 0,
                      "part of the C extension, excluding compressed "
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 712f6154732a2..90376b375e275 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -756,8 +756,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_FROUNDEVEN,  ISD::VP_FCOPYSIGN,   ISD::VP_FROUNDTOZERO,
         ISD::VP_FRINT,       ISD::VP_FNEARBYINT,  ISD::VP_IS_FPCLASS,
         ISD::VP_FMINIMUM,    ISD::VP_FMAXIMUM,    ISD::VP_LRINT,
-        ISD::VP_LLRINT,      ISD::EXPERIMENTAL_VP_REVERSE,
-        ISD::EXPERIMENTAL_VP_SPLICE, ISD::VP_REDUCE_FMINIMUM,
+        ISD::VP_LLRINT,       ISD::VP_REDUCE_FMINIMUM,
         ISD::VP_REDUCE_FMAXIMUM, ISD::EXPERIMENTAL_VP_SPLAT};
 
     static const unsigned IntegerVecReduceOps[] = {
@@ -1112,6 +1111,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
 
       setOperationAction({ISD::VECTOR_REVERSE, ISD::VECTOR_SPLICE}, VT, Custom);
+      setOperationAction(ISD::EXPERIMENTAL_VP_SPLICE, VT, Custom);
+      setOperationAction(ISD::EXPERIMENTAL_VP_REVERSE, VT, Custom);
 
       setOperationAction(FloatingPointVPOps, VT, Custom);
 
@@ -1420,6 +1421,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                             ISD::EXTRACT_SUBVECTOR, ISD::VECTOR_REVERSE,
                             ISD::VECTOR_SHUFFLE, ISD::VECTOR_COMPRESS},
                            VT, Custom);
+        setOperationAction(ISD::EXPERIMENTAL_VP_SPLICE, VT, Custom);
+        setOperationAction(ISD::EXPERIMENTAL_VP_REVERSE, VT, Custom);
 
         setOperationAction({ISD::VECTOR_INTERLEAVE, ISD::VECTOR_DEINTERLEAVE},
                            VT, Custom);
@@ -13241,6 +13244,8 @@ SDValue RISCVTargetLowering::lowerVPMergeMask(SDValue Op,
 SDValue
 RISCVTargetLowering::lowerVPSpliceExperimental(SDValue Op,
                                                SelectionDAG &DAG) const {
+  using namespace SDPatternMatch;
+
   SDLoc DL(Op);
 
   SDValue Op1 = Op.getOperand(0);
@@ -13285,6 +13290,42 @@ RISCVTargetLowering::lowerVPSpliceExperimental(SDValue Op,
                       SplatZeroOp2, DAG.getUNDEF(ContainerVT), EVL2);
   }
 
+  auto getVectorFirstEle = [](SDValue Vec) {
+    SDValue FirstEle;
+    if (sd_match(Vec, m_InsertElt(m_Value(), m_Value(FirstEle), m_Zero())))
+      return FirstEle;
+
+    if (Vec.getOpcode() == ISD::SPLAT_VECTOR ||
+        Vec.getOpcode() == ISD::BUILD_VECTOR)
+      return Vec.getOperand(0);
+
+    return SDValue();
+  };
+
+  if (!IsMaskVector && isNullConstant(Offset) && isOneConstant(EVL1))
+    if (auto FirstEle = getVectorFirstEle(Op->getOperand(0))) {
+      MVT EltVT = ContainerVT.getVectorElementType();
+      SDValue Result;
+      if ((EltVT == MVT::f16 && !Subtarget.hasVInstructionsF16()) ||
+          EltVT == MVT::bf16) {
+        EltVT = EltVT.changeTypeToInteger();
+        ContainerVT = ContainerVT.changeVectorElementType(EltVT);
+        Op2 = DAG.getBitcast(ContainerVT, Op2);
+        FirstEle =
+            DAG.getAnyExtOrTrunc(DAG.getBitcast(EltVT, FirstEle), DL, XLenVT);
+      }
+      Result = DAG.getNode(EltVT.isFloatingPoint() ? RISCVISD::VFSLIDE1UP_VL
+                                                   : RISCVISD::VSLIDE1UP_VL,
+                           DL, ContainerVT, DAG.getUNDEF(ContainerVT), Op2,
+                           FirstEle, Mask, EVL2);
+      Result = DAG.getBitcast(
+          ContainerVT.changeVectorElementType(VT.getVectorElementType()),
+          Result);
+      return VT.isFixedLengthVector()
+                 ? convertFromScalableVector(VT, Result, DAG, Subtarget)
+                 : Result;
+    }
+
   int64_t ImmValue = cast<ConstantSDNode>(Offset)->getSExtValue();
   SDValue DownOffset, UpOffset;
   if (ImmValue >= 0) {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 5711f0077b12d..898cd85a55297 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1726,7 +1726,7 @@ unsigned RISCVInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   if (!MI.memoperands_empty()) {
     MachineMemOperand *MMO = *(MI.memoperands_begin());
     if (STI.hasStdExtZihintntl() && MMO->isNonTemporal()) {
-      if (STI.hasStdExtZca() && STI.enableRVCHintInstrs()) {
+      if (STI.hasStdExtZca()) {
         if (isCompressibleInst(MI, STI))
           return 4; // c.ntl.all + c.load/c.store
         return 6;   // c.ntl.all + load/store
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index 8b1eca45d82d8..8252a9b170eb3 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -608,7 +608,7 @@ def C_UNIMP : RVInst16<(outs), (ins), "c.unimp", "", [], InstFormatOther>,
 // HINT Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtZca, HasRVCHints], hasSideEffects = 0, mayLoad = 0,
+let Predicates = [HasStdExtZca], hasSideEffects = 0, mayLoad = 0,
     mayStore = 0 in {
 
 def C_NOP_HINT : RVInst16CI<0b000, 0b01, (outs), (ins simm6nonzero:$imm),
@@ -691,24 +691,24 @@ def C_SRAI64_HINT : RVInst16CB<0b100, 0b01, (outs GPRC:$rd),
   let Inst{12} = 0;
 }
 
-} // Predicates = [HasStdExtZca, HasRVCHints], hasSideEffects = 0, mayLoad = 0,
+} // Predicates = [HasStdExtZca], hasSideEffects = 0, mayLoad = 0,
   // mayStore = 0
 
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtZca, HasRVCHints] in {
+let Predicates = [HasStdExtZca] in {
 // Just a different syntax for the c.nop hint: c.addi x0, simm6 vs c.nop simm6.
 def : InstAlias<"c.addi x0, $imm", (C_NOP_HINT simm6nonzero:$imm), 0>;
 }
 
-let Predicates = [HasStdExtC, HasRVCHints, HasStdExtZihintntl] in {
+let Predicates = [HasStdExtC, HasStdExtZihintntl] in {
 def : InstAlias<"c.ntl.p1", (C_ADD_HINT X0, X2)>;
 def : InstAlias<"c.ntl.pall", (C_ADD_HINT X0, X3)>;
 def : InstAlias<"c.ntl.s1", (C_ADD_HINT X0, X4)>;
 def : InstAlias<"c.ntl.all", (C_ADD_HINT X0, X5)>;
-} // Predicates = [HasStdExtC, HasRVCHints, HasStdExtZihintntl]
+} // Predicates = [HasStdExtC, HasStdExtZihintntl]
 
 let EmitPriority = 0 in {
 let Predicates = [HasStdExtZca] in {
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 78a176fcf18d9..6600a00d4e098 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -1201,7 +1201,7 @@ multiclass SiFive7ReadAdvance {
   def : ReadAdvance<ReadVST4R, 0>;
   def : ReadAdvance<ReadVST8R, 0>;
 
-  // 12. Vector Integer Arithmetic Instructions
+  // 11. Vector Integer Arithmetic Instructions
   defm : LMULReadAdvance<"ReadVIALUV", 0>;
   defm : LMULReadAdvance<"ReadVIALUX", 0>;
   defm : LMULReadAdvanceW<"ReadVIWALUV", 0>;
@@ -1232,7 +1232,7 @@ multiclass SiFive7ReadAdvance {
   defm : LMULReadAdvance<"ReadVIMovV", 0>;
   defm : LMULReadAdvance<"ReadVIMovX", 0>;
 
-  // 13. Vector Fixed-Point Arithmetic Instructions
+  // 12. Vector Fixed-Point Arithmetic Instructions
   defm : LMULReadAdvance<"ReadVSALUV", 0>;
   defm : LMULReadAdvance<"ReadVSALUX", 0>;
   defm : LMULReadAdvance<"ReadVAALUV", 0>;
@@ -1244,7 +1244,7 @@ multiclass SiFive7ReadAdvance {
   defm : LMULReadAdvanceW<"ReadVNClipV", 0>;
   defm : LMULReadAdvanceW<"ReadVNClipX", 0>;
 
-  // 14. Vector Floating-Point Instructions
+  // 13. Vector Floating-Point Instructions
   defm : LMULSEWReadAdvanceF<"ReadVFALUV", 0>;
   defm : LMULSEWReadAdvanceF<"ReadVFALUF", 0>;
   defm : LMULSEWReadAdvanceFW<"ReadVFWALUV", 0>;
@@ -1280,7 +1280,7 @@ multiclass SiFive7ReadAdvance {
   defm : LMULReadAdvanceW<"ReadVFNCvtFToIV", 0>;
   defm : LMULSEWReadAdvanceFW<"ReadVFNCvtFToFV", 0>;
 
-  // 15. Vector Reduction Operations
+  // 14. Vector Reduction Operations
   def : ReadAdvance<ReadVIRedV, 0>;
   def : ReadAdvance<ReadVIRedV0, 0>;
   def : ReadAdvance<ReadVIWRedV, 0>;
@@ -1294,14 +1294,14 @@ multiclass SiFive7ReadAdvance {
   def : ReadAdvance<ReadVFWRedOV, 0>;
   def : ReadAdvance<ReadVFWRedOV0, 0>;
 
-  // 16. Vector Mask Instructions
+  // 15. Vector Mask Instructions
   defm : LMULReadAdvance<"ReadVMALUV", 0>;
   defm : LMULReadAdvance<"ReadVMPopV", 0>;
   defm : LMULReadAdvance<"ReadVMFFSV", 0>;
   defm : LMULReadAdvance<"ReadVMSFSV", 0>;
   defm : LMULReadAdvance<"ReadVIotaV", 0>;
 
-  // 17. Vector Permutation Instructions
+  // 16. Vector Permutation Instructions
   def : ReadAdvance<ReadVMovXS, 0>;
   def : ReadAdvance<ReadVMovSX_V, 0>;
   def : ReadAdvance<ReadVMovSX_X, 0>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 3cd923c0ba058..ec77154d17caa 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -3239,52 +3239,79 @@ static SDValue performBitcastCombine(SDNode *N,
   return SDValue();
 }
 
-static SDValue performSETCCCombine(SDNode *N,
-                                   TargetLowering::DAGCombinerInfo &DCI) {
-  auto &DAG = DCI.DAG;
-
+template <int MatchRHS, ISD::CondCode MatchCond, bool RequiresNegate,
+          Intrinsic::ID Intrin>
+static SDValue TryMatchTrue(SDNode *N, EVT VecVT, SelectionDAG &DAG) {
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
-  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  SDValue Cond = N->getOperand(2);
+  if (MatchCond != cast<CondCodeSDNode>(Cond)->get())
+    return SDValue();
+
+  if (MatchRHS != cast<ConstantSDNode>(RHS)->getSExtValue())
+    return SDValue();
+
   SDLoc DL(N);
+  SDValue Ret = DAG.getZExtOrTrunc(
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+                  {DAG.getConstant(Intrin, DL, MVT::i32),
+                   DAG.getSExtOrTrunc(LHS->getOperand(0), DL, VecVT)}),
+      DL, MVT::i1);
+  if (RequiresNegate)
+    Ret = DAG.getNOT(DL, Ret, MVT::i1);
+  return DAG.getZExtOrTrunc(Ret, DL, N->getValueType(0));
+}
+
+static SDValue performSETCCCombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
   EVT VT = N->getValueType(0);
+  if (!VT.isScalarInteger())
+    return SDValue();
 
+  SDValue LHS = N->getOperand(0);
+  if (LHS->getOpcode() != ISD::BITCAST)
+    return SDValue();
+
+  EVT FromVT = LHS->getOperand(0).getValueType();
+  if (!FromVT.isFixedLengthVector() || FromVT.getVectorElementType() != MVT::i1)
+    return SDValue();
+
+  unsigned NumElts = FromVT.getVectorNumElements();
+  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+    return SDValue();
+
+  if (!cast<ConstantSDNode>(N->getOperand(1)))
+    return SDValue();
+
+  EVT VecVT = FromVT.changeVectorElementType(MVT::getIntegerVT(128 / NumElts));
+  auto &DAG = DCI.DAG;
   // setcc (iN (bitcast (vNi1 X))), 0, ne
   //   ==> any_true (vNi1 X)
+  if (auto Match = TryMatchTrue<0, ISD::SETNE, false, Intrinsic::wasm_anytrue>(
+          N, VecVT, DAG)) {
+    return Match;
+  }
   // setcc (iN (bitcast (vNi1 X))), 0, eq
   //   ==> xor (any_true (vNi1 X)), -1
+  if (auto Match = TryMatchTrue<0, ISD::SETEQ, true, Intrinsic::wasm_anytrue>(
+          N, VecVT, DAG)) {
+    return Match;
+  }
   // setcc (iN (bitcast (vNi1 X))), -1, eq
   //   ==> all_true (vNi1 X)
+  if (auto Match = TryMatchTrue<-1, ISD::SETEQ, false, Intrinsic::wasm_alltrue>(
+          N, VecVT, DAG)) {
+    return Match;
+  }
   // setcc (iN (bitcast (vNi1 X))), -1, ne
   //   ==> xor (all_true (vNi1 X)), -1
-  if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
-      (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
-      (isNullConstant(RHS) || isAllOnesConstant(RHS)) &&
-      LHS->getOpcode() == ISD::BITCAST) {
-    EVT FromVT = LHS->getOperand(0).getValueType();
-    if (FromVT.isFixedLengthVector() &&
-        FromVT.getVectorElementType() == MVT::i1) {
-      int Intrin = isNullConstant(RHS) ? Intrinsic::wasm_anytrue
-                                       : Intrinsic::wasm_alltrue;
-      unsigned NumElts = FromVT.getVectorNumElements();
-      if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
-        return SDValue();
-      EVT Width = MVT::getIntegerVT(128 / NumElts);
-      SDValue Ret = DAG.getZExtOrTrunc(
-          DAG.getNode(
-              ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
-              {DAG.getConstant(Intrin, DL, MVT::i32),
-               DAG.getSExtOrTrunc(LHS->getOperand(0), DL,
-                                  FromVT.changeVectorElementType(Width))}),
-          DL, MVT::i1);
-      if ((isNullConstant(RHS) && (Cond == ISD::SETEQ)) ||
-          (isAllOnesConstant(RHS) && (Cond == ISD::SETNE))) {
-        Ret = DAG.getNOT(DL, Ret, MVT::i1);
-      }
-      return DAG.getZExtOrTrunc(Ret, DL, VT);
-    }
+  if (auto Match = TryMatchTrue<-1, ISD::SETNE, true, Intrinsic::wasm_alltrue>(
+          N, VecVT, DAG)) {
+    return Match;
   }
-
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
index 3090ad313b90d..9b0dd0562cde3 100644
--- a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
+++ b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
@@ -95,6 +95,8 @@ class X86InstructionSelector : public InstructionSelector {
                  MachineFunction &MF) const;
   bool selectFCmp(MachineInstr &I, MachineRegisterInfo &MRI,
                   MachineFunction &MF) const;
+  bool selectFAbs(MachineInstr &I, MachineRegisterInfo &MRI,
+                  MachineFunction &MF) const;
   bool selectUAddSub(MachineInstr &I, MachineRegisterInfo &MRI,
                      MachineFunction &MF) const;
   bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI) const;
@@ -391,6 +393,8 @@ bool X86InstructionSelector::select(MachineInstr &I) {
   switch (I.getOpcode()) {
   default:
     return false;
+  case TargetOpcode::G_FABS:
+    return selectFAbs(I, MRI, MF);
   case TargetOpcode::G_STORE:
   case TargetOpcode::G_LOAD:
     return selectLoadStoreOp(I, MRI, MF);
@@ -1050,6 +1054,35 @@ bool X86InstructionSelector::selectCmp(MachineInstr &I,
   I.eraseFromParent();
   return true;
 }
+bool X86InstructionSelector::selectFAbs(MachineInstr &I,
+                                        MachineRegisterInfo &MRI,
+                                        MachineFunction &MF) const {
+  assert((I.getOpcode() == TargetOpcode::G_FABS) && "unexpected instruction");
+  Register SrcReg = I.getOperand(1).getReg();
+  Register DstReg = I.getOperand(0).getReg();
+  LLT Ty = MRI.getType(SrcReg);
+  unsigned OpAbs;
+  const TargetRegisterClass *DstRC;
+  switch (Ty.getSizeInBits()) {
+  default:
+    return false;
+  case 32:
+    OpAbs = X86::ABS_Fp32;
+    DstRC = &X86::FR32RegClass;
+    break;
+  case 64:
+    OpAbs = X86::ABS_Fp64;
+    DstRC = &X86::FR64RegClass;
+    break;
+  }
+  MRI.setRegClass(DstReg, DstRC);
+  MachineInstr &FAbsInst =
+      *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpAbs), DstReg)
+           .addReg(SrcReg);
+  constrainSelectedInstRegOperands(FAbsInst, TII, TRI, RBI);
+  I.eraseFromParent();
+  return true;
+}
 
 bool X86InstructionSelector::selectFCmp(MachineInstr &I,
                                         MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index f21a7c81459f7..0250ec66c0b99 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -97,10 +97,10 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
       .widenScalarToNextPow2(0, /*Min=*/8)
       .clampScalar(0, s8, sMaxScalar);
 
-  getActionDefinitionsBuilder({G_LROUND, G_LLROUND, G_FCOS,  G_FCOSH,  G_FACOS,
-                               G_FSIN,   G_FSINH,   G_FASIN, G_FTAN,   G_FTANH,
-                               G_FATAN,  G_FATAN2,  G_FPOW,  G_FEXP,   G_FEXP2,
-                               G_FEXP10, G_FLOG,    G_FLOG2, G_FLOG10, G_FPOWI})
+  getActionDefinitionsBuilder(
+      {G_LROUND, G_LLROUND, G_FCOS,  G_FCOSH, G_FACOS,  G_FSIN,  G_FSINH,
+       G_FASIN,  G_FTAN,    G_FTANH, G_FATAN, G_FATAN2, G_FPOW,  G_FEXP,
+       G_FEXP2,  G_FEXP10,  G_FLOG,  G_FLOG2, G_FLOG10, G_FPOWI, G_FSINCOS})
       .libcall();
 
   getActionDefinitionsBuilder(G_FSQRT)
@@ -418,6 +418,10 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
       .legalFor(HasAVX512, {v16s32, v8s64})
       .legalFor(UseX87, {s80});
 
+  getActionDefinitionsBuilder(G_FABS)
+      .legalFor(UseX87 && !HasSSE2 && !HasSSE1, {s64, s80})
+      .lower();
+
   // fp comparison
   getActionDefinitionsBuilder(G_FCMP)
       .legalFor(HasSSE1 || UseX87, {s8, s32})
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index bcda188d4c2cb..772e48efb8607 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1574,7 +1574,7 @@ def ProcessorFeatures {
                                                   FeatureVPCLMULQDQ];
   list<SubtargetFeature> ZN3AdditionalTuning = [TuningMacroFusion];
   list<SubtargetFeature> ZN3Tuning =
-    !listconcat(ZN2Tuning, ZN3AdditionalTuning);
+    !listremove(!listconcat(ZN2Tuning, ZN3AdditionalTuning), [TuningSlowSHLD]);
   list<SubtargetFeature> ZN3Features =
     !listconcat(ZN2Features, ZN3AdditionalFeatures);
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 6a05a1700f0cb..e73bec2e22a57 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -5441,7 +5441,8 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
 
   else if (LT.first * Ty.getVectorNumElements() > NumElem) {
     auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
-                                           Ty.getVectorNumElements());
+                                           (unsigned)LT.first.getValue() *
+                                               Ty.getVectorNumElements());
     // Expanding requires fill mask with zeroes
     Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, NewMaskTy, {},
                            CostKind, 0, MaskTy);
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index 5718ae385bac1..42ed914f6dc73 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -1018,10 +1018,7 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
 ///
 /// This stores the string representation and parses the various pieces into
 /// enum members.
-Triple::Triple(const Twine &Str)
-    : Data(Str.str()), Arch(UnknownArch), SubArch(NoSubArch),
-      Vendor(UnknownVendor), OS(UnknownOS), Environment(UnknownEnvironment),
-      ObjectFormat(UnknownObjectFormat) {
+Triple::Triple(const Twine &Str) : Data(Str.str()) {
   // Do minimal parsing by hand here.
   SmallVector<StringRef, 4> Components;
   StringRef(Data).split(Components, '-', /*MaxSplit*/ 3);
@@ -1636,14 +1633,7 @@ void Triple::setObjectFormat(ObjectFormatType Kind) {
 }
 
 void Triple::setArchName(StringRef Str) {
-  // Work around a miscompilation bug for Twines in gcc 4.0.3.
-  SmallString<64> Triple;
-  Triple += Str;
-  Triple += "-";
-  Triple += getVendorName();
-  Triple += "-";
-  Triple += getOSAndEnvironmentName();
-  setTriple(Triple);
+  setTriple(Str + "-" + getVendorName() + "-" + getOSAndEnvironmentName());
 }
 
 void Triple::setVendorName(StringRef Str) {
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 4a06e0fa619c0..7f5a2a982982d 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -250,10 +250,10 @@ CleanupPointerRootUsers(GlobalVariable *GV,
     }
   }
 
-  for (int i = 0, e = Dead.size(); i != e; ++i) {
-    if (IsSafeComputationToRemove(Dead[i].first, GetTLI)) {
-      Dead[i].second->eraseFromParent();
-      Instruction *I = Dead[i].first;
+  for (const auto &[Inst, Store] : Dead) {
+    if (IsSafeComputationToRemove(Inst, GetTLI)) {
+      Store->eraseFromParent();
+      Instruction *I = Inst;
       do {
         if (isAllocationFn(I, GetTLI))
           break;
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 99acb02561d53..40578e5edc3ab 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -3494,13 +3494,13 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
   auto Removable =
       isAllocSiteRemovable(&MI, Users, TLI, KnowInitZero | KnowInitUndef);
   if (Removable) {
-    for (unsigned i = 0, e = Users.size(); i != e; ++i) {
+    for (WeakTrackingVH &User : Users) {
       // Lowering all @llvm.objectsize and MTI calls first because they may use
       // a bitcast/GEP of the alloca we are removing.
-      if (!Users[i])
-       continue;
+      if (!User)
+        continue;
 
-      Instruction *I = cast<Instruction>(&*Users[i]);
+      Instruction *I = cast<Instruction>(&*User);
 
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
         if (II->getIntrinsicID() == Intrinsic::objectsize) {
@@ -3511,7 +3511,7 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
             Worklist.add(Inserted);
           replaceInstUsesWith(*I, Result);
           eraseInstFromFunction(*I);
-          Users[i] = nullptr; // Skip examining in the next loop.
+          User = nullptr; // Skip examining in the next loop.
           continue;
         }
         if (auto *MTI = dyn_cast<MemTransferInst>(I)) {
@@ -3527,11 +3527,11 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
         }
       }
     }
-    for (unsigned i = 0, e = Users.size(); i != e; ++i) {
-      if (!Users[i])
+    for (WeakTrackingVH &User : Users) {
+      if (!User)
         continue;
 
-      Instruction *I = cast<Instruction>(&*Users[i]);
+      Instruction *I = cast<Instruction>(&*User);
 
       if (ICmpInst *C = dyn_cast<ICmpInst>(I)) {
         replaceInstUsesWith(*C,
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index a6f9992383cd3..1a76898bd61c6 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3294,22 +3294,51 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, getOrigin(Op));
   }
 
-  void handleCountZeroes(IntrinsicInst &I) {
+  // Uninitialized bits are ok if they appear after the leading/trailing 0's
+  // and a 1. If the input is all zero, it is fully initialized iff
+  // !is_zero_poison.
+  //
+  // e.g., for ctlz, with little-endian, if 0/1 are initialized bits with
+  // concrete value 0/1, and ? is an uninitialized bit:
+  //       - 0001 0??? is fully initialized
+  //       - 000? ???? is fully uninitialized (*)
+  //       - ???? ???? is fully uninitialized
+  //       - 0000 0000 is fully uninitialized if is_zero_poison,
+  //                      fully initialized   otherwise
+  //
+  // (*) TODO: arguably, since the number of zeros is in the range [3, 8], we
+  //     only need to poison 4 bits.
+  //
+  // OutputShadow =
+  //      ((ConcreteZerosCount >= ShadowZerosCount) && !AllZeroShadow)
+  //   || (is_zero_poison && AllZeroSrc)
+  void handleCountLeadingTrailingZeros(IntrinsicInst &I) {
     IRBuilder<> IRB(&I);
     Value *Src = I.getArgOperand(0);
+    Value *SrcShadow = getShadow(Src);
 
-    // Set the Output shadow based on input Shadow
-    Value *BoolShadow = IRB.CreateIsNotNull(getShadow(Src), "_mscz_bs");
+    Value *False = IRB.getInt1(false);
+    Value *ConcreteZerosCount = IRB.CreateIntrinsic(
+        I.getType(), I.getIntrinsicID(), {Src, /*is_zero_poison=*/False});
+    Value *ShadowZerosCount = IRB.CreateIntrinsic(
+        I.getType(), I.getIntrinsicID(), {SrcShadow, /*is_zero_poison=*/False});
+
+    Value *CompareConcreteZeros = IRB.CreateICmpUGE(
+        ConcreteZerosCount, ShadowZerosCount, "_mscz_cmp_zeros");
+
+    Value *NotAllZeroShadow =
+        IRB.CreateIsNotNull(SrcShadow, "_mscz_shadow_not_null");
+    Value *OutputShadow =
+        IRB.CreateAnd(CompareConcreteZeros, NotAllZeroShadow, "_mscz_main");
 
     // If zero poison is requested, mix in with the shadow
     Constant *IsZeroPoison = cast<Constant>(I.getOperand(1));
     if (!IsZeroPoison->isZeroValue()) {
       Value *BoolZeroPoison = IRB.CreateIsNull(Src, "_mscz_bzp");
-      BoolShadow = IRB.CreateOr(BoolShadow, BoolZeroPoison, "_mscz_bs");
+      OutputShadow = IRB.CreateOr(OutputShadow, BoolZeroPoison, "_mscz_bs");
     }
 
-    Value *OutputShadow =
-        IRB.CreateSExt(BoolShadow, getShadowTy(Src), "_mscz_os");
+    OutputShadow = IRB.CreateSExt(OutputShadow, getShadowTy(Src), "_mscz_os");
 
     setShadow(&I, OutputShadow);
     setOriginForNaryOp(I);
@@ -4726,7 +4755,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       break;
     case Intrinsic::ctlz:
     case Intrinsic::cttz:
-      handleCountZeroes(I);
+      handleCountLeadingTrailingZeros(I);
       break;
     case Intrinsic::masked_compressstore:
       handleMaskedCompressStore(I);
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 1e16677e5a56f..9e3b4b82cc454 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -235,8 +235,8 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
 // matrix by exchanging the two columns.
 static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
                                     unsigned ToIndx) {
-  for (unsigned I = 0, E = DepMatrix.size(); I < E; ++I)
-    std::swap(DepMatrix[I][ToIndx], DepMatrix[I][FromIndx]);
+  for (auto &Row : DepMatrix)
+    std::swap(Row[ToIndx], Row[FromIndx]);
 }
 
 // Check if a direction vector is lexicographically positive. Return true if it
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index 778a6a012556b..343da5b2e4704 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -83,10 +83,10 @@ static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
   Module *M = I->getModule();
   dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " "
        << *Ops[0].Op->getType() << '\t';
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+  for (const ValueEntry &Op : Ops) {
     dbgs() << "[ ";
-    Ops[i].Op->printAsOperand(dbgs(), false, M);
-    dbgs() << ", #" << Ops[i].Rank << "] ";
+    Op.Op->printAsOperand(dbgs(), false, M);
+    dbgs() << ", #" << Op.Rank << "] ";
   }
 }
 #endif
@@ -1585,9 +1585,9 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
   // where they are actually the same multiply.
   unsigned MaxOcc = 0;
   Value *MaxOccVal = nullptr;
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+  for (const ValueEntry &Op : Ops) {
     BinaryOperator *BOp =
-        isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
+        isReassociableOp(Op.Op, Instruction::Mul, Instruction::FMul);
     if (!BOp)
       continue;
 
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 9883974c55e3b..242cf6d811b66 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1528,12 +1528,10 @@ CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC,
   fixupDebugInfoPostExtraction(*oldFunction, *newFunction, *TheCall, inputs,
                                NewValues);
 
-  LLVM_DEBUG(if (verifyFunction(*newFunction, &errs())) {
-    newFunction->dump();
-    report_fatal_error("verification of newFunction failed!");
-  });
-  LLVM_DEBUG(if (verifyFunction(*oldFunction))
-                 report_fatal_error("verification of oldFunction failed!"));
+  LLVM_DEBUG(llvm::dbgs() << "After extractCodeRegion - newFunction:\n");
+  LLVM_DEBUG(newFunction->dump());
+  LLVM_DEBUG(llvm::dbgs() << "After extractCodeRegion - oldFunction:\n");
+  LLVM_DEBUG(oldFunction->dump());
   LLVM_DEBUG(if (AC && verifyAssumptionCache(*oldFunction, *newFunction, AC))
                  report_fatal_error("Stale Asumption cache for old Function!"));
   return newFunction;
@@ -1833,6 +1831,9 @@ CallInst *CodeExtractor::emitReplacerCall(
   // This takes place of the original loop
   BasicBlock *codeReplacer =
       BasicBlock::Create(Context, "codeRepl", oldFunction, ReplIP);
+  if (AllocationBlock)
+    assert(AllocationBlock->getParent() == oldFunction &&
+           "AllocationBlock is not in the same function");
   BasicBlock *AllocaBlock =
       AllocationBlock ? AllocationBlock : &oldFunction->getEntryBlock();
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index a3f39f5ad7a29..fe20d48f780e2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -465,7 +465,7 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
            "Predecessor basic-block not found building successor.");
     BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
     auto *PredBBTerminator = PredBB->getTerminator();
-    LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
+    LLVM_DEBUG(dbgs() << "LV: draw edge from " << PredBB->getName() << '\n');
 
     auto *TermBr = dyn_cast<BranchInst>(PredBBTerminator);
     if (isa<UnreachableInst>(PredBBTerminator)) {
@@ -579,8 +579,8 @@ VPBasicBlock *VPBasicBlock::clone() {
 }
 
 void VPBasicBlock::executeRecipes(VPTransformState *State, BasicBlock *BB) {
-  LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
-                    << " in BB:" << BB->getName() << '\n');
+  LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB: " << getName()
+                    << " in BB: " << BB->getName() << '\n');
 
   State->CFG.PrevVPBB = this;
 
@@ -589,7 +589,7 @@ void VPBasicBlock::executeRecipes(VPTransformState *State, BasicBlock *BB) {
     Recipe.execute(*State);
   }
 
-  LLVM_DEBUG(dbgs() << "LV: filled BB:" << *BB);
+  LLVM_DEBUG(dbgs() << "LV: filled BB: " << *BB);
 }
 
 VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 3e459f5ea4ce5..a4bfdcabaa314 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3082,10 +3082,10 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
       auto *SSV = cast<ShuffleVectorInst>(SVOp0);
       SVOp0 = SSV->getOperand(0);
       SVOp1 = SSV->getOperand(1);
-      for (unsigned I = 0, E = Mask.size(); I != E; I++) {
-        if (Mask[I] >= static_cast<int>(SSV->getShuffleMask().size()))
+      for (int &Elem : Mask) {
+        if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
           return false;
-        Mask[I] = Mask[I] < 0 ? Mask[I] : SSV->getMaskValue(Mask[I]);
+        Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
       }
     }
     if (SVOp0 == Op1 && SVOp1 == Op0) {
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 9f86650ec58d1..b20cdb8d68ec3 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -510,6 +510,13 @@ if(build_runtimes)
   endif()
 
   # TODO: We need to consider passing it as '-DRUNTIMES_x86_64_LLVM_ENABLE_RUNTIMES'.
+  if("libclc" IN_LIST LLVM_ENABLE_RUNTIMES)
+    foreach(dep clang llvm-as llvm-link opt)
+      if(TARGET ${dep})
+        list(APPEND extra_deps ${dep})
+      endif()
+    endforeach()
+  endif()
   if("openmp" IN_LIST LLVM_ENABLE_RUNTIMES OR "offload" IN_LIST LLVM_ENABLE_RUNTIMES)
     if (${LLVM_TOOL_FLANG_BUILD})
       message(STATUS "Configuring build of omp_lib.mod and omp_lib_kinds.mod via flang")
diff --git a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
new file mode 100644
index 0000000000000..790f49f1d3b82
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sve2p1 < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sme2p1 -force-streaming < %s | FileCheck %s
+
+;; Broadcast indexed lane within 128b segments (dupq zd.t, zn.t[idx])
+define void @dup_within_each_segment_256b() #0 {
+; CHECK-LABEL: 'dup_within_each_segment_256b'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 poison, i32 3, i32 7, i32 poison, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11,
+                                                                            i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+  %dupq_h2  = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2,
+                                                                              i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+  %dupq_s3  = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
+                                                                           i32 7, i32 7, i32 7, i32 7>
+  %dupq_d0  = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 poison, i32 3,
+                                                                                      i32 7, i32 poison, i32 7, i32 7>
+  ret void
+}
+
+define void @dup_within_each_segment_512b() #1 {
+; CHECK-LABEL: 'dup_within_each_segment_512b'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 poison, i32 3, i32 7, i32 poison, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11,
+                                                                            i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+  %dupq_h2  = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2,
+                                                                              i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+  %dupq_s3  = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
+                                                                           i32 7, i32 7, i32 7, i32 7>
+  %dupq_d0  = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 poison, i32 3,
+                                                                                      i32 7, i32 poison, i32 7, i32 7>
+  ret void
+}
+
+attributes #0 = { noinline vscale_range(2,2) }
+attributes #1 = { noinline vscale_range(4,4) }
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 4d0603722c3ae..0779c75c345e3 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -13337,6 +13337,57 @@ define <16 x i8> @test_v16i8_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <16
   ret <16 x i8> %tmp2
 }
 
+define <16 x i8> @test_v16i8_post_reg_ld1lane_zero(ptr %bar, ptr %ptr, i64 %inc) {
+; CHECK-SD-LABEL: test_v16i8_post_reg_ld1lane_zero:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    movi.2d v0, #0000000000000000
+; CHECK-SD-NEXT:    ld1.b { v0 }[0], [x0], x2
+; CHECK-SD-NEXT:    str x0, [x1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v16i8_post_reg_ld1lane_zero:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    ldr b0, [x0]
+; CHECK-GI-NEXT:    mov w8, #0 ; =0x0
+; CHECK-GI-NEXT:    mov.b v0[1], w8
+; CHECK-GI-NEXT:    mov.b v0[2], w8
+; CHECK-GI-NEXT:    mov.b v0[3], w8
+; CHECK-GI-NEXT:    mov.b v0[4], w8
+; CHECK-GI-NEXT:    mov.b v0[5], w8
+; CHECK-GI-NEXT:    mov.b v0[6], w8
+; CHECK-GI-NEXT:    mov.b v0[7], w8
+; CHECK-GI-NEXT:    mov.b v0[8], w8
+; CHECK-GI-NEXT:    mov.b v0[9], w8
+; CHECK-GI-NEXT:    mov.b v0[10], w8
+; CHECK-GI-NEXT:    mov.b v0[11], w8
+; CHECK-GI-NEXT:    mov.b v0[12], w8
+; CHECK-GI-NEXT:    mov.b v0[13], w8
+; CHECK-GI-NEXT:    mov.b v0[14], w8
+; CHECK-GI-NEXT:    mov.b v0[15], w8
+; CHECK-GI-NEXT:    add x8, x0, x2
+; CHECK-GI-NEXT:    str x8, [x1]
+; CHECK-GI-NEXT:    ret
+  %tmp1 = load i8, ptr %bar
+  %tmp2 = insertelement <16 x i8> zeroinitializer, i8 %tmp1, i32 0
+  %tmp3 = getelementptr i8, ptr %bar, i64 %inc
+  store ptr %tmp3, ptr %ptr
+  ret <16 x i8> %tmp2
+}
+
+define <16 x i8> @test_v16i8_post_reg_ld1lane_undef(ptr %bar, ptr %ptr, i64 %inc) {
+; CHECK-LABEL: test_v16i8_post_reg_ld1lane_undef:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    add x8, x0, x2
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
+  %tmp1 = load i8, ptr %bar
+  %tmp2 = insertelement <16 x i8> poison, i8 %tmp1, i32 0
+  %tmp3 = getelementptr i8, ptr %bar, i64 %inc
+  store ptr %tmp3, ptr %ptr
+  ret <16 x i8> %tmp2
+}
+
 define <8 x i8> @test_v8i8_post_imm_ld1lane(ptr %bar, ptr %ptr, <8 x i8> %A) {
 ; CHECK-SD-LABEL: test_v8i8_post_imm_ld1lane:
 ; CHECK-SD:       ; %bb.0:
@@ -14078,3 +14129,69 @@ define i32 @load_single_extract_variable_index_masked2_i32(ptr %A, i32 %idx) {
   %e = extractelement <4 x i32> %lv, i32 %idx.x
   ret i32 %e
 }
+
+define void @chained_insert_zero(ptr noundef %fenc, ptr noundef %pred, ptr noundef %residual, i32 noundef %stride) {
+; CHECK-SD-LABEL: chained_insert_zero:
+; CHECK-SD:       ; %bb.0: ; %entry
+; CHECK-SD-NEXT:    movi.2d v0, #0000000000000000
+; CHECK-SD-NEXT:    movi.2d v1, #0000000000000000
+; CHECK-SD-NEXT:    ; kill: def $w3 killed $w3 def $x3
+; CHECK-SD-NEXT:    sxtw x8, w3
+; CHECK-SD-NEXT:    ld1.s { v0 }[0], [x0], x8
+; CHECK-SD-NEXT:    ld1.s { v1 }[0], [x1], x8
+; CHECK-SD-NEXT:    sbfiz x8, x3, #1, #32
+; CHECK-SD-NEXT:    usubl.8h v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x2]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    usubl.8h v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x2, x8]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: chained_insert_zero:
+; CHECK-GI:       ; %bb.0: ; %entry
+; CHECK-GI-NEXT:    ldr s0, [x0]
+; CHECK-GI-NEXT:    ldr s1, [x1]
+; CHECK-GI-NEXT:    ; kill: def $w3 killed $w3 def $x3
+; CHECK-GI-NEXT:    sxtw x8, w3
+; CHECK-GI-NEXT:    mov.s v0[1], wzr
+; CHECK-GI-NEXT:    mov.s v1[1], wzr
+; CHECK-GI-NEXT:    usubl.8h v0, v0, v1
+; CHECK-GI-NEXT:    str d0, [x2]
+; CHECK-GI-NEXT:    ldr s0, [x0, x8]
+; CHECK-GI-NEXT:    ldr s1, [x1, x8]
+; CHECK-GI-NEXT:    lsl x8, x8, #1
+; CHECK-GI-NEXT:    mov.s v0[1], wzr
+; CHECK-GI-NEXT:    mov.s v1[1], wzr
+; CHECK-GI-NEXT:    usubl.8h v0, v0, v1
+; CHECK-GI-NEXT:    str d0, [x2, x8]
+; CHECK-GI-NEXT:    ret
+entry:
+  %idx.ext = sext i32 %stride to i64
+  %0 = load i32, ptr %fenc, align 4
+  %vld1_lane.i = insertelement <2 x i32> <i32 poison, i32 0>, i32 %0, i64 0
+  %1 = bitcast <2 x i32> %vld1_lane.i to <8 x i8>
+  %2 = load i32, ptr %pred, align 4
+  %vld1_lane.i16 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %2, i64 0
+  %3 = bitcast <2 x i32> %vld1_lane.i16 to <8 x i8>
+  %vmovl.i15 = zext <8 x i8> %1 to <8 x i16>
+  %vmovl.i = zext <8 x i8> %3 to <8 x i16>
+  %sub.i = sub nsw <8 x i16> %vmovl.i15, %vmovl.i
+  %shuffle.i = shufflevector <8 x i16> %sub.i, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %shuffle.i, ptr %residual, align 2
+  %add.ptr = getelementptr inbounds i8, ptr %fenc, i64 %idx.ext
+  %add.ptr6 = getelementptr inbounds i8, ptr %pred, i64 %idx.ext
+  %add.ptr8 = getelementptr inbounds i16, ptr %residual, i64 %idx.ext
+  %4 = load i32, ptr %add.ptr, align 4
+  %vld1_lane.i.1 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %4, i64 0
+  %5 = bitcast <2 x i32> %vld1_lane.i.1 to <8 x i8>
+  %6 = load i32, ptr %add.ptr6, align 4
+  %vld1_lane.i16.1 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %6, i64 0
+  %7 = bitcast <2 x i32> %vld1_lane.i16.1 to <8 x i8>
+  %vmovl.i15.1 = zext <8 x i8> %5 to <8 x i16>
+  %vmovl.i.1 = zext <8 x i8> %7 to <8 x i16>
+  %sub.i.1 = sub nsw <8 x i16> %vmovl.i15.1, %vmovl.i.1
+  %shuffle.i.1 = shufflevector <8 x i16> %sub.i.1, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %shuffle.i.1, ptr %add.ptr8, align 2
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/exp10-libcall-names.ll b/llvm/test/CodeGen/AArch64/exp10-libcall-names.ll
index 6e603b7064f8f..50358e5f15879 100644
--- a/llvm/test/CodeGen/AArch64/exp10-libcall-names.ll
+++ b/llvm/test/CodeGen/AArch64/exp10-libcall-names.ll
@@ -10,9 +10,9 @@
 ; RUN: llc -mtriple=arm64-apple-driverkit < %s | FileCheck -check-prefix=APPLE %s
 ; RUN: llc -mtriple=arm64-apple-driverkit1.0 < %s | FileCheck -check-prefix=APPLE %s
 ; RUN: llc -mtriple=arm64-apple-driverkit24.0 < %s | FileCheck -check-prefix=APPLE %s
-; RUN: llc -mtriple=arm64-apple-bridgeos < %s | FileCheck -check-prefix=BRIDGEOS %s
-; RUN: llc -mtriple=arm64-apple-bridgeos1.0 < %s | FileCheck -check-prefix=BRIDGEOS %s
-; RUN: llc -mtriple=arm64-apple-bridgeos9.0 < %s | FileCheck -check-prefix=BRIDGEOS %s
+; RUN: llc -mtriple=arm64-apple-bridgeos < %s | FileCheck -check-prefix=APPLE %s
+; RUN: llc -mtriple=arm64-apple-bridgeos1.0 < %s | FileCheck -check-prefix=APPLE %s
+; RUN: llc -mtriple=arm64-apple-bridgeos9.0 < %s | FileCheck -check-prefix=APPLE %s
 
 ; RUN: not llc -mtriple=aarch64-apple-macos10.8 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s
 ; RUN: not llc -mtriple=aarch64-apple-ios6.0 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s
@@ -29,11 +29,6 @@ define float @test_exp10_f32(float %x) {
 ; APPLE-LABEL: test_exp10_f32:
 ; APPLE:       ; %bb.0:
 ; APPLE-NEXT:    b ___exp10f
-;
-; BRIDGEOS-LABEL: test_exp10_f32:
-; BRIDGEOS:       // %bb.0:
-; BRIDGEOS-NEXT:    b __exp10f
-;
   %ret = call float @llvm.exp10.f32(float %x)
   ret float %ret
 }
@@ -46,11 +41,6 @@ define double @test_exp10_f64(double %x) {
 ; APPLE-LABEL: test_exp10_f64:
 ; APPLE:       ; %bb.0:
 ; APPLE-NEXT:    b ___exp10
-;
-; BRIDGEOS-LABEL: test_exp10_f64:
-; BRIDGEOS:       // %bb.0:
-; BRIDGEOS-NEXT:    b __exp10
-;
   %ret = call double @llvm.exp10.f64(double %x)
   ret double %ret
 }
diff --git a/llvm/test/CodeGen/AArch64/streaming-func-no-sme.ll b/llvm/test/CodeGen/AArch64/streaming-func-no-sme.ll
index 968adcb7cc21b..9be776f817271 100644
--- a/llvm/test/CodeGen/AArch64/streaming-func-no-sme.ll
+++ b/llvm/test/CodeGen/AArch64/streaming-func-no-sme.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple aarch64-none-linux-gnu %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple aarch64-none-linux-gnu -filetype=null %s 2>&1 | FileCheck %s
 
 ; CHECK: LLVM ERROR: streaming SVE functions require SME
 define void @streaming(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
index 40d4d0ff60148..da83b27ce4d55 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s --check-prefixes=CHECK,SVE
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p1,+bf16 -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SME
 
 define void @dupq_i8_256b(ptr %addr) #0 {
 ; CHECK-LABEL: dupq_i8_256b:
@@ -71,13 +72,43 @@ define void @dupq_f16_256b(ptr %addr) #0 {
 }
 
 define void @dupq_bf16_256b(ptr %addr) #0 {
-; CHECK-LABEL: dupq_bf16_256b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    dup v0.8h, v0.h[2]
-; CHECK-NEXT:    dup v1.8h, v1.h[2]
-; CHECK-NEXT:    stp q0, q1, [x0]
-; CHECK-NEXT:    ret
+; SVE-LABEL: dupq_bf16_256b:
+; SVE:       // %bb.0:
+; SVE-NEXT:    ldp q0, q1, [x0]
+; SVE-NEXT:    dup v0.8h, v0.h[2]
+; SVE-NEXT:    dup v1.8h, v1.h[2]
+; SVE-NEXT:    stp q0, q1, [x0]
+; SVE-NEXT:    ret
+;
+; SME-LABEL: dupq_bf16_256b:
+; SME:       // %bb.0:
+; SME-NEXT:    ldp q1, q0, [x0]
+; SME-NEXT:    str q0, [sp, #-64]!
+; SME-NEXT:    .cfi_def_cfa_offset 64
+; SME-NEXT:    ldr h0, [sp, #4]
+; SME-NEXT:    str q1, [sp, #32]
+; SME-NEXT:    str h0, [sp, #30]
+; SME-NEXT:    str h0, [sp, #28]
+; SME-NEXT:    str h0, [sp, #26]
+; SME-NEXT:    str h0, [sp, #24]
+; SME-NEXT:    str h0, [sp, #22]
+; SME-NEXT:    str h0, [sp, #20]
+; SME-NEXT:    str h0, [sp, #18]
+; SME-NEXT:    str h0, [sp, #16]
+; SME-NEXT:    ldr h0, [sp, #36]
+; SME-NEXT:    ldr q1, [sp, #16]
+; SME-NEXT:    str h0, [sp, #62]
+; SME-NEXT:    str h0, [sp, #60]
+; SME-NEXT:    str h0, [sp, #58]
+; SME-NEXT:    str h0, [sp, #56]
+; SME-NEXT:    str h0, [sp, #54]
+; SME-NEXT:    str h0, [sp, #52]
+; SME-NEXT:    str h0, [sp, #50]
+; SME-NEXT:    str h0, [sp, #48]
+; SME-NEXT:    ldr q0, [sp, #48]
+; SME-NEXT:    stp q0, q1, [x0]
+; SME-NEXT:    add sp, sp, #64
+; SME-NEXT:    ret
   %load = load <16 x bfloat>, ptr %addr
   %splat.lanes = shufflevector <16 x bfloat> %load, <16 x bfloat> poison, <16 x i32> <i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2,
                                                                                       i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
@@ -112,4 +143,18 @@ define void @dupq_f64_256b(ptr %addr) #0 {
   ret void
 }
 
-attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1,+bf16" }
+define void @dupq_f32_256b_with_poison(ptr %addr) #0 {
+; CHECK-LABEL: dupq_f32_256b_with_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    dupq z0.s, z0.s[3]
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %load = load <8 x float>, ptr %addr
+  %splat.lanes = shufflevector <8 x float> %load, <8 x float> poison, <8 x i32> <i32 3, i32 poison, i32 3, i32 3,
+                                                                                 i32 7, i32 7, i32 7, i32 poison>
+  store <8 x float> %splat.lanes, ptr %addr
+  ret void
+}
+
+attributes #0 = { noinline vscale_range(2,2) }
diff --git a/llvm/test/CodeGen/AMDGPU/convergence-laneops.ll b/llvm/test/CodeGen/AMDGPU/convergence-laneops.ll
index b7a1749be18bc..57ab371d5b6fc 100644
--- a/llvm/test/CodeGen/AMDGPU/convergence-laneops.ll
+++ b/llvm/test/CodeGen/AMDGPU/convergence-laneops.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s
-; RUN: not --crash llc -mtriple=amdgcn--amdhsa -mcpu=1100 -verify-machineinstrs < %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+; RUN: not --crash llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
 
 ; FIXME: Merge these tests with existing lane op tests (llvm.amdgcn.readlane.ll, llvm.amdgcn.writelane.ll ...) once the crash is fixed.
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll
new file mode 100644
index 0000000000000..243f6c4d23732
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG-REAL16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL-REAL16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL-FAKE16 %s
+
+define amdgpu_ps float @test_cvt_pk_f16_bf8_v(i16 %a) {
+; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_f16_bf8_v:
+; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    v_cvt_pk_f16_bf8 v0, v0.l
+; GFX1250-SDAG-REAL16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_f16_bf8_v:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_pk_f16_bf8 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_f16_bf8_v:
+; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    v_cvt_pk_f16_bf8 v0, v0.l
+; GFX1250-GISEL-REAL16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_f16_bf8_v:
+; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_pk_f16_bf8 v0, v0
+; GFX1250-GISEL-FAKE16-NEXT:    ; return to shader part epilog
+  %cvt = tail call <2 x half> @llvm.amdgcn.cvt.pk.f16.bf8(i16 %a)
+  %ret = bitcast <2 x half> %cvt to float
+  ret float %ret
+}
+
+define amdgpu_ps float @test_cvt_pk_f16_bf8_s(i16 inreg %a) {
+; GFX1250-LABEL: test_cvt_pk_f16_bf8_s:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_cvt_pk_f16_bf8 v0, s0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %cvt = tail call <2 x half> @llvm.amdgcn.cvt.pk.f16.bf8(i16 %a)
+  %ret = bitcast <2 x half> %cvt to float
+  ret float %ret
+}
+
+define amdgpu_ps float @test_cvt_pk_f16_fp8_v(i16 %a) {
+; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_f16_fp8_v:
+; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    v_cvt_pk_f16_fp8 v0, v0.l
+; GFX1250-SDAG-REAL16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_f16_fp8_v:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_pk_f16_fp8 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_f16_fp8_v:
+; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    v_cvt_pk_f16_fp8 v0, v0.l
+; GFX1250-GISEL-REAL16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_f16_fp8_v:
+; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_pk_f16_fp8 v0, v0
+; GFX1250-GISEL-FAKE16-NEXT:    ; return to shader part epilog
+  %cvt = tail call <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16 %a)
+  %ret = bitcast <2 x half> %cvt to float
+  ret float %ret
+}
+
+define amdgpu_ps float @test_cvt_pk_f16_fp8_s(i16 inreg %a) {
+; GFX1250-LABEL: test_cvt_pk_f16_fp8_s:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_cvt_pk_f16_fp8 v0, s0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %cvt = tail call <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16 %a)
+  %ret = bitcast <2 x half> %cvt to float
+  ret float %ret
+}
+
+define amdgpu_ps float @test_cvt_pk_f16_fp8_v_hi(<2 x i16> %a) {
+; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_f16_fp8_v_hi:
+; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    v_cvt_pk_f16_fp8 v0, v0.h
+; GFX1250-SDAG-REAL16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_f16_fp8_v_hi:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_pk_f16_fp8 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_f16_fp8_v_hi:
+; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-REAL16-NEXT:    v_cvt_pk_f16_fp8 v0, v0.l
+; GFX1250-GISEL-REAL16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_f16_fp8_v_hi:
+; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_pk_f16_fp8 v0, v0
+; GFX1250-GISEL-FAKE16-NEXT:    ; return to shader part epilog
+  %a.1 = extractelement <2 x i16> %a, i32 1
+  %cvt = tail call <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16 %a.1)
+  %ret = bitcast <2 x half> %cvt to float
+  ret float %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll
index 008e19b620520..5914253b5f58e 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll
@@ -1228,51 +1228,49 @@ define void @v_shuffle_v3bf16_v2bf16__3_u_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v1
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v2
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX900-NEXT:    v_alignbit_b32 v2, s4, v2, 16
-; GFX900-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    global_store_short_d16_hi v0, v1, s[16:17] offset:4
 ; GFX900-NEXT:    global_store_dword v0, v2, s[16:17]
-; GFX900-NEXT:    global_store_short v0, v1, s[16:17] offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v1
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v2
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    v_alignbit_b32 v2, s4, v2, 16
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX90A-NEXT:    global_store_short_d16_hi v0, v1, s[16:17] offset:4
 ; GFX90A-NEXT:    global_store_dword v0, v2, s[16:17]
-; GFX90A-NEXT:    global_store_short v0, v1, s[16:17] offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v1
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v2
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_alignbit_b32 v2, s0, v2, 16
-; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX942-NEXT:    global_store_short_d16_hi v0, v1, s[0:1] offset:4
 ; GFX942-NEXT:    global_store_dword v0, v2, s[0:1]
-; GFX942-NEXT:    global_store_short v0, v1, s[0:1] offset:4
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x bfloat> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll
index 99c9480adc410..cd4dbe93e8a11 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll
@@ -1928,48 +1928,45 @@ define void @v_shuffle_v3bf16_v3bf16__5_u_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    global_store_short_d16_hi v3, v0, s[16:17] offset:4
 ; GFX900-NEXT:    global_store_dword v3, v2, s[16:17]
-; GFX900-NEXT:    global_store_short v3, v0, s[16:17] offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    global_store_short_d16_hi v4, v0, s[16:17] offset:4
 ; GFX90A-NEXT:    global_store_dword v4, v3, s[16:17]
-; GFX90A-NEXT:    global_store_short v4, v0, s[16:17] offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:3]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_short_d16_hi v4, v0, s[0:1] offset:4
 ; GFX942-NEXT:    global_store_dword v4, v3, s[0:1]
-; GFX942-NEXT:    global_store_short v4, v0, s[0:1] offset:4
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll
index e34becc1065ff..99cb8a38f57c3 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll
@@ -1228,51 +1228,49 @@ define void @v_shuffle_v3f16_v2f16__3_u_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_u_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v1
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v2
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX900-NEXT:    v_alignbit_b32 v2, s4, v2, 16
-; GFX900-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    global_store_short_d16_hi v0, v1, s[16:17] offset:4
 ; GFX900-NEXT:    global_store_dword v0, v2, s[16:17]
-; GFX900-NEXT:    global_store_short v0, v1, s[16:17] offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_u_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v1
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v2
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    v_alignbit_b32 v2, s4, v2, 16
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX90A-NEXT:    global_store_short_d16_hi v0, v1, s[16:17] offset:4
 ; GFX90A-NEXT:    global_store_dword v0, v2, s[16:17]
-; GFX90A-NEXT:    global_store_short v0, v1, s[16:17] offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v3f16_v2f16__3_u_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v1
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v2
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_alignbit_b32 v2, s0, v2, 16
-; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX942-NEXT:    global_store_short_d16_hi v0, v1, s[0:1] offset:4
 ; GFX942-NEXT:    global_store_dword v0, v2, s[0:1]
-; GFX942-NEXT:    global_store_short v0, v1, s[0:1] offset:4
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x half> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll
index 84d42c882494c..0854ff2ebfc5d 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll
@@ -1928,48 +1928,45 @@ define void @v_shuffle_v3f16_v3f16__5_u_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_u_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    global_store_short_d16_hi v3, v0, s[16:17] offset:4
 ; GFX900-NEXT:    global_store_dword v3, v2, s[16:17]
-; GFX900-NEXT:    global_store_short v3, v0, s[16:17] offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_u_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    global_store_short_d16_hi v4, v0, s[16:17] offset:4
 ; GFX90A-NEXT:    global_store_dword v4, v3, s[16:17]
-; GFX90A-NEXT:    global_store_short v4, v0, s[16:17] offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v3f16_v3f16__5_u_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:3]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_short_d16_hi v4, v0, s[0:1] offset:4
 ; GFX942-NEXT:    global_store_dword v4, v3, s[0:1]
-; GFX942-NEXT:    global_store_short v4, v0, s[0:1] offset:4
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x half> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-image-sample.ll b/llvm/test/CodeGen/AMDGPU/unsupported-image-sample.ll
index db5f0ad42a677..b3cf3790a59d3 100644
--- a/llvm/test/CodeGen/AMDGPU/unsupported-image-sample.ll
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-image-sample.ll
@@ -1,17 +1,16 @@
 ; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: not --crash llc -O0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GFX90A %s
-; RUN: not --crash llc -O0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GFX942 %s
+; RUN: not llc -O0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GFX90A %s
+; RUN: not llc -O0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GFX942 %s
 ; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1030 %s
 ; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100 %s
 
 ; GFX9-LABEL: image_sample_test:
 ; GFX9: image_sample_lz
 
-; GFX90A: LLVM ERROR: requested image instruction is not supported on this GPU
-
-; GFX942: LLVM ERROR: requested image instruction is not supported on this GPU
+; GFX90A: error: <unknown>:0:0: in function image_sample_test void (ptr addrspace(1), float, float, <8 x i32>, <4 x i32>): requested image instruction is not supported on this GPU
+; GFX942: error: <unknown>:0:0: in function image_sample_test void (ptr addrspace(1), float, float, <8 x i32>, <4 x i32>): requested image instruction is not supported on this GPU
 
 ; GFX1030-LABEL: image_sample_test:
 ; GFX1030: image_sample_lz
@@ -28,3 +27,13 @@ define amdgpu_kernel void @image_sample_test(ptr addrspace(1) %out, float %arg1,
 }
 
 declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg)
+
+; GFX90A: error: <unknown>:0:0: in function sample_1d_tfe <4 x float> (<8 x i32>, <4 x i32>, ptr addrspace(1), float): TFE is not supported on this GPU
+; GFX942: error: <unknown>:0:0: in function sample_1d_tfe <4 x float> (<8 x i32>, <4 x i32>, ptr addrspace(1), float): TFE is not supported on this GPU
+define <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, ptr addrspace(1) inreg %out, float %s) {
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, ptr addrspace(1) %out, align 4
+  ret <4 x float> %v.vec
+}
diff --git a/llvm/test/CodeGen/ARM/ifcvt_unanalyzable_fallthrough.mir b/llvm/test/CodeGen/ARM/ifcvt_unanalyzable_fallthrough.mir
new file mode 100644
index 0000000000000..d2673c36f0f4c
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ifcvt_unanalyzable_fallthrough.mir
@@ -0,0 +1,114 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv7-apple-ios -run-pass=if-converter %s -o - | FileCheck %s
+
+# Testcase with unanalyzable branches (that may fallthrough) in the BB
+# following the diamond/triangle.
+
+# Goal here is to showcase a problem seen in the IfConverter when
+# AnalyzeBranch is indicating that the branches couldn't be analyzed. Problem
+# was originally seen for an out-of-tree target, and here we use ARM and a
+# MBB with two conditional jumps to make AnalyzeBranch return false.
+#
+# The problem was that if-converter when analyzing branches was using a
+# variable named HasFallThrough, to remember that an MBB could fallthrough to
+# the textual successor. When HasFallThrough is set we know that there are
+# fallthrough exits, but the opposite is not guaranteed. If
+# HasFallThrough==false there could still be fallthrough exists in situations
+# when analyzeBranch found unanalyzable branches. There were however a couple
+# of places in the code that checked !HasFallThrough assuming that it would
+# imply that there was no fallthrough exit.
+#
+# As a consequence we could end up merging blocks at the end of a converted
+# diamond/triangle and while doing that we messed up when fixing up the CFG
+# related to fallthrough edges. For the test cases below we incorrectly ended
+# up with a fallthrough from the MBBs with two Bcc instructions to the MBB
+# with the STRH after if conversion.
+#
+---
+name:            avoidMergeBlockDiamond
+body:             |
+  ; CHECK-LABEL: name: avoidMergeBlockDiamond
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $sp = tADDspi $sp, 2, 1 /* CC::ne */, $cpsr
+  ; CHECK-NEXT:   $sp = tADDspi $sp, 1, 0 /* CC::eq */, $cpsr, implicit $sp
+  ; CHECK-NEXT:   $sp = tADDspi $sp, 3, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   tBcc %bb.1, 1 /* CC::ne */, $cpsr
+  ; CHECK-NEXT:   tBcc %bb.1, 1 /* CC::ne */, $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   tBX_RET 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   STRH $sp, $sp, $noreg, 0, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   tB %bb.2, 14 /* CC::al */, $noreg
+  bb.0:
+    tBcc %bb.2, 1, $cpsr
+
+  bb.1:
+    $sp = tADDspi $sp, 1, 14, _
+    tB %bb.4, 14, $noreg
+
+  bb.2:
+    $sp = tADDspi $sp, 2, 14, _
+    tB %bb.4, 14, $noreg
+
+  bb.3:
+    STRH $sp, $sp, $noreg, 0, 14, $noreg
+    tB %bb.3, 14, $noreg
+
+  bb.4:
+    $sp = tADDspi $sp, 3, 14, _
+    tBcc %bb.5, 1, $cpsr
+    tBcc %bb.5, 1, $cpsr
+
+  bb.5:
+  successors:
+    tBX_RET 14, _
+...
+
+# Similar to the above, but with a triangle.
+---
+name:            avoidMergeBlockTriangle
+body:             |
+  ; CHECK-LABEL: name: avoidMergeBlockTriangle
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $sp = tADDspi $sp, 1, 1 /* CC::ne */, $cpsr
+  ; CHECK-NEXT:   $sp = tADDspi $sp, 2, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   tBcc %bb.1, 1 /* CC::ne */, $cpsr
+  ; CHECK-NEXT:   tBcc %bb.1, 1 /* CC::ne */, $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   tBX_RET 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   STRH $sp, $sp, $noreg, 0, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   tB %bb.2, 14 /* CC::al */, $noreg
+  bb.0:
+    tBcc %bb.1, 1, $cpsr
+    tB %bb.3, 14, $noreg
+
+  bb.1:
+    $sp = tADDspi $sp, 1, 14, _
+    tB %bb.3, 14, $noreg
+
+  bb.2:
+    STRH $sp, $sp, $noreg, 0, 14, $noreg
+    tB %bb.2, 14, $noreg
+
+  bb.3:
+    $sp = tADDspi $sp, 2, 14, _
+    tBcc %bb.4, 1, $cpsr
+    tBcc %bb.4, 1, $cpsr
+
+  bb.4:
+  successors:
+    tBX_RET 14, _
+...
diff --git a/llvm/test/CodeGen/ARM/special-reg.ll b/llvm/test/CodeGen/ARM/special-reg.ll
index e966550e673d4..cc95f79d2c73b 100644
--- a/llvm/test/CodeGen/ARM/special-reg.ll
+++ b/llvm/test/CodeGen/ARM/special-reg.ll
@@ -25,14 +25,18 @@ entry:
 define i64 @read_volatile_i64_twice() {
 ; ACORE-LABEL: read_volatile_i64_twice:
 ; ACORE:       @ %bb.0: @ %entry
-; ACORE-NEXT:    mov r0, #0
-; ACORE-NEXT:    mov r1, #0
+; ACORE-NEXT:    mrrc p15, #1, r0, r1, c14
+; ACORE-NEXT:    mrrc p15, #1, r2, r3, c14
+; ACORE-NEXT:    eor r0, r2, r0
+; ACORE-NEXT:    eor r1, r3, r1
 ; ACORE-NEXT:    bx lr
 ;
 ; MCORE-LABEL: read_volatile_i64_twice:
 ; MCORE:       @ %bb.0: @ %entry
-; MCORE-NEXT:    movs r0, #0
-; MCORE-NEXT:    movs r1, #0
+; MCORE-NEXT:    mrrc p15, #1, r0, r1, c14
+; MCORE-NEXT:    mrrc p15, #1, r2, r3, c14
+; MCORE-NEXT:    eors r0, r2
+; MCORE-NEXT:    eors r1, r3
 ; MCORE-NEXT:    bx lr
 entry:
   %0 = tail call i64 @llvm.read_volatile_register.i64(metadata !5)
diff --git a/llvm/test/CodeGen/DirectX/issue-145408-gep-struct-fix.ll b/llvm/test/CodeGen/DirectX/issue-145408-gep-struct-fix.ll
new file mode 100644
index 0000000000000..40d222cdf2f8f
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/issue-145408-gep-struct-fix.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -dxil-data-scalarization -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+
+%struct.RawStruct8D = type { [8 x i32] }
+
+define void @test_no_transform_of_struct()  {
+; CHECK-LABEL: define void @test_no_transform_of_struct() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[OUTPUTSIZESLOCAL_I:%.*]] = alloca [[STRUCT_RAWSTRUCT8D:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYINIT_ELEMENT13_I76:%.*]] = getelementptr inbounds nuw [1 x %struct.RawStruct8D], ptr [[OUTPUTSIZESLOCAL_I]], i32 0, i32 0
+; CHECK-NEXT:    ret void
+;
+entry:
+  %outputSizesLocal.i = alloca %struct.RawStruct8D, align 4
+  %arrayinit.element13.i76 = getelementptr inbounds nuw [1 x %struct.RawStruct8D], ptr %outputSizesLocal.i, i32 0, i32 0
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/llc-pipeline.ll b/llvm/test/CodeGen/DirectX/llc-pipeline.ll
index 2b29fd30a7a56..36fed88fc52d6 100644
--- a/llvm/test/CodeGen/DirectX/llc-pipeline.ll
+++ b/llvm/test/CodeGen/DirectX/llc-pipeline.ll
@@ -33,13 +33,13 @@
 ; CHECK-NEXT:   DXIL Translate Metadata
 ; CHECK-NEXT:   DXIL Post Optimization Validation
 ; CHECK-NEXT:   DXIL Op Lowering
+; CHECK-NEXT:   DXIL Root Signature Analysis
 ; CHECK-NEXT:   DXIL Prepare Module
 
 ; CHECK-ASM-NEXT: DXIL Metadata Pretty Printer
 ; CHECK-ASM-NEXT: Print Module IR
 
 ; CHECK-OBJ-NEXT: DXIL Embedder
-; CHECK-OBJ-NEXT: DXIL Root Signature Analysis
 ; CHECK-OBJ-NEXT: DXContainer Global Emitter
 ; CHECK-OBJ-NEXT: FunctionPass Manager
 ; CHECK-OBJ-NEXT:   Lazy Machine Block Frequency Analysis
diff --git a/llvm/test/CodeGen/DirectX/strip-rootsignatures.ll b/llvm/test/CodeGen/DirectX/strip-rootsignatures.ll
new file mode 100644
index 0000000000000..3ac617ae871fc
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/strip-rootsignatures.ll
@@ -0,0 +1,18 @@
+; RUN: opt -S -dxil-prepare < %s | FileCheck %s
+
+; Ensures that dxil-prepare will remove the dx.rootsignatures metadata
+
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+define void @main() {
+entry:
+  ret void
+}
+
+; CHECK-NOT: !dx.rootsignatures
+; CHECK-NOT: {{^!}}
+
+!dx.rootsignatures = !{!2} ; list of function/root signature pairs
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
+!3 = !{ !4 } ; list of root signature elements
+!4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout
diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-ilp32d.ll b/llvm/test/CodeGen/LoongArch/calling-conv-ilp32d.ll
new file mode 100644
index 0000000000000..62c2cc999456c
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/calling-conv-ilp32d.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+d --target-abi=ilp32d < %s \
+; RUN:   | FileCheck %s
+
+;; This file contains specific tests for the ilp32d ABI.
+
+;; Check pass floating-point arguments whith FPRs.
+
+define i32 @callee_float_in_fpr(i32 %a, float %b, double %c) nounwind {
+; CHECK-LABEL: callee_float_in_fpr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ftintrz.w.s $fa0, $fa0
+; CHECK-NEXT:    movfr2gr.s $a1, $fa0
+; CHECK-NEXT:    ftintrz.w.d $fa0, $fa1
+; CHECK-NEXT:    movfr2gr.s $a2, $fa0
+; CHECK-NEXT:    add.w $a0, $a0, $a1
+; CHECK-NEXT:    add.w $a0, $a0, $a2
+; CHECK-NEXT:    ret
+  %b_fptosi = fptosi float %b to i32
+  %c_fptosi = fptosi double %c to i32
+  %1 = add i32 %a, %b_fptosi
+  %2 = add i32 %1, %c_fptosi
+  ret i32 %2
+}
+
+define i32 @caller_float_in_fpr() nounwind {
+; CHECK-LABEL: caller_float_in_fpr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi.w $sp, $sp, -16
+; CHECK-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; CHECK-NEXT:    movgr2fr.w $fa1, $zero
+; CHECK-NEXT:    movgr2frh.w $fa1, $zero
+; CHECK-NEXT:    movgr2fr.w $fa0, $zero
+; CHECK-NEXT:    ori $a0, $zero, 1
+; CHECK-NEXT:    bl callee_float_in_fpr
+; CHECK-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; CHECK-NEXT:    addi.w $sp, $sp, 16
+; CHECK-NEXT:    ret
+  %1 = call i32 @callee_float_in_fpr(i32 1, float 0.0, double 0.0)
+  ret i32 %1
+}
+
+;; Check that the GPR is used once the FPRs are exhausted.
+
+;; Must keep define on a single line due to an update_llc_test_checks.py limitation.
+define i32 @callee_double_in_gpr_exhausted_fprs(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i) nounwind {
+; CHECK-LABEL: callee_double_in_gpr_exhausted_fprs:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movgr2fr.w $fa0, $a0
+; CHECK-NEXT:    movgr2frh.w $fa0, $a1
+; CHECK-NEXT:    ftintrz.w.d $fa1, $fa7
+; CHECK-NEXT:    movfr2gr.s $a0, $fa1
+; CHECK-NEXT:    ftintrz.w.d $fa0, $fa0
+; CHECK-NEXT:    movfr2gr.s $a1, $fa0
+; CHECK-NEXT:    add.w $a0, $a0, $a1
+; CHECK-NEXT:    ret
+  %h_fptosi = fptosi double %h to i32
+  %i_fptosi = fptosi double %i to i32
+  %1 = add i32 %h_fptosi, %i_fptosi
+  ret i32 %1
+}
+
+define i32 @caller_double_in_gpr_exhausted_fprs() nounwind {
+; CHECK-LABEL: caller_double_in_gpr_exhausted_fprs:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi.w $sp, $sp, -16
+; CHECK-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_1)
+; CHECK-NEXT:    fld.d $fa2, $a0, %pc_lo12(.LCPI3_1)
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_2)
+; CHECK-NEXT:    fld.d $fa3, $a0, %pc_lo12(.LCPI3_2)
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_3)
+; CHECK-NEXT:    fld.d $fa4, $a0, %pc_lo12(.LCPI3_3)
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_4)
+; CHECK-NEXT:    fld.d $fa5, $a0, %pc_lo12(.LCPI3_4)
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_5)
+; CHECK-NEXT:    fld.d $fa6, $a0, %pc_lo12(.LCPI3_5)
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_6)
+; CHECK-NEXT:    fld.d $fa7, $a0, %pc_lo12(.LCPI3_6)
+; CHECK-NEXT:    addi.w $a0, $zero, 1
+; CHECK-NEXT:    movgr2fr.w $fa0, $a0
+; CHECK-NEXT:    ffint.s.w $fa0, $fa0
+; CHECK-NEXT:    fcvt.d.s $fa0, $fa0
+; CHECK-NEXT:    lu12i.w $a1, 262688
+; CHECK-NEXT:    move $a0, $zero
+; CHECK-NEXT:    bl callee_double_in_gpr_exhausted_fprs
+; CHECK-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; CHECK-NEXT:    addi.w $sp, $sp, 16
+; CHECK-NEXT:    ret
+  %1 = call i32 @callee_double_in_gpr_exhausted_fprs(
+      double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0,
+      double 7.0, double 8.0, double 9.0)
+  ret i32 %1
+}
+
+;; Check that the stack is used once the FPRs and GPRs are both exhausted.
+
+;; Must keep define on a single line due to an update_llc_test_checks.py limitation.
+define i32 @callee_double_on_stack_exhausted_fprs_gprs(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i, double %j, double %k, double %l, double %m, double %n) nounwind {
+; CHECK-LABEL: callee_double_on_stack_exhausted_fprs_gprs:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fld.d $fa0, $sp, 0
+; CHECK-NEXT:    fld.d $fa1, $sp, 8
+; CHECK-NEXT:    ftintrz.w.d $fa0, $fa0
+; CHECK-NEXT:    movfr2gr.s $a0, $fa0
+; CHECK-NEXT:    ftintrz.w.d $fa0, $fa1
+; CHECK-NEXT:    movfr2gr.s $a1, $fa0
+; CHECK-NEXT:    add.w $a0, $a0, $a1
+; CHECK-NEXT:    ret
+  %m_fptosi = fptosi double %m to i32
+  %n_fptosi = fptosi double %n to i32
+  %1 = add i32 %m_fptosi, %n_fptosi
+  ret i32 %1
+}
+
+define i32 @caller_double_on_stack_exhausted_fprs_gprs() nounwind {
+; CHECK-LABEL: caller_double_on_stack_exhausted_fprs_gprs:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi.w $sp, $sp, -32
+; CHECK-NEXT:    st.w $ra, $sp, 28 # 4-byte Folded Spill
+; CHECK-NEXT:    lu12i.w $a0, 262816
+; CHECK-NEXT:    st.w $a0, $sp, 4
+; CHECK-NEXT:    st.w $zero, $sp, 0
+; CHECK-NEXT:    lu12i.w $a0, 262848
+; CHECK-NEXT:    st.w $a0, $sp, 12
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
+; CHECK-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI5_0)
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_1)
+; CHECK-NEXT:    fld.d $fa2, $a0, %pc_lo12(.LCPI5_1)
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_2)
+; CHECK-NEXT:    fld.d $fa3, $a0, %pc_lo12(.LCPI5_2)
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_3)
+; CHECK-NEXT:    fld.d $fa4, $a0, %pc_lo12(.LCPI5_3)
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_4)
+; CHECK-NEXT:    fld.d $fa5, $a0, %pc_lo12(.LCPI5_4)
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_5)
+; CHECK-NEXT:    fld.d $fa6, $a0, %pc_lo12(.LCPI5_5)
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_6)
+; CHECK-NEXT:    fld.d $fa7, $a0, %pc_lo12(.LCPI5_6)
+; CHECK-NEXT:    addi.w $a0, $zero, 1
+; CHECK-NEXT:    movgr2fr.w $fa0, $a0
+; CHECK-NEXT:    ffint.s.w $fa0, $fa0
+; CHECK-NEXT:    fcvt.d.s $fa0, $fa0
+; CHECK-NEXT:    lu12i.w $a1, 262688
+; CHECK-NEXT:    lu12i.w $a3, 262720
+; CHECK-NEXT:    lu12i.w $a5, 262752
+; CHECK-NEXT:    lu12i.w $a7, 262784
+; CHECK-NEXT:    st.w $zero, $sp, 8
+; CHECK-NEXT:    move $a0, $zero
+; CHECK-NEXT:    move $a2, $zero
+; CHECK-NEXT:    move $a4, $zero
+; CHECK-NEXT:    move $a6, $zero
+; CHECK-NEXT:    bl callee_double_on_stack_exhausted_fprs_gprs
+; CHECK-NEXT:    ld.w $ra, $sp, 28 # 4-byte Folded Reload
+; CHECK-NEXT:    addi.w $sp, $sp, 32
+; CHECK-NEXT:    ret
+  %1 = call i32 @callee_double_on_stack_exhausted_fprs_gprs(
+      double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0,
+      double 7.0, double 8.0, double 9.0, double 10.0, double 11.0, double 12.0,
+      double 13.0, double 14.0)
+  ret i32 %1
+}
+
+;; Check returning doubles.
+
+define double @callee_double_ret() nounwind {
+; CHECK-LABEL: callee_double_ret:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi.w $a0, $zero, 1
+; CHECK-NEXT:    movgr2fr.w $fa0, $a0
+; CHECK-NEXT:    ffint.s.w $fa0, $fa0
+; CHECK-NEXT:    fcvt.d.s $fa0, $fa0
+; CHECK-NEXT:    ret
+  ret double 1.0
+}
+
+define i64 @caller_double_ret() nounwind {
+; CHECK-LABEL: caller_double_ret:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi.w $sp, $sp, -16
+; CHECK-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; CHECK-NEXT:    bl callee_double_ret
+; CHECK-NEXT:    movfr2gr.s $a0, $fa0
+; CHECK-NEXT:    movfrh2gr.s $a1, $fa0
+; CHECK-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; CHECK-NEXT:    addi.w $sp, $sp, 16
+; CHECK-NEXT:    ret
+  %1 = call double @callee_double_ret()
+  %2 = bitcast double %1 to i64
+  ret i64 %2
+}
diff --git a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll
index be9ea29b54c33..c1d75ddd32803 100644
--- a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll
+++ b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll
@@ -32,18 +32,14 @@ define double @constraint_f_double(double %a) nounwind {
 define double @constraint_gpr(double %a) {
 ; LA32-LABEL: constraint_gpr:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    addi.w $sp, $sp, -16
-; LA32-NEXT:    .cfi_def_cfa_offset 16
-; LA32-NEXT:    fst.d $fa0, $sp, 8
-; LA32-NEXT:    ld.w $a7, $sp, 8
-; LA32-NEXT:    ld.w $t0, $sp, 12
+; LA32-NEXT:    .cfi_def_cfa_offset 0
+; LA32-NEXT:    movfr2gr.s $a7, $fa0
+; LA32-NEXT:    movfrh2gr.s $t0, $fa0
 ; LA32-NEXT:    #APP
 ; LA32-NEXT:    move $a6, $a7
 ; LA32-NEXT:    #NO_APP
-; LA32-NEXT:    st.w $a7, $sp, 4
-; LA32-NEXT:    st.w $a6, $sp, 0
-; LA32-NEXT:    fld.d $fa0, $sp, 0
-; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    movgr2fr.w $fa0, $a6
+; LA32-NEXT:    movgr2frh.w $fa0, $a7
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: constraint_gpr:
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll
index 2a51fd97feb62..0b82ea220d7fb 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll
@@ -279,11 +279,8 @@ define double @convert_u64_to_double(i64 %a) nounwind {
 define double @bitcast_i64_to_double(i64 %a, i64 %b) nounwind {
 ; LA32-LABEL: bitcast_i64_to_double:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    addi.w $sp, $sp, -16
-; LA32-NEXT:    st.w $a1, $sp, 12
-; LA32-NEXT:    st.w $a0, $sp, 8
-; LA32-NEXT:    fld.d $fa0, $sp, 8
-; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    movgr2fr.w $fa0, $a0
+; LA32-NEXT:    movgr2frh.w $fa0, $a1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: bitcast_i64_to_double:
@@ -297,11 +294,8 @@ define double @bitcast_i64_to_double(i64 %a, i64 %b) nounwind {
 define i64 @bitcast_double_to_i64(double %a) nounwind {
 ; LA32-LABEL: bitcast_double_to_i64:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    addi.w $sp, $sp, -16
-; LA32-NEXT:    fst.d $fa0, $sp, 8
-; LA32-NEXT:    ld.w $a0, $sp, 8
-; LA32-NEXT:    ld.w $a1, $sp, 12
-; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    movfr2gr.s $a0, $fa0
+; LA32-NEXT:    movfrh2gr.s $a1, $fa0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: bitcast_double_to_i64:
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll
index 78cabd37c0ad9..b6507e87f0886 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll
@@ -115,9 +115,8 @@ define double @load_acquire_double(ptr %ptr) {
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    ori $a1, $zero, 2
 ; LA32-NEXT:    bl __atomic_load_8
-; LA32-NEXT:    st.w $a1, $sp, 4
-; LA32-NEXT:    st.w $a0, $sp, 0
-; LA32-NEXT:    fld.d $fa0, $sp, 0
+; LA32-NEXT:    movgr2fr.w $fa0, $a0
+; LA32-NEXT:    movgr2frh.w $fa0, $a1
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -234,9 +233,8 @@ define double @load_unordered_double(ptr %ptr) {
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    move $a1, $zero
 ; LA32-NEXT:    bl __atomic_load_8
-; LA32-NEXT:    st.w $a1, $sp, 4
-; LA32-NEXT:    st.w $a0, $sp, 0
-; LA32-NEXT:    fld.d $fa0, $sp, 0
+; LA32-NEXT:    movgr2fr.w $fa0, $a0
+; LA32-NEXT:    movgr2frh.w $fa0, $a1
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -352,9 +350,8 @@ define double @load_monotonic_double(ptr %ptr) {
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    move $a1, $zero
 ; LA32-NEXT:    bl __atomic_load_8
-; LA32-NEXT:    st.w $a1, $sp, 4
-; LA32-NEXT:    st.w $a0, $sp, 0
-; LA32-NEXT:    fld.d $fa0, $sp, 0
+; LA32-NEXT:    movgr2fr.w $fa0, $a0
+; LA32-NEXT:    movgr2frh.w $fa0, $a1
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -481,9 +478,8 @@ define double @load_seq_cst_double(ptr %ptr) {
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    ori $a1, $zero, 5
 ; LA32-NEXT:    bl __atomic_load_8
-; LA32-NEXT:    st.w $a1, $sp, 4
-; LA32-NEXT:    st.w $a0, $sp, 0
-; LA32-NEXT:    fld.d $fa0, $sp, 0
+; LA32-NEXT:    movgr2fr.w $fa0, $a0
+; LA32-NEXT:    movgr2frh.w $fa0, $a1
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -605,9 +601,8 @@ define void @store_release_double(ptr %ptr, double %v) {
 ; LA32-NEXT:    .cfi_def_cfa_offset 16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
-; LA32-NEXT:    fst.d $fa0, $sp, 0
-; LA32-NEXT:    ld.w $a1, $sp, 0
-; LA32-NEXT:    ld.w $a2, $sp, 4
+; LA32-NEXT:    movfr2gr.s $a1, $fa0
+; LA32-NEXT:    movfrh2gr.s $a2, $fa0
 ; LA32-NEXT:    ori $a3, $zero, 3
 ; LA32-NEXT:    bl __atomic_store_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
@@ -723,9 +718,8 @@ define void @store_unordered_double(ptr %ptr, double %v) {
 ; LA32-NEXT:    .cfi_def_cfa_offset 16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
-; LA32-NEXT:    fst.d $fa0, $sp, 0
-; LA32-NEXT:    ld.w $a1, $sp, 0
-; LA32-NEXT:    ld.w $a2, $sp, 4
+; LA32-NEXT:    movfr2gr.s $a1, $fa0
+; LA32-NEXT:    movfrh2gr.s $a2, $fa0
 ; LA32-NEXT:    move $a3, $zero
 ; LA32-NEXT:    bl __atomic_store_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
@@ -841,9 +835,8 @@ define void @store_monotonic_double(ptr %ptr, double %v) {
 ; LA32-NEXT:    .cfi_def_cfa_offset 16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
-; LA32-NEXT:    fst.d $fa0, $sp, 0
-; LA32-NEXT:    ld.w $a1, $sp, 0
-; LA32-NEXT:    ld.w $a2, $sp, 4
+; LA32-NEXT:    movfr2gr.s $a1, $fa0
+; LA32-NEXT:    movfrh2gr.s $a2, $fa0
 ; LA32-NEXT:    move $a3, $zero
 ; LA32-NEXT:    bl __atomic_store_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
@@ -973,9 +966,8 @@ define void @store_seq_cst_double(ptr %ptr, double %v) {
 ; LA32-NEXT:    .cfi_def_cfa_offset 16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
-; LA32-NEXT:    fst.d $fa0, $sp, 0
-; LA32-NEXT:    ld.w $a1, $sp, 0
-; LA32-NEXT:    ld.w $a2, $sp, 4
+; LA32-NEXT:    movfr2gr.s $a1, $fa0
+; LA32-NEXT:    movfrh2gr.s $a2, $fa0
 ; LA32-NEXT:    ori $a3, $zero, 5
 ; LA32-NEXT:    bl __atomic_store_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/Mips/msa/compare_float.ll b/llvm/test/CodeGen/Mips/msa/compare_float.ll
index 2656cb839768c..178264581ea19 100644
--- a/llvm/test/CodeGen/Mips/msa/compare_float.ll
+++ b/llvm/test/CodeGen/Mips/msa/compare_float.ll
@@ -1,661 +1,645 @@
-; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s --check-prefixes=CHECK,MIPS
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s --check-prefixes=CHECK,MIPSEL
 
 declare <4 x float> @llvm.mips.fmax.w(<4 x float>, <4 x float>) nounwind
 declare <2 x double> @llvm.mips.fmax.d(<2 x double>, <2 x double>) nounwind
 declare <4 x float> @llvm.mips.fmin.w(<4 x float>, <4 x float>) nounwind
 declare <2 x double> @llvm.mips.fmin.d(<2 x double>, <2 x double>) nounwind
 
+; (setcc $a, $b, SETFALSE) is always folded, so we won't get fcaf
 define void @false_v4f32(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: false_v4f32:
-
+; CHECK-LABEL: false_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ldi.b $w0, 0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w0, 0($4)
   %1 = load <4 x float>, ptr %a
   %2 = load <4 x float>, ptr %b
   %3 = fcmp false <4 x float> %1, %2
   %4 = sext <4 x i1> %3 to <4 x i32>
   store <4 x i32> %4, ptr %c
   ret void
-
-  ; (setcc $a, $b, SETFALSE) is always folded, so we won't get fcaf:
-  ; CHECK-DAG: ldi.b [[R1:\$w[0-9]+]], 0
-  ; CHECK-DAG: st.w [[R1]], 0($4)
-  ; CHECK: .size false_v4f32
 }
 
+; (setcc $a, $b, SETFALSE) is always folded
 define void @false_v2f64(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: false_v2f64:
-
+; CHECK-LABEL: false_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ldi.b $w0, 0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w0, 0($4)
   %1 = load <2 x double>, ptr %a
   %2 = load <2 x double>, ptr %b
   %3 = fcmp false <2 x double> %1, %2
   %4 = sext <2 x i1> %3 to <2 x i64>
   store <2 x i64> %4, ptr %c
   ret void
-
-  ; (setcc $a, $b, SETFALSE) is always folded
-  ; CHECK-DAG: ldi.b [[R1:\$w[0-9]+]], 0
-  ; CHECK-DAG: st.w [[R1]], 0($4)
-  ; CHECK: .size false_v2f64
 }
 
 define void @oeq_v4f32(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: oeq_v4f32:
-
+; CHECK-LABEL: oeq_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $w0, 0($6)
+; CHECK-NEXT:    ld.w $w1, 0($5)
+; CHECK-NEXT:    fceq.w $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w0, 0($4)
   %1 = load <4 x float>, ptr %a
-  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x float>, ptr %b
-  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp oeq <4 x float> %1, %2
   %4 = sext <4 x i1> %3 to <4 x i32>
-  ; CHECK-DAG: fceq.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <4 x i32> %4, ptr %c
-  ; CHECK-DAG: st.w [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size oeq_v4f32
 }
 
 define void @oeq_v2f64(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: oeq_v2f64:
-
+; CHECK-LABEL: oeq_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $w0, 0($6)
+; CHECK-NEXT:    ld.d $w1, 0($5)
+; CHECK-NEXT:    fceq.d $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.d $w0, 0($4)
   %1 = load <2 x double>, ptr %a
-  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x double>, ptr %b
-  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp oeq <2 x double> %1, %2
   %4 = sext <2 x i1> %3 to <2 x i64>
-  ; CHECK-DAG: fceq.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <2 x i64> %4, ptr %c
-  ; CHECK-DAG: st.d [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size oeq_v2f64
 }
 
 define void @oge_v4f32(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: oge_v4f32:
-
+; CHECK-LABEL: oge_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $w0, 0($5)
+; CHECK-NEXT:    ld.w $w1, 0($6)
+; CHECK-NEXT:    fcle.w $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w0, 0($4)
   %1 = load <4 x float>, ptr %a
-  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x float>, ptr %b
-  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp oge <4 x float> %1, %2
   %4 = sext <4 x i1> %3 to <4 x i32>
-  ; CHECK-DAG: fcle.w [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <4 x i32> %4, ptr %c
-  ; CHECK-DAG: st.w [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size oge_v4f32
 }
 
 define void @oge_v2f64(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: oge_v2f64:
-
+; CHECK-LABEL: oge_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $w0, 0($5)
+; CHECK-NEXT:    ld.d $w1, 0($6)
+; CHECK-NEXT:    fcle.d $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.d $w0, 0($4)
   %1 = load <2 x double>, ptr %a
-  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x double>, ptr %b
-  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp oge <2 x double> %1, %2
   %4 = sext <2 x i1> %3 to <2 x i64>
-  ; CHECK-DAG: fcle.d [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <2 x i64> %4, ptr %c
-  ; CHECK-DAG: st.d [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size oge_v2f64
 }
 
 define void @ogt_v4f32(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: ogt_v4f32:
-
+; CHECK-LABEL: ogt_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $w0, 0($5)
+; CHECK-NEXT:    ld.w $w1, 0($6)
+; CHECK-NEXT:    fclt.w $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w0, 0($4)
   %1 = load <4 x float>, ptr %a
-  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x float>, ptr %b
-  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp ogt <4 x float> %1, %2
   %4 = sext <4 x i1> %3 to <4 x i32>
-  ; CHECK-DAG: fclt.w [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <4 x i32> %4, ptr %c
-  ; CHECK-DAG: st.w [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size ogt_v4f32
 }
 
 define void @ogt_v2f64(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: ogt_v2f64:
-
+; CHECK-LABEL: ogt_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $w0, 0($5)
+; CHECK-NEXT:    ld.d $w1, 0($6)
+; CHECK-NEXT:    fclt.d $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.d $w0, 0($4)
   %1 = load <2 x double>, ptr %a
-  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x double>, ptr %b
-  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp ogt <2 x double> %1, %2
   %4 = sext <2 x i1> %3 to <2 x i64>
-  ; CHECK-DAG: fclt.d [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <2 x i64> %4, ptr %c
-  ; CHECK-DAG: st.d [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size ogt_v2f64
 }
 
 define void @ole_v4f32(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: ole_v4f32:
-
+; CHECK-LABEL: ole_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $w0, 0($6)
+; CHECK-NEXT:    ld.w $w1, 0($5)
+; CHECK-NEXT:    fcle.w $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w0, 0($4)
   %1 = load <4 x float>, ptr %a
-  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x float>, ptr %b
-  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp ole <4 x float> %1, %2
   %4 = sext <4 x i1> %3 to <4 x i32>
-  ; CHECK-DAG: fcle.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <4 x i32> %4, ptr %c
-  ; CHECK-DAG: st.w [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size ole_v4f32
 }
 
 define void @ole_v2f64(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: ole_v2f64:
-
+; CHECK-LABEL: ole_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $w0, 0($6)
+; CHECK-NEXT:    ld.d $w1, 0($5)
+; CHECK-NEXT:    fcle.d $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.d $w0, 0($4)
   %1 = load <2 x double>, ptr %a
-  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x double>, ptr %b
-  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp ole <2 x double> %1, %2
   %4 = sext <2 x i1> %3 to <2 x i64>
-  ; CHECK-DAG: fcle.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <2 x i64> %4, ptr %c
-  ; CHECK-DAG: st.d [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size ole_v2f64
 }
 
 define void @olt_v4f32(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: olt_v4f32:
-
+; CHECK-LABEL: olt_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $w0, 0($6)
+; CHECK-NEXT:    ld.w $w1, 0($5)
+; CHECK-NEXT:    fclt.w $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w0, 0($4)
   %1 = load <4 x float>, ptr %a
-  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x float>, ptr %b
-  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp olt <4 x float> %1, %2
   %4 = sext <4 x i1> %3 to <4 x i32>
-  ; CHECK-DAG: fclt.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <4 x i32> %4, ptr %c
-  ; CHECK-DAG: st.w [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size olt_v4f32
 }
 
 define void @olt_v2f64(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: olt_v2f64:
-
+; CHECK-LABEL: olt_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $w0, 0($6)
+; CHECK-NEXT:    ld.d $w1, 0($5)
+; CHECK-NEXT:    fclt.d $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.d $w0, 0($4)
   %1 = load <2 x double>, ptr %a
-  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x double>, ptr %b
-  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp olt <2 x double> %1, %2
   %4 = sext <2 x i1> %3 to <2 x i64>
-  ; CHECK-DAG: fclt.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <2 x i64> %4, ptr %c
-  ; CHECK-DAG: st.d [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size olt_v2f64
 }
 
 define void @one_v4f32(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: one_v4f32:
-
+; CHECK-LABEL: one_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $w0, 0($6)
+; CHECK-NEXT:    ld.w $w1, 0($5)
+; CHECK-NEXT:    fcne.w $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w0, 0($4)
   %1 = load <4 x float>, ptr %a
-  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x float>, ptr %b
-  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp one <4 x float> %1, %2
   %4 = sext <4 x i1> %3 to <4 x i32>
-  ; CHECK-DAG: fcne.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <4 x i32> %4, ptr %c
-  ; CHECK-DAG: st.w [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size one_v4f32
 }
 
 define void @one_v2f64(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: one_v2f64:
-
+; CHECK-LABEL: one_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $w0, 0($6)
+; CHECK-NEXT:    ld.d $w1, 0($5)
+; CHECK-NEXT:    fcne.d $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.d $w0, 0($4)
   %1 = load <2 x double>, ptr %a
-  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x double>, ptr %b
-  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp one <2 x double> %1, %2
   %4 = sext <2 x i1> %3 to <2 x i64>
-  ; CHECK-DAG: fcne.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <2 x i64> %4, ptr %c
-  ; CHECK-DAG: st.d [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size one_v2f64
 }
 
 define void @ord_v4f32(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: ord_v4f32:
-
+; CHECK-LABEL: ord_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $w0, 0($6)
+; CHECK-NEXT:    ld.w $w1, 0($5)
+; CHECK-NEXT:    fcor.w $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w0, 0($4)
   %1 = load <4 x float>, ptr %a
-  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x float>, ptr %b
-  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp ord <4 x float> %1, %2
   %4 = sext <4 x i1> %3 to <4 x i32>
-  ; CHECK-DAG: fcor.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <4 x i32> %4, ptr %c
-  ; CHECK-DAG: st.w [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size ord_v4f32
 }
 
 define void @ord_v2f64(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: ord_v2f64:
-
+; CHECK-LABEL: ord_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $w0, 0($6)
+; CHECK-NEXT:    ld.d $w1, 0($5)
+; CHECK-NEXT:    fcor.d $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.d $w0, 0($4)
   %1 = load <2 x double>, ptr %a
-  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x double>, ptr %b
-  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp ord <2 x double> %1, %2
   %4 = sext <2 x i1> %3 to <2 x i64>
-  ; CHECK-DAG: fcor.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <2 x i64> %4, ptr %c
-  ; CHECK-DAG: st.d [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size ord_v2f64
 }
 
 define void @ueq_v4f32(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: ueq_v4f32:
-
+; CHECK-LABEL: ueq_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $w0, 0($6)
+; CHECK-NEXT:    ld.w $w1, 0($5)
+; CHECK-NEXT:    fcueq.w $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w0, 0($4)
   %1 = load <4 x float>, ptr %a
-  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x float>, ptr %b
-  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp ueq <4 x float> %1, %2
   %4 = sext <4 x i1> %3 to <4 x i32>
-  ; CHECK-DAG: fcueq.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <4 x i32> %4, ptr %c
-  ; CHECK-DAG: st.w [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size ueq_v4f32
 }
 
 define void @ueq_v2f64(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: ueq_v2f64:
-
+; CHECK-LABEL: ueq_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $w0, 0($6)
+; CHECK-NEXT:    ld.d $w1, 0($5)
+; CHECK-NEXT:    fcueq.d $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.d $w0, 0($4)
   %1 = load <2 x double>, ptr %a
-  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x double>, ptr %b
-  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp ueq <2 x double> %1, %2
   %4 = sext <2 x i1> %3 to <2 x i64>
-  ; CHECK-DAG: fcueq.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <2 x i64> %4, ptr %c
-  ; CHECK-DAG: st.d [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size ueq_v2f64
 }
 
 define void @uge_v4f32(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: uge_v4f32:
-
+; CHECK-LABEL: uge_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $w0, 0($5)
+; CHECK-NEXT:    ld.w $w1, 0($6)
+; CHECK-NEXT:    fcule.w $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w0, 0($4)
   %1 = load <4 x float>, ptr %a
-  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x float>, ptr %b
-  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp uge <4 x float> %1, %2
   %4 = sext <4 x i1> %3 to <4 x i32>
-  ; CHECK-DAG: fcule.w [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <4 x i32> %4, ptr %c
-  ; CHECK-DAG: st.w [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size uge_v4f32
 }
 
 define void @uge_v2f64(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: uge_v2f64:
-
+; CHECK-LABEL: uge_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $w0, 0($5)
+; CHECK-NEXT:    ld.d $w1, 0($6)
+; CHECK-NEXT:    fcule.d $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.d $w0, 0($4)
   %1 = load <2 x double>, ptr %a
-  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x double>, ptr %b
-  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp uge <2 x double> %1, %2
   %4 = sext <2 x i1> %3 to <2 x i64>
-  ; CHECK-DAG: fcule.d [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <2 x i64> %4, ptr %c
-  ; CHECK-DAG: st.d [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size uge_v2f64
 }
 
 define void @ugt_v4f32(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: ugt_v4f32:
-
+; CHECK-LABEL: ugt_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $w0, 0($5)
+; CHECK-NEXT:    ld.w $w1, 0($6)
+; CHECK-NEXT:    fcult.w $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w0, 0($4)
   %1 = load <4 x float>, ptr %a
-  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x float>, ptr %b
-  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp ugt <4 x float> %1, %2
   %4 = sext <4 x i1> %3 to <4 x i32>
-  ; CHECK-DAG: fcult.w [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <4 x i32> %4, ptr %c
-  ; CHECK-DAG: st.w [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size ugt_v4f32
 }
 
 define void @ugt_v2f64(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: ugt_v2f64:
-
+; CHECK-LABEL: ugt_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $w0, 0($5)
+; CHECK-NEXT:    ld.d $w1, 0($6)
+; CHECK-NEXT:    fcult.d $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.d $w0, 0($4)
   %1 = load <2 x double>, ptr %a
-  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x double>, ptr %b
-  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp ugt <2 x double> %1, %2
   %4 = sext <2 x i1> %3 to <2 x i64>
-  ; CHECK-DAG: fcult.d [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <2 x i64> %4, ptr %c
-  ; CHECK-DAG: st.d [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size ugt_v2f64
 }
 
 define void @ule_v4f32(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: ule_v4f32:
-
+; CHECK-LABEL: ule_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $w0, 0($6)
+; CHECK-NEXT:    ld.w $w1, 0($5)
+; CHECK-NEXT:    fcule.w $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w0, 0($4)
   %1 = load <4 x float>, ptr %a
-  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x float>, ptr %b
-  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp ule <4 x float> %1, %2
   %4 = sext <4 x i1> %3 to <4 x i32>
-  ; CHECK-DAG: fcule.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <4 x i32> %4, ptr %c
-  ; CHECK-DAG: st.w [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size ule_v4f32
 }
 
 define void @ule_v2f64(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: ule_v2f64:
-
+; CHECK-LABEL: ule_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $w0, 0($6)
+; CHECK-NEXT:    ld.d $w1, 0($5)
+; CHECK-NEXT:    fcule.d $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.d $w0, 0($4)
   %1 = load <2 x double>, ptr %a
-  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x double>, ptr %b
-  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp ule <2 x double> %1, %2
   %4 = sext <2 x i1> %3 to <2 x i64>
-  ; CHECK-DAG: fcule.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <2 x i64> %4, ptr %c
-  ; CHECK-DAG: st.d [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size ule_v2f64
 }
 
 define void @ult_v4f32(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: ult_v4f32:
-
+; CHECK-LABEL: ult_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $w0, 0($6)
+; CHECK-NEXT:    ld.w $w1, 0($5)
+; CHECK-NEXT:    fcult.w $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w0, 0($4)
   %1 = load <4 x float>, ptr %a
-  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x float>, ptr %b
-  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp ult <4 x float> %1, %2
   %4 = sext <4 x i1> %3 to <4 x i32>
-  ; CHECK-DAG: fcult.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <4 x i32> %4, ptr %c
-  ; CHECK-DAG: st.w [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size ult_v4f32
 }
 
 define void @ult_v2f64(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: ult_v2f64:
-
+; CHECK-LABEL: ult_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $w0, 0($6)
+; CHECK-NEXT:    ld.d $w1, 0($5)
+; CHECK-NEXT:    fcult.d $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.d $w0, 0($4)
   %1 = load <2 x double>, ptr %a
-  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x double>, ptr %b
-  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp ult <2 x double> %1, %2
   %4 = sext <2 x i1> %3 to <2 x i64>
-  ; CHECK-DAG: fcult.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <2 x i64> %4, ptr %c
-  ; CHECK-DAG: st.d [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size ult_v2f64
 }
 
 define void @uno_v4f32(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: uno_v4f32:
-
+; CHECK-LABEL: uno_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $w0, 0($6)
+; CHECK-NEXT:    ld.w $w1, 0($5)
+; CHECK-NEXT:    fcun.w $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w0, 0($4)
   %1 = load <4 x float>, ptr %a
-  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x float>, ptr %b
-  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp uno <4 x float> %1, %2
   %4 = sext <4 x i1> %3 to <4 x i32>
-  ; CHECK-DAG: fcun.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <4 x i32> %4, ptr %c
-  ; CHECK-DAG: st.w [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size uno_v4f32
 }
 
 define void @uno_v2f64(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: uno_v2f64:
-
+; CHECK-LABEL: uno_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $w0, 0($6)
+; CHECK-NEXT:    ld.d $w1, 0($5)
+; CHECK-NEXT:    fcun.d $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.d $w0, 0($4)
   %1 = load <2 x double>, ptr %a
-  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x double>, ptr %b
-  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp uno <2 x double> %1, %2
   %4 = sext <2 x i1> %3 to <2 x i64>
-  ; CHECK-DAG: fcun.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <2 x i64> %4, ptr %c
-  ; CHECK-DAG: st.d [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size uno_v2f64
 }
 
+; (setcc $a, $b, SETTRUE) is always folded, so we won't get fcaf
 define void @true_v4f32(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: true_v4f32:
-
+; CHECK-LABEL: true_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ldi.b $w0, -1
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w0, 0($4)
   %1 = load <4 x float>, ptr %a
   %2 = load <4 x float>, ptr %b
   %3 = fcmp true <4 x float> %1, %2
   %4 = sext <4 x i1> %3 to <4 x i32>
   store <4 x i32> %4, ptr %c
   ret void
-
-  ; (setcc $a, $b, SETTRUE) is always folded, so we won't get fcaf:
-  ; CHECK-DAG: ldi.b [[R1:\$w[0-9]+]], -1
-  ; CHECK-DAG: st.w [[R1]], 0($4)
-  ; CHECK: .size true_v4f32
 }
 
+; (setcc $a, $b, SETTRUE) is always folded.
 define void @true_v2f64(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: true_v2f64:
-
+; CHECK-LABEL: true_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ldi.b $w0, -1
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w0, 0($4)
   %1 = load <2 x double>, ptr %a
   %2 = load <2 x double>, ptr %b
   %3 = fcmp true <2 x double> %1, %2
   %4 = sext <2 x i1> %3 to <2 x i64>
   store <2 x i64> %4, ptr %c
   ret void
-
-  ; (setcc $a, $b, SETTRUE) is always folded.
-  ; CHECK-DAG: ldi.b [[R1:\$w[0-9]+]], -1
-  ; CHECK-DAG: st.w [[R1]], 0($4)
-  ; CHECK: .size true_v2f64
 }
 
-define void @bsel_v4f32(ptr %d, ptr %a, ptr %b,
-                          ptr %c) nounwind {
-  ; CHECK: bsel_v4f32:
-
+; Note that IfSet and IfClr are swapped since the condition is inverted
+define void @bsel_v4f32(ptr %d, ptr %a, ptr %b, ptr %c) nounwind {
+; CHECK-LABEL: bsel_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $w0, 0($5)
+; CHECK-NEXT:    ld.w $w1, 0($6)
+; CHECK-NEXT:    fclt.w $w1, $w1, $w0
+; CHECK-NEXT:    ld.w $w2, 0($7)
+; CHECK-NEXT:    bsel.v $w1, $w2, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w1, 0($4)
   %1 = load <4 x float>, ptr %a
-  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x float>, ptr %b
-  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = load <4 x float>, ptr %c
-  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0($7)
   %4 = fcmp ogt <4 x float> %1, %2
-  ; CHECK-DAG: fclt.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
   %5 = select <4 x i1> %4, <4 x float> %1, <4 x float> %3
-  ; Note that IfSet and IfClr are swapped since the condition is inverted
-  ; CHECK-DAG: bsel.v [[R4]], [[R3]], [[R1]]
   store <4 x float> %5, ptr %d
-  ; CHECK-DAG: st.w [[R4]], 0($4)
-
   ret void
-  ; CHECK: .size bsel_v4f32
 }
 
-define void @bsel_v2f64(ptr %d, ptr %a, ptr %b,
-                          ptr %c) nounwind {
-  ; CHECK: bsel_v2f64:
-
+; Note that IfSet and IfClr are swapped since the condition is inverted
+define void @bsel_v2f64(ptr %d, ptr %a, ptr %b, ptr %c) nounwind {
+; CHECK-LABEL: bsel_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $w0, 0($5)
+; CHECK-NEXT:    ld.d $w1, 0($6)
+; CHECK-NEXT:    fclt.d $w1, $w1, $w0
+; CHECK-NEXT:    ld.d $w2, 0($7)
+; CHECK-NEXT:    bsel.v $w1, $w2, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.d $w1, 0($4)
   %1 = load <2 x double>, ptr %a
-  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x double>, ptr %b
-  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = load <2 x double>, ptr %c
-  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0($7)
   %4 = fcmp ogt <2 x double> %1, %2
-  ; CHECK-DAG: fclt.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
   %5 = select <2 x i1> %4, <2 x double> %1, <2 x double> %3
-  ; Note that IfSet and IfClr are swapped since the condition is inverted
-  ; CHECK-DAG: bsel.v [[R4]], [[R3]], [[R1]]
   store <2 x double> %5, ptr %d
-  ; CHECK-DAG: st.d [[R4]], 0($4)
-
   ret void
-  ; CHECK: .size bsel_v2f64
 }
 
-define void @bseli_v4f32(ptr %d, ptr %a, ptr %b,
-                          ptr %c) nounwind {
-  ; CHECK: bseli_v4f32:
-
+; Note that IfSet and IfClr are swapped since the condition is inverted
+define void @bseli_v4f32(ptr %d, ptr %a, ptr %b, ptr %c) nounwind {
+; MIPS-LABEL: bseli_v4f32:
+; MIPS:       # %bb.0:
+; MIPS-NEXT:    ld.w $w0, 0($5)
+; MIPS-NEXT:    ld.w $w1, 0($6)
+; MIPS-NEXT:    fclt.w $w1, $w1, $w0
+; MIPS-NEXT:    ldi.b $w2, 0
+; MIPS-NEXT:    shf.b $w2, $w2, 27
+; MIPS-NEXT:    bsel.v $w1, $w2, $w0
+; MIPS-NEXT:    jr $ra
+; MIPS-NEXT:    st.w $w1, 0($4)
+;
+; MIPSEL-LABEL: bseli_v4f32:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    ld.w $w0, 0($5)
+; MIPSEL-NEXT:    ld.w $w1, 0($6)
+; MIPSEL-NEXT:    fclt.w $w1, $w1, $w0
+; MIPSEL-NEXT:    ldi.b $w2, 0
+; MIPSEL-NEXT:    bsel.v $w1, $w2, $w0
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    st.w $w1, 0($4)
   %1 = load <4 x float>, ptr %a
-  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x float>, ptr %b
-  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp ogt <4 x float> %1, %2
-  ; CHECK-DAG: fclt.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
   %4 = select <4 x i1> %3, <4 x float> %1, <4 x float> zeroinitializer
-  ; Note that IfSet and IfClr are swapped since the condition is inverted
-  ; CHECK-DAG: bsel.v [[R4]], [[R3:\$w[0-9]+]], [[R1]]
   store <4 x float> %4, ptr %d
-  ; CHECK-DAG: st.w [[R4]], 0($4)
-
   ret void
-  ; CHECK: .size bseli_v4f32
 }
 
-define void @bseli_v2f64(ptr %d, ptr %a, ptr %b,
-                          ptr %c) nounwind {
-  ; CHECK: bseli_v2f64:
-
+; Note that IfSet and IfClr are swapped since the condition is inverted
+define void @bseli_v2f64(ptr %d, ptr %a, ptr %b, ptr %c) nounwind {
+; MIPS-LABEL: bseli_v2f64:
+; MIPS:       # %bb.0:
+; MIPS-NEXT:    ld.d $w0, 0($5)
+; MIPS-NEXT:    ld.d $w1, 0($6)
+; MIPS-NEXT:    fclt.d $w1, $w1, $w0
+; MIPS-NEXT:    ldi.b $w2, 0
+; MIPS-NEXT:    shf.b $w2, $w2, 27
+; MIPS-NEXT:    shf.w $w2, $w2, 177
+; MIPS-NEXT:    bsel.v $w1, $w2, $w0
+; MIPS-NEXT:    jr $ra
+; MIPS-NEXT:    st.d $w1, 0($4)
+;
+; MIPSEL-LABEL: bseli_v2f64:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    ld.d $w0, 0($5)
+; MIPSEL-NEXT:    ld.d $w1, 0($6)
+; MIPSEL-NEXT:    fclt.d $w1, $w1, $w0
+; MIPSEL-NEXT:    ldi.b $w2, 0
+; MIPSEL-NEXT:    bsel.v $w1, $w2, $w0
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    st.d $w1, 0($4)
   %1 = load <2 x double>, ptr %a
-  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x double>, ptr %b
-  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = fcmp ogt <2 x double> %1, %2
-  ; CHECK-DAG: fclt.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
   %4 = select <2 x i1> %3, <2 x double> %1, <2 x double> zeroinitializer
-  ; Note that IfSet and IfClr are swapped since the condition is inverted
-  ; CHECK-DAG: bsel.v [[R4]], [[R3:\$w[0-9]+]], [[R1]]
   store <2 x double> %4, ptr %d
-  ; CHECK-DAG: st.d [[R4]], 0($4)
-
   ret void
-  ; CHECK: .size bseli_v2f64
 }
 
 define void @max_v4f32(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: max_v4f32:
-
+; CHECK-LABEL: max_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $w0, 0($6)
+; CHECK-NEXT:    ld.w $w1, 0($5)
+; CHECK-NEXT:    fmax.w $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w0, 0($4)
   %1 = load <4 x float>, ptr %a
-  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x float>, ptr %b
-  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = tail call <4 x float> @llvm.mips.fmax.w(<4 x float> %1, <4 x float> %2)
-  ; CHECK-DAG: fmax.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <4 x float> %3, ptr %c
-  ; CHECK-DAG: st.w [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size max_v4f32
 }
 
 define void @max_v2f64(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: max_v2f64:
-
+; CHECK-LABEL: max_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $w0, 0($6)
+; CHECK-NEXT:    ld.d $w1, 0($5)
+; CHECK-NEXT:    fmax.d $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.d $w0, 0($4)
   %1 = load <2 x double>, ptr %a
-  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x double>, ptr %b
-  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = tail call <2 x double> @llvm.mips.fmax.d(<2 x double> %1, <2 x double> %2)
-  ; CHECK-DAG: fmax.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <2 x double> %3, ptr %c
-  ; CHECK-DAG: st.d [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size max_v2f64
 }
 
 define void @min_v4f32(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: min_v4f32:
-
+; CHECK-LABEL: min_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $w0, 0($6)
+; CHECK-NEXT:    ld.w $w1, 0($5)
+; CHECK-NEXT:    fmin.w $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.w $w0, 0($4)
   %1 = load <4 x float>, ptr %a
-  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x float>, ptr %b
-  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = tail call <4 x float> @llvm.mips.fmin.w(<4 x float> %1, <4 x float> %2)
-  ; CHECK-DAG: fmin.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <4 x float> %3, ptr %c
-  ; CHECK-DAG: st.w [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size min_v4f32
 }
 
 define void @min_v2f64(ptr %c, ptr %a, ptr %b) nounwind {
-  ; CHECK: min_v2f64:
-
+; CHECK-LABEL: min_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $w0, 0($6)
+; CHECK-NEXT:    ld.d $w1, 0($5)
+; CHECK-NEXT:    fmin.d $w0, $w1, $w0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    st.d $w0, 0($4)
   %1 = load <2 x double>, ptr %a
-  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x double>, ptr %b
-  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = tail call <2 x double> @llvm.mips.fmin.d(<2 x double> %1, <2 x double> %2)
-  ; CHECK-DAG: fmin.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <2 x double> %3, ptr %c
-  ; CHECK-DAG: st.d [[R3]], 0($4)
-
   ret void
-  ; CHECK: .size min_v2f64
 }
diff --git a/llvm/test/CodeGen/NVPTX/alias.ll b/llvm/test/CodeGen/NVPTX/alias.ll
index 8ae29b51290ef..01761c21ab103 100644
--- a/llvm/test/CodeGen/NVPTX/alias.ll
+++ b/llvm/test/CodeGen/NVPTX/alias.ll
@@ -56,8 +56,7 @@ attributes #0 = { noreturn }
 ; CHECK-NEXT: .noreturn
 
 ;      CHECK: .visible .func  (.param .b32 func_retval0) z()
-;      CHECK:      call.uni (retval0), 
-; CHECK-NEXT:      b,
+;      CHECK:      call.uni (retval0), b,
 
 
 ; CHECK: .alias b, a;
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index 6f115756a8ae7..01e4065a7baa7 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -216,12 +216,7 @@ define <2 x bfloat> @test_call(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; CHECK-NEXT:    .param .align 4 .b8 param1[4];
 ; CHECK-NEXT:    st.param.b32 [param1], %r2;
 ; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
-; CHECK-NEXT:    call.uni (retval0),
-; CHECK-NEXT:    test_callee,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0,
-; CHECK-NEXT:    param1
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
diff --git a/llvm/test/CodeGen/NVPTX/byval-const-global.ll b/llvm/test/CodeGen/NVPTX/byval-const-global.ll
index 2af1e6d7e185b..ad9e4b089e8d8 100644
--- a/llvm/test/CodeGen/NVPTX/byval-const-global.ll
+++ b/llvm/test/CodeGen/NVPTX/byval-const-global.ll
@@ -19,11 +19,7 @@ define void @foo() {
 ; CHECK-NEXT:    .param .align 8 .b8 param0[16];
 ; CHECK-NEXT:    st.param.b64 [param0], %rd1;
 ; CHECK-NEXT:    st.param.b64 [param0+8], %rd2;
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    bar,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni bar, (param0);
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    ret;
   call void @bar(ptr byval(%struct) @G)
diff --git a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
index a2175dd009f5f..0cd7058174d67 100644
--- a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
+++ b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
@@ -48,8 +48,7 @@ entry:
 ; CHECK-NEXT:   st.param.b64  [param0], %rd[[A_REG]]
 ; CHECK-NEXT:   .param .b64 param1;
 ; CHECK-NEXT:   st.param.b64  [param1], %rd[[SP_REG]]
-; CHECK-NEXT:   call.uni
-; CHECK-NEXT:   callee,
+; CHECK-NEXT:   call.uni callee,
 
   call void @callee(ptr %a, ptr %buf) #2
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/combine-mad.ll b/llvm/test/CodeGen/NVPTX/combine-mad.ll
index dc6d504c2c66c..2232810d02128 100644
--- a/llvm/test/CodeGen/NVPTX/combine-mad.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-mad.ll
@@ -203,12 +203,7 @@ define i32 @test_mad_multi_use(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    .param .b32 param1;
 ; CHECK-NEXT:    st.param.b32 [param1], %r5;
 ; CHECK-NEXT:    .param .b32 retval0;
-; CHECK-NEXT:    call.uni (retval0),
-; CHECK-NEXT:    use,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0,
-; CHECK-NEXT:    param1
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni (retval0), use, (param0, param1);
 ; CHECK-NEXT:    ld.param.b32 %r6, [retval0];
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
diff --git a/llvm/test/CodeGen/NVPTX/convergent-mir-call.ll b/llvm/test/CodeGen/NVPTX/convergent-mir-call.ll
index 5e85bf4554546..39a2d7f9e1504 100644
--- a/llvm/test/CodeGen/NVPTX/convergent-mir-call.ll
+++ b/llvm/test/CodeGen/NVPTX/convergent-mir-call.ll
@@ -9,18 +9,16 @@ declare void @conv() convergent
 declare void @not_conv()
 
 define void @test(ptr %f) {
-  ; CHECK: ConvergentCallUniPrintCall
-  ; CHECK-NEXT: @conv
+  ; CHECK: CALL_UNI_conv @conv
   call void @conv()
 
-  ; CHECK: CallUniPrintCall
-  ; CHECK-NEXT: @not_conv
+  ; CHECK: CALL_UNI @not_conv
   call void @not_conv()
 
-  ; CHECK: ConvergentCallPrintCall
+  ; CHECK: CALL_conv %{{[0-9]+}}
   call void %f() convergent
 
-  ; CHECK: CallPrintCall
+  ; CHECK: CALL %{{[0-9]+}}
   call void %f()
 
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll b/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll
index 71a46fa6d4820..d1b478d341915 100644
--- a/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll
@@ -9,12 +9,7 @@ define %struct.64 @test_return_type_mismatch(ptr %p) {
 ; CHECK-LABEL: test_return_type_mismatch(
 ; CHECK:         .param .align 1 .b8 retval0[8];
 ; CHECK-NEXT:    prototype_0 : .callprototype (.param .align 1 .b8 _[8]) _ (.param .b64 _);
-; CHECK-NEXT:    call (retval0),
-; CHECK-NEXT:    %rd
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    )
-; CHECK-NEXT:    , prototype_0;
+; CHECK-NEXT:    call (retval0), %rd{{[0-9]+}}, (param0), prototype_0;
   %ret = call %struct.64 @callee(ptr %p)
   ret %struct.64 %ret
 }
@@ -23,12 +18,7 @@ define i64 @test_param_type_mismatch(ptr %p) {
 ; CHECK-LABEL: test_param_type_mismatch(
 ; CHECK:         .param .b64 retval0;
 ; CHECK-NEXT:    prototype_1 : .callprototype (.param .b64 _) _ (.param .b64 _);
-; CHECK-NEXT:    call (retval0),
-; CHECK-NEXT:    %rd
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    )
-; CHECK-NEXT:    , prototype_1;
+; CHECK-NEXT:    call (retval0), %rd{{[0-9]+}}, (param0), prototype_1;
   %ret = call i64 @callee(i64 7)
   ret i64 %ret
 }
@@ -37,13 +27,7 @@ define i64 @test_param_count_mismatch(ptr %p) {
 ; CHECK-LABEL: test_param_count_mismatch(
 ; CHECK:         .param .b64 retval0;
 ; CHECK-NEXT:    prototype_2 : .callprototype (.param .b64 _) _ (.param .b64 _, .param .b64 _);
-; CHECK-NEXT:    call (retval0),
-; CHECK-NEXT:    %rd
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0,
-; CHECK-NEXT:    param1
-; CHECK-NEXT:    )
-; CHECK-NEXT:    , prototype_2;
+; CHECK-NEXT:    call (retval0), %rd{{[0-9]+}}, (param0, param1), prototype_2;
   %ret = call i64 @callee(ptr %p, i64 7)
   ret i64 %ret
 }
@@ -52,12 +36,7 @@ define %struct.64 @test_return_type_mismatch_variadic(ptr %p) {
 ; CHECK-LABEL: test_return_type_mismatch_variadic(
 ; CHECK:         .param .align 1 .b8 retval0[8];
 ; CHECK-NEXT:    prototype_3 : .callprototype (.param .align 1 .b8 _[8]) _ (.param .b64 _);
-; CHECK-NEXT:    call (retval0),
-; CHECK-NEXT:    %rd
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    )
-; CHECK-NEXT:    , prototype_3;
+; CHECK-NEXT:    call (retval0), %rd{{[0-9]+}}, (param0), prototype_3;
   %ret = call %struct.64 (ptr, ...) @callee_variadic(ptr %p)
   ret %struct.64 %ret
 }
@@ -65,12 +44,7 @@ define %struct.64 @test_return_type_mismatch_variadic(ptr %p) {
 define i64 @test_param_type_mismatch_variadic(ptr %p) {
 ; CHECK-LABEL: test_param_type_mismatch_variadic(
 ; CHECK:         .param .b64 retval0;
-; CHECK-NEXT:    call.uni (retval0),
-; CHECK-NEXT:    callee_variadic
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0,
-; CHECK-NEXT:    param1
-; CHECK-NEXT:    )
+; CHECK-NEXT:    call.uni (retval0), callee_variadic, (param0, param1);
   %ret = call i64 (ptr, ...) @callee_variadic(ptr %p, i64 7)
   ret i64 %ret
 }
@@ -78,12 +52,7 @@ define i64 @test_param_type_mismatch_variadic(ptr %p) {
 define i64 @test_param_count_mismatch_variadic(ptr %p) {
 ; CHECK-LABEL: test_param_count_mismatch_variadic(
 ; CHECK:         .param .b64 retval0;
-; CHECK-NEXT:    call.uni (retval0),
-; CHECK-NEXT:    callee_variadic
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0,
-; CHECK-NEXT:    param1
-; CHECK-NEXT:    )
+; CHECK-NEXT:    call.uni (retval0), callee_variadic, (param0, param1);
   %ret = call i64 (ptr, ...) @callee_variadic(ptr %p, i64 7)
   ret i64 %ret
 }
diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
index b73aea76a4528..4d2ba7d00f872 100644
--- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
@@ -24,11 +24,7 @@ define i32 @test_dynamic_stackalloc(i64 %n) {
 ; CHECK-32-NEXT:    .param .b32 param0;
 ; CHECK-32-NEXT:    st.param.b32 [param0], %r5;
 ; CHECK-32-NEXT:    .param .b32 retval0;
-; CHECK-32-NEXT:    call.uni (retval0),
-; CHECK-32-NEXT:    bar,
-; CHECK-32-NEXT:    (
-; CHECK-32-NEXT:    param0
-; CHECK-32-NEXT:    );
+; CHECK-32-NEXT:    call.uni (retval0), bar, (param0);
 ; CHECK-32-NEXT:    ld.param.b32 %r6, [retval0];
 ; CHECK-32-NEXT:    } // callseq 0
 ; CHECK-32-NEXT:    st.param.b32 [func_retval0], %r6;
@@ -49,11 +45,7 @@ define i32 @test_dynamic_stackalloc(i64 %n) {
 ; CHECK-64-NEXT:    .param .b64 param0;
 ; CHECK-64-NEXT:    st.param.b64 [param0], %rd5;
 ; CHECK-64-NEXT:    .param .b32 retval0;
-; CHECK-64-NEXT:    call.uni (retval0),
-; CHECK-64-NEXT:    bar,
-; CHECK-64-NEXT:    (
-; CHECK-64-NEXT:    param0
-; CHECK-64-NEXT:    );
+; CHECK-64-NEXT:    call.uni (retval0), bar, (param0);
 ; CHECK-64-NEXT:    ld.param.b32 %r1, [retval0];
 ; CHECK-64-NEXT:    } // callseq 0
 ; CHECK-64-NEXT:    st.param.b32 [func_retval0], %r1;
diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
index c905fc04ce780..252edf4b02c76 100644
--- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
@@ -263,12 +263,7 @@ declare half @test_callee(half %a, half %b) #0
 ; CHECK-DAG:  st.param.b16    [param0], [[A]];
 ; CHECK-DAG:  st.param.b16    [param1], [[B]];
 ; CHECK-DAG:  .param .align 2 .b8 retval0[2];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT:        test_callee,
-; CHECK-NEXT: (
-; CHECK-NEXT:        param0,
-; CHECK-NEXT:        param1
-; CHECK-NEXT: );
+; CHECK:      call.uni (retval0), test_callee, (param0, param1);
 ; CHECK-NEXT: ld.param.b16    [[R:%rs[0-9]+]], [retval0];
 ; CHECK-NEXT: }
 ; CHECK-NEXT: st.param.b16    [func_retval0], [[R]];
@@ -287,12 +282,7 @@ define half @test_call(half %a, half %b) #0 {
 ; CHECK-DAG:  st.param.b16    [param0], [[B]];
 ; CHECK-DAG:  st.param.b16    [param1], [[A]];
 ; CHECK-DAG:  .param .align 2 .b8 retval0[2];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT:        test_callee,
-; CHECK-NEXT: (
-; CHECK-NEXT:        param0,
-; CHECK-NEXT:        param1
-; CHECK-NEXT: );
+; CHECK:      call.uni (retval0), test_callee, (param0, param1);
 ; CHECK-NEXT: ld.param.b16    [[R:%rs[0-9]+]], [retval0];
 ; CHECK-NEXT: }
 ; CHECK-NEXT: st.param.b16    [func_retval0], [[R]];
@@ -311,12 +301,7 @@ define half @test_call_flipped(half %a, half %b) #0 {
 ; CHECK-DAG:  st.param.b16    [param0], [[B]];
 ; CHECK-DAG:  st.param.b16    [param1], [[A]];
 ; CHECK-DAG:  .param .align 2 .b8 retval0[2];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT:        test_callee,
-; CHECK-NEXT: (
-; CHECK-NEXT:        param0,
-; CHECK-NEXT:        param1
-; CHECK-NEXT: );
+; CHECK:      call.uni (retval0), test_callee, (param0, param1);
 ; CHECK-NEXT: ld.param.b16    [[R:%rs[0-9]+]], [retval0];
 ; CHECK-NEXT: }
 ; CHECK-NEXT: st.param.b16    [func_retval0], [[R]];
@@ -650,8 +635,7 @@ else:
 ; CHECK:      ld.b16  [[AB:%rs[0-9]+]], [%[[P1]]];
 ; CHECK:      {
 ; CHECK:      st.param.b64    [param0], %[[P1]];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_dummy
+; CHECK:      call.uni (retval0), test_dummy
 ; CHECK:      }
 ; CHECK:      setp.ne.b32     [[PRED:%p[0-9]+]], %r{{[0-9]+}}, 0;
 ; CHECK:      @[[PRED]] bra   [[LOOP]];
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index fc7f53c5fdca3..8da2c1d1ebac2 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -467,12 +467,7 @@ define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    .param .align 4 .b8 param1[4];
 ; CHECK-NEXT:    st.param.b32 [param1], %r2;
 ; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
-; CHECK-NEXT:    call.uni (retval0),
-; CHECK-NEXT:    test_callee,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0,
-; CHECK-NEXT:    param1
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -495,12 +490,7 @@ define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    .param .align 4 .b8 param1[4];
 ; CHECK-NEXT:    st.param.b32 [param1], %r1;
 ; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
-; CHECK-NEXT:    call.uni (retval0),
-; CHECK-NEXT:    test_callee,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0,
-; CHECK-NEXT:    param1
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
 ; CHECK-NEXT:    } // callseq 1
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -523,12 +513,7 @@ define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    .param .align 4 .b8 param1[4];
 ; CHECK-NEXT:    st.param.b32 [param1], %r1;
 ; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
-; CHECK-NEXT:    call.uni (retval0),
-; CHECK-NEXT:    test_callee,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0,
-; CHECK-NEXT:    param1
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
 ; CHECK-NEXT:    } // callseq 2
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
diff --git a/llvm/test/CodeGen/NVPTX/fma.ll b/llvm/test/CodeGen/NVPTX/fma.ll
index 327851725991e..b74e531adba3f 100644
--- a/llvm/test/CodeGen/NVPTX/fma.ll
+++ b/llvm/test/CodeGen/NVPTX/fma.ll
@@ -40,12 +40,7 @@ define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) {
 ; CHECK-NEXT:    .param .b32 param1;
 ; CHECK-NEXT:    st.param.b32 [param1], %r6;
 ; CHECK-NEXT:    .param .b32 retval0;
-; CHECK-NEXT:    call.uni (retval0),
-; CHECK-NEXT:    dummy_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0,
-; CHECK-NEXT:    param1
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni (retval0), dummy_f32, (param0, param1);
 ; CHECK-NEXT:    ld.param.b32 %r7, [retval0];
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
@@ -92,12 +87,7 @@ define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) {
 ; CHECK-NEXT:    .param .b64 param1;
 ; CHECK-NEXT:    st.param.b64 [param1], %rd6;
 ; CHECK-NEXT:    .param .b64 retval0;
-; CHECK-NEXT:    call.uni (retval0),
-; CHECK-NEXT:    dummy_f64,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0,
-; CHECK-NEXT:    param1
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni (retval0), dummy_f64, (param0, param1);
 ; CHECK-NEXT:    ld.param.b64 %rd7, [retval0];
 ; CHECK-NEXT:    } // callseq 1
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd7;
diff --git a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
index d253df5ed1b9c..ed8f6b4511079 100644
--- a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
+++ b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
@@ -50,11 +50,7 @@ define void @test_ld_param_escaping(ptr byval(i32) %a) {
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .b64 param0;
 ; CHECK-NEXT:    st.param.b64 [param0], %rd2;
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    escape,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni escape, (param0);
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    ret;
   call void @escape(ptr %a)
@@ -72,11 +68,7 @@ define void @test_ld_param_byval(ptr byval(i32) %a) {
 ; CHECK-NEXT:    { // callseq 1, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.b32 [param0], %r1;
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    byval_user,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni byval_user, (param0);
 ; CHECK-NEXT:    } // callseq 1
 ; CHECK-NEXT:    ret;
   call void @byval_user(ptr %a)
diff --git a/llvm/test/CodeGen/NVPTX/fp128-storage-type.ll b/llvm/test/CodeGen/NVPTX/fp128-storage-type.ll
index d40f514acd408..de69d02ded5e4 100644
--- a/llvm/test/CodeGen/NVPTX/fp128-storage-type.ll
+++ b/llvm/test/CodeGen/NVPTX/fp128-storage-type.ll
@@ -42,11 +42,7 @@ define void @call(fp128 %x) {
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v2.b64 [param0], {%rd1, %rd2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call, (param0);
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    ret;
   call void @call(fp128 %x)
diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
index bf1fb06c44688..d5ddadf2b21c5 100644
--- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
@@ -647,12 +647,7 @@ define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    .param .align 4 .b8 param1[4];
 ; COMMON-NEXT:    st.param.b32 [param1], %r2;
 ; COMMON-NEXT:    .param .align 4 .b8 retval0[4];
-; COMMON-NEXT:    call.uni (retval0),
-; COMMON-NEXT:    test_callee,
-; COMMON-NEXT:    (
-; COMMON-NEXT:    param0,
-; COMMON-NEXT:    param1
-; COMMON-NEXT:    );
+; COMMON-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; COMMON-NEXT:    ld.param.b32 %r3, [retval0];
 ; COMMON-NEXT:    } // callseq 0
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -675,12 +670,7 @@ define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    .param .align 4 .b8 param1[4];
 ; COMMON-NEXT:    st.param.b32 [param1], %r1;
 ; COMMON-NEXT:    .param .align 4 .b8 retval0[4];
-; COMMON-NEXT:    call.uni (retval0),
-; COMMON-NEXT:    test_callee,
-; COMMON-NEXT:    (
-; COMMON-NEXT:    param0,
-; COMMON-NEXT:    param1
-; COMMON-NEXT:    );
+; COMMON-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; COMMON-NEXT:    ld.param.b32 %r3, [retval0];
 ; COMMON-NEXT:    } // callseq 1
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -703,12 +693,7 @@ define <2 x i16> @test_tailcall_flipped(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    .param .align 4 .b8 param1[4];
 ; COMMON-NEXT:    st.param.b32 [param1], %r1;
 ; COMMON-NEXT:    .param .align 4 .b8 retval0[4];
-; COMMON-NEXT:    call.uni (retval0),
-; COMMON-NEXT:    test_callee,
-; COMMON-NEXT:    (
-; COMMON-NEXT:    param0,
-; COMMON-NEXT:    param1
-; COMMON-NEXT:    );
+; COMMON-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; COMMON-NEXT:    ld.param.b32 %r3, [retval0];
 ; COMMON-NEXT:    } // callseq 2
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 7cc7468bc7de7..72c279bee4268 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -833,12 +833,7 @@ define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    .param .align 4 .b8 param1[4];
 ; CHECK-NEXT:    st.param.b32 [param1], %r2;
 ; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
-; CHECK-NEXT:    call.uni (retval0),
-; CHECK-NEXT:    test_callee,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0,
-; CHECK-NEXT:    param1
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -861,12 +856,7 @@ define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    .param .align 4 .b8 param1[4];
 ; CHECK-NEXT:    st.param.b32 [param1], %r1;
 ; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
-; CHECK-NEXT:    call.uni (retval0),
-; CHECK-NEXT:    test_callee,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0,
-; CHECK-NEXT:    param1
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
 ; CHECK-NEXT:    } // callseq 1
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -889,12 +879,7 @@ define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    .param .align 4 .b8 param1[4];
 ; CHECK-NEXT:    st.param.b32 [param1], %r1;
 ; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
-; CHECK-NEXT:    call.uni (retval0),
-; CHECK-NEXT:    test_callee,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0,
-; CHECK-NEXT:    param1
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
 ; CHECK-NEXT:    } // callseq 2
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
diff --git a/llvm/test/CodeGen/NVPTX/indirect_byval.ll b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
index 1341a04c939c6..eae0321433946 100644
--- a/llvm/test/CodeGen/NVPTX/indirect_byval.ll
+++ b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
@@ -33,13 +33,7 @@ define internal i32 @foo() {
 ; CHECK-NEXT:    st.param.b64 [param1], %rd4;
 ; CHECK-NEXT:    .param .b32 retval0;
 ; CHECK-NEXT:    prototype_0 : .callprototype (.param .b32 _) _ (.param .align 1 .b8 _[1], .param .b64 _);
-; CHECK-NEXT:    call (retval0),
-; CHECK-NEXT:    %rd1,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0,
-; CHECK-NEXT:    param1
-; CHECK-NEXT:    )
-; CHECK-NEXT:    , prototype_0;
+; CHECK-NEXT:    call (retval0), %rd1, (param0, param1), prototype_0;
 ; CHECK-NEXT:    ld.param.b32 %r1, [retval0];
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
@@ -76,13 +70,7 @@ define internal i32 @bar() {
 ; CHECK-NEXT:    st.param.b64 [param1], %rd5;
 ; CHECK-NEXT:    .param .b32 retval0;
 ; CHECK-NEXT:    prototype_1 : .callprototype (.param .b32 _) _ (.param .align 8 .b8 _[8], .param .b64 _);
-; CHECK-NEXT:    call (retval0),
-; CHECK-NEXT:    %rd1,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0,
-; CHECK-NEXT:    param1
-; CHECK-NEXT:    )
-; CHECK-NEXT:    , prototype_1;
+; CHECK-NEXT:    call (retval0), %rd1, (param0, param1), prototype_1;
 ; CHECK-NEXT:    ld.param.b32 %r1, [retval0];
 ; CHECK-NEXT:    } // callseq 1
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
diff --git a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
index 419c780f7d82a..9e9705709f2bd 100644
--- a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
+++ b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
@@ -14,10 +14,7 @@ define void @foo(ptr %ptr) {
 ; CHECK-NEXT:    ld.param.b64 %rd1, [foo_param_0];
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 16 .b8 retval0[16];
-; CHECK-NEXT:    call.uni (retval0),
-; CHECK-NEXT:    bar,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni (retval0), bar, ();
 ; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [retval0];
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4};
diff --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
index 2bfd891a04a17..a9004d00e7807 100644
--- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
+++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
@@ -58,11 +58,7 @@ define ptx_kernel void @foo2(i32 %a) {
 ; PTX32-NEXT:    { // callseq 0, 0
 ; PTX32-NEXT:    .param .b32 param0;
 ; PTX32-NEXT:    st.param.b32 [param0], %r2;
-; PTX32-NEXT:    call.uni
-; PTX32-NEXT:    bar,
-; PTX32-NEXT:    (
-; PTX32-NEXT:    param0
-; PTX32-NEXT:    );
+; PTX32-NEXT:    call.uni bar, (param0);
 ; PTX32-NEXT:    } // callseq 0
 ; PTX32-NEXT:    ret;
 ;
@@ -84,11 +80,7 @@ define ptx_kernel void @foo2(i32 %a) {
 ; PTX64-NEXT:    { // callseq 0, 0
 ; PTX64-NEXT:    .param .b64 param0;
 ; PTX64-NEXT:    st.param.b64 [param0], %rd1;
-; PTX64-NEXT:    call.uni
-; PTX64-NEXT:    bar,
-; PTX64-NEXT:    (
-; PTX64-NEXT:    param0
-; PTX64-NEXT:    );
+; PTX64-NEXT:    call.uni bar, (param0);
 ; PTX64-NEXT:    } // callseq 0
 ; PTX64-NEXT:    ret;
   %local = alloca i32, align 4
@@ -159,20 +151,12 @@ define void @foo4() {
 ; PTX32-NEXT:    { // callseq 1, 0
 ; PTX32-NEXT:    .param .b32 param0;
 ; PTX32-NEXT:    st.param.b32 [param0], %r1;
-; PTX32-NEXT:    call.uni
-; PTX32-NEXT:    bar,
-; PTX32-NEXT:    (
-; PTX32-NEXT:    param0
-; PTX32-NEXT:    );
+; PTX32-NEXT:    call.uni bar, (param0);
 ; PTX32-NEXT:    } // callseq 1
 ; PTX32-NEXT:    { // callseq 2, 0
 ; PTX32-NEXT:    .param .b32 param0;
 ; PTX32-NEXT:    st.param.b32 [param0], %r3;
-; PTX32-NEXT:    call.uni
-; PTX32-NEXT:    bar,
-; PTX32-NEXT:    (
-; PTX32-NEXT:    param0
-; PTX32-NEXT:    );
+; PTX32-NEXT:    call.uni bar, (param0);
 ; PTX32-NEXT:    } // callseq 2
 ; PTX32-NEXT:    ret;
 ;
@@ -197,20 +181,12 @@ define void @foo4() {
 ; PTX64-NEXT:    { // callseq 1, 0
 ; PTX64-NEXT:    .param .b64 param0;
 ; PTX64-NEXT:    st.param.b64 [param0], %rd1;
-; PTX64-NEXT:    call.uni
-; PTX64-NEXT:    bar,
-; PTX64-NEXT:    (
-; PTX64-NEXT:    param0
-; PTX64-NEXT:    );
+; PTX64-NEXT:    call.uni bar, (param0);
 ; PTX64-NEXT:    } // callseq 1
 ; PTX64-NEXT:    { // callseq 2, 0
 ; PTX64-NEXT:    .param .b64 param0;
 ; PTX64-NEXT:    st.param.b64 [param0], %rd3;
-; PTX64-NEXT:    call.uni
-; PTX64-NEXT:    bar,
-; PTX64-NEXT:    (
-; PTX64-NEXT:    param0
-; PTX64-NEXT:    );
+; PTX64-NEXT:    call.uni bar, (param0);
 ; PTX64-NEXT:    } // callseq 2
 ; PTX64-NEXT:    ret;
   %A = alloca i32
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index c3f94455b3038..0a2cd81ac904c 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -133,12 +133,7 @@ define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
 ; PTX-NEXT:    st.param.b64 [param0], %rd3;
 ; PTX-NEXT:    .param .b32 retval0;
 ; PTX-NEXT:    prototype_0 : .callprototype (.param .b32 _) _ (.param .b64 _);
-; PTX-NEXT:    call (retval0),
-; PTX-NEXT:    %rd1,
-; PTX-NEXT:    (
-; PTX-NEXT:    param0
-; PTX-NEXT:    )
-; PTX-NEXT:    , prototype_0;
+; PTX-NEXT:    call (retval0), %rd1, (param0), prototype_0;
 ; PTX-NEXT:    ld.param.b32 %r1, [retval0];
 ; PTX-NEXT:    } // callseq 0
 ; PTX-NEXT:    ret;
@@ -182,14 +177,7 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4
 ; PTX-NEXT:    st.param.b64 [param2], %rd4;
 ; PTX-NEXT:    .param .b32 retval0;
 ; PTX-NEXT:    prototype_1 : .callprototype (.param .b32 _) _ (.param .b64 _, .param .b64 _, .param .b64 _);
-; PTX-NEXT:    call (retval0),
-; PTX-NEXT:    %rd1,
-; PTX-NEXT:    (
-; PTX-NEXT:    param0,
-; PTX-NEXT:    param1,
-; PTX-NEXT:    param2
-; PTX-NEXT:    )
-; PTX-NEXT:    , prototype_1;
+; PTX-NEXT:    call (retval0), %rd1, (param0, param1, param2), prototype_1;
 ; PTX-NEXT:    ld.param.b32 %r2, [retval0];
 ; PTX-NEXT:    } // callseq 1
 ; PTX-NEXT:    ret;
@@ -284,12 +272,7 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou
 ; PTX-NEXT:    st.param.b64 [param0], %rd5;
 ; PTX-NEXT:    .param .b32 retval0;
 ; PTX-NEXT:    prototype_2 : .callprototype (.param .b32 _) _ (.param .b64 _);
-; PTX-NEXT:    call (retval0),
-; PTX-NEXT:    %rd1,
-; PTX-NEXT:    (
-; PTX-NEXT:    param0
-; PTX-NEXT:    )
-; PTX-NEXT:    , prototype_2;
+; PTX-NEXT:    call (retval0), %rd1, (param0), prototype_2;
 ; PTX-NEXT:    ld.param.b32 %r3, [retval0];
 ; PTX-NEXT:    } // callseq 2
 ; PTX-NEXT:    ret;
@@ -330,12 +313,7 @@ define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input,
 ; PTX-NEXT:    st.param.b64 [param0], %rd5;
 ; PTX-NEXT:    .param .b32 retval0;
 ; PTX-NEXT:    prototype_3 : .callprototype (.param .b32 _) _ (.param .b64 _);
-; PTX-NEXT:    call (retval0),
-; PTX-NEXT:    %rd1,
-; PTX-NEXT:    (
-; PTX-NEXT:    param0
-; PTX-NEXT:    )
-; PTX-NEXT:    , prototype_3;
+; PTX-NEXT:    call (retval0), %rd1, (param0), prototype_3;
 ; PTX-NEXT:    ld.param.b32 %r4, [retval0];
 ; PTX-NEXT:    } // callseq 3
 ; PTX-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -561,11 +539,7 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
 ; PTX-NEXT:    { // callseq 4, 0
 ; PTX-NEXT:    .param .align 4 .b8 param0[4];
 ; PTX-NEXT:    st.param.b32 [param0], %r1;
-; PTX-NEXT:    call.uni
-; PTX-NEXT:    device_func,
-; PTX-NEXT:    (
-; PTX-NEXT:    param0
-; PTX-NEXT:    );
+; PTX-NEXT:    call.uni device_func, (param0);
 ; PTX-NEXT:    } // callseq 4
 ; PTX-NEXT:    ret;
   call void @device_func(ptr byval(i32) align 4 %input)
diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index 246408ecf6a3a..6f334b075241b 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes IR,IRC
 ; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-nvcl | FileCheck %s --check-prefixes IR,IRO
 ; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX,PTXC
@@ -47,11 +47,7 @@ define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %
 ; PTX-NEXT:    .param .b64 param0;
 ; PTX-NEXT:    st.param.b64 [param0], %rd5;
 ; PTX-NEXT:    .param .b64 retval0;
-; PTX-NEXT:    call.uni (retval0),
-; PTX-NEXT:    escape,
-; PTX-NEXT:    (
-; PTX-NEXT:    param0
-; PTX-NEXT:    );
+; PTX-NEXT:    call.uni (retval0), escape, (param0);
 ; PTX-NEXT:    ld.param.b64 %rd6, [retval0];
 ; PTX-NEXT:    } // callseq 0
 ; PTX-NEXT:    ret;
@@ -89,11 +85,7 @@ define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) {
 ; PTX-NEXT:    .param .b64 param0;
 ; PTX-NEXT:    st.param.b64 [param0], %rd2;
 ; PTX-NEXT:    .param .b64 retval0;
-; PTX-NEXT:    call.uni (retval0),
-; PTX-NEXT:    escape,
-; PTX-NEXT:    (
-; PTX-NEXT:    param0
-; PTX-NEXT:    );
+; PTX-NEXT:    call.uni (retval0), escape, (param0);
 ; PTX-NEXT:    ld.param.b64 %rd3, [retval0];
 ; PTX-NEXT:    } // callseq 1
 ; PTX-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index 54495cf0d61f3..d268562914755 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -153,11 +153,7 @@ define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out
 ; PTX-NEXT:    { // callseq 0, 0
 ; PTX-NEXT:    .param .b64 param0;
 ; PTX-NEXT:    st.param.b64 [param0], %rd1;
-; PTX-NEXT:    call.uni
-; PTX-NEXT:    _Z6escapePv,
-; PTX-NEXT:    (
-; PTX-NEXT:    param0
-; PTX-NEXT:    );
+; PTX-NEXT:    call.uni _Z6escapePv, (param0);
 ; PTX-NEXT:    } // callseq 0
 ; PTX-NEXT:    ret;
 entry:
@@ -198,11 +194,7 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone
 ; PTX-NEXT:    { // callseq 1, 0
 ; PTX-NEXT:    .param .b64 param0;
 ; PTX-NEXT:    st.param.b64 [param0], %rd3;
-; PTX-NEXT:    call.uni
-; PTX-NEXT:    _Z6escapePv,
-; PTX-NEXT:    (
-; PTX-NEXT:    param0
-; PTX-NEXT:    );
+; PTX-NEXT:    call.uni _Z6escapePv, (param0);
 ; PTX-NEXT:    } // callseq 1
 ; PTX-NEXT:    ret;
 entry:
@@ -902,11 +894,7 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
 ; PTX-NEXT:    { // callseq 2, 0
 ; PTX-NEXT:    .param .align 4 .b8 param0[4];
 ; PTX-NEXT:    st.param.b32 [param0], %r1;
-; PTX-NEXT:    call.uni
-; PTX-NEXT:    device_func,
-; PTX-NEXT:    (
-; PTX-NEXT:    param0
-; PTX-NEXT:    );
+; PTX-NEXT:    call.uni device_func, (param0);
 ; PTX-NEXT:    } // callseq 2
 ; PTX-NEXT:    ret;
   call void @device_func(ptr byval(i32) align 4 %input)
@@ -929,11 +917,7 @@ define void @device_func(ptr byval(i32) align 4 %input) {
 ; PTX-NEXT:    { // callseq 3, 0
 ; PTX-NEXT:    .param .align 4 .b8 param0[4];
 ; PTX-NEXT:    st.param.b32 [param0], %r1;
-; PTX-NEXT:    call.uni
-; PTX-NEXT:    device_func,
-; PTX-NEXT:    (
-; PTX-NEXT:    param0
-; PTX-NEXT:    );
+; PTX-NEXT:    call.uni device_func, (param0);
 ; PTX-NEXT:    } // callseq 3
 ; PTX-NEXT:    ret;
   call void @device_func(ptr byval(i32) align 4 %input)
diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
index 7e907990147a5..2e9eb6913ac0e 100644
--- a/llvm/test/CodeGen/NVPTX/misched_func_call.ll
+++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
@@ -21,11 +21,7 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-NEXT:    .param .b64 param0;
 ; CHECK-NEXT:    st.param.b64 [param0], 0d0000000000000000;
 ; CHECK-NEXT:    .param .b64 retval0;
-; CHECK-NEXT:    call.uni (retval0),
-; CHECK-NEXT:    quux,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni (retval0), quux, (param0);
 ; CHECK-NEXT:    ld.param.b64 %rd1, [retval0];
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    mul.lo.s32 %r7, %r10, %r3;
diff --git a/llvm/test/CodeGen/NVPTX/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/NVPTX/naked-fn-with-frame-pointer.ll
index a1f0577c2218b..448960181ae42 100644
--- a/llvm/test/CodeGen/NVPTX/naked-fn-with-frame-pointer.ll
+++ b/llvm/test/CodeGen/NVPTX/naked-fn-with-frame-pointer.ll
@@ -11,10 +11,7 @@ define dso_local void @naked() naked "frame-pointer"="all" {
 ; CHECK-32-EMPTY:
 ; CHECK-32-NEXT:  // %bb.0:
 ; CHECK-32-NEXT:    { // callseq 0, 0
-; CHECK-32-NEXT:    call.uni
-; CHECK-32-NEXT:    main,
-; CHECK-32-NEXT:    (
-; CHECK-32-NEXT:    );
+; CHECK-32-NEXT:    call.uni main, ();
 ; CHECK-32-NEXT:    } // callseq 0
 ; CHECK-32-NEXT:    // begin inline asm
 ; CHECK-32-NEXT:    exit;
@@ -26,10 +23,7 @@ define dso_local void @naked() naked "frame-pointer"="all" {
 ; CHECK-64-EMPTY:
 ; CHECK-64-NEXT:  // %bb.0:
 ; CHECK-64-NEXT:    { // callseq 0, 0
-; CHECK-64-NEXT:    call.uni
-; CHECK-64-NEXT:    main,
-; CHECK-64-NEXT:    (
-; CHECK-64-NEXT:    );
+; CHECK-64-NEXT:    call.uni main, ();
 ; CHECK-64-NEXT:    } // callseq 0
 ; CHECK-64-NEXT:    // begin inline asm
 ; CHECK-64-NEXT:    exit;
@@ -45,10 +39,7 @@ define dso_local void @normal() "frame-pointer"="all" {
 ; CHECK-32-EMPTY:
 ; CHECK-32-NEXT:  // %bb.0:
 ; CHECK-32-NEXT:    { // callseq 1, 0
-; CHECK-32-NEXT:    call.uni
-; CHECK-32-NEXT:    main,
-; CHECK-32-NEXT:    (
-; CHECK-32-NEXT:    );
+; CHECK-32-NEXT:    call.uni main, ();
 ; CHECK-32-NEXT:    } // callseq 1
 ; CHECK-32-NEXT:    // begin inline asm
 ; CHECK-32-NEXT:    exit;
@@ -60,10 +51,7 @@ define dso_local void @normal() "frame-pointer"="all" {
 ; CHECK-64-EMPTY:
 ; CHECK-64-NEXT:  // %bb.0:
 ; CHECK-64-NEXT:    { // callseq 1, 0
-; CHECK-64-NEXT:    call.uni
-; CHECK-64-NEXT:    main,
-; CHECK-64-NEXT:    (
-; CHECK-64-NEXT:    );
+; CHECK-64-NEXT:    call.uni main, ();
 ; CHECK-64-NEXT:    } // callseq 1
 ; CHECK-64-NEXT:    // begin inline asm
 ; CHECK-64-NEXT:    exit;
diff --git a/llvm/test/CodeGen/NVPTX/param-add.ll b/llvm/test/CodeGen/NVPTX/param-add.ll
index 4fc8786c1e2fe..cd2664e913824 100644
--- a/llvm/test/CodeGen/NVPTX/param-add.ll
+++ b/llvm/test/CodeGen/NVPTX/param-add.ll
@@ -37,11 +37,7 @@ define i32 @test(%struct.1float alignstack(32) %data) {
 ; CHECK-NEXT:    st.param.b8 [param0+2], %r12;
 ; CHECK-NEXT:    st.param.b8 [param0+3], %r13;
 ; CHECK-NEXT:    .param .b32 retval0;
-; CHECK-NEXT:    call.uni (retval0),
-; CHECK-NEXT:    callee,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni (retval0), callee, (param0);
 ; CHECK-NEXT:    ld.param.b32 %r14, [retval0];
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r14;
diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll
index 4bea710e6dd93..263477df1dbfe 100644
--- a/llvm/test/CodeGen/NVPTX/param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll
@@ -32,8 +32,7 @@
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], [[C]]
 ; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni
-; CHECK-NEXT: test_i1,
+; CHECK:      call.uni (retval0), test_i1,
 ; CHECK:      ld.param.b32    [[R8:%r[0-9]+]], [retval0];
 ; CHECK:      and.b32         [[R:%r[0-9]+]], [[R8]], 1;
 ; CHECK:      st.param.b32    [func_retval0], [[R]];
@@ -76,8 +75,7 @@ define signext i1 @test_i1s(i1 signext %a) {
 ; CHECK-DAG:  st.param.b8     [param0], [[E0]];
 ; CHECK-DAG:  st.param.b8     [param0+2], [[E2]];
 ; CHECK:      .param .align 1 .b8 retval0[1];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v3i1,
+; CHECK:      call.uni (retval0), test_v3i1,
 ; CHECK-DAG:  ld.param.b8     [[RE0:%rs[0-9]+]], [retval0];
 ; CHECK-DAG:  ld.param.b8     [[RE2:%rs[0-9]+]], [retval0+2];
 ; CHECK-DAG:  st.param.b8     [func_retval0], [[RE0]]
@@ -95,8 +93,7 @@ define <3 x i1> @test_v3i1(<3 x i1> %a) {
 ; CHECK:      .param .align 1 .b8 param0[1];
 ; CHECK:      st.param.b8  [param0], [[E0]];
 ; CHECK:      .param .align 1 .b8 retval0[1];
-; CHECK:      call.uni (retval0),
-; CHECK:      test_v4i1,
+; CHECK:      call.uni (retval0), test_v4i1,
 ; CHECK:      ld.param.b8  [[RE0:%rs[0-9]+]], [retval0];
 ; CHECK:      ld.param.b8  [[RE1:%rs[0-9]+]], [retval0+1];
 ; CHECK:      ld.param.b8  [[RE2:%rs[0-9]+]], [retval0+2];
@@ -120,8 +117,7 @@ define <4 x i1> @test_v4i1(<4 x i1> %a) {
 ; CHECK-DAG:  st.param.b8     [param0], [[E0]];
 ; CHECK-DAG:  st.param.b8     [param0+4], [[E4]];
 ; CHECK:      .param .align 1 .b8 retval0[1];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v5i1,
+; CHECK:      call.uni (retval0), test_v5i1,
 ; CHECK-DAG:  ld.param.b8  [[RE0:%rs[0-9]+]], [retval0];
 ; CHECK-DAG:  ld.param.b8     [[RE4:%rs[0-9]+]], [retval0+4];
 ; CHECK-DAG:  st.param.b8  [func_retval0], [[RE0]]
@@ -139,8 +135,7 @@ define <5 x i1> @test_v5i1(<5 x i1> %a) {
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK:      test_i2,
+; CHECK:      call.uni (retval0), test_i2,
 ; CHECK:      ld.param.b32    {{%r[0-9]+}}, [retval0];
 ; CHECK:      st.param.b32    [func_retval0], {{%r[0-9]+}};
 ; CHECK-NEXT: ret;
@@ -156,8 +151,7 @@ define i2 @test_i2(i2 %a) {
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK:      test_i3,
+; CHECK:      call.uni (retval0), test_i3,
 ; CHECK:      ld.param.b32    {{%r[0-9]+}}, [retval0];
 ; CHECK:      st.param.b32    [func_retval0], {{%r[0-9]+}};
 ; CHECK-NEXT: ret;
@@ -176,8 +170,7 @@ define i3 @test_i3(i3 %a) {
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], [[A]];
 ; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK:      test_i8,
+; CHECK:      call.uni (retval0), test_i8,
 ; CHECK:      ld.param.b32    [[R32:%r[0-9]+]], [retval0];
 ; CHECK:      and.b32         [[R:%r[0-9]+]], [[R32]], 255;
 ; CHECK:      st.param.b32    [func_retval0], [[R]];
@@ -196,8 +189,7 @@ define i8 @test_i8(i8 %a) {
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], [[A]];
 ; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK:      test_i8s,
+; CHECK:      call.uni (retval0), test_i8s,
 ; CHECK:      ld.param.b32    [[R32:%r[0-9]+]], [retval0];
 ; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ?
 ; CHECK:      cvt.u16.u32     [[R16:%rs[0-9]+]], [[R32]];
@@ -216,8 +208,7 @@ define signext i8 @test_i8s(i8 signext %a) {
 ; CHECK:      .param .align 4 .b8 param0[4];
 ; CHECK:      st.param.b32  [param0], [[R]]
 ; CHECK:      .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v3i8,
+; CHECK:      call.uni (retval0), test_v3i8,
 ; CHECK:      ld.param.b32  [[RE:%r[0-9]+]], [retval0];
 ; v4i8/i32->{v3i8 elements}->v4i8/i32 conversion is messy and not very
 ; interesting here, so it's skipped.
@@ -235,8 +226,7 @@ define <3 x i8> @test_v3i8(<3 x i8> %a) {
 ; CHECK:      .param .align 4 .b8 param0[4];
 ; CHECK:      st.param.b32  [param0], [[R]];
 ; CHECK:      .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v4i8,
+; CHECK:      call.uni (retval0), test_v4i8,
 ; CHECK:      ld.param.b32  [[RET:%r[0-9]+]], [retval0];
 ; CHECK:      st.param.b32  [func_retval0], [[RET]];
 ; CHECK-NEXT: ret;
@@ -254,8 +244,7 @@ define <4 x i8> @test_v4i8(<4 x i8> %a) {
 ; CHECK-DAG:  st.param.v4.b8  [param0], 
 ; CHECK-DAG:  st.param.b8     [param0+4], [[E4]];
 ; CHECK:      .param .align 8 .b8 retval0[8];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v5i8,
+; CHECK:      call.uni (retval0), test_v5i8,
 ; CHECK-DAG:  ld.param.v4.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0];
 ; CHECK-DAG:  ld.param.b8     [[RE4:%rs[0-9]+]], [retval0+4];
 ; CHECK-DAG:  st.param.v4.b8  [func_retval0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
@@ -272,8 +261,7 @@ define <5 x i8> @test_v5i8(<5 x i8> %a) {
 ; CHECK:      ld.param.b16    {{%rs[0-9]+}}, [test_i11_param_0];
 ; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_i11,
+; CHECK:      call.uni (retval0), test_i11,
 ; CHECK:      ld.param.b32    {{%r[0-9]+}}, [retval0];
 ; CHECK:      st.param.b32    [func_retval0], {{%r[0-9]+}};
 ; CHECK-NEXT: ret;
@@ -290,8 +278,7 @@ define i11 @test_i11(i11 %a) {
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], [[E32]];
 ; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_i16,
+; CHECK:      call.uni (retval0), test_i16,
 ; CHECK:      ld.param.b32    [[RE32:%r[0-9]+]], [retval0];
 ; CHECK:      and.b32         [[R:%r[0-9]+]], [[RE32]], 65535;
 ; CHECK:      st.param.b32    [func_retval0], [[R]];
@@ -309,8 +296,7 @@ define i16 @test_i16(i16 %a) {
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], [[E32]];
 ; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_i16s,
+; CHECK:      call.uni (retval0), test_i16s,
 ; CHECK:      ld.param.b32    [[RE32:%r[0-9]+]], [retval0];
 ; CHECK:      cvt.s32.s16     [[R:%r[0-9]+]], [[RE32]];
 ; CHECK:      st.param.b32    [func_retval0], [[R]];
@@ -329,8 +315,7 @@ define signext i16 @test_i16s(i16 signext %a) {
 ; CHECK:      st.param.v2.b16 [param0], {[[E0]], [[E1]]};
 ; CHECK:      st.param.b16    [param0+4], [[E2]];
 ; CHECK:      .param .align 8 .b8 retval0[8];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v3i16,
+; CHECK:      call.uni (retval0), test_v3i16,
 ; CHECK:      ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0];
 ; CHECK:      ld.param.b16    [[RE2:%rs[0-9]+]], [retval0+4];
 ; CHECK-DAG:  st.param.v2.b16 [func_retval0], {[[RE0]], [[RE1]]};
@@ -348,8 +333,7 @@ define <3 x i16> @test_v3i16(<3 x i16> %a) {
 ; CHECK:      .param .align 8 .b8 param0[8];
 ; CHECK:      st.param.v2.b32 [param0], {[[E0]], [[E1]]};
 ; CHECK:      .param .align 8 .b8 retval0[8];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v4i16,
+; CHECK:      call.uni (retval0), test_v4i16,
 ; CHECK:      ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0];
 ; CHECK:      st.param.v2.b32 [func_retval0], {[[RE0]], [[RE1]]}
 ; CHECK-NEXT: ret;
@@ -367,8 +351,7 @@ define <4 x i16> @test_v4i16(<4 x i16> %a) {
 ; CHECK-DAG:  st.param.v4.b16 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]};
 ; CHECK-DAG:  st.param.b16    [param0+8], [[E4]];
 ; CHECK:      .param .align 16 .b8 retval0[16];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v5i16,
+; CHECK:      call.uni (retval0), test_v5i16,
 ; CHECK-DAG:  ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0];
 ; CHECK-DAG:  ld.param.b16    [[RE4:%rs[0-9]+]], [retval0+8];
 ; CHECK-DAG:  st.param.v4.b16 [func_retval0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
@@ -386,8 +369,7 @@ define <5 x i16> @test_v5i16(<5 x i16> %a) {
 ; CHECK:      .param .align 2 .b8 param0[2];
 ; CHECK:      st.param.b16    [param0], [[E]];
 ; CHECK:      .param .align 2 .b8 retval0[2];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_f16,
+; CHECK:      call.uni (retval0), test_f16,
 ; CHECK:      ld.param.b16    [[R:%rs[0-9]+]], [retval0];
 ; CHECK:      st.param.b16    [func_retval0], [[R]]
 ; CHECK-NEXT: ret;
@@ -403,8 +385,7 @@ define half @test_f16(half %a) {
 ; CHECK:      .param .align 4 .b8 param0[4];
 ; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v2f16,
+; CHECK:      call.uni (retval0), test_v2f16,
 ; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0];
 ; CHECK:      st.param.b32    [func_retval0], [[R]]
 ; CHECK-NEXT: ret;
@@ -420,8 +401,7 @@ define <2 x half> @test_v2f16(<2 x half> %a) {
 ; CHECK:      .param .align 2 .b8 param0[2];
 ; CHECK:      st.param.b16    [param0], [[E]];
 ; CHECK:      .param .align 2 .b8 retval0[2];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_bf16,
+; CHECK:      call.uni (retval0), test_bf16,
 ; CHECK:      ld.param.b16    [[R:%rs[0-9]+]], [retval0];
 ; CHECK:      st.param.b16    [func_retval0], [[R]]
 ; CHECK-NEXT: ret;
@@ -437,8 +417,7 @@ define bfloat @test_bf16(bfloat %a) {
 ; CHECK:      .param .align 4 .b8 param0[4];
 ; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v2bf16,
+; CHECK:      call.uni (retval0), test_v2bf16,
 ; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0];
 ; CHECK:      st.param.b32    [func_retval0], [[R]]
 ; CHECK-NEXT: ret;
@@ -457,8 +436,7 @@ define <2 x bfloat> @test_v2bf16(<2 x bfloat> %a) {
 ; CHECK-DAG:  st.param.v2.b16 [param0], {[[E0]], [[E1]]};
 ; CHECK-DAG:  st.param.b16    [param0+4], [[E2]];
 ; CHECK:      .param .align 8 .b8 retval0[8];
-; CHECK:      call.uni (retval0),
-; CHECK:      test_v3f16,
+; CHECK:      call.uni (retval0),      test_v3f16,
 ; CHECK-DAG:  ld.param.v2.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]]}, [retval0];
 ; CHECK-DAG:  ld.param.b16    [[R2:%rs[0-9]+]], [retval0+4];
 ; CHECK-DAG:  st.param.v2.b16 [func_retval0], {[[R0]], [[R1]]};
@@ -476,8 +454,7 @@ define <3 x half> @test_v3f16(<3 x half> %a) {
 ; CHECK:      .param .align 8 .b8 param0[8];
 ; CHECK:      st.param.v2.b32 [param0], {[[R01]], [[R23]]};
 ; CHECK:      .param .align 8 .b8 retval0[8];
-; CHECK:      call.uni (retval0),
-; CHECK:      test_v4f16,
+; CHECK:      call.uni (retval0),      test_v4f16,
 ; CHECK:      ld.param.v2.b32 {[[RH01:%r[0-9]+]], [[RH23:%r[0-9]+]]}, [retval0];
 ; CHECK:      st.param.v2.b32 [func_retval0], {[[RH01]], [[RH23]]};
 ; CHECK:      ret;
@@ -495,8 +472,7 @@ define <4 x half> @test_v4f16(<4 x half> %a) {
 ; CHECK-DAG:  st.param.v4.b16 [param0],
 ; CHECK-DAG:  st.param.b16    [param0+8], [[E4]];
 ; CHECK:      .param .align 16 .b8 retval0[16];
-; CHECK:      call.uni (retval0),
-; CHECK:      test_v5f16,
+; CHECK:      call.uni (retval0),      test_v5f16,
 ; CHECK-DAG:  ld.param.v4.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]], [[R2:%rs[0-9]+]], [[R3:%rs[0-9]+]]}, [retval0];
 ; CHECK-DAG:  ld.param.b16    [[R4:%rs[0-9]+]], [retval0+8];
 ; CHECK-DAG:  st.param.v4.b16 [func_retval0], {[[R0]], [[R1]], [[R2]], [[R3]]};
@@ -514,8 +490,7 @@ define <5 x half> @test_v5f16(<5 x half> %a) {
 ; CHECK:      .param .align 16 .b8 param0[16];
 ; CHECK:      st.param.v4.b32 [param0], {[[R01]], [[R23]], [[R45]], [[R67]]};
 ; CHECK:      .param .align 16 .b8 retval0[16];
-; CHECK:      call.uni (retval0),
-; CHECK:      test_v8f16,
+; CHECK:      call.uni (retval0), test_v8f16,
 ; CHECK:      ld.param.v4.b32 {[[RH01:%r[0-9]+]], [[RH23:%r[0-9]+]], [[RH45:%r[0-9]+]], [[RH67:%r[0-9]+]]}, [retval0];
 ; CHECK:      st.param.v4.b32 [func_retval0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]};
 ; CHECK:      ret;
@@ -535,8 +510,7 @@ define <8 x half> @test_v8f16(<8 x half> %a) {
 ; CHECK-DAG:  st.param.v4.b16 [param0+8],
 ; CHECK-DAG:  st.param.b16    [param0+16], [[E8]];
 ; CHECK:      .param .align 32 .b8 retval0[32];
-; CHECK:      call.uni (retval0),
-; CHECK:      test_v9f16,
+; CHECK:      call.uni (retval0), test_v9f16,
 ; CHECK-DAG:  ld.param.v4.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]], [[R2:%rs[0-9]+]], [[R3:%rs[0-9]+]]}, [retval0];
 ; CHECK-DAG:  ld.param.v4.b16 {[[R4:%rs[0-9]+]], [[R5:%rs[0-9]+]], [[R6:%rs[0-9]+]], [[R7:%rs[0-9]+]]}, [retval0+8];
 ; CHECK-DAG:  ld.param.b16    [[R8:%rs[0-9]+]], [retval0+16];
@@ -557,8 +531,7 @@ define <9 x half> @test_v9f16(<9 x half> %a) {
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_i19,
+; CHECK:      call.uni (retval0), test_i19,
 ; CHECK:      ld.param.b32    {{%r[0-9]+}}, [retval0];
 ; CHECK:      st.param.b32    [func_retval0], {{%r[0-9]+}};
 ; CHECK-NEXT: ret;
@@ -575,8 +548,7 @@ define i19 @test_i19(i19 %a) {
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_i23,
+; CHECK:      call.uni (retval0), test_i23,
 ; CHECK:      ld.param.b32    {{%r[0-9]+}}, [retval0];
 ; CHECK:      st.param.b32    [func_retval0], {{%r[0-9]+}};
 ; CHECK-NEXT: ret;
@@ -593,8 +565,7 @@ define i23 @test_i23(i23 %a) {
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_i24,
+; CHECK:      call.uni (retval0), test_i24,
 ; CHECK:      ld.param.b32    {{%r[0-9]+}}, [retval0];
 ; CHECK:      st.param.b32    [func_retval0], {{%r[0-9]+}};
 ; CHECK-NEXT: ret;
@@ -610,8 +581,7 @@ define i24 @test_i24(i24 %a) {
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_i29,
+; CHECK:      call.uni (retval0), test_i29,
 ; CHECK:      ld.param.b32    {{%r[0-9]+}}, [retval0];
 ; CHECK:      st.param.b32    [func_retval0], {{%r[0-9]+}};
 ; CHECK-NEXT: ret;
@@ -627,8 +597,7 @@ define i29 @test_i29(i29 %a) {
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_i32,
+; CHECK:      call.uni (retval0), test_i32,
 ; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0];
 ; CHECK:      st.param.b32    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -646,8 +615,7 @@ define i32 @test_i32(i32 %a) {
 ; CHECK:      st.param.v2.b32  [param0], {[[E0]], [[E1]]};
 ; CHECK:      st.param.b32     [param0+8], [[E2]];
 ; CHECK:      .param .align 16 .b8 retval0[16];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v3i32,
+; CHECK:      call.uni (retval0), test_v3i32,
 ; CHECK:      ld.param.v2.b32  {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0];
 ; CHECK:      ld.param.b32     [[RE2:%r[0-9]+]], [retval0+8];
 ; CHECK-DAG:  st.param.v2.b32  [func_retval0], {[[RE0]], [[RE1]]};
@@ -665,8 +633,7 @@ define <3 x i32> @test_v3i32(<3 x i32> %a) {
 ; CHECK:      .param .align 16 .b8 param0[16];
 ; CHECK:      st.param.v4.b32  [param0], {[[E0]], [[E1]], [[E2]], [[E3]]};
 ; CHECK:      .param .align 16 .b8 retval0[16];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v4i32,
+; CHECK:      call.uni (retval0), test_v4i32,
 ; CHECK:      ld.param.v4.b32  {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0];
 ; CHECK:      st.param.v4.b32  [func_retval0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
 ; CHECK-NEXT: ret;
@@ -684,8 +651,7 @@ define <4 x i32> @test_v4i32(<4 x i32> %a) {
 ; CHECK-DAG:  st.param.v4.b32  [param0], {[[E0]], [[E1]], [[E2]], [[E3]]};
 ; CHECK-DAG:  st.param.b32     [param0+16], [[E4]];
 ; CHECK:      .param .align 32 .b8 retval0[32];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v5i32,
+; CHECK:      call.uni (retval0), test_v5i32,
 ; CHECK-DAG:  ld.param.v4.b32  {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0];
 ; CHECK-DAG:  ld.param.b32     [[RE4:%r[0-9]+]], [retval0+16];
 ; CHECK-DAG:  st.param.v4.b32  [func_retval0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
@@ -703,8 +669,7 @@ define <5 x i32> @test_v5i32(<5 x i32> %a) {
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_f32,
+; CHECK:      call.uni (retval0), test_f32,
 ; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0];
 ; CHECK:      st.param.b32    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -721,8 +686,7 @@ define float @test_f32(float %a) {
 ; CHECK:      .param .b64 param0;
 ; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      .param .b64 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_i40,
+; CHECK:      call.uni (retval0), test_i40,
 ; CHECK:      ld.param.b64    {{%rd[0-9]+}}, [retval0];
 ; CHECK:      st.param.b64    [func_retval0], {{%rd[0-9]+}};
 ; CHECK-NEXT: ret;
@@ -739,8 +703,7 @@ define i40 @test_i40(i40 %a) {
 ; CHECK:      .param .b64 param0;
 ; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      .param .b64 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_i47,
+; CHECK:      call.uni (retval0), test_i47,
 ; CHECK:      ld.param.b64    {{%rd[0-9]+}}, [retval0];
 ; CHECK:      st.param.b64    [func_retval0], {{%rd[0-9]+}};
 ; CHECK-NEXT: ret;
@@ -757,8 +720,7 @@ define i47 @test_i47(i47 %a) {
 ; CHECK:      .param .b64 param0;
 ; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      .param .b64 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_i48,
+; CHECK:      call.uni (retval0), test_i48,
 ; CHECK:      ld.param.b64    {{%rd[0-9]+}}, [retval0];
 ; CHECK:      st.param.b64    [func_retval0], {{%rd[0-9]+}};
 ; CHECK-NEXT: ret;
@@ -776,8 +738,7 @@ define i48 @test_i48(i48 %a) {
 ; CHECK:      .param .b64 param0;
 ; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      .param .b64 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_i51,
+; CHECK:      call.uni (retval0), test_i51,
 ; CHECK:      ld.param.b64    {{%rd[0-9]+}}, [retval0];
 ; CHECK:      st.param.b64    [func_retval0], {{%rd[0-9]+}};
 ; CHECK-NEXT: ret;
@@ -795,8 +756,7 @@ define i51 @test_i51(i51 %a) {
 ; CHECK:      .param .b64 param0;
 ; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      .param .b64 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_i56,
+; CHECK:      call.uni (retval0), test_i56,
 ; CHECK:      ld.param.b64    {{%rd[0-9]+}}, [retval0];
 ; CHECK:      st.param.b64    [func_retval0], {{%rd[0-9]+}};
 ; CHECK-NEXT: ret;
@@ -812,8 +772,7 @@ define i56 @test_i56(i56 %a) {
 ; CHECK:      .param .b64 param0;
 ; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      .param .b64 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_i57,
+; CHECK:      call.uni (retval0), test_i57,
 ; CHECK:      ld.param.b64    {{%rd[0-9]+}}, [retval0];
 ; CHECK:      st.param.b64    [func_retval0], {{%rd[0-9]+}};
 ; CHECK-NEXT: ret;
@@ -829,8 +788,7 @@ define i57 @test_i57(i57 %a) {
 ; CHECK:      .param .b64 param0;
 ; CHECK:      st.param.b64    [param0], [[E]];
 ; CHECK:      .param .b64 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_i64,
+; CHECK:      call.uni (retval0), test_i64,
 ; CHECK:      ld.param.b64    [[R:%rd[0-9]+]], [retval0];
 ; CHECK:      st.param.b64    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -848,8 +806,7 @@ define i64 @test_i64(i64 %a) {
 ; CHECK:      st.param.v2.b64  [param0], {[[E0]], [[E1]]};
 ; CHECK:      st.param.b64     [param0+16], [[E2]];
 ; CHECK:      .param .align 32 .b8 retval0[32];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v3i64,
+; CHECK:      call.uni (retval0), test_v3i64,
 ; CHECK:      ld.param.v2.b64  {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0];
 ; CHECK:      ld.param.b64     [[RE2:%rd[0-9]+]], [retval0+16];
 ; CHECK-DAG:  st.param.v2.b64  [func_retval0], {[[RE0]], [[RE1]]};
@@ -872,8 +829,7 @@ define <3 x i64> @test_v3i64(<3 x i64> %a) {
 ; CHECK:      st.param.v2.b64  [param0], {[[E0]], [[E1]]};
 ; CHECK:      st.param.v2.b64  [param0+16], {[[E2]], [[E3]]};
 ; CHECK:      .param .align 32 .b8 retval0[32];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v4i64,
+; CHECK:      call.uni (retval0), test_v4i64,
 ; CHECK:      ld.param.v2.b64  {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0];
 ; CHECK:      ld.param.v2.b64  {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16];
 ; CHECK-DAG:  st.param.v2.b64  [func_retval0+16], {[[RE2]], [[RE3]]};
@@ -893,8 +849,7 @@ define <4 x i64> @test_v4i64(<4 x i64> %a) {
 ; CHECK:      .param .align 1 .b8 param0[1];
 ; CHECK:      st.param.b8    [param0], [[A]]
 ; CHECK:      .param .align 1 .b8 retval0[1];
-; CHECK:      call.uni
-; CHECK-NEXT: test_s_i1,
+; CHECK:      call.uni (retval0), test_s_i1,
 ; CHECK:      ld.param.b8    [[R:%rs[0-9]+]], [retval0];
 ; CHECK:      st.param.b8    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -910,8 +865,7 @@ define %s_i1 @test_s_i1(%s_i1 %a) {
 ; CHECK:      .param .align 1 .b8 param0[1];
 ; CHECK:      st.param.b8    [param0], [[A]]
 ; CHECK:      .param .align 1 .b8 retval0[1];
-; CHECK:      call.uni
-; CHECK-NEXT: test_s_i8,
+; CHECK:      call.uni (retval0), test_s_i8,
 ; CHECK:      ld.param.b8    [[R:%rs[0-9]+]], [retval0];
 ; CHECK:      st.param.b8    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -927,8 +881,7 @@ define %s_i8 @test_s_i8(%s_i8 %a) {
 ; CHECK:      .param .align 2 .b8 param0[2];
 ; CHECK:      st.param.b16    [param0], [[A]]
 ; CHECK:      .param .align 2 .b8 retval0[2];
-; CHECK:      call.uni
-; CHECK-NEXT: test_s_i16,
+; CHECK:      call.uni (retval0), test_s_i16,
 ; CHECK:      ld.param.b16    [[R:%rs[0-9]+]], [retval0];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -944,8 +897,7 @@ define %s_i16 @test_s_i16(%s_i16 %a) {
 ; CHECK:      .param .align 2 .b8 param0[2];
 ; CHECK:      st.param.b16    [param0], [[A]]
 ; CHECK:      .param .align 2 .b8 retval0[2];
-; CHECK:      call.uni
-; CHECK-NEXT: test_s_f16,
+; CHECK:      call.uni (retval0), test_s_f16,
 ; CHECK:      ld.param.b16    [[R:%rs[0-9]+]], [retval0];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -961,8 +913,7 @@ define %s_f16 @test_s_f16(%s_f16 %a) {
 ; CHECK:      .param .align 4 .b8 param0[4]
 ; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_s_i32,
+; CHECK:      call.uni (retval0), test_s_i32,
 ; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0];
 ; CHECK:      st.param.b32    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -978,8 +929,7 @@ define %s_i32 @test_s_i32(%s_i32 %a) {
 ; CHECK:      .param .align 4 .b8 param0[4]
 ; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_s_f32,
+; CHECK:      call.uni (retval0), test_s_f32,
 ; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0];
 ; CHECK:      st.param.b32    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -995,8 +945,7 @@ define %s_f32 @test_s_f32(%s_f32 %a) {
 ; CHECK:      .param .align 8 .b8 param0[8];
 ; CHECK:      st.param.b64    [param0], [[E]];
 ; CHECK:      .param .align 8 .b8 retval0[8];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_s_i64,
+; CHECK:      call.uni (retval0), test_s_i64,
 ; CHECK:      ld.param.b64    [[R:%rd[0-9]+]], [retval0];
 ; CHECK:      st.param.b64    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -1021,8 +970,7 @@ define %s_i64 @test_s_i64(%s_i64 %a) {
 ; CHECK-DAG:    st.param.b32    [param0+12], [[E3]];
 ; CHECK-DAG:    st.param.b64    [param0+16], [[E4]];
 ; CHECK:        .param .align 8 .b8 retval0[24];
-; CHECK:        call.uni (retval0),
-; CHECK-NEXT:   test_s_i32f32,
+; CHECK:        call.uni (retval0), test_s_i32f32,
 ; CHECK-DAG:    ld.param.b32    [[RE0:%r[0-9]+]], [retval0];
 ; CHECK-DAG:    ld.param.b32    [[RE1:%r[0-9]+]], [retval0+4];
 ; CHECK-DAG:    ld.param.b32    [[RE2:%r[0-9]+]], [retval0+8];
@@ -1051,8 +999,7 @@ define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) {
 ; CHECK:        st.param.v2.b32 [param0+8], {[[E2]], [[E3]]};
 ; CHECK:        st.param.b64    [param0+16], [[E4]];
 ; CHECK:        .param .align 8 .b8 retval0[24];
-; CHECK:        call.uni (retval0),
-; CHECK-NEXT:   test_s_i32x4,
+; CHECK:        call.uni (retval0), test_s_i32x4,
 ; CHECK:        ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0];
 ; CHECK:        ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8];
 ; CHECK:        ld.param.b64    [[RE4:%rd[0-9]+]], [retval0+16];
@@ -1081,8 +1028,7 @@ define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) {
 ; CHECK:        st.param.b32    [param0+16], [[E4]];
 ; CHECK:        st.param.b64    [param0+24], [[E5]];
 ; CHECK:        .param .align 8 .b8 retval0[32];
-; CHECK:        call.uni (retval0),
-; CHECK:        test_s_i1i32x4,
+; CHECK:        call.uni (retval0), test_s_i1i32x4,
 ; CHECK:        (
 ; CHECK:        param0
 ; CHECK:        );
@@ -1160,8 +1106,7 @@ define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) {
 ; CHECK-DAG:        st.param.b8     [param0+23],
 ; CHECK-DAG:        st.param.b8     [param0+24],
 ; CHECK:            .param .align 1 .b8 retval0[25];
-; CHECK:            call.uni (retval0),
-; CHECK-NEXT:       test_s_i1i32x4p,
+; CHECK:            call.uni (retval0), test_s_i1i32x4p,
 ; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0];
 ; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+1];
 ; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+2];
@@ -1237,8 +1182,7 @@ define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) {
 ; CHECK:        st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]};
 ; CHECK:        st.param.b32    [param0+64], [[E15]];
 ; CHECK:        .param .align 16 .b8 retval0[80];
-; CHECK:        call.uni (retval0),
-; CHECK:        test_s_crossfield,
+; CHECK:        call.uni (retval0), test_s_crossfield,
 ; CHECK:        ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0];
 ; CHECK:        ld.param.b32    [[RE2:%r[0-9]+]], [retval0+8];
 ; CHECK:        ld.param.v4.b32 {[[RE3:%r[0-9]+]], [[RE4:%r[0-9]+]], [[RE5:%r[0-9]+]], [[RE6:%r[0-9]+]]}, [retval0+16];
diff --git a/llvm/test/CodeGen/NVPTX/param-overalign.ll b/llvm/test/CodeGen/NVPTX/param-overalign.ll
index 22a648c7a9786..f490c5f73d425 100644
--- a/llvm/test/CodeGen/NVPTX/param-overalign.ll
+++ b/llvm/test/CodeGen/NVPTX/param-overalign.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx | FileCheck %s
 ; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -verify-machineinstrs | %ptxas-verify %}
 
@@ -18,27 +19,23 @@ target triple = "nvptx64-nvidia-cuda"
 ; CHECK-NEXT: ;
 
 define float @caller_md(float %a, float %b) {
-; CHECK-LABEL: .visible .func  (.param .b32 func_retval0) caller_md(
-; CHECK-NEXT:         .param .b32 caller_md_param_0,
-; CHECK-NEXT:         .param .b32 caller_md_param_1
-; CHECK-NEXT: )
-; CHECK-NEXT: {
-
-; CHECK:         ld.param.b32 %r1, [caller_md_param_0];
+; CHECK-LABEL: caller_md(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [caller_md_param_0];
 ; CHECK-NEXT:    ld.param.b32 %r2, [caller_md_param_1];
-; CHECK-NEXT:    {
+; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v2.b32 [param0], {%r1, %r2};
 ; CHECK-NEXT:    .param .b32 retval0;
-; CHECK-NEXT:    call.uni (retval0),
-; CHECK-NEXT:    callee_md,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni (retval0), callee_md, (param0);
 ; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
-; CHECK-NEXT:    }
+; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
+
   %s1 = insertvalue %struct.float2 poison, float %a, 0
   %s2 = insertvalue %struct.float2 %s1, float %b, 1
   %r = call float @callee_md(%struct.float2 %s2)
@@ -46,15 +43,16 @@ define float @caller_md(float %a, float %b) {
 }
 
 define float @callee_md(%struct.float2 alignstack(8) %a) {
-; CHECK-LABEL: .visible .func  (.param .b32 func_retval0) callee_md(
-; CHECK-NEXT:         .param .align 8 .b8 callee_md_param_0[8]
-; CHECK-NEXT: )
-; CHECK-NEXT: {
-
-; CHECK:         ld.param.v2.b32 {%r1, %r2}, [callee_md_param_0];
+; CHECK-LABEL: callee_md(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [callee_md_param_0];
 ; CHECK-NEXT:    add.rn.f32 %r3, %r1, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
+
   %v0 = extractvalue %struct.float2 %a, 0
   %v1 = extractvalue %struct.float2 %a, 1
   %2 = fadd float %v0, %v1
@@ -62,27 +60,23 @@ define float @callee_md(%struct.float2 alignstack(8) %a) {
 }
 
 define float @caller(float %a, float %b) {
-; CHECK-LABEL: .visible .func  (.param .b32 func_retval0) caller(
-; CHECK-NEXT:         .param .b32 caller_param_0,
-; CHECK-NEXT:         .param .b32 caller_param_1
-; CHECK-NEXT: )
-; CHECK-NEXT: {
-
-; CHECK:         ld.param.b32 %r1, [caller_param_0];
+; CHECK-LABEL: caller(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [caller_param_0];
 ; CHECK-NEXT:    ld.param.b32 %r2, [caller_param_1];
-; CHECK-NEXT:    {
+; CHECK-NEXT:    { // callseq 1, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v2.b32 [param0], {%r1, %r2};
 ; CHECK-NEXT:    .param .b32 retval0;
-; CHECK-NEXT:    call.uni (retval0),
-; CHECK-NEXT:    callee,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni (retval0), callee, (param0);
 ; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
-; CHECK-NEXT:    }
+; CHECK-NEXT:    } // callseq 1
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
+
   %s1 = insertvalue %struct.float2 poison, float %a, 0
   %s2 = insertvalue %struct.float2 %s1, float %b, 1
   %r = call float @callee(%struct.float2 %s2)
@@ -90,15 +84,16 @@ define float @caller(float %a, float %b) {
 }
 
 define float @callee(%struct.float2 alignstack(8) %a ) {
-; CHECK-LABEL: .visible .func  (.param .b32 func_retval0) callee(
-; CHECK-NEXT:         .param .align 8 .b8 callee_param_0[8]
-; CHECK-NEXT: )
-; CHECK-NEXT: {
-
-; CHECK:         ld.param.v2.b32 {%r1, %r2}, [callee_param_0];
+; CHECK-LABEL: callee(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [callee_param_0];
 ; CHECK-NEXT:    add.rn.f32 %r3, %r1, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
+
   %v0 = extractvalue %struct.float2 %a, 0
   %v1 = extractvalue %struct.float2 %a, 1
   %2 = fadd float %v0, %v1
@@ -106,9 +101,15 @@ define float @callee(%struct.float2 alignstack(8) %a ) {
 }
 
 define alignstack(8) %struct.float2 @aligned_return(%struct.float2 %a ) {
-; CHECK-LABEL: .visible .func  (.param .align 8 .b8 func_retval0[8]) aligned_return(
-; CHECK-NEXT:         .param .align 4 .b8 aligned_return_param_0[8]
-; CHECK-NEXT: )
-; CHECK-NEXT: {
+; CHECK-LABEL: aligned_return(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [aligned_return_param_0+4];
+; CHECK-NEXT:    ld.param.b32 %r2, [aligned_return_param_0];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0+4], %r1;
+; CHECK-NEXT:    ret;
   ret %struct.float2 %a
 }
diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
index abb1aff867754..892e49a5fe82a 100644
--- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
+++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
@@ -86,11 +86,7 @@ define dso_local void @caller_St4x1(ptr nocapture noundef readonly byval(%struct
   ; CHECK:       .param .b32 param0;
   ; CHECK:       st.param.b32 [param0], {{%r[0-9]+}};
   ; CHECK:       .param .align 16 .b8 retval0[4];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St4x1,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
+  ; CHECK:       call.uni (retval0), callee_St4x1, (param0);
   ; CHECK:       ld.param.b32 {{%r[0-9]+}}, [retval0];
   %1 = load i32, ptr %in, align 4
   %call = tail call fastcc [1 x i32] @callee_St4x1(i32 %1) #2
@@ -118,11 +114,7 @@ define dso_local void @caller_St4x2(ptr nocapture noundef readonly byval(%struct
   ; CHECK:       .param .align 16 .b8 param0[8];
   ; CHECK:       st.param.v2.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       .param .align 16 .b8 retval0[8];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St4x2,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
+  ; CHECK:       call.uni (retval0), callee_St4x2, (param0);
   ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
   %agg.tmp = alloca %struct.St4x2, align 8
   %1 = load i64, ptr %in, align 4
@@ -160,11 +152,7 @@ define dso_local void @caller_St4x3(ptr nocapture noundef readonly byval(%struct
   ; CHECK:       st.param.v2.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       st.param.b32    [param0+8], {{%r[0-9]+}};
   ; CHECK:       .param .align 16 .b8 retval0[12];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St4x3,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
+  ; CHECK:       call.uni (retval0), callee_St4x3, (param0);
   ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
   ; CHECK:       ld.param.b32    {{%r[0-9]+}},  [retval0+8];
   %call = tail call fastcc [3 x i32] @callee_St4x3(ptr noundef nonnull byval(%struct.St4x3) align 4 %in) #2
@@ -207,11 +195,7 @@ define dso_local void @caller_St4x4(ptr nocapture noundef readonly byval(%struct
   ; CHECK:       .param .align 16 .b8 param0[16];
   ; CHECK:       st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       .param .align 16 .b8 retval0[16];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St4x4,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
+  ; CHECK:       call.uni (retval0), callee_St4x4, (param0);
   ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
   %call = tail call fastcc [4 x i32] @callee_St4x4(ptr noundef nonnull byval(%struct.St4x4) align 4 %in) #2
   %.fca.0.extract = extractvalue [4 x i32] %call, 0
@@ -258,11 +242,7 @@ define dso_local void @caller_St4x5(ptr nocapture noundef readonly byval(%struct
   ; CHECK:       st.param.v4.b32 [param0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       st.param.b32    [param0+16], {{%r[0-9]+}};
   ; CHECK:       .param .align 16 .b8 retval0[20];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St4x5,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
+  ; CHECK:       call.uni (retval0), callee_St4x5, (param0);
   ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
   ; CHECK:       ld.param.b32    {{%r[0-9]+}},  [retval0+16];
   %call = tail call fastcc [5 x i32] @callee_St4x5(ptr noundef nonnull byval(%struct.St4x5) align 4 %in) #2
@@ -318,11 +298,7 @@ define dso_local void @caller_St4x6(ptr nocapture noundef readonly byval(%struct
   ; CHECK:       st.param.v4.b32 [param0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       .param .align 16 .b8 retval0[24];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St4x6,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
+  ; CHECK:       call.uni (retval0), callee_St4x6, (param0);
   ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
   ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
   %call = tail call fastcc [6 x i32] @callee_St4x6(ptr noundef nonnull byval(%struct.St4x6) align 4 %in) #2
@@ -385,11 +361,7 @@ define dso_local void @caller_St4x7(ptr nocapture noundef readonly byval(%struct
   ; CHECK:       st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       st.param.b32    [param0+24], {{%r[0-9]+}};
   ; CHECK:       .param .align 16 .b8 retval0[28];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St4x7,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
+  ; CHECK:       call.uni (retval0), callee_St4x7, (param0);
   ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
   ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
   ; CHECK:       ld.param.b32    {{%r[0-9]+}}, [retval0+24];
@@ -460,11 +432,7 @@ define dso_local void @caller_St4x8(ptr nocapture noundef readonly byval(%struct
   ; CHECK:       st.param.v4.b32 [param0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       st.param.v4.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       .param .align 16 .b8 retval0[32];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St4x8,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
+  ; CHECK:       call.uni (retval0), callee_St4x8, (param0);
   ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
   ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
   %call = tail call fastcc [8 x i32] @callee_St4x8(ptr noundef nonnull byval(%struct.St4x8) align 4 %in) #2
@@ -537,11 +505,7 @@ define dso_local void @caller_St8x1(ptr nocapture noundef readonly byval(%struct
   ; CHECK:       .param .b64 param0;
   ; CHECK:       st.param.b64 [param0], {{%rd[0-9]+}};
   ; CHECK:       .param .align 16 .b8 retval0[8];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St8x1,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
+  ; CHECK:       call.uni (retval0), callee_St8x1, (param0);
   ; CHECK:       ld.param.b64 {{%rd[0-9]+}}, [retval0];
   %1 = load i64, ptr %in, align 8
   %call = tail call fastcc [1 x i64] @callee_St8x1(i64 %1) #2
@@ -569,11 +533,7 @@ define dso_local void @caller_St8x2(ptr nocapture noundef readonly byval(%struct
   ; CHECK:       .param .align 16 .b8 param0[16];
   ; CHECK:       st.param.v2.b64 [param0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
   ; CHECK:       .param .align 16 .b8 retval0[16];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St8x2,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
+  ; CHECK:       call.uni (retval0), callee_St8x2, (param0);
   ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0];
   %call = tail call fastcc [2 x i64] @callee_St8x2(ptr noundef nonnull byval(%struct.St8x2) align 8 %in) #2
   %.fca.0.extract = extractvalue [2 x i64] %call, 0
@@ -608,11 +568,7 @@ define dso_local void @caller_St8x3(ptr nocapture noundef readonly byval(%struct
   ; CHECK:       st.param.v2.b64 [param0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
   ; CHECK:       st.param.b64    [param0+16], {{%rd[0-9]+}};
   ; CHECK:       .param .align 16 .b8 retval0[24];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St8x3,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
+  ; CHECK:       call.uni (retval0), callee_St8x3, (param0);
   ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0];
   ; CHECK:       ld.param.b64    {{%rd[0-9]+}}, [retval0+16];
   %call = tail call fastcc [3 x i64] @callee_St8x3(ptr noundef nonnull byval(%struct.St8x3) align 8 %in) #2
@@ -656,11 +612,7 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct
   ; CHECK:       st.param.v2.b64 [param0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
   ; CHECK:       st.param.v2.b64 [param0+16], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
   ; CHECK:       .param .align 16 .b8 retval0[32];
-  ; CHECK:       call.uni (retval0),
-  ; CHECK-NEXT:  callee_St8x4,
-  ; CHECK-NEXT:  (
-  ; CHECK-NEXT:  param0
-  ; CHECK-NEXT:  );
+  ; CHECK:       call.uni (retval0), callee_St8x4, (param0);
   ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0];
   ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+16];
   %call = tail call fastcc [4 x i64] @callee_St8x4(ptr noundef nonnull byval(%struct.St8x4) align 8 %in) #2
diff --git a/llvm/test/CodeGen/NVPTX/shift-opt.ll b/llvm/test/CodeGen/NVPTX/shift-opt.ll
index b165b4cb4b262..f0813609268e9 100644
--- a/llvm/test/CodeGen/NVPTX/shift-opt.ll
+++ b/llvm/test/CodeGen/NVPTX/shift-opt.ll
@@ -131,11 +131,7 @@ define i64 @test_negative_use_lop(i64 %x, i32 %y) {
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .b64 param0;
 ; CHECK-NEXT:    st.param.b64 [param0], %rd3;
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    use,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni use, (param0);
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd4;
 ; CHECK-NEXT:    ret;
@@ -164,11 +160,7 @@ define i64 @test_negative_use_shl(i64 %x, i32 %y) {
 ; CHECK-NEXT:    { // callseq 1, 0
 ; CHECK-NEXT:    .param .b64 param0;
 ; CHECK-NEXT:    st.param.b64 [param0], %rd2;
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    use,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni use, (param0);
 ; CHECK-NEXT:    } // callseq 1
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd4;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/st-param-imm.ll b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
index bdab9958fe2b2..50d3e8049a947 100644
--- a/llvm/test/CodeGen/NVPTX/st-param-imm.ll
+++ b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
@@ -28,11 +28,7 @@ define void @st_param_i8_i16() {
 ; CHECK-NEXT:    .param .align 2 .b8 param0[4];
 ; CHECK-NEXT:    st.param.b8 [param0], 1;
 ; CHECK-NEXT:    st.param.b16 [param0+2], 2;
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_i8_i16,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_i8_i16, (param0);
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    ret;
   call void @call_i8_i16(%struct.A { i8 1, i16 2 })
@@ -48,11 +44,7 @@ define void @st_param_i32() {
 ; CHECK-NEXT:    { // callseq 1, 0
 ; CHECK-NEXT:    .param .b32 param0;
 ; CHECK-NEXT:    st.param.b32 [param0], 3;
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_i32, (param0);
 ; CHECK-NEXT:    } // callseq 1
 ; CHECK-NEXT:    ret;
   call void @call_i32(i32 3)
@@ -68,11 +60,7 @@ define void @st_param_i64() {
 ; CHECK-NEXT:    { // callseq 2, 0
 ; CHECK-NEXT:    .param .b64 param0;
 ; CHECK-NEXT:    st.param.b64 [param0], 4;
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_i64,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_i64, (param0);
 ; CHECK-NEXT:    } // callseq 2
 ; CHECK-NEXT:    ret;
   call void @call_i64(i64 4)
@@ -88,11 +76,7 @@ define void @st_param_f32() {
 ; CHECK-NEXT:    { // callseq 3, 0
 ; CHECK-NEXT:    .param .b32 param0;
 ; CHECK-NEXT:    st.param.b32 [param0], 0f40A00000;
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_f32, (param0);
 ; CHECK-NEXT:    } // callseq 3
 ; CHECK-NEXT:    ret;
   call void @call_f32(float 5.0)
@@ -108,11 +92,7 @@ define void @st_param_f64() {
 ; CHECK-NEXT:    { // callseq 4, 0
 ; CHECK-NEXT:    .param .b64 param0;
 ; CHECK-NEXT:    st.param.b64 [param0], 0d4018000000000000;
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_f64,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_f64, (param0);
 ; CHECK-NEXT:    } // callseq 4
 ; CHECK-NEXT:    ret;
   call void @call_f64(double 6.0)
@@ -134,11 +114,7 @@ define void @st_param_v2_i8_ii() {
 ; CHECK-NEXT:    { // callseq 5, 0
 ; CHECK-NEXT:    .param .align 2 .b8 param0[2];
 ; CHECK-NEXT:    st.param.v2.b8 [param0], {1, 2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v2_i8,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v2_i8, (param0);
 ; CHECK-NEXT:    } // callseq 5
 ; CHECK-NEXT:    ret;
   call void @call_v2_i8(%struct.char2 { i8 1, i8 2 })
@@ -154,11 +130,7 @@ define void @st_param_v2_i8_ir(i8 %val) {
 ; CHECK-NEXT:    { // callseq 6, 0
 ; CHECK-NEXT:    .param .align 2 .b8 param0[2];
 ; CHECK-NEXT:    st.param.v2.b8 [param0], {1, %rs1};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v2_i8,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v2_i8, (param0);
 ; CHECK-NEXT:    } // callseq 6
 ; CHECK-NEXT:    ret;
   %struct.ir0 = insertvalue %struct.char2 poison, i8 1, 0
@@ -176,11 +148,7 @@ define void @st_param_v2_i8_ri(i8 %val) {
 ; CHECK-NEXT:    { // callseq 7, 0
 ; CHECK-NEXT:    .param .align 2 .b8 param0[2];
 ; CHECK-NEXT:    st.param.v2.b8 [param0], {%rs1, 2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v2_i8,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v2_i8, (param0);
 ; CHECK-NEXT:    } // callseq 7
 ; CHECK-NEXT:    ret;
   %struct.ri0 = insertvalue %struct.char2 poison, i8 %val, 0
@@ -198,11 +166,7 @@ define void @st_param_v2_i16_ii() {
 ; CHECK-NEXT:    { // callseq 8, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v2.b16 [param0], {1, 2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v2_i16,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v2_i16, (param0);
 ; CHECK-NEXT:    } // callseq 8
 ; CHECK-NEXT:    ret;
   call void @call_v2_i16(%struct.short2 { i16 1, i16 2 })
@@ -218,11 +182,7 @@ define void @st_param_v2_i16_ir(i16 %val) {
 ; CHECK-NEXT:    { // callseq 9, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v2.b16 [param0], {1, %rs1};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v2_i16,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v2_i16, (param0);
 ; CHECK-NEXT:    } // callseq 9
 ; CHECK-NEXT:    ret;
   %struct.ir0 = insertvalue %struct.short2 poison, i16 1, 0
@@ -240,11 +200,7 @@ define void @st_param_v2_i16_ri(i16 %val) {
 ; CHECK-NEXT:    { // callseq 10, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v2.b16 [param0], {%rs1, 2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v2_i16,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v2_i16, (param0);
 ; CHECK-NEXT:    } // callseq 10
 ; CHECK-NEXT:    ret;
   %struct.ri0 = insertvalue %struct.short2 poison, i16 %val, 0
@@ -262,11 +218,7 @@ define void @st_param_v2_i32_ii() {
 ; CHECK-NEXT:    { // callseq 11, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v2.b32 [param0], {1, 2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v2_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v2_i32, (param0);
 ; CHECK-NEXT:    } // callseq 11
 ; CHECK-NEXT:    ret;
   call void @call_v2_i32(%struct.int2 { i32 1, i32 2 })
@@ -282,11 +234,7 @@ define void @st_param_v2_i32_ir(i32 %val) {
 ; CHECK-NEXT:    { // callseq 12, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v2.b32 [param0], {1, %r1};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v2_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v2_i32, (param0);
 ; CHECK-NEXT:    } // callseq 12
 ; CHECK-NEXT:    ret;
   %struct.ir0 = insertvalue %struct.int2 poison, i32 1, 0
@@ -304,11 +252,7 @@ define void @st_param_v2_i32_ri(i32 %val) {
 ; CHECK-NEXT:    { // callseq 13, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v2.b32 [param0], {%r1, 2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v2_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v2_i32, (param0);
 ; CHECK-NEXT:    } // callseq 13
 ; CHECK-NEXT:    ret;
   %struct.ri0 = insertvalue %struct.int2 poison, i32 %val, 0
@@ -326,11 +270,7 @@ define void @st_param_v2_i64_ii() {
 ; CHECK-NEXT:    { // callseq 14, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v2.b64 [param0], {1, 2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v2_i64,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v2_i64, (param0);
 ; CHECK-NEXT:    } // callseq 14
 ; CHECK-NEXT:    ret;
   call void @call_v2_i64(%struct.longlong2 { i64 1, i64 2 })
@@ -346,11 +286,7 @@ define void @st_param_v2_i64_ir(i64 %val) {
 ; CHECK-NEXT:    { // callseq 15, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v2.b64 [param0], {1, %rd1};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v2_i64,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v2_i64, (param0);
 ; CHECK-NEXT:    } // callseq 15
 ; CHECK-NEXT:    ret;
   %struct.ir0 = insertvalue %struct.longlong2 poison, i64 1, 0
@@ -368,11 +304,7 @@ define void @st_param_v2_i64_ri(i64 %val) {
 ; CHECK-NEXT:    { // callseq 16, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v2.b64 [param0], {%rd1, 2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v2_i64,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v2_i64, (param0);
 ; CHECK-NEXT:    } // callseq 16
 ; CHECK-NEXT:    ret;
   %struct.ri0 = insertvalue %struct.longlong2 poison, i64 %val, 0
@@ -390,11 +322,7 @@ define void @st_param_v2_f32_ii(float %val) {
 ; CHECK-NEXT:    { // callseq 17, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v2.b32 [param0], {0f3F800000, 0f40000000};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v2_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v2_f32, (param0);
 ; CHECK-NEXT:    } // callseq 17
 ; CHECK-NEXT:    ret;
   call void @call_v2_f32(%struct.float2 { float 1.0, float 2.0 })
@@ -410,11 +338,7 @@ define void @st_param_v2_f32_ir(float %val) {
 ; CHECK-NEXT:    { // callseq 18, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v2.b32 [param0], {0f3F800000, %r1};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v2_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v2_f32, (param0);
 ; CHECK-NEXT:    } // callseq 18
 ; CHECK-NEXT:    ret;
   %struct.ir0 = insertvalue %struct.float2 poison, float 1.0, 0
@@ -432,11 +356,7 @@ define void @st_param_v2_f32_ri(float %val) {
 ; CHECK-NEXT:    { // callseq 19, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v2.b32 [param0], {%r1, 0f40000000};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v2_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v2_f32, (param0);
 ; CHECK-NEXT:    } // callseq 19
 ; CHECK-NEXT:    ret;
   %struct.ri0 = insertvalue %struct.float2 poison, float %val, 0
@@ -454,11 +374,7 @@ define void @st_param_v2_f64_ii(double %val) {
 ; CHECK-NEXT:    { // callseq 20, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v2.b64 [param0], {0d3FF0000000000000, 0d4000000000000000};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v2_f64,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v2_f64, (param0);
 ; CHECK-NEXT:    } // callseq 20
 ; CHECK-NEXT:    ret;
   call void @call_v2_f64(%struct.double2 { double 1.0, double 2.0 })
@@ -474,11 +390,7 @@ define void @st_param_v2_f64_ir(double %val) {
 ; CHECK-NEXT:    { // callseq 21, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v2.b64 [param0], {0d3FF0000000000000, %rd1};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v2_f64,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v2_f64, (param0);
 ; CHECK-NEXT:    } // callseq 21
 ; CHECK-NEXT:    ret;
   %struct.ir0 = insertvalue %struct.double2 poison, double 1.0, 0
@@ -496,11 +408,7 @@ define void @st_param_v2_f64_ri(double %val) {
 ; CHECK-NEXT:    { // callseq 22, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v2.b64 [param0], {%rd1, 0d4000000000000000};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v2_f64,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v2_f64, (param0);
 ; CHECK-NEXT:    } // callseq 22
 ; CHECK-NEXT:    ret;
   %struct.ri0 = insertvalue %struct.double2 poison, double %val, 0
@@ -525,11 +433,7 @@ define void @st_param_v4_i8_iiii() {
 ; CHECK-NEXT:    { // callseq 23, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {1, 2, 3, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i8,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 23
 ; CHECK-NEXT:    ret;
   call void @call_v4_i8(%struct.char4 { i8 1, i8 2, i8 3, i8 4 })
@@ -547,11 +451,7 @@ define void @st_param_v4_i8_irrr(i8 %b, i8 %c, i8 %d) {
 ; CHECK-NEXT:    { // callseq 24, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {1, %rs1, %rs2, %rs3};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i8,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 24
 ; CHECK-NEXT:    ret;
   %struct.irrr0 = insertvalue %struct.char4 poison, i8 1, 0
@@ -573,11 +473,7 @@ define void @st_param_v4_i8_rirr(i8 %a, i8 %c, i8 %d) {
 ; CHECK-NEXT:    { // callseq 25, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, 2, %rs2, %rs3};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i8,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 25
 ; CHECK-NEXT:    ret;
   %struct.rirr0 = insertvalue %struct.char4 poison, i8 %a, 0
@@ -599,11 +495,7 @@ define void @st_param_v4_i8_rrir(i8 %a, i8 %b, i8 %d) {
 ; CHECK-NEXT:    { // callseq 26, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, %rs2, 3, %rs3};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i8,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 26
 ; CHECK-NEXT:    ret;
   %struct.rrir0 = insertvalue %struct.char4 poison, i8 %a, 0
@@ -625,11 +517,7 @@ define void @st_param_v4_i8_rrri(i8 %a, i8 %b, i8 %c) {
 ; CHECK-NEXT:    { // callseq 27, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, %rs2, %rs3, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i8,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 27
 ; CHECK-NEXT:    ret;
   %struct.rrri0 = insertvalue %struct.char4 poison, i8 %a, 0
@@ -650,11 +538,7 @@ define void @st_param_v4_i8_iirr(i8 %c, i8 %d) {
 ; CHECK-NEXT:    { // callseq 28, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {1, 2, %rs1, %rs2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i8,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 28
 ; CHECK-NEXT:    ret;
   %struct.iirr0 = insertvalue %struct.char4 poison, i8 1, 0
@@ -675,11 +559,7 @@ define void @st_param_v4_i8_irir(i8 %b, i8 %d) {
 ; CHECK-NEXT:    { // callseq 29, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {1, %rs1, 3, %rs2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i8,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 29
 ; CHECK-NEXT:    ret;
   %struct.irir0 = insertvalue %struct.char4 poison, i8 1, 0
@@ -700,11 +580,7 @@ define void @st_param_v4_i8_irri(i8 %b, i8 %c) {
 ; CHECK-NEXT:    { // callseq 30, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {1, %rs1, %rs2, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i8,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 30
 ; CHECK-NEXT:    ret;
   %struct.irri0 = insertvalue %struct.char4 poison, i8 1, 0
@@ -725,11 +601,7 @@ define void @st_param_v4_i8_riir(i8 %a, i8 %d) {
 ; CHECK-NEXT:    { // callseq 31, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, 2, 3, %rs2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i8,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 31
 ; CHECK-NEXT:    ret;
   %struct.riir0 = insertvalue %struct.char4 poison, i8 %a, 0
@@ -750,11 +622,7 @@ define void @st_param_v4_i8_riri(i8 %a, i8 %c) {
 ; CHECK-NEXT:    { // callseq 32, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, 2, %rs2, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i8,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 32
 ; CHECK-NEXT:    ret;
   %struct.riri0 = insertvalue %struct.char4 poison, i8 %a, 0
@@ -775,11 +643,7 @@ define void @st_param_v4_i8_rrii(i8 %a, i8 %b) {
 ; CHECK-NEXT:    { // callseq 33, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, %rs2, 3, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i8,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 33
 ; CHECK-NEXT:    ret;
   %struct.rrii0 = insertvalue %struct.char4 poison, i8 %a, 0
@@ -799,11 +663,7 @@ define void @st_param_v4_i8_iiir(i8 %d) {
 ; CHECK-NEXT:    { // callseq 34, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {1, 2, 3, %rs1};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i8,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 34
 ; CHECK-NEXT:    ret;
   %struct.iiir0 = insertvalue %struct.char4 poison, i8 1, 0
@@ -823,11 +683,7 @@ define void @st_param_v4_i8_iiri(i8 %c) {
 ; CHECK-NEXT:    { // callseq 35, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {1, 2, %rs1, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i8,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 35
 ; CHECK-NEXT:    ret;
   %struct.iiri0 = insertvalue %struct.char4 poison, i8 1, 0
@@ -847,11 +703,7 @@ define void @st_param_v4_i8_irii(i8 %b) {
 ; CHECK-NEXT:    { // callseq 36, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {1, %rs1, 3, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i8,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 36
 ; CHECK-NEXT:    ret;
   %struct.irii0 = insertvalue %struct.char4 poison, i8 1, 0
@@ -871,11 +723,7 @@ define void @st_param_v4_i8_riii(i8 %a) {
 ; CHECK-NEXT:    { // callseq 37, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, 2, 3, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i8,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 37
 ; CHECK-NEXT:    ret;
   %struct.riii0 = insertvalue %struct.char4 poison, i8 %a, 0
@@ -895,11 +743,7 @@ define void @st_param_v4_i16_iiii() {
 ; CHECK-NEXT:    { // callseq 38, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {1, 2, 3, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i16,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 38
 ; CHECK-NEXT:    ret;
   call void @call_v4_i16(%struct.short4 { i16 1, i16 2, i16 3, i16 4 })
@@ -917,11 +761,7 @@ define void @st_param_v4_i16_irrr(i16 %b, i16 %c, i16 %d) {
 ; CHECK-NEXT:    { // callseq 39, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {1, %rs1, %rs2, %rs3};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i16,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 39
 ; CHECK-NEXT:    ret;
   %struct.irrr0 = insertvalue %struct.short4 poison, i16 1, 0
@@ -943,11 +783,7 @@ define void @st_param_v4_i16_rirr(i16 %a, i16 %c, i16 %d) {
 ; CHECK-NEXT:    { // callseq 40, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {%rs1, 2, %rs2, %rs3};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i16,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 40
 ; CHECK-NEXT:    ret;
   %struct.rirr0 = insertvalue %struct.short4 poison, i16 %a, 0
@@ -969,11 +805,7 @@ define void @st_param_v4_i16_rrir(i16 %a, i16 %b, i16 %d) {
 ; CHECK-NEXT:    { // callseq 41, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {%rs1, %rs2, 3, %rs3};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i16,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 41
 ; CHECK-NEXT:    ret;
   %struct.rrir0 = insertvalue %struct.short4 poison, i16 %a, 0
@@ -995,11 +827,7 @@ define void @st_param_v4_i16_rrri(i16 %a, i16 %b, i16 %c) {
 ; CHECK-NEXT:    { // callseq 42, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {%rs1, %rs2, %rs3, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i16,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 42
 ; CHECK-NEXT:    ret;
   %struct.rrri0 = insertvalue %struct.short4 poison, i16 %a, 0
@@ -1020,11 +848,7 @@ define void @st_param_v4_i16_iirr(i16 %c, i16 %d) {
 ; CHECK-NEXT:    { // callseq 43, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {1, 2, %rs1, %rs2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i16,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 43
 ; CHECK-NEXT:    ret;
   %struct.iirr0 = insertvalue %struct.short4 poison, i16 1, 0
@@ -1045,11 +869,7 @@ define void @st_param_v4_i16_irir(i16 %b, i16 %d) {
 ; CHECK-NEXT:    { // callseq 44, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {1, %rs1, 3, %rs2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i16,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 44
 ; CHECK-NEXT:    ret;
   %struct.irir0 = insertvalue %struct.short4 poison, i16 1, 0
@@ -1070,11 +890,7 @@ define void @st_param_v4_i16_irri(i16 %b, i16 %c) {
 ; CHECK-NEXT:    { // callseq 45, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {1, %rs1, %rs2, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i16,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 45
 ; CHECK-NEXT:    ret;
   %struct.irri0 = insertvalue %struct.short4 poison, i16 1, 0
@@ -1095,11 +911,7 @@ define void @st_param_v4_i16_riir(i16 %a, i16 %d) {
 ; CHECK-NEXT:    { // callseq 46, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {%rs1, 2, 3, %rs2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i16,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 46
 ; CHECK-NEXT:    ret;
   %struct.riir0 = insertvalue %struct.short4 poison, i16 %a, 0
@@ -1120,11 +932,7 @@ define void @st_param_v4_i16_riri(i16 %a, i16 %c) {
 ; CHECK-NEXT:    { // callseq 47, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {%rs1, 2, %rs2, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i16,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 47
 ; CHECK-NEXT:    ret;
   %struct.riri0 = insertvalue %struct.short4 poison, i16 %a, 0
@@ -1145,11 +953,7 @@ define void @st_param_v4_i16_rrii(i16 %a, i16 %b) {
 ; CHECK-NEXT:    { // callseq 48, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {%rs1, %rs2, 3, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i16,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 48
 ; CHECK-NEXT:    ret;
   %struct.rrii0 = insertvalue %struct.short4 poison, i16 %a, 0
@@ -1169,11 +973,7 @@ define void @st_param_v4_i16_iiir(i16 %d) {
 ; CHECK-NEXT:    { // callseq 49, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {1, 2, 3, %rs1};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i16,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 49
 ; CHECK-NEXT:    ret;
   %struct.iiir0 = insertvalue %struct.short4 poison, i16 1, 0
@@ -1193,11 +993,7 @@ define void @st_param_v4_i16_iiri(i16 %c) {
 ; CHECK-NEXT:    { // callseq 50, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {1, 2, %rs1, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i16,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 50
 ; CHECK-NEXT:    ret;
   %struct.iiri0 = insertvalue %struct.short4 poison, i16 1, 0
@@ -1217,11 +1013,7 @@ define void @st_param_v4_i16_irii(i16 %b) {
 ; CHECK-NEXT:    { // callseq 51, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {1, %rs1, 3, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i16,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 51
 ; CHECK-NEXT:    ret;
   %struct.irii0 = insertvalue %struct.short4 poison, i16 1, 0
@@ -1241,11 +1033,7 @@ define void @st_param_v4_i16_riii(i16 %a) {
 ; CHECK-NEXT:    { // callseq 52, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {%rs1, 2, 3, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i16,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 52
 ; CHECK-NEXT:    ret;
   %struct.riii0 = insertvalue %struct.short4 poison, i16 %a, 0
@@ -1265,11 +1053,7 @@ define void @st_param_v4_i32_iiii() {
 ; CHECK-NEXT:    { // callseq 53, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {1, 2, 3, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i32, (param0);
 ; CHECK-NEXT:    } // callseq 53
 ; CHECK-NEXT:    ret;
   call void @call_v4_i32(%struct.int4 { i32 1, i32 2, i32 3, i32 4 })
@@ -1287,11 +1071,7 @@ define void @st_param_v4_i32_irrr(i32 %b, i32 %c, i32 %d) {
 ; CHECK-NEXT:    { // callseq 54, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {1, %r1, %r2, %r3};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i32, (param0);
 ; CHECK-NEXT:    } // callseq 54
 ; CHECK-NEXT:    ret;
   %struct.irrr0 = insertvalue %struct.int4 poison, i32 1, 0
@@ -1313,11 +1093,7 @@ define void @st_param_v4_i32_rirr(i32 %a, i32 %c, i32 %d) {
 ; CHECK-NEXT:    { // callseq 55, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, 2, %r2, %r3};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i32, (param0);
 ; CHECK-NEXT:    } // callseq 55
 ; CHECK-NEXT:    ret;
   %struct.rirr0 = insertvalue %struct.int4 poison, i32 %a, 0
@@ -1339,11 +1115,7 @@ define void @st_param_v4_i32_rrir(i32 %a, i32 %b, i32 %d) {
 ; CHECK-NEXT:    { // callseq 56, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, %r2, 3, %r3};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i32, (param0);
 ; CHECK-NEXT:    } // callseq 56
 ; CHECK-NEXT:    ret;
   %struct.rrir0 = insertvalue %struct.int4 poison, i32 %a, 0
@@ -1365,11 +1137,7 @@ define void @st_param_v4_i32_rrri(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    { // callseq 57, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, %r2, %r3, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i32, (param0);
 ; CHECK-NEXT:    } // callseq 57
 ; CHECK-NEXT:    ret;
   %struct.rrri0 = insertvalue %struct.int4 poison, i32 %a, 0
@@ -1390,11 +1158,7 @@ define void @st_param_v4_i32_iirr(i32 %c, i32 %d) {
 ; CHECK-NEXT:    { // callseq 58, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {1, 2, %r1, %r2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i32, (param0);
 ; CHECK-NEXT:    } // callseq 58
 ; CHECK-NEXT:    ret;
   %struct.iirr0 = insertvalue %struct.int4 poison, i32 1, 0
@@ -1415,11 +1179,7 @@ define void @st_param_v4_i32_irir(i32 %b, i32 %d) {
 ; CHECK-NEXT:    { // callseq 59, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {1, %r1, 3, %r2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i32, (param0);
 ; CHECK-NEXT:    } // callseq 59
 ; CHECK-NEXT:    ret;
   %struct.irir0 = insertvalue %struct.int4 poison, i32 1, 0
@@ -1440,11 +1200,7 @@ define void @st_param_v4_i32_irri(i32 %b, i32 %c) {
 ; CHECK-NEXT:    { // callseq 60, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {1, %r1, %r2, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i32, (param0);
 ; CHECK-NEXT:    } // callseq 60
 ; CHECK-NEXT:    ret;
   %struct.irri0 = insertvalue %struct.int4 poison, i32 1, 0
@@ -1465,11 +1221,7 @@ define void @st_param_v4_i32_riir(i32 %a, i32 %d) {
 ; CHECK-NEXT:    { // callseq 61, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, 2, 3, %r2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i32, (param0);
 ; CHECK-NEXT:    } // callseq 61
 ; CHECK-NEXT:    ret;
   %struct.riir0 = insertvalue %struct.int4 poison, i32 %a, 0
@@ -1490,11 +1242,7 @@ define void @st_param_v4_i32_riri(i32 %a, i32 %c) {
 ; CHECK-NEXT:    { // callseq 62, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, 2, %r2, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i32, (param0);
 ; CHECK-NEXT:    } // callseq 62
 ; CHECK-NEXT:    ret;
   %struct.riri0 = insertvalue %struct.int4 poison, i32 %a, 0
@@ -1515,11 +1263,7 @@ define void @st_param_v4_i32_rrii(i32 %a, i32 %b) {
 ; CHECK-NEXT:    { // callseq 63, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, %r2, 3, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i32, (param0);
 ; CHECK-NEXT:    } // callseq 63
 ; CHECK-NEXT:    ret;
   %struct.rrii0 = insertvalue %struct.int4 poison, i32 %a, 0
@@ -1539,11 +1283,7 @@ define void @st_param_v4_i32_iiir(i32 %d) {
 ; CHECK-NEXT:    { // callseq 64, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {1, 2, 3, %r1};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i32, (param0);
 ; CHECK-NEXT:    } // callseq 64
 ; CHECK-NEXT:    ret;
   %struct.iiir0 = insertvalue %struct.int4 poison, i32 1, 0
@@ -1563,11 +1303,7 @@ define void @st_param_v4_i32_iiri(i32 %c) {
 ; CHECK-NEXT:    { // callseq 65, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {1, 2, %r1, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i32, (param0);
 ; CHECK-NEXT:    } // callseq 65
 ; CHECK-NEXT:    ret;
   %struct.iiri0 = insertvalue %struct.int4 poison, i32 1, 0
@@ -1587,11 +1323,7 @@ define void @st_param_v4_i32_irii(i32 %b) {
 ; CHECK-NEXT:    { // callseq 66, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {1, %r1, 3, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i32, (param0);
 ; CHECK-NEXT:    } // callseq 66
 ; CHECK-NEXT:    ret;
   %struct.irii0 = insertvalue %struct.int4 poison, i32 1, 0
@@ -1611,11 +1343,7 @@ define void @st_param_v4_i32_riii(i32 %a) {
 ; CHECK-NEXT:    { // callseq 67, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, 2, 3, 4};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_i32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_i32, (param0);
 ; CHECK-NEXT:    } // callseq 67
 ; CHECK-NEXT:    ret;
   %struct.riii0 = insertvalue %struct.int4 poison, i32 %a, 0
@@ -1635,11 +1363,7 @@ define void @st_param_v4_f32_iiii() {
 ; CHECK-NEXT:    { // callseq 68, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, 0f40000000, 0f40400000, 0f40800000};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_f32, (param0);
 ; CHECK-NEXT:    } // callseq 68
 ; CHECK-NEXT:    ret;
   call void @call_v4_f32(%struct.float4 { float 1.0, float 2.0, float 3.0, float 4.0 })
@@ -1657,11 +1381,7 @@ define void @st_param_v4_f32_irrr(float %b, float %c, float %d) {
 ; CHECK-NEXT:    { // callseq 69, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, %r1, %r2, %r3};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_f32, (param0);
 ; CHECK-NEXT:    } // callseq 69
 ; CHECK-NEXT:    ret;
   %struct.irrr0 = insertvalue %struct.float4 poison, float 1.0, 0
@@ -1683,11 +1403,7 @@ define void @st_param_v4_f32_rirr(float %a, float %c, float %d) {
 ; CHECK-NEXT:    { // callseq 70, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, 0f40000000, %r2, %r3};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_f32, (param0);
 ; CHECK-NEXT:    } // callseq 70
 ; CHECK-NEXT:    ret;
   %struct.rirr0 = insertvalue %struct.float4 poison, float %a, 0
@@ -1709,11 +1425,7 @@ define void @st_param_v4_f32_rrir(float %a, float %b, float %d) {
 ; CHECK-NEXT:    { // callseq 71, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, %r2, 0f40400000, %r3};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_f32, (param0);
 ; CHECK-NEXT:    } // callseq 71
 ; CHECK-NEXT:    ret;
   %struct.rrir0 = insertvalue %struct.float4 poison, float %a, 0
@@ -1735,11 +1447,7 @@ define void @st_param_v4_f32_rrri(float %a, float %b, float %c) {
 ; CHECK-NEXT:    { // callseq 72, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, %r2, %r3, 0f40800000};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_f32, (param0);
 ; CHECK-NEXT:    } // callseq 72
 ; CHECK-NEXT:    ret;
   %struct.rrri0 = insertvalue %struct.float4 poison, float %a, 0
@@ -1760,11 +1468,7 @@ define void @st_param_v4_f32_iirr(float %c, float %d) {
 ; CHECK-NEXT:    { // callseq 73, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, 0f40000000, %r1, %r2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_f32, (param0);
 ; CHECK-NEXT:    } // callseq 73
 ; CHECK-NEXT:    ret;
   %struct.iirr0 = insertvalue %struct.float4 poison, float 1.0, 0
@@ -1785,11 +1489,7 @@ define void @st_param_v4_f32_irir(float %b, float %d) {
 ; CHECK-NEXT:    { // callseq 74, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, %r1, 0f40400000, %r2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_f32, (param0);
 ; CHECK-NEXT:    } // callseq 74
 ; CHECK-NEXT:    ret;
   %struct.irir0 = insertvalue %struct.float4 poison, float 1.0, 0
@@ -1810,11 +1510,7 @@ define void @st_param_v4_f32_irri(float %b, float %c) {
 ; CHECK-NEXT:    { // callseq 75, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, %r1, %r2, 0f40800000};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_f32, (param0);
 ; CHECK-NEXT:    } // callseq 75
 ; CHECK-NEXT:    ret;
   %struct.irri0 = insertvalue %struct.float4 poison, float 1.0, 0
@@ -1835,11 +1531,7 @@ define void @st_param_v4_f32_riir(float %a, float %d) {
 ; CHECK-NEXT:    { // callseq 76, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, 0f40000000, 0f40400000, %r2};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_f32, (param0);
 ; CHECK-NEXT:    } // callseq 76
 ; CHECK-NEXT:    ret;
   %struct.riir0 = insertvalue %struct.float4 poison, float %a, 0
@@ -1860,11 +1552,7 @@ define void @st_param_v4_f32_riri(float %a, float %c) {
 ; CHECK-NEXT:    { // callseq 77, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, 0f40000000, %r2, 0f40800000};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_f32, (param0);
 ; CHECK-NEXT:    } // callseq 77
 ; CHECK-NEXT:    ret;
   %struct.riri0 = insertvalue %struct.float4 poison, float %a, 0
@@ -1885,11 +1573,7 @@ define void @st_param_v4_f32_rrii(float %a, float %b) {
 ; CHECK-NEXT:    { // callseq 78, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, %r2, 0f40400000, 0f40800000};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_f32, (param0);
 ; CHECK-NEXT:    } // callseq 78
 ; CHECK-NEXT:    ret;
   %struct.rrii0 = insertvalue %struct.float4 poison, float %a, 0
@@ -1909,11 +1593,7 @@ define void @st_param_v4_f32_iiir(float %d) {
 ; CHECK-NEXT:    { // callseq 79, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, 0f40000000, 0f40400000, %r1};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_f32, (param0);
 ; CHECK-NEXT:    } // callseq 79
 ; CHECK-NEXT:    ret;
   %struct.iiir0 = insertvalue %struct.float4 poison, float 1.0, 0
@@ -1933,11 +1613,7 @@ define void @st_param_v4_f32_iiri(float %c) {
 ; CHECK-NEXT:    { // callseq 80, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, 0f40000000, %r1, 0f40800000};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_f32, (param0);
 ; CHECK-NEXT:    } // callseq 80
 ; CHECK-NEXT:    ret;
   %struct.iiri0 = insertvalue %struct.float4 poison, float 1.0, 0
@@ -1957,11 +1633,7 @@ define void @st_param_v4_f32_irii(float %b) {
 ; CHECK-NEXT:    { // callseq 81, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, %r1, 0f40400000, 0f40800000};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_f32, (param0);
 ; CHECK-NEXT:    } // callseq 81
 ; CHECK-NEXT:    ret;
   %struct.irii0 = insertvalue %struct.float4 poison, float 1.0, 0
@@ -1981,11 +1653,7 @@ define void @st_param_v4_f32_riii(float %a) {
 ; CHECK-NEXT:    { // callseq 82, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, 0f40000000, 0f40400000, 0f40800000};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_v4_f32,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_v4_f32, (param0);
 ; CHECK-NEXT:    } // callseq 82
 ; CHECK-NEXT:    ret;
   %struct.riii0 = insertvalue %struct.float4 poison, float %a, 0
@@ -2011,11 +1679,7 @@ define void @st_param_bfloat() {
 ; CHECK-NEXT:    { // callseq 83, 0
 ; CHECK-NEXT:    .param .align 2 .b8 param0[2];
 ; CHECK-NEXT:    st.param.b16 [param0], %rs1;
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    call_bfloat,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni call_bfloat, (param0);
 ; CHECK-NEXT:    } // callseq 83
 ; CHECK-NEXT:    ret;
   %five = bitcast i16 16640 to bfloat
diff --git a/llvm/test/CodeGen/NVPTX/store-undef.ll b/llvm/test/CodeGen/NVPTX/store-undef.ll
index 52415b05e03d0..5b31b5e24bc68 100644
--- a/llvm/test/CodeGen/NVPTX/store-undef.ll
+++ b/llvm/test/CodeGen/NVPTX/store-undef.ll
@@ -16,11 +16,7 @@ define void @test_store_param_undef() {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[32];
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    test_call,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni test_call, (param0);
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    ret;
   call void @test_call(%struct.T undef)
@@ -41,11 +37,7 @@ define void @test_store_param_def(i64 %param0, i32 %param1) {
 ; CHECK-NEXT:    st.param.b64 [param0], %rd1;
 ; CHECK-NEXT:    st.param.v2.b32 [param0+8], {%r2, %r1};
 ; CHECK-NEXT:    st.param.v4.b32 [param0+16], {%r3, %r1, %r4, %r5};
-; CHECK-NEXT:    call.uni
-; CHECK-NEXT:    test_call,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni test_call, (param0);
 ; CHECK-NEXT:    } // callseq 1
 ; CHECK-NEXT:    ret;
   %V2 = insertelement <2 x i32> undef, i32 %param1, i32 1
diff --git a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
index a97a8b5822f99..d6961a9541776 100644
--- a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
@@ -71,11 +71,7 @@ define ptx_kernel void @baz(ptr %red, i32 %idx) {
 ; CHECK-NEXT:    .param .b64 param0;
 ; CHECK-NEXT:    st.param.b64 [param0], %rd3;
 ; CHECK-NEXT:    .param .b32 retval0;
-; CHECK-NEXT:    call.uni (retval0),
-; CHECK-NEXT:    texfunc,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni (retval0), texfunc, (param0);
 ; CHECK-NEXT:    ld.param.b32 %r6, [retval0];
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    add.rn.f32 %r8, %r2, %r6;
diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
index efbac868dba38..178ee7ff6db18 100644
--- a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
@@ -33,11 +33,7 @@
 ; CHECK-DAG:    st.param.b8  [param0+3], [[P2_1_or]];
 ; CHECK-DAG:    st.param.b8  [param0+4], [[P2_1]];
 ; CHECK:        .param .align 8 .b8 retval0[16];
-; CHECK-NEXT:   call.uni (retval0),
-; CHECK-NEXT:   test_s_i8i16p,
-; CHECK-NEXT:   (
-; CHECK-NEXT:   param0
-; CHECK-NEXT:   );
+; CHECK-NEXT:   call.uni (retval0), test_s_i8i16p, (param0);
 ; CHECK-DAG:    ld.param.b16 [[R0:%rs[0-9]+]],   [retval0];
 ; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+3];
 ; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+4];
@@ -80,11 +76,7 @@ define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) {
 ; CHECK-DAG:    st.param.b8  [param0+7], [[P2_2_shr]];
 ; CHECK-DAG:    st.param.b8  [param0+8], [[P2_3]];
 ; CHECK:        .param .align 8 .b8 retval0[24];
-; CHECK-NEXT:   call.uni (retval0),
-; CHECK-NEXT:   test_s_i8i32p,
-; CHECK-NEXT:   (
-; CHECK-NEXT:   param0
-; CHECK-NEXT:   );
+; CHECK-NEXT:   call.uni (retval0), test_s_i8i32p, (param0);
 ; CHECK-DAG:    ld.param.b32 [[R0:%r[0-9]+]],   [retval0];
 ; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+5];
 ; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+6];
@@ -147,11 +139,7 @@ define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) {
 ; CHECK-DAG:    st.param.b8  [param0+15], [[P2_bfe_5]];
 ; CHECK-DAG:    st.param.b8  [param0+16], [[P2_bfe_6]];
 ; CHECK:        .param .align 8 .b8 retval0[32];
-; CHECK-NEXT:   call.uni (retval0),
-; CHECK-NEXT:   test_s_i8i64p,
-; CHECK-NEXT:   (
-; CHECK-NEXT:   param0
-; CHECK-NEXT:   );
+; CHECK-NEXT:   call.uni (retval0), test_s_i8i64p, (param0);
 ; CHECK-DAG:    ld.param.b64 [[R0:%rd[0-9]+]],   [retval0];
 ; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+9];
 ; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+10];
@@ -192,11 +180,7 @@ define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) {
 ; CHECK-DAG:    st.param.b8  [param0+3], [[P2_1_or]];
 ; CHECK-DAG:    st.param.b8  [param0+4], [[P2_1]];
 ; CHECK:        .param .align 8 .b8 retval0[16];
-; CHECK-NEXT:   call.uni (retval0),
-; CHECK-NEXT:   test_s_i8f16p,
-; CHECK-NEXT:   (
-; CHECK-NEXT:   param0
-; CHECK-NEXT:   );
+; CHECK-NEXT:   call.uni (retval0), test_s_i8f16p, (param0);
 ; CHECK-DAG:    ld.param.b16 [[R0:%rs[0-9]+]],     [retval0];
 ; CHECK-DAG:    ld.param.b8  [[R2I_0:%rs[0-9]+]], [retval0+3];
 ; CHECK-DAG:    ld.param.b8  [[R2I_1:%rs[0-9]+]], [retval0+4];
@@ -239,11 +223,7 @@ define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) {
 ; CHECK-DAG:    st.param.b8  [param0+7], [[P2_2_shr]];
 ; CHECK-DAG:    st.param.b8  [param0+8], [[P2_3]];
 ; CHECK:        .param .align 8 .b8 retval0[24];
-; CHECK-NEXT:   call.uni (retval0),
-; CHECK-NEXT:   test_s_i8f16x2p,
-; CHECK-NEXT:   (
-; CHECK-NEXT:   param0
-; CHECK-NEXT:   );
+; CHECK-NEXT:   call.uni (retval0), test_s_i8f16x2p, (param0);
 ; CHECK-DAG:    ld.param.b32 [[R0:%r[0-9]+]],   [retval0];
 ; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+5];
 ; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+6];
@@ -286,11 +266,7 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
 ; CHECK-DAG:    st.param.b8  [param0+7], [[P2_2_shr]];
 ; CHECK-DAG:    st.param.b8  [param0+8], [[P2_3]];
 ; CHECK:        .param .align 8 .b8 retval0[24];
-; CHECK-NEXT:   call.uni (retval0),
-; CHECK-NEXT:   test_s_i8f32p,
-; CHECK-NEXT:   (
-; CHECK-NEXT:   param0
-; CHECK-NEXT:   );
+; CHECK-NEXT:   call.uni (retval0), test_s_i8f32p, (param0);
 ; CHECK-DAG:    ld.param.b32 [[R0:%r[0-9]+]],    [retval0];
 ; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+5];
 ; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+6];
@@ -353,11 +329,7 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
 ; CHECK-DAG:    st.param.b8  [param0+15], [[P2_bfe_5]];
 ; CHECK-DAG:    st.param.b8  [param0+16], [[P2_bfe_6]];
 ; CHECK:        .param .align 8 .b8 retval0[32];
-; CHECK-NEXT:   call.uni (retval0),
-; CHECK-NEXT:   test_s_i8f64p,
-; CHECK-NEXT:   (
-; CHECK-NEXT:   param0
-; CHECK-NEXT:   );
+; CHECK-NEXT:   call.uni (retval0), test_s_i8f64p, (param0);
 ; CHECK-DAG:    ld.param.b64 [[R0:%rd[0-9]+]],   [retval0];
 ; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+9];
 ; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+10];
diff --git a/llvm/test/CodeGen/NVPTX/unreachable.ll b/llvm/test/CodeGen/NVPTX/unreachable.ll
index 80cf938d48b53..618c7ed0c4997 100644
--- a/llvm/test/CodeGen/NVPTX/unreachable.ll
+++ b/llvm/test/CodeGen/NVPTX/unreachable.ll
@@ -28,10 +28,7 @@ define ptx_kernel void @kernel_func() {
 ; NO-TRAP-UNREACHABLE-EMPTY:
 ; NO-TRAP-UNREACHABLE-NEXT:  // %bb.0:
 ; NO-TRAP-UNREACHABLE-NEXT:    { // callseq 0, 0
-; NO-TRAP-UNREACHABLE-NEXT:    call.uni
-; NO-TRAP-UNREACHABLE-NEXT:    throw,
-; NO-TRAP-UNREACHABLE-NEXT:    (
-; NO-TRAP-UNREACHABLE-NEXT:    );
+; NO-TRAP-UNREACHABLE-NEXT:    call.uni throw, ();
 ; NO-TRAP-UNREACHABLE-NEXT:    } // callseq 0
 ; NO-TRAP-UNREACHABLE-NEXT:    // begin inline asm
 ; NO-TRAP-UNREACHABLE-NEXT:    exit;
@@ -43,10 +40,7 @@ define ptx_kernel void @kernel_func() {
 ; NO-TRAP-AFTER-NORETURN-EMPTY:
 ; NO-TRAP-AFTER-NORETURN-NEXT:  // %bb.0:
 ; NO-TRAP-AFTER-NORETURN-NEXT:    { // callseq 0, 0
-; NO-TRAP-AFTER-NORETURN-NEXT:    call.uni
-; NO-TRAP-AFTER-NORETURN-NEXT:    throw,
-; NO-TRAP-AFTER-NORETURN-NEXT:    (
-; NO-TRAP-AFTER-NORETURN-NEXT:    );
+; NO-TRAP-AFTER-NORETURN-NEXT:    call.uni throw, ();
 ; NO-TRAP-AFTER-NORETURN-NEXT:    } // callseq 0
 ; NO-TRAP-AFTER-NORETURN-NEXT:    // begin inline asm
 ; NO-TRAP-AFTER-NORETURN-NEXT:    exit;
@@ -59,10 +53,7 @@ define ptx_kernel void @kernel_func() {
 ; TRAP-EMPTY:
 ; TRAP-NEXT:  // %bb.0:
 ; TRAP-NEXT:    { // callseq 0, 0
-; TRAP-NEXT:    call.uni
-; TRAP-NEXT:    throw,
-; TRAP-NEXT:    (
-; TRAP-NEXT:    );
+; TRAP-NEXT:    call.uni throw, ();
 ; TRAP-NEXT:    } // callseq 0
 ; TRAP-NEXT:    trap; exit;
 ;
@@ -72,10 +63,7 @@ define ptx_kernel void @kernel_func() {
 ; BUG-FIXED-EMPTY:
 ; BUG-FIXED-NEXT:  // %bb.0:
 ; BUG-FIXED-NEXT:    { // callseq 0, 0
-; BUG-FIXED-NEXT:    call.uni
-; BUG-FIXED-NEXT:    throw,
-; BUG-FIXED-NEXT:    (
-; BUG-FIXED-NEXT:    );
+; BUG-FIXED-NEXT:    call.uni throw, ();
 ; BUG-FIXED-NEXT:    } // callseq 0
 ; BUG-FIXED-NEXT:    trap;
   call void @throw()
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index ddaa9fd831af7..ca1b722527a89 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -126,12 +126,7 @@ define dso_local i32 @foo() {
 ; CHECK-PTX-NEXT:    .param .b64 param1;
 ; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd4;
 ; CHECK-PTX-NEXT:    .param .b32 retval0;
-; CHECK-PTX-NEXT:    call.uni (retval0),
-; CHECK-PTX-NEXT:    variadics1,
-; CHECK-PTX-NEXT:    (
-; CHECK-PTX-NEXT:    param0,
-; CHECK-PTX-NEXT:    param1
-; CHECK-PTX-NEXT:    );
+; CHECK-PTX-NEXT:    call.uni (retval0), variadics1, (param0, param1);
 ; CHECK-PTX-NEXT:    ld.param.b32 %r2, [retval0];
 ; CHECK-PTX-NEXT:    } // callseq 0
 ; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %r2;
@@ -238,12 +233,7 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    .param .b64 param1;
 ; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd4;
 ; CHECK-PTX-NEXT:    .param .b32 retval0;
-; CHECK-PTX-NEXT:    call.uni (retval0),
-; CHECK-PTX-NEXT:    variadics2,
-; CHECK-PTX-NEXT:    (
-; CHECK-PTX-NEXT:    param0,
-; CHECK-PTX-NEXT:    param1
-; CHECK-PTX-NEXT:    );
+; CHECK-PTX-NEXT:    call.uni (retval0), variadics2, (param0, param1);
 ; CHECK-PTX-NEXT:    ld.param.b32 %r2, [retval0];
 ; CHECK-PTX-NEXT:    } // callseq 1
 ; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %r2;
@@ -315,12 +305,7 @@ define dso_local i32 @baz() {
 ; CHECK-PTX-NEXT:    .param .b64 param1;
 ; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd1;
 ; CHECK-PTX-NEXT:    .param .b32 retval0;
-; CHECK-PTX-NEXT:    call.uni (retval0),
-; CHECK-PTX-NEXT:    variadics3,
-; CHECK-PTX-NEXT:    (
-; CHECK-PTX-NEXT:    param0,
-; CHECK-PTX-NEXT:    param1
-; CHECK-PTX-NEXT:    );
+; CHECK-PTX-NEXT:    call.uni (retval0), variadics3, (param0, param1);
 ; CHECK-PTX-NEXT:    ld.param.b32 %r2, [retval0];
 ; CHECK-PTX-NEXT:    } // callseq 2
 ; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %r2;
@@ -397,12 +382,7 @@ define dso_local void @qux() {
 ; CHECK-PTX-NEXT:    .param .b64 param1;
 ; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd8;
 ; CHECK-PTX-NEXT:    .param .b32 retval0;
-; CHECK-PTX-NEXT:    call.uni (retval0),
-; CHECK-PTX-NEXT:    variadics4,
-; CHECK-PTX-NEXT:    (
-; CHECK-PTX-NEXT:    param0,
-; CHECK-PTX-NEXT:    param1
-; CHECK-PTX-NEXT:    );
+; CHECK-PTX-NEXT:    call.uni (retval0), variadics4, (param0, param1);
 ; CHECK-PTX-NEXT:    ld.param.b32 %r1, [retval0];
 ; CHECK-PTX-NEXT:    } // callseq 3
 ; CHECK-PTX-NEXT:    ret;
diff --git a/llvm/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll b/llvm/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll
index dac17dc3225ee..b7852c3c3e6e0 100644
--- a/llvm/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll
+++ b/llvm/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll
@@ -18,7 +18,6 @@ define signext i32 @main() nounwind {
 ; CHECK-NEXT:    sth 3, 46(1)
 ; CHECK-NEXT:    addi 3, 1, 46
 ; CHECK-NEXT:    lharx 4, 0, 3
-; CHECK-NEXT:    clrlwi 4, 4, 16
 ; CHECK-NEXT:    cmplwi 4, 33059
 ; CHECK-NEXT:    bne 0, .LBB0_4
 ; CHECK-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -32,7 +31,6 @@ define signext i32 @main() nounwind {
 ; CHECK-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lharx 5, 0, 3
-; CHECK-NEXT:    clrlwi 5, 5, 16
 ; CHECK-NEXT:    cmplwi 5, 33059
 ; CHECK-NEXT:    beq 0, .LBB0_2
 ; CHECK-NEXT:  .LBB0_4: # %cmpxchg.nostore
diff --git a/llvm/test/CodeGen/PowerPC/all-atomics.ll b/llvm/test/CodeGen/PowerPC/all-atomics.ll
index 5e14fbbb6ad61..07afea75aec67 100644
--- a/llvm/test/CodeGen/PowerPC/all-atomics.ll
+++ b/llvm/test/CodeGen/PowerPC/all-atomics.ll
@@ -4346,8 +4346,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbz 7, uc@toc@l(3)
 ; CHECK-NEXT:    lbz 8, sc@toc@l(4)
 ; CHECK-NEXT:    lbarx 5, 0, 6
-; CHECK-NEXT:    clrlwi 9, 5, 24
-; CHECK-NEXT:    cmplw 9, 7
+; CHECK-NEXT:    cmplw 5, 7
 ; CHECK-NEXT:    bne 0, .LBB3_4
 ; CHECK-NEXT:  # %bb.1: # %cmpxchg.fencedstore276
 ; CHECK-NEXT:    sync
@@ -4359,8 +4358,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 {
 ; CHECK-NEXT:  # %bb.3: # %cmpxchg.releasedload274
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lbarx 5, 0, 6
-; CHECK-NEXT:    clrlwi 9, 5, 24
-; CHECK-NEXT:    cmplw 9, 7
+; CHECK-NEXT:    cmplw 5, 7
 ; CHECK-NEXT:    beq 0, .LBB3_2
 ; CHECK-NEXT:  .LBB3_4: # %cmpxchg.nostore272
 ; CHECK-NEXT:    addi 7, 3, uc@toc@l
@@ -4368,8 +4366,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 {
 ; CHECK-NEXT:    stb 5, sc@toc@l(4)
 ; CHECK-NEXT:    lbz 9, uc@toc@l(3)
 ; CHECK-NEXT:    lbarx 8, 0, 7
-; CHECK-NEXT:    clrlwi 10, 8, 24
-; CHECK-NEXT:    cmplw 10, 9
+; CHECK-NEXT:    cmplw 8, 9
 ; CHECK-NEXT:    bne 0, .LBB3_8
 ; CHECK-NEXT:  # %bb.5: # %cmpxchg.fencedstore257
 ; CHECK-NEXT:    sync
@@ -4382,8 +4379,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 {
 ; CHECK-NEXT:  # %bb.7: # %cmpxchg.releasedload255
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lbarx 8, 0, 7
-; CHECK-NEXT:    clrlwi 10, 8, 24
-; CHECK-NEXT:    cmplw 10, 9
+; CHECK-NEXT:    cmplw 8, 9
 ; CHECK-NEXT:    beq 0, .LBB3_6
 ; CHECK-NEXT:  .LBB3_8: # %cmpxchg.nostore253
 ; CHECK-NEXT:    addis 5, 2, ss@toc@ha
@@ -4393,8 +4389,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbz 11, sc@toc@l(4)
 ; CHECK-NEXT:    addi 8, 5, ss@toc@l
 ; CHECK-NEXT:    lharx 9, 0, 8
-; CHECK-NEXT:    clrlwi 12, 9, 16
-; CHECK-NEXT:    cmplw 12, 10
+; CHECK-NEXT:    cmplw 9, 10
 ; CHECK-NEXT:    bne 0, .LBB3_12
 ; CHECK-NEXT:  # %bb.9: # %cmpxchg.fencedstore238
 ; CHECK-NEXT:    extsb 11, 11
@@ -4408,8 +4403,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 {
 ; CHECK-NEXT:  # %bb.11: # %cmpxchg.releasedload236
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lharx 9, 0, 8
-; CHECK-NEXT:    clrlwi 12, 9, 16
-; CHECK-NEXT:    cmplw 12, 10
+; CHECK-NEXT:    cmplw 9, 10
 ; CHECK-NEXT:    beq 0, .LBB3_10
 ; CHECK-NEXT:  .LBB3_12: # %cmpxchg.nostore234
 ; CHECK-NEXT:    lwsync
@@ -4419,8 +4413,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbz 12, sc@toc@l(4)
 ; CHECK-NEXT:    addi 9, 5, us@toc@l
 ; CHECK-NEXT:    lharx 10, 0, 9
-; CHECK-NEXT:    clrlwi 0, 10, 16
-; CHECK-NEXT:    cmplw 0, 11
+; CHECK-NEXT:    cmplw 10, 11
 ; CHECK-NEXT:    bne 0, .LBB3_16
 ; CHECK-NEXT:  # %bb.13: # %cmpxchg.fencedstore219
 ; CHECK-NEXT:    extsb 12, 12
@@ -4434,8 +4427,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 {
 ; CHECK-NEXT:  # %bb.15: # %cmpxchg.releasedload217
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lharx 10, 0, 9
-; CHECK-NEXT:    clrlwi 0, 10, 16
-; CHECK-NEXT:    cmplw 0, 11
+; CHECK-NEXT:    cmplw 10, 11
 ; CHECK-NEXT:    beq 0, .LBB3_14
 ; CHECK-NEXT:  .LBB3_16: # %cmpxchg.nostore215
 ; CHECK-NEXT:    lwsync
@@ -4535,7 +4527,6 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbz 30, uc@toc@l(3)
 ; CHECK-NEXT:    lbz 29, sc@toc@l(4)
 ; CHECK-NEXT:    lbarx 28, 0, 6
-; CHECK-NEXT:    clrlwi 28, 28, 24
 ; CHECK-NEXT:    cmplw 28, 30
 ; CHECK-NEXT:    bne 0, .LBB3_36
 ; CHECK-NEXT:  # %bb.33: # %cmpxchg.fencedstore124
@@ -4548,7 +4539,6 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 {
 ; CHECK-NEXT:  # %bb.35: # %cmpxchg.releasedload122
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lbarx 28, 0, 6
-; CHECK-NEXT:    clrlwi 28, 28, 24
 ; CHECK-NEXT:    cmplw 28, 30
 ; CHECK-NEXT:    beq 0, .LBB3_34
 ; CHECK-NEXT:  .LBB3_36: # %cmpxchg.nostore120
@@ -4566,7 +4556,6 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 {
 ; CHECK-NEXT:    stw 6, ui@toc@l(5)
 ; CHECK-NEXT:    lbz 6, uc@toc@l(3)
 ; CHECK-NEXT:    lbarx 29, 0, 7
-; CHECK-NEXT:    clrlwi 29, 29, 24
 ; CHECK-NEXT:    cmplw 29, 6
 ; CHECK-NEXT:    bne 0, .LBB3_42
 ; CHECK-NEXT:  # %bb.39: # %cmpxchg.fencedstore105
@@ -4579,7 +4568,6 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 {
 ; CHECK-NEXT:  # %bb.41: # %cmpxchg.releasedload103
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lbarx 29, 0, 7
-; CHECK-NEXT:    clrlwi 29, 29, 24
 ; CHECK-NEXT:    cmplw 29, 6
 ; CHECK-NEXT:    beq 0, .LBB3_40
 ; CHECK-NEXT:  .LBB3_42: # %cmpxchg.nostore101
@@ -4597,7 +4585,6 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 {
 ; CHECK-NEXT:    stw 6, ui@toc@l(5)
 ; CHECK-NEXT:    lbz 6, uc@toc@l(3)
 ; CHECK-NEXT:    lharx 30, 0, 8
-; CHECK-NEXT:    clrlwi 30, 30, 16
 ; CHECK-NEXT:    cmplw 30, 6
 ; CHECK-NEXT:    bne 0, .LBB3_48
 ; CHECK-NEXT:  # %bb.45: # %cmpxchg.fencedstore86
@@ -4612,7 +4599,6 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 {
 ; CHECK-NEXT:  # %bb.47: # %cmpxchg.releasedload84
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lharx 30, 0, 8
-; CHECK-NEXT:    clrlwi 30, 30, 16
 ; CHECK-NEXT:    cmplw 30, 6
 ; CHECK-NEXT:    beq 0, .LBB3_46
 ; CHECK-NEXT:  .LBB3_48: # %cmpxchg.nostore82
@@ -4630,7 +4616,6 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 {
 ; CHECK-NEXT:    stw 6, ui@toc@l(5)
 ; CHECK-NEXT:    lbz 6, uc@toc@l(3)
 ; CHECK-NEXT:    lharx 8, 0, 9
-; CHECK-NEXT:    clrlwi 8, 8, 16
 ; CHECK-NEXT:    cmplw 8, 6
 ; CHECK-NEXT:    bne 0, .LBB3_54
 ; CHECK-NEXT:  # %bb.51: # %cmpxchg.fencedstore67
@@ -4645,7 +4630,6 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 {
 ; CHECK-NEXT:  # %bb.53: # %cmpxchg.releasedload65
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lharx 8, 0, 9
-; CHECK-NEXT:    clrlwi 8, 8, 16
 ; CHECK-NEXT:    cmplw 8, 6
 ; CHECK-NEXT:    beq 0, .LBB3_52
 ; CHECK-NEXT:  .LBB3_54: # %cmpxchg.nostore63
diff --git a/llvm/test/CodeGen/PowerPC/atomics-regression.ll b/llvm/test/CodeGen/PowerPC/atomics-regression.ll
index 280c4299c30b7..0474a479a1fef 100644
--- a/llvm/test/CodeGen/PowerPC/atomics-regression.ll
+++ b/llvm/test/CodeGen/PowerPC/atomics-regression.ll
@@ -406,7 +406,6 @@ define void @test40(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  .LBB40_1: # %cmpxchg.start
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
@@ -428,7 +427,6 @@ define void @test41(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  .LBB41_1: # %cmpxchg.start
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
@@ -451,7 +449,6 @@ define void @test42(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  .LBB42_1: # %cmpxchg.start
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB42_3
 ; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
@@ -470,7 +467,6 @@ define void @test43(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -484,7 +480,6 @@ define void @test43(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB43_2
 ; PPC64LE-NEXT:    blr
@@ -497,7 +492,6 @@ define void @test44(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB44_4
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -511,7 +505,6 @@ define void @test44(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB44_2
 ; PPC64LE-NEXT:  .LBB44_4: # %cmpxchg.nostore
@@ -526,7 +519,6 @@ define void @test45(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -540,7 +532,6 @@ define void @test45(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB45_2
 ; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
@@ -557,7 +548,6 @@ define void @test46(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB46_4
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -571,7 +561,6 @@ define void @test46(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB46_2
 ; PPC64LE-NEXT:  .LBB46_4: # %cmpxchg.nostore
@@ -586,7 +575,6 @@ define void @test47(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -600,7 +588,6 @@ define void @test47(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB47_2
 ; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
@@ -617,7 +604,6 @@ define void @test48(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB48_4
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -631,7 +617,6 @@ define void @test48(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB48_2
 ; PPC64LE-NEXT:  .LBB48_4: # %cmpxchg.nostore
@@ -646,7 +631,6 @@ define void @test49(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB49_4
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -660,7 +644,6 @@ define void @test49(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB49_2
 ; PPC64LE-NEXT:  .LBB49_4: # %cmpxchg.nostore
@@ -679,7 +662,6 @@ define void @test50(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  .LBB50_1: # %cmpxchg.start
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
@@ -701,7 +683,6 @@ define void @test51(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  .LBB51_1: # %cmpxchg.start
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
@@ -724,7 +705,6 @@ define void @test52(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  .LBB52_1: # %cmpxchg.start
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB52_3
 ; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
@@ -743,7 +723,6 @@ define void @test53(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -757,7 +736,6 @@ define void @test53(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB53_2
 ; PPC64LE-NEXT:    blr
@@ -770,7 +748,6 @@ define void @test54(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB54_4
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -784,7 +761,6 @@ define void @test54(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB54_2
 ; PPC64LE-NEXT:  .LBB54_4: # %cmpxchg.nostore
@@ -799,7 +775,6 @@ define void @test55(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -813,7 +788,6 @@ define void @test55(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB55_2
 ; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
@@ -830,7 +804,6 @@ define void @test56(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB56_4
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -844,7 +817,6 @@ define void @test56(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB56_2
 ; PPC64LE-NEXT:  .LBB56_4: # %cmpxchg.nostore
@@ -859,7 +831,6 @@ define void @test57(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -873,7 +844,6 @@ define void @test57(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB57_2
 ; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
@@ -890,7 +860,6 @@ define void @test58(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB58_4
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -904,7 +873,6 @@ define void @test58(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB58_2
 ; PPC64LE-NEXT:  .LBB58_4: # %cmpxchg.nostore
@@ -919,7 +887,6 @@ define void @test59(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB59_4
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -933,7 +900,6 @@ define void @test59(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB59_2
 ; PPC64LE-NEXT:  .LBB59_4: # %cmpxchg.nostore
@@ -1424,7 +1390,6 @@ define void @test80(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  .LBB80_1: # %cmpxchg.start
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
@@ -1446,7 +1411,6 @@ define void @test81(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  .LBB81_1: # %cmpxchg.start
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
@@ -1469,7 +1433,6 @@ define void @test82(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  .LBB82_1: # %cmpxchg.start
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB82_3
 ; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
@@ -1488,7 +1451,6 @@ define void @test83(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -1502,7 +1464,6 @@ define void @test83(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB83_2
 ; PPC64LE-NEXT:    blr
@@ -1515,7 +1476,6 @@ define void @test84(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB84_4
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -1529,7 +1489,6 @@ define void @test84(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB84_2
 ; PPC64LE-NEXT:  .LBB84_4: # %cmpxchg.nostore
@@ -1544,7 +1503,6 @@ define void @test85(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -1558,7 +1516,6 @@ define void @test85(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB85_2
 ; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
@@ -1575,7 +1532,6 @@ define void @test86(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB86_4
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -1589,7 +1545,6 @@ define void @test86(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB86_2
 ; PPC64LE-NEXT:  .LBB86_4: # %cmpxchg.nostore
@@ -1604,7 +1559,6 @@ define void @test87(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -1618,7 +1572,6 @@ define void @test87(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB87_2
 ; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
@@ -1635,7 +1588,6 @@ define void @test88(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB88_4
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -1649,7 +1601,6 @@ define void @test88(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB88_2
 ; PPC64LE-NEXT:  .LBB88_4: # %cmpxchg.nostore
@@ -1664,7 +1615,6 @@ define void @test89(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB89_4
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -1678,7 +1628,6 @@ define void @test89(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB89_2
 ; PPC64LE-NEXT:  .LBB89_4: # %cmpxchg.nostore
@@ -1697,7 +1646,6 @@ define void @test90(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  .LBB90_1: # %cmpxchg.start
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
@@ -1719,7 +1667,6 @@ define void @test91(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  .LBB91_1: # %cmpxchg.start
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
@@ -1742,7 +1689,6 @@ define void @test92(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  .LBB92_1: # %cmpxchg.start
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB92_3
 ; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
@@ -1761,7 +1707,6 @@ define void @test93(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -1775,7 +1720,6 @@ define void @test93(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB93_2
 ; PPC64LE-NEXT:    blr
@@ -1788,7 +1732,6 @@ define void @test94(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB94_4
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -1802,7 +1745,6 @@ define void @test94(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB94_2
 ; PPC64LE-NEXT:  .LBB94_4: # %cmpxchg.nostore
@@ -1817,7 +1759,6 @@ define void @test95(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -1831,7 +1772,6 @@ define void @test95(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB95_2
 ; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
@@ -1848,7 +1788,6 @@ define void @test96(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB96_4
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -1862,7 +1801,6 @@ define void @test96(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB96_2
 ; PPC64LE-NEXT:  .LBB96_4: # %cmpxchg.nostore
@@ -1877,7 +1815,6 @@ define void @test97(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -1891,7 +1828,6 @@ define void @test97(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB97_2
 ; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
@@ -1908,7 +1844,6 @@ define void @test98(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB98_4
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -1922,7 +1857,6 @@ define void @test98(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB98_2
 ; PPC64LE-NEXT:  .LBB98_4: # %cmpxchg.nostore
@@ -1937,7 +1871,6 @@ define void @test99(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB99_4
 ; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
@@ -1951,7 +1884,6 @@ define void @test99(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 16
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    beq 0, .LBB99_2
 ; PPC64LE-NEXT:  .LBB99_4: # %cmpxchg.nostore
diff --git a/llvm/test/CodeGen/PowerPC/builtins-bcd-transform.ll b/llvm/test/CodeGen/PowerPC/builtins-bcd-transform.ll
new file mode 100644
index 0000000000000..449beeb18c2de
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/builtins-bcd-transform.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Testfile that verifies positive case (0 or 1 only) for BCD builtins national2packed, packed2zoned and zoned2packed.
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names  < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names  < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names  < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names  < %s | FileCheck %s
+
+declare <16 x i8> @llvm.ppc.national2packed(<16 x i8>, i32 immarg)
+
+define <16 x i8> @tBcd_National2packed_imm0(<16 x i8> %a) {
+; CHECK-LABEL: tBcd_National2packed_imm0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    bcdcfn. v2, v2, 0
+; CHECK-NEXT:    blr
+entry:
+  %0 = call <16 x i8> @llvm.ppc.national2packed(<16 x i8> %a, i32 0)
+  ret <16 x i8> %0
+}
+
+define <16 x i8> @tBcd_National2packed_imm1(<16 x i8> %a) {
+; CHECK-LABEL: tBcd_National2packed_imm1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    bcdcfn. v2, v2, 1
+; CHECK-NEXT:    blr
+entry:
+  %0 = call <16 x i8> @llvm.ppc.national2packed(<16 x i8> %a, i32 1)
+  ret <16 x i8> %0
+}
+
+declare <16 x i8> @llvm.ppc.packed2national(<16 x i8>)
+
+define <16 x i8> @tBcd_Packed2national(<16 x i8> %a) {
+; CHECK-LABEL: tBcd_Packed2national:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    bcdctn. v2, v2
+; CHECK-NEXT:    blr
+entry:
+  %0 = call <16 x i8> @llvm.ppc.packed2national(<16 x i8> %a)
+  ret <16 x i8> %0
+}
+
+declare <16 x i8> @llvm.ppc.packed2zoned(<16 x i8>, i32 immarg)
+
+define <16 x i8> @tBcd_Packed2zoned_imm0(<16 x i8> %a) {
+; CHECK-LABEL: tBcd_Packed2zoned_imm0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    bcdctz. v2, v2, 0
+; CHECK-NEXT:    blr
+entry:
+  %0 = call <16 x i8> @llvm.ppc.packed2zoned(<16 x i8> %a, i32 0)
+  ret <16 x i8> %0
+}
+
+define <16 x i8> @tBcd_Packed2zoned_imm1(<16 x i8> %a) {
+; CHECK-LABEL: tBcd_Packed2zoned_imm1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    bcdctz. v2, v2, 1
+; CHECK-NEXT:    blr
+entry:
+  %0 = call <16 x i8> @llvm.ppc.packed2zoned(<16 x i8> %a, i32 1)
+  ret <16 x i8> %0
+}
+
+declare <16 x i8> @llvm.ppc.zoned2packed(<16 x i8>, i32 immarg)
+
+define <16 x i8> @tBcd_Zoned2packed_imm0(<16 x i8> %a) {
+; CHECK-LABEL: tBcd_Zoned2packed_imm0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    bcdcfz. v2, v2, 0
+; CHECK-NEXT:    blr
+entry:
+  %0 = call <16 x i8> @llvm.ppc.zoned2packed(<16 x i8> %a, i32 0)
+  ret <16 x i8> %0
+}
+
+define <16 x i8> @tBcd_Zoned2packed_imm1(<16 x i8> %a) {
+; CHECK-LABEL: tBcd_Zoned2packed_imm1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    bcdcfz. v2, v2, 1
+; CHECK-NEXT:    blr
+entry:
+  %0 = call <16 x i8> @llvm.ppc.zoned2packed(<16 x i8> %a, i32 1)
+  ret <16 x i8> %0
+}
diff --git a/llvm/test/CodeGen/PowerPC/loop-comment.ll b/llvm/test/CodeGen/PowerPC/loop-comment.ll
index 34b29cbe901e9..530e67b4804fb 100644
--- a/llvm/test/CodeGen/PowerPC/loop-comment.ll
+++ b/llvm/test/CodeGen/PowerPC/loop-comment.ll
@@ -10,7 +10,6 @@ define void @test(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-NEXT:  .LBB0_1: # %cmpxchg.start
 ; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index 8b931f70aa5cc..999ecba7f1b9c 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -61,7 +61,6 @@
 ; CHECK-NEXT:   m                                - 'M' (Integer Multiplication and Division).
 ; CHECK-NEXT:   mips-p8700                       - MIPS p8700 processor.
 ; CHECK-NEXT:   no-default-unroll                - Disable default unroll preference..
-; CHECK-NEXT:   no-rvc-hints                     - Disable RVC Hint Instructions..
 ; CHECK-NEXT:   no-sink-splat-operands           - Disable sink splat operands to enable .vx, .vf,.wx, and .wf instructions.
 ; CHECK-NEXT:   no-trailing-seq-cst-fence        - Disable trailing fence for seq-cst store..
 ; CHECK-NEXT:   optimized-nf2-segment-load-store - vlseg2eN.v and vsseg2eN.v are implemented as a wide memory op and shuffle.
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index c2ae1ce491389..3e822d357b667 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -293,31 +293,6 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_fact
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3
 }
 
-; TODO: Remove once recursive deinterleaving support is removed
-define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor4_recursive(ptr %p) {
-; CHECK-LABEL: vector_deinterleave_load_factor4_recursive:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v8, (a0)
-; CHECK-NEXT:    ret
-  %vec = load <32 x i8>, ptr %p
-  %d0 = call {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8> %vec)
-  %d0.0 = extractvalue { <16 x i8>, <16 x i8> } %d0, 0
-  %d0.1 = extractvalue { <16 x i8>, <16 x i8> } %d0, 1
-  %d1 = call {<8 x i8>, <8 x i8>} @llvm.vector.deinterleave2.v16i8(<16 x i8> %d0.0)
-  %t0 = extractvalue { <8 x i8>, <8 x i8> } %d1, 0
-  %t2 = extractvalue { <8 x i8>, <8 x i8> } %d1, 1
-  %d2 = call {<8 x i8>, <8 x i8>} @llvm.vector.deinterleave2.v16i8(<16 x i8> %d0.1)
-  %t1 = extractvalue { <8 x i8>, <8 x i8> } %d2, 0
-  %t3 = extractvalue { <8 x i8>, <8 x i8> } %d2, 1
-
-  %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } undef, <8 x i8> %t0, 0
-  %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 1
-  %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 2
-  %res3 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res2, <8 x i8> %t3, 3
-  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3
-}
-
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor5(ptr %p) {
 ; CHECK-LABEL: vector_deinterleave_load_factor5:
 ; CHECK:       # %bb.0:
@@ -414,45 +389,3 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <
   %res7 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res6, <8 x i8> %t6, 7
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res7
 }
-
-; TODO: Remove once recursive deinterleaving support is removed
-define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave_load_factor8_recursive(ptr %ptr) {
-; CHECK-LABEL: vector_deinterleave_load_factor8_recursive:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v8, (a0)
-; CHECK-NEXT:    ret
-  %vec = load <16 x i32>, ptr %ptr
-  %d0 = call { <8 x i32>, <8 x i32> } @llvm.vector.deinterleave2.v16i32(<16 x i32> %vec)
-  %d0.0 = extractvalue { <8 x i32>, <8 x i32> } %d0, 0
-  %d0.1 = extractvalue { <8 x i32>, <8 x i32> } %d0, 1
-  %d1 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %d0.0)
-  %d1.0 = extractvalue { <4 x i32>, <4 x i32> } %d1, 0
-  %d1.1 = extractvalue { <4 x i32>, <4 x i32> } %d1, 1
-  %d2 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %d0.1)
-  %d2.0 = extractvalue { <4 x i32>, <4 x i32> } %d2, 0
-  %d2.1 = extractvalue { <4 x i32>, <4 x i32> } %d2, 1
-
-  %d3 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d1.0)
-  %t0 = extractvalue { <2 x i32>, <2 x i32> } %d3, 0
-  %t4 = extractvalue { <2 x i32>, <2 x i32> } %d3, 1
-  %d4 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d1.1)
-  %t2 = extractvalue { <2 x i32>, <2 x i32> } %d4, 0
-  %t6 = extractvalue { <2 x i32>, <2 x i32> } %d4, 1
-  %d5 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d2.0)
-  %t1 = extractvalue { <2 x i32>, <2 x i32> } %d5, 0
-  %t5 = extractvalue { <2 x i32>, <2 x i32> } %d5, 1
-  %d6 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d2.1)
-  %t3 = extractvalue { <2 x i32>, <2 x i32> } %d6, 0
-  %t7 = extractvalue { <2 x i32>, <2 x i32> } %d6, 1
-
-  %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } undef, <2 x i32> %t0, 0
-  %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1
-  %res2 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res1, <2 x i32> %t2, 2
-  %res3 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res2, <2 x i32> %t3, 3
-  %res4 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3, <2 x i32> %t4, 4
-  %res5 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res4, <2 x i32> %t5, 5
-  %res6 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res5, <2 x i32> %t6, 6
-  %res7 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res6, <2 x i32> %t7, 7
-  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res7
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
index c394e7aa2e3e8..a49eeed3605c5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
@@ -203,20 +203,6 @@ define void @vector_interleave_store_factor4(<4 x i32> %a, <4 x i32> %b, <4 x i3
   ret void
 }
 
-; TODO: Remove once recursive interleaving support is removed
-define void @vector_interleave_store_factor4_recursive(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, ptr %p) {
-; CHECK-LABEL: vector_interleave_store_factor4_recursive:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vsseg4e32.v v8, (a0)
-; CHECK-NEXT:    ret
-  %v0 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %c)
-  %v1 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %b, <4 x i32> %d)
-  %v2 = call <16 x i32> @llvm.vector.interleave2.v16i32(<8 x i32> %v0, <8 x i32> %v1)
-  store <16 x i32> %v2, ptr %p
-  ret void
-}
-
 define void @vector_interleave_store_factor5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, ptr %p) {
 ; CHECK-LABEL: vector_interleave_store_factor5:
 ; CHECK:       # %bb.0:
@@ -260,23 +246,3 @@ define void @vector_interleave_store_factor8(<4 x i32> %a, <4 x i32> %b, <4 x i3
   store <32 x i32> %v, ptr %p
   ret void
 }
-
-; TODO: Remove once recursive interleaving support is removed
-define void @vector_interleave_store_factor8_recursive(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h, ptr %p) {
-; CHECK-LABEL: vector_interleave_store_factor8_recursive:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vsseg8e32.v v8, (a0)
-; CHECK-NEXT:    ret
-  %v0 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %e)
-  %v1 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %c, <4 x i32> %g)
-  %v2 = call <16 x i32> @llvm.vector.interleave2.v16i32(<8 x i32> %v0, <8 x i32> %v1)
-
-  %v3 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %b, <4 x i32> %f)
-  %v4 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %d, <4 x i32> %h)
-  %v5 = call <16 x i32> @llvm.vector.interleave2.v16i32(<8 x i32> %v3, <8 x i32> %v4)
-
-  %v6 = call <32 x i32> @llvm.vector.interleave2.v32i32(<16 x i32> %v2, <16 x i32> %v5)
-  store <32 x i32> %v6, ptr %p
-  ret void
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 8ac4c7447c7d4..5e3ae2faf1a53 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -302,15 +302,11 @@ define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vpload_factor4_intrinsics(p
 ; CHECK-NEXT:    vlseg4e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %wide.masked.load = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 8)
-  %d0 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %wide.masked.load)
-  %d0.0 = extractvalue { <4 x i32>, <4 x i32> } %d0, 0
-  %d0.1 = extractvalue { <4 x i32>, <4 x i32> } %d0, 1
-  %d1 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.0)
-  %t0 = extractvalue { <2 x i32>, <2 x i32> } %d1, 0
-  %t2 = extractvalue { <2 x i32>, <2 x i32> } %d1, 1
-  %d2 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.1)
-  %t1 = extractvalue { <2 x i32>, <2 x i32> } %d2, 0
-  %t3 = extractvalue { <2 x i32>, <2 x i32> } %d2, 1
+  %d = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.vector.deinterleave4.v8i32(<8 x i32> %wide.masked.load)
+  %t0 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %d, 0
+  %t1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %d, 1
+  %t2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %d, 2
+  %t3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %d, 3
 
   %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> %t0, 0
   %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-reverser-float.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-reverse-float.ll
similarity index 71%
rename from llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-reverser-float.ll
rename to llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-reverse-float.ll
index 1d21cb5586984..ad84aaccc2171 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-reverser-float.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-reverse-float.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+m,+f,+d,+v,+zvfh -verify-machineinstrs -riscv-v-vector-bits-min=128 \
+; RUN: llc -mtriple=riscv64 -mattr=+m,+f,+d,+v,+zvfh,+zvfbfmin -verify-machineinstrs -riscv-v-vector-bits-min=128 \
+; RUN:   < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+m,+f,+d,+v,+zvfhmin,+zvfbfmin -verify-machineinstrs -riscv-v-vector-bits-min=128 \
 ; RUN:   < %s | FileCheck %s
 
 define <2 x double> @test_vp_reverse_v2f64_masked(<2 x double> %src, <2 x i1> %mask, i32 zeroext %evl) {
@@ -88,3 +90,32 @@ define <4 x half> @test_vp_reverse_v4f16(<4 x half> %src, i32 zeroext %evl) {
   %dst = call <4 x half> @llvm.experimental.vp.reverse.v4f16(<4 x half> %src, <4 x i1> splat (i1 1), i32 %evl)
   ret <4 x half> %dst
 }
+
+define <4 x bfloat> @test_vp_reverse_v4bf16_masked(<4 x bfloat> %src, <4 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_v4bf16_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vrsub.vx v10, v9, a0, v0.t
+; CHECK-NEXT:    vrgather.vv v9, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %dst = call <4 x bfloat> @llvm.experimental.vp.reverse.v4bf16(<4 x bfloat> %src, <4 x i1> %mask, i32 %evl)
+  ret <4 x bfloat> %dst
+}
+
+define <4 x bfloat> @test_vp_reverse_v4bf16(<4 x bfloat> %src, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_v4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vrsub.vx v10, v9, a1
+; CHECK-NEXT:    vrgather.vv v9, v8, v10
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+
+  %dst = call <4 x bfloat> @llvm.experimental.vp.reverse.v4bf16(<4 x bfloat> %src, <4 x i1> splat (i1 1), i32 %evl)
+  ret <4 x bfloat> %dst
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-reverser-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-reverse-int.ll
similarity index 100%
rename from llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-reverser-int.ll
rename to llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-reverse-int.ll
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-splice.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-splice.ll
index 7bf22247093f7..8160e62a43106 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-splice.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-splice.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple riscv64 -mattr=+f,+d,+v,+zvfh -verify-machineinstrs -riscv-v-vector-bits-min=128 \
-; RUN:   < %s | FileCheck %s
+; RUN: llc -mtriple riscv64 -mattr=+f,+d,+v,+zvfh,+zvfbfmin -verify-machineinstrs -riscv-v-vector-bits-min=128 \
+; RUN:   < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple riscv64 -mattr=+f,+d,+v,+zvfhmin,+zvfbfmin -verify-machineinstrs -riscv-v-vector-bits-min=128 \
+; RUN:   < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
 
 define <2 x i64> @test_vp_splice_v2i64(<2 x i64> %va, <2 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_v2i64:
@@ -299,3 +301,103 @@ define <8 x half> @test_vp_splice_v8f16_masked(<8 x half> %va, <8 x half> %vb, <
   %v = call <8 x half> @llvm.experimental.vp.splice.v8f16(<8 x half> %va, <8 x half> %vb, i32 5, <8 x i1> %mask, i32 %evla, i32 %evlb)
   ret <8 x half> %v
 }
+
+define <4 x i32> @test_vp_splice_v4i32_with_firstelt(i32 %first, <4 x i32> %vb, <4 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_splice_v4i32_with_firstelt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslide1up.vx v9, v8, a0, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %va = insertelement <4 x i32> poison, i32 %first, i32 0
+  %v = call <4 x i32> @llvm.experimental.vp.splice.v4i32(<4 x i32> %va, <4 x i32> %vb, i32 0, <4 x i1> %mask, i32 1, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @test_vp_splice_v4i32_with_splat_firstelt(i32 %first, <4 x i32> %vb, <4 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_splice_v4i32_with_splat_firstelt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslide1up.vx v9, v8, a0, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %ins = insertelement <4 x i32> poison, i32 %first, i32 0
+  %splat = shufflevector <4 x i32> %ins, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.experimental.vp.splice.v4i32(<4 x i32> %splat, <4 x i32> %vb, i32 0, <4 x i1> %mask, i32 1, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x float> @test_vp_splice_nxv2f32_with_firstelt(float %first, <4 x float> %vb, <4 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_splice_nxv2f32_with_firstelt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vfslide1up.vf v9, v8, fa0, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %va = insertelement <4 x float> poison, float %first, i32 0
+  %v = call <4 x float> @llvm.experimental.vp.splice.nxv2f32(<4 x float> %va, <4 x float> %vb, i32 0, <4 x i1> %mask, i32 1, i32 %evl)
+  ret <4 x float> %v
+}
+
+define <4 x half> @test_vp_splice_nxv2f16_with_firstelt(half %first, <4 x half> %vb, <4 x i1> %mask, i32 zeroext %evl) {
+; ZVFH-LABEL: test_vp_splice_nxv2f16_with_firstelt:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfslide1up.vf v9, v8, fa0, v0.t
+; ZVFH-NEXT:    vmv1r.v v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: test_vp_splice_nxv2f16_with_firstelt:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.s.x v9, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; ZVFHMIN-NEXT:    vslideup.vi v9, v8, 1, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v8, v9
+; ZVFHMIN-NEXT:    ret
+  %va = insertelement <4 x half> poison, half %first, i32 0
+  %v = call <4 x half> @llvm.experimental.vp.splice.nxv2f16(<4 x half> %va, <4 x half> %vb, i32 0, <4 x i1> %mask, i32 1, i32 %evl)
+  ret <4 x half> %v
+}
+
+define <8 x bfloat> @test_vp_splice_v8bf16(<8 x bfloat> %va, <8 x bfloat> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+
+  %v = call <8 x bfloat> @llvm.experimental.vp.splice.v8bf16(<8 x bfloat> %va, <8 x bfloat> %vb, i32 5, <8 x i1> splat (i1 1), i32 %evla, i32 %evlb)
+  ret <8 x bfloat> %v
+}
+
+define <8 x bfloat> @test_vp_splice_v8bf16_negative_offset(<8 x bfloat> %va, <8 x bfloat> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v8bf16_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+
+  %v = call <8 x bfloat> @llvm.experimental.vp.splice.v8bf16(<8 x bfloat> %va, <8 x bfloat> %vb, i32 -5, <8 x i1> splat (i1 1), i32 %evla, i32 %evlb)
+  ret <8 x bfloat> %v
+}
+
+define <8 x bfloat> @test_vp_splice_v8bf16_masked(<8 x bfloat> %va, <8 x bfloat> %vb, <8 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v8bf16_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x bfloat> @llvm.experimental.vp.splice.v8bf16(<8 x bfloat> %va, <8 x bfloat> %vb, i32 5, <8 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <8 x bfloat> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index 9344c52098684..b11db3d61f693 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -380,31 +380,6 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
   ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res3
 }
 
-; TODO: Remove once recursive deinterleaving support is removed
-define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor4_recursive(ptr %p) {
-; CHECK-LABEL: vector_deinterleave_load_factor4_recursive:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v8, (a0)
-; CHECK-NEXT:    ret
-  %vec = load <vscale x 32 x i8>, ptr %p
-  %d0 = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
-  %d0.0 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %d0, 0
-  %d0.1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %d0, 1
-  %d1 = call {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %d0.0)
-  %t0 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %d1, 0
-  %t2 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %d1, 1
-  %d2 = call {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %d0.1)
-  %t1 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %d2, 0
-  %t3 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %d2, 1
-
-  %res0 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } undef, <vscale x 8 x i8> %t0, 0
-  %res1 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res0, <vscale x 8 x i8> %t1, 1
-  %res2 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res1, <vscale x 8 x i8> %t2, 2
-  %res3 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res2, <vscale x 8 x i8> %t3, 3
-  ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res3
-}
-
 define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor5(ptr %p) {
 ; CHECK-LABEL: vector_deinterleave_load_factor5:
 ; CHECK:       # %bb.0:
@@ -500,45 +475,3 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
   %res7 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res6, <vscale x 8 x i8> %t7, 7
   ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res7
 }
-
-; TODO: Remove once recursive deinterleaving support is removed
-define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @vector_deinterleave_load_factor8_recursive(ptr %ptr) {
-; CHECK-LABEL: vector_deinterleave_load_factor8_recursive:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v8, (a0)
-; CHECK-NEXT:    ret
-  %vec = load <vscale x 16 x i32>, ptr %ptr
-  %d0 = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %vec)
-  %d0.0 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 0
-  %d0.1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 1
-  %d1 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.0)
-  %d1.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 0
-  %d1.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 1
-  %d2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.1)
-  %d2.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 0
-  %d2.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 1
-
-  %d3 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.0)
-  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 0
-  %t4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 1
-  %d4 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.1)
-  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 0
-  %t6 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 1
-  %d5 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.0)
-  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 0
-  %t5 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 1
-  %d6 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.1)
-  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 0
-  %t7 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 1
-
-  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
-  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
-  %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
-  %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
-  %res4 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3, <vscale x 2 x i32> %t4, 4
-  %res5 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res4, <vscale x 2 x i32> %t5, 5
-  %res6 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res5, <vscale x 2 x i32> %t6, 6
-  %res7 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res6, <vscale x 2 x i32> %t7, 7
-  ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res7
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index a5811e697634a..af55aaa8fce86 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -260,20 +260,6 @@ define void @vector_interleave_store_factor4(<vscale x 2 x i32> %a, <vscale x 2
   ret void
 }
 
-; TODO: Remove once recursive interleaving support is removed
-define void @vector_interleave_store_factor4_recursive(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, ptr %p) {
-; CHECK-LABEL: vector_interleave_store_factor4_recursive:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vsseg4e32.v v8, (a0)
-; CHECK-NEXT:    ret
-  %v0 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
-  %v1 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %b, <vscale x 4 x i32> %d)
-  %v2 = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %v0, <vscale x 8 x i32> %v1)
-  store <vscale x 16 x i32> %v2, ptr %p
-  ret void
-}
-
 define void @vector_interleave_store_factor5(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, ptr %p) {
 ; CHECK-LABEL: vector_interleave_store_factor5:
 ; CHECK:       # %bb.0:
@@ -317,23 +303,3 @@ define void @vector_interleave_store_factor8(<vscale x 2 x i32> %a, <vscale x 2
   store <vscale x 16 x i32> %v, ptr %p
   ret void
 }
-
-; TODO: Remove once recursive interleaving support is removed
-define void @vector_interleave_store_factor8_recursive(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, <vscale x 2 x i32> %f, <vscale x 2 x i32> %g, <vscale x 2 x i32> %h, ptr %p) {
-; CHECK-LABEL: vector_interleave_store_factor8_recursive:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vsseg8e32.v v8, (a0)
-; CHECK-NEXT:    ret
-  %v0 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %e)
-  %v1 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %c, <vscale x 2 x i32> %g)
-  %v2 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1)
-
-  %v3 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %b, <vscale x 2 x i32> %f)
-  %v4 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %d, <vscale x 2 x i32> %h)
-  %v5 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %v3, <vscale x 4 x i32> %v4)
-
-  %v6 = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %v2, <vscale x 8 x i32> %v5)
-  store <vscale x 16 x i32> %v6, ptr %p
-  ret void
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
index 6008ea43e9158..9c8c5da75ff7c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple riscv64 -mattr=+f,+d,+v,+zvfh,+zvfbfmin -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-; RUN: llc -mtriple riscv64 -mattr=+f,+d,+v,+zvfhmin,+zvfbfmin -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: llc -mtriple riscv64 -mattr=+f,+d,+v,+zfh,+zfbfmin,+zvfh,+zvfbfmin -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple riscv64 -mattr=+f,+d,+v,+zfh,+zfbfmin,+zvfhmin,+zvfbfmin -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
 
 define <vscale x 2 x i64> @test_vp_splice_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_nxv2i64:
@@ -505,3 +505,73 @@ define <vscale x 2 x bfloat> @test_vp_splice_nxv2bf16_masked(<vscale x 2 x bfloa
   %v = call <vscale x 2 x bfloat> @llvm.experimental.vp.splice.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, i32 5, <vscale x 2 x i1> %mask, i32 %evla, i32 %evlb)
   ret <vscale x 2 x bfloat> %v
 }
+
+define <vscale x 2 x i32> @test_vp_splice_nxv2i32_with_firstelt(i32 %first, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_splice_nxv2i32_with_firstelt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslide1up.vx v9, v8, a0, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %va = insertelement <vscale x 2 x i32> poison, i32 %first, i32 0
+  %v = call <vscale x 2 x i32> @llvm.experimental.vp.splice.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, i32 0, <vscale x 2 x i1> %mask, i32 1, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @test_vp_splice_nxv2i32_with_splat_firstelt(i32 %first, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_splice_nxv2i32_with_splat_firstelt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslide1up.vx v9, v8, a0, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %ins = insertelement <vscale x 2 x i32> poison, i32 %first, i32 0
+  %splat = shufflevector <vscale x 2 x i32> %ins, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.experimental.vp.splice.nxv2i32(<vscale x 2 x i32> %splat, <vscale x 2 x i32> %vb, i32 0, <vscale x 2 x i1> %mask, i32 1, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x float> @test_vp_splice_nxv2f32_with_firstelt(float %first, <vscale x 2 x float> %vb, <vscale x 2 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_splice_nxv2f32_with_firstelt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vfslide1up.vf v9, v8, fa0, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %va = insertelement <vscale x 2 x float> poison, float %first, i32 0
+  %v = call <vscale x 2 x float> @llvm.experimental.vp.splice.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, i32 0, <vscale x 2 x i1> %mask, i32 1, i32 %evl)
+  ret <vscale x 2 x float> %v
+}
+
+define <vscale x 2 x half> @test_vp_splice_nxv2f16_with_firstelt(half %first, <vscale x 2 x half> %vb, <vscale x 2 x i1> %mask, i32 zeroext %evl) {
+; ZVFH-LABEL: test_vp_splice_nxv2f16_with_firstelt:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfslide1up.vf v9, v8, fa0, v0.t
+; ZVFH-NEXT:    vmv1r.v v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: test_vp_splice_nxv2f16_with_firstelt:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslide1up.vx v9, v8, a1, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v8, v9
+; ZVFHMIN-NEXT:    ret
+  %va = insertelement <vscale x 2 x half> poison, half %first, i32 0
+  %v = call <vscale x 2 x half> @llvm.experimental.vp.splice.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, i32 0, <vscale x 2 x i1> %mask, i32 1, i32 %evl)
+  ret <vscale x 2 x half> %v
+}
+
+define <vscale x 2 x bfloat> @test_vp_splice_nxv2bf16_with_firstelt(bfloat %first, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_splice_nxv2bf16_with_firstelt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.h a1, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vslide1up.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %va = insertelement <vscale x 2 x bfloat> poison, bfloat %first, i32 0
+  %v = call <vscale x 2 x bfloat> @llvm.experimental.vp.splice.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, i32 0, <vscale x 2 x i1> %mask, i32 1, i32 %evl)
+  ret <vscale x 2 x bfloat> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 1868154052272..35f01f608b56e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -84,15 +84,11 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ; RV64-NEXT:    ret
   %rvl = mul i32 %evl, 4
   %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
-  %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
-  %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
-  %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
-  %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
-  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
-  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
-  %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
-  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
-  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+  %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave4.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 0
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 1
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 2
+  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 3
 
   %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
   %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
@@ -214,28 +210,15 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ; RV64-NEXT:    ret
   %rvl = mul i32 %evl, 8
   %wide.masked.load = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr %ptr, <vscale x 16 x i1> splat (i1 true), i32 %rvl)
-  %d0 = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.masked.load)
-  %d0.0 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 0
-  %d0.1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 1
-  %d1 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.0)
-  %d1.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 0
-  %d1.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 1
-  %d2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.1)
-  %d2.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 0
-  %d2.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 1
-
-  %d3 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.0)
-  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 0
-  %t4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 1
-  %d4 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.1)
-  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 0
-  %t6 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 1
-  %d5 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.0)
-  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 0
-  %t5 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 1
-  %d6 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.1)
-  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 0
-  %t7 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 1
+  %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave8.nxv16i32(<vscale x 16 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 0
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 1
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 2
+  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 3
+  %t4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 4
+  %t5 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 5
+  %t6 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 6
+  %t7 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 7
 
   %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
   %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
@@ -323,10 +306,8 @@ define void @store_factor4_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
 ; RV64-NEXT:    vsseg4e32.v v8, (a0)
 ; RV64-NEXT:    ret
   %rvl = mul i32 %evl, 8
-  %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
-  %interleaved.vec1 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
-  %interleaved.vec2 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %interleaved.vec1)
-  call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec2, ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %rvl)
+  %interleaved.vec = call <vscale x 4 x i32> @llvm.vector.interleave4.nxv4i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1)
+  call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec, ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %rvl)
   ret void
 }
 
@@ -430,14 +411,8 @@ define void @store_factor8_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
 ; RV64-NEXT:    vsseg8e32.v v8, (a0)
 ; RV64-NEXT:    ret
   %rvl = mul i32 %evl, 8
-  %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
-  %interleaved.vec1 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
-  %interleaved.vec2 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %interleaved.vec1)
-  %interleaved.vec3 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
-  %interleaved.vec4 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
-  %interleaved.vec5 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec3, <vscale x 2 x i32> %interleaved.vec4)
-  %interleaved.vec6 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %interleaved.vec2, <vscale x 4 x i32> %interleaved.vec5)
-  call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> %interleaved.vec6, ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
+  %interleaved.vec = call <vscale x 8 x i32> @llvm.vector.interleave8.nxv8i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1)
+  call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> %interleaved.vec, ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
   ret void
 }
 
@@ -485,19 +460,13 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ; RV64-NEXT:    vlseg4e32.v v8, (a0), v0.t
 ; RV64-NEXT:    ret
   %rvl = mul i32 %evl, 4
-  %interleaved.mask0 = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
-  %interleaved.mask1 = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
-  %interleaved.mask2 = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> %interleaved.mask0, <vscale x 4 x i1> %interleaved.mask1)
-  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> %interleaved.mask2, i32 %rvl)
-  %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
-  %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
-  %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
-  %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
-  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
-  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
-  %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
-  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
-  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+  %interleaved.mask = call <vscale x 8 x i1> @llvm.vector.interleave4.nxv8i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> %interleaved.mask, i32 %rvl)
+  %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave4.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+  %t0 = extractvalue  { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 0
+  %t1 = extractvalue  { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 1
+  %t2 = extractvalue  { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 2
+  %t3 = extractvalue  { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 3
 
   %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
   %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
@@ -674,175 +643,14 @@ define void @masked_store_factor4_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32>
 ; RV64-NEXT:    vsseg4e32.v v8, (a0), v0.t
 ; RV64-NEXT:    ret
   %rvl = mul i32 %evl, 4
-  %interleaved.mask0 = call <vscale x 2 x i1> @llvm.vector.interleave2.nxv2i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
-  %interleaved.mask1 = call <vscale x 2 x i1> @llvm.vector.interleave2.nxv2i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
-  %interleaved.mask2 = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %interleaved.mask0, <vscale x 2 x i1> %interleaved.mask1)
-  %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
-  %interleaved.vec1 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
-  %interleaved.vec2 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %interleaved.vec1)
-  call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec2, ptr %ptr, <vscale x 4 x i1> %interleaved.mask2, i32 %rvl)
+  %interleaved.mask = call <vscale x 4 x i1> @llvm.vector.interleave4.nxv4i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
+  %interleaved.vec = call <vscale x 4 x i32> @llvm.vector.interleave4.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1)
+  call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec, ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
   ret void
 }
 
 ; Negative tests
 
-; We should not transform this function because the deinterleave tree is not in a desired form.
-define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @incorrect_extract_value_index(ptr %ptr, i32 %evl) {
-; RV32-LABEL: incorrect_extract_value_index:
-; RV32:       # %bb.0:
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vnsrl.wi v12, v8, 0
-; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT:    vnsrl.wx v9, v12, a0
-; RV32-NEXT:    vnsrl.wi v8, v12, 0
-; RV32-NEXT:    vmv.v.v v10, v9
-; RV32-NEXT:    vmv.v.v v11, v9
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: incorrect_extract_value_index:
-; RV64:       # %bb.0:
-; RV64-NEXT:    slli a1, a1, 34
-; RV64-NEXT:    srli a1, a1, 32
-; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV64-NEXT:    vnsrl.wi v12, v8, 0
-; RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV64-NEXT:    vnsrl.wx v9, v12, a0
-; RV64-NEXT:    vnsrl.wi v8, v12, 0
-; RV64-NEXT:    vmv.v.v v10, v9
-; RV64-NEXT:    vmv.v.v v11, v9
-; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 4
-  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
-  %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
-  %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
-  %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
-  %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
-  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
-  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
-  %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
-  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
-  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
-
-  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
-  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
-  %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
-  %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
-  ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
-}
-
-; We should not transform this function because the expression is not a balanced tree.
-define {<vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>} @not_balanced_load_tree(ptr %ptr, i32 %evl) {
-; RV32-LABEL: not_balanced_load_tree:
-; RV32:       # %bb.0:
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vle32.v v12, (a0)
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vnsrl.wx v8, v12, a0
-; RV32-NEXT:    vnsrl.wi v16, v12, 0
-; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT:    vnsrl.wi v10, v16, 0
-; RV32-NEXT:    vnsrl.wx v11, v16, a0
-; RV32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; RV32-NEXT:    vnsrl.wx v12, v11, a0
-; RV32-NEXT:    vnsrl.wi v11, v11, 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: not_balanced_load_tree:
-; RV64:       # %bb.0:
-; RV64-NEXT:    slli a1, a1, 34
-; RV64-NEXT:    srli a1, a1, 32
-; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV64-NEXT:    vle32.v v12, (a0)
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV64-NEXT:    vnsrl.wx v8, v12, a0
-; RV64-NEXT:    vnsrl.wi v16, v12, 0
-; RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV64-NEXT:    vnsrl.wi v10, v16, 0
-; RV64-NEXT:    vnsrl.wx v11, v16, a0
-; RV64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; RV64-NEXT:    vnsrl.wx v12, v11, a0
-; RV64-NEXT:    vnsrl.wi v11, v11, 0
-; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 4
-  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
-  %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
-  %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
-  %t0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
-  %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
-  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
-  %d1.1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
-  %d2 = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 2 x i32> %d1.1)
-  %t2 = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } %d2, 0
-  %t3 = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } %d2, 1
-
-  %res0 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison, <vscale x 4 x i32> %t0, 0
-  %res1 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res0, <vscale x 2 x i32> %t1, 1
-  %res2 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res1, <vscale x 1 x i32> %t2, 2
-  %res3 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res2, <vscale x 1 x i32> %t3, 3
-  ret { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res3
-}
-
-define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32> %v1, <vscale x 4 x i32> %v2, ptr %ptr, i32 %evl) {
-; RV32-LABEL: not_balanced_store_tree:
-; RV32:       # %bb.0:
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
-; RV32-NEXT:    vwaddu.vv v12, v8, v8
-; RV32-NEXT:    li a2, -1
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    vwmaccu.vx v12, a2, v8
-; RV32-NEXT:    srli a3, a3, 3
-; RV32-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vx v8, v12, a3
-; RV32-NEXT:    vslideup.vx v12, v8, a3
-; RV32-NEXT:    vwaddu.vv v16, v12, v9
-; RV32-NEXT:    vwmaccu.vx v16, a2, v9
-; RV32-NEXT:    vsetvli a3, zero, e32, m2, ta, ma
-; RV32-NEXT:    vwaddu.vv v12, v16, v10
-; RV32-NEXT:    vwmaccu.vx v12, a2, v10
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vse32.v v12, (a0)
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: not_balanced_store_tree:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
-; RV64-NEXT:    vwaddu.vv v12, v8, v8
-; RV64-NEXT:    li a2, -1
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a1, a1, 34
-; RV64-NEXT:    vwmaccu.vx v12, a2, v8
-; RV64-NEXT:    srli a3, a3, 3
-; RV64-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vx v8, v12, a3
-; RV64-NEXT:    vslideup.vx v12, v8, a3
-; RV64-NEXT:    vwaddu.vv v16, v12, v9
-; RV64-NEXT:    vwmaccu.vx v16, a2, v9
-; RV64-NEXT:    vsetvli a3, zero, e32, m2, ta, ma
-; RV64-NEXT:    vwaddu.vv v12, v16, v10
-; RV64-NEXT:    vwmaccu.vx v12, a2, v10
-; RV64-NEXT:    srli a1, a1, 32
-; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV64-NEXT:    vse32.v v12, (a0)
-; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 4
-  %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
-  %interleaved.vec1 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %v1)
-  %interleaved.vec2 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 4 x i32> %interleaved.vec1, <vscale x 4 x i32> %v2)
-  call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> %interleaved.vec2, ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
-  ret void
-}
-
 define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1> %mask0, <vscale x 2 x i1> %mask1, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: not_same_mask:
 ; RV32:       # %bb.0:
@@ -931,48 +739,58 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1>
 define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @invalid_evl(ptr %ptr, i32 %evl) {
 ; RV32-LABEL: invalid_evl:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 2
+; RV32-NEXT:    sub sp, sp, a2
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; RV32-NEXT:    ori a1, a1, 1
 ; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vnsrl.wx v12, v8, a0
-; RV32-NEXT:    vnsrl.wi v14, v8, 0
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs4r.v v8, (a0)
 ; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT:    vnsrl.wx v10, v14, a0
-; RV32-NEXT:    vnsrl.wi v8, v14, 0
-; RV32-NEXT:    vnsrl.wx v11, v12, a0
-; RV32-NEXT:    vnsrl.wi v9, v12, 0
+; RV32-NEXT:    vlseg4e32.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    .cfi_def_cfa sp, 16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: invalid_evl:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 2
+; RV64-NEXT:    sub sp, sp, a2
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; RV64-NEXT:    ori a1, a1, 1
 ; RV64-NEXT:    slli a1, a1, 32
 ; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV64-NEXT:    vnsrl.wx v12, v8, a0
-; RV64-NEXT:    vnsrl.wi v14, v8, 0
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs4r.v v8, (a0)
 ; RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV64-NEXT:    vnsrl.wx v10, v14, a0
-; RV64-NEXT:    vnsrl.wi v8, v14, 0
-; RV64-NEXT:    vnsrl.wx v11, v12, a0
-; RV64-NEXT:    vnsrl.wi v9, v12, 0
+; RV64-NEXT:    vlseg4e32.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 2
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %rvl = or i32 %evl, 1
   %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
-  %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
-  %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
-  %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
-  %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
-  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
-  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
-  %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
-  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
-  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+  %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave4.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 0
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 1
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 2
+  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 3
 
   %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
   %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
diff --git a/llvm/test/CodeGen/SystemZ/vec-max-min-zerosplat.ll b/llvm/test/CodeGen/SystemZ/vec-max-min-zerosplat.ll
index e8d4b2828c84b..2125a0b8912b1 100644
--- a/llvm/test/CodeGen/SystemZ/vec-max-min-zerosplat.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-max-min-zerosplat.ll
@@ -1,12 +1,15 @@
-; Test vector maximum/minimum with a zero splat on z14.
-;
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
 
+; Test vector maximum/minimum with a zero splat on z14.
+
 define <2 x double> @f1(<2 x double> %val) {
 ; CHECK-LABEL: f1:
-; CHECK: vgbm %v0, 0
-; CHECK-NEXT: vfmaxdb %v24, %v24, %v0, 4
-; CHECK-NEXT: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgbm %v0, 0
+; CHECK-NEXT:    vfmaxdb %v24, %v24, %v0, 4
+; CHECK-NEXT:    br %r14
   %cmp = fcmp ogt <2 x double> %val,  zeroinitializer
   %ret = select <2 x i1> %cmp, <2 x double> %val, <2 x double> zeroinitializer
   ret <2 x double> %ret
@@ -14,9 +17,10 @@ define <2 x double> @f1(<2 x double> %val) {
 
 define <2 x double> @f2(<2 x double> %val) {
 ; CHECK-LABEL: f2:
-; CHECK: vgbm %v0, 0
-; CHECK-NEXT: vfmindb %v24, %v24, %v0, 4
-; CHECK-NEXT: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgbm %v0, 0
+; CHECK-NEXT:    vfmindb %v24, %v24, %v0, 4
+; CHECK-NEXT:    br %r14
   %cmp = fcmp olt <2 x double> %val,  zeroinitializer
   %ret = select <2 x i1> %cmp, <2 x double> %val, <2 x double> zeroinitializer
   ret <2 x double> %ret
@@ -24,9 +28,10 @@ define <2 x double> @f2(<2 x double> %val) {
 
 define <4 x float> @f3(<4 x float> %val) {
 ; CHECK-LABEL: f3:
-; CHECK: vgbm %v0, 0
-; CHECK-NEXT: vfmaxsb %v24, %v24, %v0, 4
-; CHECK-NEXT: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgbm %v0, 0
+; CHECK-NEXT:    vfmaxsb %v24, %v24, %v0, 4
+; CHECK-NEXT:    br %r14
   %cmp = fcmp ogt <4 x float> %val,  zeroinitializer
   %ret = select <4 x i1> %cmp, <4 x float> %val, <4 x float> zeroinitializer
   ret <4 x float> %ret
@@ -34,9 +39,10 @@ define <4 x float> @f3(<4 x float> %val) {
 
 define <4 x float> @f4(<4 x float> %val) {
 ; CHECK-LABEL: f4:
-; CHECK: vgbm %v0, 0
-; CHECK-NEXT: vfminsb %v24, %v24, %v0, 4
-; CHECK-NEXT: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgbm %v0, 0
+; CHECK-NEXT:    vfminsb %v24, %v24, %v0, 4
+; CHECK-NEXT:    br %r14
   %cmp = fcmp olt <4 x float> %val,  zeroinitializer
   %ret = select <4 x i1> %cmp, <4 x float> %val, <4 x float> zeroinitializer
   ret <4 x float> %ret
@@ -44,10 +50,11 @@ define <4 x float> @f4(<4 x float> %val) {
 
 define <2 x double> @f5(<2 x double> %val) {
 ; CHECK-LABEL: f5:
-; CHECK: vgbm %v0, 0
-; CHECK-NEXT: vfchedb	%v1, %v0, %v24
-; CHECK-NEXT: vsel	%v24, %v0, %v24, %v1
-; CHECK-NEXT: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgbm %v0, 0
+; CHECK-NEXT:    vfchedb %v1, %v0, %v24
+; CHECK-NEXT:    vsel %v24, %v0, %v24, %v1
+; CHECK-NEXT:    br %r14
   %cmp = fcmp ugt <2 x double> %val,  zeroinitializer
   %ret = select <2 x i1> %cmp, <2 x double> %val, <2 x double> zeroinitializer
   ret <2 x double> %ret
@@ -55,10 +62,11 @@ define <2 x double> @f5(<2 x double> %val) {
 
 define <2 x double> @f6(<2 x double> %val) {
 ; CHECK-LABEL: f6:
-; CHECK: vgbm %v0, 0
-; CHECK-NEXT: vfchedb	%v1, %v24, %v0
-; CHECK-NEXT: vsel	%v24, %v0, %v24, %v1
-; CHECK-NEXT: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgbm %v0, 0
+; CHECK-NEXT:    vfchedb %v1, %v24, %v0
+; CHECK-NEXT:    vsel %v24, %v0, %v24, %v1
+; CHECK-NEXT:    br %r14
   %cmp = fcmp ult <2 x double> %val,  zeroinitializer
   %ret = select <2 x i1> %cmp, <2 x double> %val, <2 x double> zeroinitializer
   ret <2 x double> %ret
@@ -66,10 +74,11 @@ define <2 x double> @f6(<2 x double> %val) {
 
 define <4 x float> @f7(<4 x float> %val) {
 ; CHECK-LABEL: f7:
-; CHECK: vgbm %v0, 0
-; CHECK-NEXT: vfchesb	%v1, %v0, %v24
-; CHECK-NEXT: vsel	%v24, %v0, %v24, %v1
-; CHECK-NEXT: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgbm %v0, 0
+; CHECK-NEXT:    vfchesb %v1, %v0, %v24
+; CHECK-NEXT:    vsel %v24, %v0, %v24, %v1
+; CHECK-NEXT:    br %r14
   %cmp = fcmp ugt <4 x float> %val,  zeroinitializer
   %ret = select <4 x i1> %cmp, <4 x float> %val, <4 x float> zeroinitializer
   ret <4 x float> %ret
@@ -77,10 +86,11 @@ define <4 x float> @f7(<4 x float> %val) {
 
 define <4 x float> @f8(<4 x float> %val) {
 ; CHECK-LABEL: f8:
-; CHECK: vgbm %v0, 0
-; CHECK-NEXT: vfchesb	%v1, %v24, %v0
-; CHECK-NEXT: vsel	%v24, %v0, %v24, %v1
-; CHECK-NEXT: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgbm %v0, 0
+; CHECK-NEXT:    vfchesb %v1, %v24, %v0
+; CHECK-NEXT:    vsel %v24, %v0, %v24, %v1
+; CHECK-NEXT:    br %r14
   %cmp = fcmp ult <4 x float> %val,  zeroinitializer
   %ret = select <4 x i1> %cmp, <4 x float> %val, <4 x float> zeroinitializer
   ret <4 x float> %ret
diff --git a/llvm/test/CodeGen/X86/GlobalISel/llvm.sincos.mir b/llvm/test/CodeGen/X86/GlobalISel/llvm.sincos.mir
new file mode 100644
index 0000000000000..f8ce9ea8be650
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/llvm.sincos.mir
@@ -0,0 +1,189 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=i686-linux-gnu -run-pass=regbankselect,instruction-select %s -o - | FileCheck %s --check-prefixes GISEL-I686
+
+---
+name:            test_sincos_f32
+alignment:       16
+legalized:       true
+fixedStack:
+  - { id: 0, type: default, offset: 0, size: 4, alignment: 16, stack-id: default,
+      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+stack:
+  - { id: 0, name: '', type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+body:             |
+  bb.1:
+    ; GISEL-I686-LABEL: name: test_sincos_f32
+    ; GISEL-I686: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (invariant load (s32) from %fixed-stack.0, align 16)
+    ; GISEL-I686-NEXT: [[LEA32r:%[0-9]+]]:gr32 = LEA32r %stack.0, 1, $noreg, 0, $noreg
+    ; GISEL-I686-NEXT: [[LEA32r1:%[0-9]+]]:gr32 = LEA32r %stack.1, 1, $noreg, 0, $noreg
+    ; GISEL-I686-NEXT: ADJCALLSTACKDOWN32 12, 0, 0, implicit-def $esp, implicit-def $eflags, implicit-def $ssp, implicit $esp, implicit $ssp
+    ; GISEL-I686-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $esp
+    ; GISEL-I686-NEXT: MOV32mr [[COPY]], 1, $noreg, 0, $noreg, [[MOV32rm]] :: (store (s32) into stack, align 1)
+    ; GISEL-I686-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esp
+    ; GISEL-I686-NEXT: MOV32mr [[COPY1]], 1, $noreg, 4, $noreg, [[LEA32r]] :: (store (s32) into stack + 4, align 1)
+    ; GISEL-I686-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $esp
+    ; GISEL-I686-NEXT: MOV32mr [[COPY2]], 1, $noreg, 8, $noreg, [[LEA32r1]] :: (store (s32) into stack + 8, align 1)
+    ; GISEL-I686-NEXT: CALLpcrel32 &sincosf, csr_32, implicit $esp, implicit $ssp
+    ; GISEL-I686-NEXT: ADJCALLSTACKUP32 12, 0, implicit-def $esp, implicit-def $eflags, implicit-def $ssp, implicit $esp, implicit $ssp
+    ; GISEL-I686-NEXT: [[LD_Fp32m:%[0-9]+]]:rfp32 = nofpexcept LD_Fp32m %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $fpsw, implicit $fpcw :: (load (s32) from %stack.0)
+    ; GISEL-I686-NEXT: [[LD_Fp32m1:%[0-9]+]]:rfp32 = nofpexcept LD_Fp32m %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $fpsw, implicit $fpcw :: (load (s32) from %stack.1)
+    ; GISEL-I686-NEXT: $fp0 = COPY [[LD_Fp32m]]
+    ; GISEL-I686-NEXT: $fp1 = COPY [[LD_Fp32m1]]
+    ; GISEL-I686-NEXT: RET 0, implicit $fp0, implicit $fp1
+    %1:_(p0) = G_FRAME_INDEX %fixed-stack.0
+    %0:_(s32) = G_LOAD %1(p0) :: (invariant load (s32) from %fixed-stack.0, align 16)
+    %4:_(p0) = G_FRAME_INDEX %stack.0
+    %5:_(p0) = G_FRAME_INDEX %stack.1
+    ADJCALLSTACKDOWN32 12, 0, 0, implicit-def $esp, implicit-def $eflags, implicit-def $ssp, implicit $esp, implicit $ssp
+    %6:_(p0) = COPY $esp
+    %7:_(s32) = G_CONSTANT i32 0
+    %8:_(p0) = G_PTR_ADD %6, %7(s32)
+    G_STORE %0(s32), %8(p0) :: (store (s32) into stack, align 1)
+    %9:_(p0) = COPY $esp
+    %10:_(s32) = G_CONSTANT i32 4
+    %11:_(p0) = G_PTR_ADD %9, %10(s32)
+    G_STORE %4(p0), %11(p0) :: (store (s32) into stack + 4, align 1)
+    %12:_(p0) = COPY $esp
+    %13:_(s32) = G_CONSTANT i32 8
+    %14:_(p0) = G_PTR_ADD %12, %13(s32)
+    G_STORE %5(p0), %14(p0) :: (store (s32) into stack + 8, align 1)
+    CALLpcrel32 &sincosf, csr_32, implicit $esp, implicit $ssp
+    ADJCALLSTACKUP32 12, 0, implicit-def $esp, implicit-def $eflags, implicit-def $ssp, implicit $esp, implicit $ssp
+    %2:_(s32) = G_LOAD %4(p0) :: (load (s32) from %stack.0)
+    %3:_(s32) = G_LOAD %5(p0) :: (load (s32) from %stack.1)
+    $fp0 = COPY %2(s32)
+    $fp1 = COPY %3(s32)
+    RET 0, implicit $fp0, implicit $fp1
+...
+---
+name:            test_sincos_f64
+alignment:       16
+legalized:       true
+fixedStack:
+  - { id: 0, type: default, offset: 0, size: 8, alignment: 16, stack-id: default,
+      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+stack:
+  - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: default, offset: 0, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+body:             |
+  bb.1:
+    ; GISEL-I686-LABEL: name: test_sincos_f64
+    ; GISEL-I686: [[LEA32r:%[0-9]+]]:gr32 = LEA32r %fixed-stack.0, 1, $noreg, 0, $noreg
+    ; GISEL-I686-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (invariant load (s32) from %fixed-stack.0, align 16)
+    ; GISEL-I686-NEXT: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm [[LEA32r]], 1, $noreg, 4, $noreg :: (invariant load (s32) from %fixed-stack.0 + 4, basealign 16)
+    ; GISEL-I686-NEXT: [[LEA32r1:%[0-9]+]]:gr32 = LEA32r %stack.0, 1, $noreg, 0, $noreg
+    ; GISEL-I686-NEXT: [[LEA32r2:%[0-9]+]]:gr32 = LEA32r %stack.1, 1, $noreg, 0, $noreg
+    ; GISEL-I686-NEXT: ADJCALLSTACKDOWN32 16, 0, 0, implicit-def $esp, implicit-def $eflags, implicit-def $ssp, implicit $esp, implicit $ssp
+    ; GISEL-I686-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $esp
+    ; GISEL-I686-NEXT: [[MOV32r0_:%[0-9]+]]:gr32_nosp = MOV32r0 implicit-def dead $eflags
+    ; GISEL-I686-NEXT: [[LEA32r3:%[0-9]+]]:gr32 = LEA32r [[COPY]], 1, [[MOV32r0_]], 0, $noreg
+    ; GISEL-I686-NEXT: MOV32mr [[COPY]], 1, $noreg, 0, $noreg, [[MOV32rm]] :: (store (s32) into stack, align 1)
+    ; GISEL-I686-NEXT: MOV32mr [[LEA32r3]], 1, $noreg, 4, $noreg, [[MOV32rm1]] :: (store (s32) into stack + 4, align 1)
+    ; GISEL-I686-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esp
+    ; GISEL-I686-NEXT: MOV32mr [[COPY1]], 1, $noreg, 8, $noreg, [[LEA32r1]] :: (store (s32) into stack + 8, align 1)
+    ; GISEL-I686-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $esp
+    ; GISEL-I686-NEXT: MOV32mr [[COPY2]], 1, $noreg, 12, $noreg, [[LEA32r2]] :: (store (s32) into stack + 12, align 1)
+    ; GISEL-I686-NEXT: CALLpcrel32 &sincos, csr_32, implicit $esp, implicit $ssp
+    ; GISEL-I686-NEXT: ADJCALLSTACKUP32 16, 0, implicit-def $esp, implicit-def $eflags, implicit-def $ssp, implicit $esp, implicit $ssp
+    ; GISEL-I686-NEXT: $fp0 = IMPLICIT_DEF
+    ; GISEL-I686-NEXT: $fp1 = IMPLICIT_DEF
+    ; GISEL-I686-NEXT: RET 0, implicit $fp0, implicit $fp1
+    %1:_(p0) = G_FRAME_INDEX %fixed-stack.0
+    %25:_(s32) = G_LOAD %1(p0) :: (invariant load (s32) from %fixed-stack.0, align 16)
+    %17:_(s32) = G_CONSTANT i32 4
+    %26:_(p0) = G_PTR_ADD %1, %17(s32)
+    %27:_(s32) = G_LOAD %26(p0) :: (invariant load (s32) from %fixed-stack.0 + 4, basealign 16)
+    %4:_(p0) = G_FRAME_INDEX %stack.0
+    %5:_(p0) = G_FRAME_INDEX %stack.1
+    ADJCALLSTACKDOWN32 16, 0, 0, implicit-def $esp, implicit-def $eflags, implicit-def $ssp, implicit $esp, implicit $ssp
+    %6:_(p0) = COPY $esp
+    %7:_(s32) = G_CONSTANT i32 0
+    %8:_(p0) = G_PTR_ADD %6, %7(s32)
+    G_STORE %25(s32), %8(p0) :: (store (s32) into stack, align 1)
+    %24:_(p0) = G_PTR_ADD %8, %17(s32)
+    G_STORE %27(s32), %24(p0) :: (store (s32) into stack + 4, align 1)
+    %9:_(p0) = COPY $esp
+    %10:_(s32) = G_CONSTANT i32 8
+    %11:_(p0) = G_PTR_ADD %9, %10(s32)
+    G_STORE %4(p0), %11(p0) :: (store (s32) into stack + 8, align 1)
+    %12:_(p0) = COPY $esp
+    %13:_(s32) = G_CONSTANT i32 12
+    %14:_(p0) = G_PTR_ADD %12, %13(s32)
+    G_STORE %5(p0), %14(p0) :: (store (s32) into stack + 12, align 1)
+    CALLpcrel32 &sincos, csr_32, implicit $esp, implicit $ssp
+    ADJCALLSTACKUP32 16, 0, implicit-def $esp, implicit-def $eflags, implicit-def $ssp, implicit $esp, implicit $ssp
+    $fp0 = IMPLICIT_DEF
+    $fp1 = IMPLICIT_DEF
+    RET 0, implicit $fp0, implicit $fp1
+...
+---
+name:            test_sincos_f80
+alignment:       16
+legalized:       true
+fixedStack:
+  - { id: 0, type: default, offset: 0, size: 10, alignment: 16, stack-id: default,
+      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+stack:
+  - { id: 0, name: '', type: default, offset: 0, size: 10, alignment: 16,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: default, offset: 0, size: 10, alignment: 16,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+body:             |
+  bb.1:
+    ; GISEL-I686-LABEL: name: test_sincos_f80
+    ; GISEL-I686: [[LD_Fp80m:%[0-9]+]]:rfp80 = nofpexcept LD_Fp80m %fixed-stack.0, 1, $noreg, 0, $noreg, implicit-def dead $fpsw, implicit $fpcw :: (invariant load (s80) from %fixed-stack.0, align 16)
+    ; GISEL-I686-NEXT: [[LEA32r:%[0-9]+]]:gr32 = LEA32r %stack.0, 1, $noreg, 0, $noreg
+    ; GISEL-I686-NEXT: [[LEA32r1:%[0-9]+]]:gr32 = LEA32r %stack.1, 1, $noreg, 0, $noreg
+    ; GISEL-I686-NEXT: ADJCALLSTACKDOWN32 20, 0, 0, implicit-def $esp, implicit-def $eflags, implicit-def $ssp, implicit $esp, implicit $ssp
+    ; GISEL-I686-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $esp
+    ; GISEL-I686-NEXT: nofpexcept ST_FpP80m [[COPY]], 1, $noreg, 0, $noreg, [[LD_Fp80m]], implicit-def dead $fpsw, implicit $fpcw :: (store (s80) into stack, align 1)
+    ; GISEL-I686-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esp
+    ; GISEL-I686-NEXT: MOV32mr [[COPY1]], 1, $noreg, 12, $noreg, [[LEA32r]] :: (store (s32) into stack + 12, align 1)
+    ; GISEL-I686-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $esp
+    ; GISEL-I686-NEXT: MOV32mr [[COPY2]], 1, $noreg, 16, $noreg, [[LEA32r1]] :: (store (s32) into stack + 16, align 1)
+    ; GISEL-I686-NEXT: CALLpcrel32 &sincosl, csr_32, implicit $esp, implicit $ssp
+    ; GISEL-I686-NEXT: ADJCALLSTACKUP32 20, 0, implicit-def $esp, implicit-def $eflags, implicit-def $ssp, implicit $esp, implicit $ssp
+    ; GISEL-I686-NEXT: [[LD_Fp80m1:%[0-9]+]]:rfp80 = nofpexcept LD_Fp80m %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $fpsw, implicit $fpcw :: (load (s80) from %stack.0, align 16)
+    ; GISEL-I686-NEXT: [[LD_Fp80m2:%[0-9]+]]:rfp80 = nofpexcept LD_Fp80m %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $fpsw, implicit $fpcw :: (load (s80) from %stack.1, align 16)
+    ; GISEL-I686-NEXT: $fp0 = COPY [[LD_Fp80m1]]
+    ; GISEL-I686-NEXT: $fp1 = COPY [[LD_Fp80m2]]
+    ; GISEL-I686-NEXT: RET 0, implicit $fp0, implicit $fp1
+    %1:_(p0) = G_FRAME_INDEX %fixed-stack.0
+    %0:_(s80) = G_LOAD %1(p0) :: (invariant load (s80) from %fixed-stack.0, align 16)
+    %4:_(p0) = G_FRAME_INDEX %stack.0
+    %5:_(p0) = G_FRAME_INDEX %stack.1
+    ADJCALLSTACKDOWN32 20, 0, 0, implicit-def $esp, implicit-def $eflags, implicit-def $ssp, implicit $esp, implicit $ssp
+    %6:_(p0) = COPY $esp
+    %7:_(s32) = G_CONSTANT i32 0
+    %8:_(p0) = G_PTR_ADD %6, %7(s32)
+    G_STORE %0(s80), %8(p0) :: (store (s80) into stack, align 1)
+    %9:_(p0) = COPY $esp
+    %10:_(s32) = G_CONSTANT i32 12
+    %11:_(p0) = G_PTR_ADD %9, %10(s32)
+    G_STORE %4(p0), %11(p0) :: (store (s32) into stack + 12, align 1)
+    %12:_(p0) = COPY $esp
+    %13:_(s32) = G_CONSTANT i32 16
+    %14:_(p0) = G_PTR_ADD %12, %13(s32)
+    G_STORE %5(p0), %14(p0) :: (store (s32) into stack + 16, align 1)
+    CALLpcrel32 &sincosl, csr_32, implicit $esp, implicit $ssp
+    ADJCALLSTACKUP32 20, 0, implicit-def $esp, implicit-def $eflags, implicit-def $ssp, implicit $esp, implicit $ssp
+    %2:_(s80) = G_LOAD %4(p0) :: (load (s80) from %stack.0, align 16)
+    %3:_(s80) = G_LOAD %5(p0) :: (load (s80) from %stack.1, align 16)
+    $fp0 = COPY %2(s80)
+    $fp1 = COPY %3(s80)
+    RET 0, implicit $fp0, implicit $fp1
+...
diff --git a/llvm/test/CodeGen/X86/isel-fabs-x87.ll b/llvm/test/CodeGen/X86/isel-fabs-x87.ll
index 8b846499946cc..a0534e6a1a82e 100644
--- a/llvm/test/CodeGen/X86/isel-fabs-x87.ll
+++ b/llvm/test/CodeGen/X86/isel-fabs-x87.ll
@@ -1,8 +1,48 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+x87,-sse2,-sse | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+x87,-sse2,-sse -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=i686-- -mattr=+x87,-sse2,-sse | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=i686-- -mattr=+x87,-sse2,-sse -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+x87,-sse2,-sse -fast-isel | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+x87,-sse2,-sse -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=i686-- -mattr=+x87,-sse2,-sse | FileCheck %s --check-prefixes=X86,SDAG-ISEL
+; RUN: llc < %s -mtriple=i686-- -mattr=+x87,-sse2,-sse -fast-isel | FileCheck %s --check-prefixes=X86,Fast-ISEL
+; RUN: llc < %s -mtriple=i686-- -mattr=+x87,-sse2,-sse -global-isel -global-isel-abort=0 | FileCheck %s --check-prefixes=X86,GISEL-ISEL
+
+define void @test_float_abs(ptr %argptr)   {
+; SDAG-ISEL-LABEL: test_float_abs:
+; SDAG-ISEL:       # %bb.0:
+; SDAG-ISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SDAG-ISEL-NEXT:    andb $127, 3(%eax)
+; SDAG-ISEL-NEXT:    retl
+;
+; Fast-ISEL-LABEL: test_float_abs:
+; Fast-ISEL:       # %bb.0:
+; Fast-ISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; Fast-ISEL-NEXT:    andb $127, 3(%eax)
+; Fast-ISEL-NEXT:    retl
+;
+; GISEL-ISEL-LABEL: test_float_abs:
+; GISEL-ISEL:       # %bb.0:
+; GISEL-ISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-ISEL-NEXT:    andl $2147483647, (%eax) # imm = 0x7FFFFFFF
+; GISEL-ISEL-NEXT:    retl
+     %arg = load  float, float* %argptr
+     %abs = tail call float @llvm.fabs.f32(float %arg)
+     store float %abs, ptr %argptr
+     ret void
+ }
+
+define void @test_double_abs(ptr %argptr)  {
+; X86-LABEL: test_double_abs:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    fldl (%eax)
+; X86-NEXT:    fabs
+; X86-NEXT:    fstpl (%eax)
+; X86-NEXT:    retl
+    %arg = load double, double* %argptr
+    %abs = tail call double @llvm.fabs.f64(double %arg)
+    store double %abs, double* %argptr
+    ret void
+}
 
 define x86_fp80 @test_x86_fp80_abs(x86_fp80 %arg) {
 ; X64-LABEL: test_x86_fp80_abs:
diff --git a/llvm/test/CodeGen/X86/isel-fabs.ll b/llvm/test/CodeGen/X86/isel-fabs.ll
index 10bd5799280ad..c2d29248e49ba 100644
--- a/llvm/test/CodeGen/X86/isel-fabs.ll
+++ b/llvm/test/CodeGen/X86/isel-fabs.ll
@@ -1,37 +1,61 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=x86_64-- -mattr=-x87 | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=x86_64-- -mattr=-x87 -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=i686-- -mattr=-x87 | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=i686-- -mattr=-x87 -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=FASTISEL-X86
+; RUN: llc < %s -mtriple=x86_64-- -mattr=-x87,+sse,+sse2 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-- -mattr=-x87,+sse,+sse2 -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-- -mattr=-x87,+sse,+sse2 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
+; RUN: llc < %s -mtriple=i686-- -mattr=-x87,+sse,+sse2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=i686-- -mattr=-x87,+sse,+sse2 -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=FASTISEL-X86
+; RUN: llc < %s -mtriple=i686-- -mattr=-x87,+sse,+sse2 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86
 
-
-define float @test_float_abs(float %arg) {
+define float @test_float_abs(float %arg) nounwind {
 ; X64-LABEL: test_float_abs:
 ; X64:       # %bb.0:
 ; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-NEXT:    retq
 ;
+; GISEL-X64-LABEL: test_float_abs:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    movd %xmm0, %eax
+; GISEL-X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; GISEL-X64-NEXT:    movd %eax, %xmm0
+; GISEL-X64-NEXT:    retq
+;
 ; X86-LABEL: test_float_abs:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    retl
 ;
 ; FASTISEL-X86-LABEL: test_float_abs:
 ; FASTISEL-X86:       # %bb.0:
-; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FASTISEL-X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; FASTISEL-X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; FASTISEL-X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; FASTISEL-X86-NEXT:    movd %xmm0, %eax
 ; FASTISEL-X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: test_float_abs:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; GISEL-X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    retl
     %abs = tail call float @llvm.fabs.f32(float %arg)
     ret float %abs
 }
 
-define double @test_double_abs(double %arg) {
+define double @test_double_abs(double %arg) nounwind {
 ; X64-LABEL: test_double_abs:
 ; X64:       # %bb.0:
 ; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-NEXT:    retq
 ;
+; GISEL-X64-LABEL: test_double_abs:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; GISEL-X64-NEXT:    movq %xmm0, %rcx
+; GISEL-X64-NEXT:    andq %rax, %rcx
+; GISEL-X64-NEXT:    movq %rcx, %xmm0
+; GISEL-X64-NEXT:    retq
+;
 ; X86-LABEL: test_double_abs:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -41,10 +65,26 @@ define double @test_double_abs(double %arg) {
 ;
 ; FASTISEL-X86-LABEL: test_double_abs:
 ; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    pushl %ebp
+; FASTISEL-X86-NEXT:    movl %esp, %ebp
+; FASTISEL-X86-NEXT:    andl $-8, %esp
+; FASTISEL-X86-NEXT:    subl $8, %esp
+; FASTISEL-X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; FASTISEL-X86-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; FASTISEL-X86-NEXT:    movlps %xmm0, (%esp)
+; FASTISEL-X86-NEXT:    movl (%esp), %eax
 ; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FASTISEL-X86-NEXT:    andl $2147483647, %edx # imm = 0x7FFFFFFF
+; FASTISEL-X86-NEXT:    movl %ebp, %esp
+; FASTISEL-X86-NEXT:    popl %ebp
 ; FASTISEL-X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: test_double_abs:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    movl $-1, %eax
+; GISEL-X86-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
+; GISEL-X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; GISEL-X86-NEXT:    retl
     %abs = tail call double @llvm.fabs.f64(double %arg)
     ret double %abs
 }
diff --git a/llvm/test/CodeGen/X86/llvm.sincos.ll b/llvm/test/CodeGen/X86/llvm.sincos.ll
index 5734729a2c507..065710f91457b 100644
--- a/llvm/test/CodeGen/X86/llvm.sincos.ll
+++ b/llvm/test/CodeGen/X86/llvm.sincos.ll
@@ -3,8 +3,9 @@
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel  | FileCheck %s --check-prefixes=X64,FASTISEL-X64
 ; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 -fast-isel=0  | FileCheck %s --check-prefixes=X86,SDAG-X86
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=0 -fast-isel=0  | FileCheck %s --check-prefixes=X64,SDAG-X64
-; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefixes=X86,GISEL-X86
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefixes=X64,GISEL-X64
+; TODO: The below RUN line will fails GISEL selection and will fallback to DAG selection due to lack of support for loads/stores in i686 mode, support is expected soon enough, for this reason the llvm/test/CodeGen/X86/GlobalISel/llvm.sincos.mir test is added for now because of the lack of support for i686 in GlobalISel.
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=1 -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
 
 define { float, float } @test_sincos_f32(float %Val) nounwind {
 ; X86-LABEL: test_sincos_f32:
@@ -32,6 +33,35 @@ define { float, float } @test_sincos_f32(float %Val) nounwind {
 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-NEXT:    popq %rax
 ; X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: test_sincos_f32:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    subl $28, %esp
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; GISEL-X86-NEXT:    movl %eax, (%esp)
+; GISEL-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    calll sincosf
+; GISEL-X86-NEXT:    flds {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    flds {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    fxch %st(1)
+; GISEL-X86-NEXT:    addl $28, %esp
+; GISEL-X86-NEXT:    retl
+;
+; GISEL-X64-LABEL: test_sincos_f32:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    pushq %rax
+; GISEL-X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; GISEL-X64-NEXT:    movq %rsp, %rsi
+; GISEL-X64-NEXT:    callq sincosf
+; GISEL-X64-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; GISEL-X64-NEXT:    movl (%rsp), %ecx
+; GISEL-X64-NEXT:    movd %eax, %xmm0
+; GISEL-X64-NEXT:    movd %ecx, %xmm1
+; GISEL-X64-NEXT:    popq %rax
+; GISEL-X64-NEXT:    retq
   %res = call { float, float } @llvm.sincos.f32(float %Val)
   ret { float, float } %res
 }
@@ -62,6 +92,34 @@ define { double, double } @test_sincos_f64(double %Val) nounwind  {
 ; X64-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    addq $24, %rsp
 ; X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: test_sincos_f64:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    subl $44, %esp
+; GISEL-X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    fstpl (%esp)
+; GISEL-X86-NEXT:    calll sincos
+; GISEL-X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    addl $44, %esp
+; GISEL-X86-NEXT:    retl
+;
+; GISEL-X64-LABEL: test_sincos_f64:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    subq $24, %rsp
+; GISEL-X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; GISEL-X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; GISEL-X64-NEXT:    callq sincos
+; GISEL-X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; GISEL-X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; GISEL-X64-NEXT:    movq %rax, %xmm0
+; GISEL-X64-NEXT:    movq %rcx, %xmm1
+; GISEL-X64-NEXT:    addq $24, %rsp
+; GISEL-X64-NEXT:    retq
   %res = call { double, double } @llvm.sincos.f64(double %Val)
   ret { double, double } %res
 }
@@ -94,6 +152,36 @@ define { x86_fp80, x86_fp80 } @test_sincos_f80(x86_fp80 %Val) nounwind {
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X64-NEXT:    addq $56, %rsp
 ; X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: test_sincos_f80:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    subl $60, %esp
+; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    fstpt (%esp)
+; GISEL-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    calll sincosl
+; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    fxch %st(1)
+; GISEL-X86-NEXT:    addl $60, %esp
+; GISEL-X86-NEXT:    retl
+;
+; GISEL-X64-LABEL: test_sincos_f80:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    subq $56, %rsp
+; GISEL-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; GISEL-X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; GISEL-X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; GISEL-X64-NEXT:    fstpt (%rsp)
+; GISEL-X64-NEXT:    callq sincosl
+; GISEL-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; GISEL-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; GISEL-X64-NEXT:    fxch %st(1)
+; GISEL-X64-NEXT:    addq $56, %rsp
+; GISEL-X64-NEXT:    retq
   %res = call { x86_fp80, x86_fp80 } @llvm.sincos.f80(x86_fp80 %Val)
   ret { x86_fp80, x86_fp80 } %res
 }
diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll
index c7da04171e6a1..756019d0e98a0 100644
--- a/llvm/test/CodeGen/X86/shift-i512.ll
+++ b/llvm/test/CodeGen/X86/shift-i512.ll
@@ -48,46 +48,20 @@ define <8 x i64> @shl_i512_1(<8 x i64> %a)  {
 ;
 ; ZNVER4-LABEL: shl_i512_1:
 ; ZNVER4:       # %bb.0:
-; ZNVER4-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; ZNVER4-NEXT:    vmovq %xmm0, %rdx
-; ZNVER4-NEXT:    vpextrq $1, %xmm0, %r9
-; ZNVER4-NEXT:    vpextrq $1, %xmm1, %rax
-; ZNVER4-NEXT:    vmovq %xmm1, %rcx
 ; ZNVER4-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; ZNVER4-NEXT:    shrq $63, %rdx
-; ZNVER4-NEXT:    vpextrq $1, %xmm1, %rsi
-; ZNVER4-NEXT:    vmovq %xmm1, %rdi
-; ZNVER4-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; ZNVER4-NEXT:    leaq (%rdx,%r9,2), %rdx
-; ZNVER4-NEXT:    shrq $63, %r9
-; ZNVER4-NEXT:    vpsllq $1, %xmm0, %xmm0
-; ZNVER4-NEXT:    vmovq %xmm1, %r10
-; ZNVER4-NEXT:    vpextrq $1, %xmm1, %r8
-; ZNVER4-NEXT:    leaq (%r9,%r10,2), %r9
-; ZNVER4-NEXT:    shrq $63, %r10
-; ZNVER4-NEXT:    vmovq %rdx, %xmm4
-; ZNVER4-NEXT:    leaq (%r10,%r8,2), %r10
-; ZNVER4-NEXT:    shrq $63, %r8
-; ZNVER4-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; ZNVER4-NEXT:    leaq (%r8,%rdi,2), %r8
-; ZNVER4-NEXT:    shrq $63, %rdi
-; ZNVER4-NEXT:    leaq (%rdi,%rsi,2), %rdi
-; ZNVER4-NEXT:    shrq $63, %rsi
-; ZNVER4-NEXT:    leaq (%rsi,%rcx,2), %rsi
-; ZNVER4-NEXT:    shrq $63, %rcx
-; ZNVER4-NEXT:    vmovq %r8, %xmm3
-; ZNVER4-NEXT:    leaq (%rcx,%rax,2), %rax
-; ZNVER4-NEXT:    vmovq %rsi, %xmm2
-; ZNVER4-NEXT:    vmovq %rax, %xmm1
-; ZNVER4-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; ZNVER4-NEXT:    vmovq %rdi, %xmm2
-; ZNVER4-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; ZNVER4-NEXT:    vmovq %r10, %xmm3
+; ZNVER4-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; ZNVER4-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; ZNVER4-NEXT:    vpsllq $1, %xmm0, %xmm4
 ; ZNVER4-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; ZNVER4-NEXT:    vmovq %r9, %xmm2
-; ZNVER4-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; ZNVER4-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; ZNVER4-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ZNVER4-NEXT:    vpshldq $1, %xmm3, %xmm2, %xmm3
+; ZNVER4-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; ZNVER4-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
+; ZNVER4-NEXT:    vpshldq $1, %ymm1, %ymm2, %ymm1
+; ZNVER4-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; ZNVER4-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; ZNVER4-NEXT:    vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; ZNVER4-NEXT:    vpshldq $1, %zmm0, %zmm3, %zmm0
+; ZNVER4-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6]
 ; ZNVER4-NEXT:    retq
   %d = bitcast <8 x i64> %a to i512
   %s = shl i512 %d, 1
@@ -142,65 +116,21 @@ define <8 x i64> @lshr_i512_1(<8 x i64> %a)  {
 ;
 ; ZNVER4-LABEL: lshr_i512_1:
 ; ZNVER4:       # %bb.0:
-; ZNVER4-NEXT:    pushq %rbx
-; ZNVER4-NEXT:    .cfi_def_cfa_offset 16
-; ZNVER4-NEXT:    .cfi_offset %rbx, -16
+; ZNVER4-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
 ; ZNVER4-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; ZNVER4-NEXT:    vmovq %xmm0, %r10
-; ZNVER4-NEXT:    vpextrq $1, %xmm0, %rsi
-; ZNVER4-NEXT:    vpextrq $1, %xmm1, %rcx
-; ZNVER4-NEXT:    vmovq %xmm1, %r9
-; ZNVER4-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; ZNVER4-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
-; ZNVER4-NEXT:    shrq %r10
-; ZNVER4-NEXT:    vpextrq $1, %xmm0, %rax
-; ZNVER4-NEXT:    vmovq %xmm0, %rdx
-; ZNVER4-NEXT:    vmovq %xmm1, %rdi
-; ZNVER4-NEXT:    vpextrq $1, %xmm1, %r11
-; ZNVER4-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; ZNVER4-NEXT:    movq %rdx, %r8
-; ZNVER4-NEXT:    shrq %r8
-; ZNVER4-NEXT:    shlq $63, %rax
-; ZNVER4-NEXT:    movq %rdi, %rbx
-; ZNVER4-NEXT:    shrq %rbx
-; ZNVER4-NEXT:    shlq $63, %rdx
-; ZNVER4-NEXT:    shlq $63, %rdi
-; ZNVER4-NEXT:    vpsrlq $1, %xmm0, %xmm0
-; ZNVER4-NEXT:    orq %r8, %rax
-; ZNVER4-NEXT:    movq %r11, %r8
-; ZNVER4-NEXT:    shlq $63, %r8
-; ZNVER4-NEXT:    shrq %r11
-; ZNVER4-NEXT:    orq %rbx, %r8
-; ZNVER4-NEXT:    movq %r9, %rbx
-; ZNVER4-NEXT:    orq %r11, %rdx
-; ZNVER4-NEXT:    movq %rsi, %r11
-; ZNVER4-NEXT:    shrq %r11
-; ZNVER4-NEXT:    shlq $63, %rbx
-; ZNVER4-NEXT:    shrq %r9
-; ZNVER4-NEXT:    shlq $63, %rsi
-; ZNVER4-NEXT:    vmovq %rax, %xmm4
-; ZNVER4-NEXT:    orq %r11, %rbx
-; ZNVER4-NEXT:    movq %rcx, %r11
-; ZNVER4-NEXT:    shlq $63, %r11
-; ZNVER4-NEXT:    shrq %rcx
-; ZNVER4-NEXT:    orq %r10, %rsi
-; ZNVER4-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
-; ZNVER4-NEXT:    orq %r9, %r11
-; ZNVER4-NEXT:    orq %rdi, %rcx
-; ZNVER4-NEXT:    vmovq %rbx, %xmm3
-; ZNVER4-NEXT:    vmovq %rcx, %xmm1
-; ZNVER4-NEXT:    vmovq %r11, %xmm2
-; ZNVER4-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; ZNVER4-NEXT:    vmovq %rsi, %xmm2
-; ZNVER4-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; ZNVER4-NEXT:    vmovq %r8, %xmm3
-; ZNVER4-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; ZNVER4-NEXT:    vmovq %rdx, %xmm2
-; ZNVER4-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; ZNVER4-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
-; ZNVER4-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ZNVER4-NEXT:    popq %rbx
-; ZNVER4-NEXT:    .cfi_def_cfa_offset 8
+; ZNVER4-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; ZNVER4-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; ZNVER4-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; ZNVER4-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
+; ZNVER4-NEXT:    vpshldq $63, %xmm4, %xmm2, %xmm4
+; ZNVER4-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; ZNVER4-NEXT:    vpshldq $63, %ymm3, %ymm1, %ymm1
+; ZNVER4-NEXT:    vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; ZNVER4-NEXT:    vpsrlq $1, %xmm2, %xmm2
+; ZNVER4-NEXT:    vpshldq $63, %zmm0, %zmm3, %zmm0
+; ZNVER4-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
+; ZNVER4-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; ZNVER4-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
 ; ZNVER4-NEXT:    retq
   %d = bitcast <8 x i64> %a to i512
   %s = lshr i512 %d, 1
@@ -255,65 +185,21 @@ define <8 x i64> @ashr_i512_1(<8 x i64> %a)  {
 ;
 ; ZNVER4-LABEL: ashr_i512_1:
 ; ZNVER4:       # %bb.0:
-; ZNVER4-NEXT:    pushq %rbx
-; ZNVER4-NEXT:    .cfi_def_cfa_offset 16
-; ZNVER4-NEXT:    .cfi_offset %rbx, -16
+; ZNVER4-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
 ; ZNVER4-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; ZNVER4-NEXT:    vmovq %xmm0, %r10
-; ZNVER4-NEXT:    vpextrq $1, %xmm0, %rsi
-; ZNVER4-NEXT:    vpextrq $1, %xmm1, %rcx
-; ZNVER4-NEXT:    vmovq %xmm1, %r9
-; ZNVER4-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; ZNVER4-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
-; ZNVER4-NEXT:    shrq %r10
-; ZNVER4-NEXT:    vpextrq $1, %xmm0, %rax
-; ZNVER4-NEXT:    vmovq %xmm0, %rdx
-; ZNVER4-NEXT:    vmovq %xmm1, %rdi
-; ZNVER4-NEXT:    vpextrq $1, %xmm1, %r11
-; ZNVER4-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; ZNVER4-NEXT:    movq %rdx, %r8
-; ZNVER4-NEXT:    shrq %r8
-; ZNVER4-NEXT:    shlq $63, %rax
-; ZNVER4-NEXT:    movq %rdi, %rbx
-; ZNVER4-NEXT:    shrq %rbx
-; ZNVER4-NEXT:    shlq $63, %rdx
-; ZNVER4-NEXT:    shlq $63, %rdi
-; ZNVER4-NEXT:    vpsraq $1, %xmm0, %xmm0
-; ZNVER4-NEXT:    orq %r8, %rax
-; ZNVER4-NEXT:    movq %r11, %r8
-; ZNVER4-NEXT:    shlq $63, %r8
-; ZNVER4-NEXT:    shrq %r11
-; ZNVER4-NEXT:    orq %rbx, %r8
-; ZNVER4-NEXT:    movq %r9, %rbx
-; ZNVER4-NEXT:    orq %r11, %rdx
-; ZNVER4-NEXT:    movq %rsi, %r11
-; ZNVER4-NEXT:    shrq %r11
-; ZNVER4-NEXT:    shlq $63, %rbx
-; ZNVER4-NEXT:    shrq %r9
-; ZNVER4-NEXT:    shlq $63, %rsi
-; ZNVER4-NEXT:    vmovq %rax, %xmm4
-; ZNVER4-NEXT:    orq %r11, %rbx
-; ZNVER4-NEXT:    movq %rcx, %r11
-; ZNVER4-NEXT:    shlq $63, %r11
-; ZNVER4-NEXT:    shrq %rcx
-; ZNVER4-NEXT:    orq %r10, %rsi
-; ZNVER4-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
-; ZNVER4-NEXT:    orq %r9, %r11
-; ZNVER4-NEXT:    orq %rdi, %rcx
-; ZNVER4-NEXT:    vmovq %rbx, %xmm3
-; ZNVER4-NEXT:    vmovq %rcx, %xmm1
-; ZNVER4-NEXT:    vmovq %r11, %xmm2
-; ZNVER4-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; ZNVER4-NEXT:    vmovq %rsi, %xmm2
-; ZNVER4-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; ZNVER4-NEXT:    vmovq %r8, %xmm3
-; ZNVER4-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; ZNVER4-NEXT:    vmovq %rdx, %xmm2
-; ZNVER4-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; ZNVER4-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
-; ZNVER4-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; ZNVER4-NEXT:    popq %rbx
-; ZNVER4-NEXT:    .cfi_def_cfa_offset 8
+; ZNVER4-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; ZNVER4-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; ZNVER4-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; ZNVER4-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
+; ZNVER4-NEXT:    vpshldq $63, %xmm4, %xmm2, %xmm4
+; ZNVER4-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; ZNVER4-NEXT:    vpshldq $63, %ymm3, %ymm1, %ymm1
+; ZNVER4-NEXT:    vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; ZNVER4-NEXT:    vpsraq $1, %xmm2, %xmm2
+; ZNVER4-NEXT:    vpshldq $63, %zmm0, %zmm3, %zmm0
+; ZNVER4-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
+; ZNVER4-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; ZNVER4-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
 ; ZNVER4-NEXT:    retq
   %d = bitcast <8 x i64> %a to i512
   %s = ashr i512 %d, 1
diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll
index 7296cc27894c3..283c6a303a581 100644
--- a/llvm/test/CodeGen/X86/var-permute-256.ll
+++ b/llvm/test/CodeGen/X86/var-permute-256.ll
@@ -2,12 +2,12 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefix=XOP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=INT256,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=INT256,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=INT256,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=INT256,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=INT256,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=INT256,AVX512VL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=INT256,AVX512VL,AVX512VLDQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=INT256,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=INT256,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=INT256,AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=INT256,AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=INT256,AVX512VL,AVX512VLF
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=INT256,AVX512VL,AVX512VLF
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=INT256,AVX512VL,AVX512VLBW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=INT256,AVX512VL,VLVBMI
 
@@ -393,17 +393,17 @@ define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwi
 ; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
 ;
-; AVX512VLDQ-LABEL: var_shuffle_v16i16:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
-; AVX512VLDQ-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm2
-; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
-; AVX512VLDQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm3
-; AVX512VLDQ-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
-; AVX512VLDQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm3 ^ ymm2))
-; AVX512VLDQ-NEXT:    retq
+; AVX512VLF-LABEL: var_shuffle_v16i16:
+; AVX512VLF:       # %bb.0:
+; AVX512VLF-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
+; AVX512VLF-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512VLF-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm2
+; AVX512VLF-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
+; AVX512VLF-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512VLF-NEXT:    vpshufb %ymm1, %ymm0, %ymm3
+; AVX512VLF-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
+; AVX512VLF-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm3 ^ ymm2))
+; AVX512VLF-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: var_shuffle_v16i16:
 ; AVX512VLBW:       # %bb.0:
@@ -533,21 +533,57 @@ define <16 x i16> @var_shuffle_zero_v16i16(<16 x i16> %v, <16 x i16> %indices) n
 ; AVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512VLDQ-LABEL: var_shuffle_zero_v16i16:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
-; AVX512VLDQ-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm2
-; AVX512VLDQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512VLDQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
-; AVX512VLDQ-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
-; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
-; AVX512VLDQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX512VLDQ-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512VLDQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm1 & (ymm0 ^ ymm3))
-; AVX512VLDQ-NEXT:    vpandn %ymm1, %ymm2, %ymm0
-; AVX512VLDQ-NEXT:    retq
+; AVX512F-LABEL: var_shuffle_zero_v16i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
+; AVX512F-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
+; AVX512F-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
+; AVX512F-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512F-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT:    vpblendvb %ymm1, %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT:    vpandn %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shuffle_zero_v16i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpcmpnleuw %zmm2, %zmm1, %k1
+; AVX512BW-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
+; AVX512BW-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
+; AVX512BW-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
+; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLF-LABEL: var_shuffle_zero_v16i16:
+; AVX512VLF:       # %bb.0:
+; AVX512VLF-NEXT:    vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
+; AVX512VLF-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm2
+; AVX512VLF-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VLF-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
+; AVX512VLF-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512VLF-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
+; AVX512VLF-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
+; AVX512VLF-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512VLF-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX512VLF-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512VLF-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm1 & (ymm0 ^ ymm3))
+; AVX512VLF-NEXT:    vpandn %ymm1, %ymm2, %ymm0
+; AVX512VLF-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: var_shuffle_zero_v16i16:
 ; AVX512VLBW:       # %bb.0:
@@ -668,15 +704,15 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
 ; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
 ;
-; AVX512VLDQ-LABEL: var_shuffle_v32i8:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
-; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
-; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm3
-; AVX512VLDQ-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
-; AVX512VLDQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm2 ^ ymm3))
-; AVX512VLDQ-NEXT:    retq
+; AVX512VLF-LABEL: var_shuffle_v32i8:
+; AVX512VLF:       # %bb.0:
+; AVX512VLF-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
+; AVX512VLF-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
+; AVX512VLF-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512VLF-NEXT:    vpshufb %ymm1, %ymm0, %ymm3
+; AVX512VLF-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
+; AVX512VLF-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm2 ^ ymm3))
+; AVX512VLF-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: var_shuffle_v32i8:
 ; AVX512VLBW:       # %bb.0:
@@ -847,19 +883,51 @@ define <32 x i8> @var_shuffle_zero_v32i8(<32 x i8> %v, <32 x i8> %indices) nounw
 ; AVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512VLDQ-LABEL: var_shuffle_zero_v32i8:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
-; AVX512VLDQ-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm2
-; AVX512VLDQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
-; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
-; AVX512VLDQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX512VLDQ-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512VLDQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm1 & (ymm0 ^ ymm3))
-; AVX512VLDQ-NEXT:    vpandn %ymm1, %ymm2, %ymm0
-; AVX512VLDQ-NEXT:    retq
+; AVX512F-LABEL: var_shuffle_zero_v32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
+; AVX512F-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
+; AVX512F-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512F-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT:    vpblendvb %ymm1, %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT:    vpandn %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shuffle_zero_v32i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512BW-NEXT:    vpcmpnleub %zmm2, %zmm1, %k1
+; AVX512BW-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
+; AVX512BW-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
+; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLF-LABEL: var_shuffle_zero_v32i8:
+; AVX512VLF:       # %bb.0:
+; AVX512VLF-NEXT:    vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
+; AVX512VLF-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm2
+; AVX512VLF-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VLF-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
+; AVX512VLF-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
+; AVX512VLF-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512VLF-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX512VLF-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512VLF-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm1 & (ymm0 ^ ymm3))
+; AVX512VLF-NEXT:    vpandn %ymm1, %ymm2, %ymm0
+; AVX512VLF-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: var_shuffle_zero_v32i8:
 ; AVX512VLBW:       # %bb.0:
@@ -1493,17 +1561,17 @@ define <16 x i16> @var_shuffle_v16i16_from_v8i16(<8 x i16> %v, <16 x i16> %indic
 ; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
 ;
-; AVX512VLDQ-LABEL: var_shuffle_v16i16_from_v8i16:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512VLDQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
-; AVX512VLDQ-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
-; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm3
-; AVX512VLDQ-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
-; AVX512VLDQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm3 ^ ymm2))
-; AVX512VLDQ-NEXT:    retq
+; AVX512VLF-LABEL: var_shuffle_v16i16_from_v8i16:
+; AVX512VLF:       # %bb.0:
+; AVX512VLF-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VLF-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
+; AVX512VLF-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512VLF-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512VLF-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
+; AVX512VLF-NEXT:    vpshufb %ymm1, %ymm0, %ymm3
+; AVX512VLF-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
+; AVX512VLF-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm3 ^ ymm2))
+; AVX512VLF-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: var_shuffle_v16i16_from_v8i16:
 ; AVX512VLBW:       # %bb.0:
@@ -1611,15 +1679,15 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices)
 ; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
 ;
-; AVX512VLDQ-LABEL: var_shuffle_v32i8_from_v16i8:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
-; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm3
-; AVX512VLDQ-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
-; AVX512VLDQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm2 ^ ymm3))
-; AVX512VLDQ-NEXT:    retq
+; AVX512VLF-LABEL: var_shuffle_v32i8_from_v16i8:
+; AVX512VLF:       # %bb.0:
+; AVX512VLF-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VLF-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
+; AVX512VLF-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512VLF-NEXT:    vpshufb %ymm1, %ymm0, %ymm3
+; AVX512VLF-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
+; AVX512VLF-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm2 ^ ymm3))
+; AVX512VLF-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: var_shuffle_v32i8_from_v16i8:
 ; AVX512VLBW:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 62ab5d82bfbb6..910dd1ee6c419 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -2099,21 +2099,19 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    psrlq $1, %xmm2
 ; SSE41-NEXT:    por %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm0
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cvtsi2ss %rax, %xmm3
+; SSE41-NEXT:    movq %xmm0, %rax
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE41-NEXT:    movq %xmm1, %rax
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
-; SSE41-NEXT:    movaps %xmm1, %xmm2
-; SSE41-NEXT:    addps %xmm1, %xmm2
-; SSE41-NEXT:    xorps %xmm3, %xmm3
-; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[2,3]
-; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],zero,zero
+; SSE41-NEXT:    movaps %xmm2, %xmm3
+; SSE41-NEXT:    addps %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movaps %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: uitofp_4i64_to_4f32_undef:
diff --git a/llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll b/llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll
index 58f6a66aeff79..c5e879c0135f4 100644
--- a/llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll
+++ b/llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll
@@ -12,12 +12,12 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver1 | FileCheck %s --check-prefixes=BMI
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver2 | FileCheck %s --check-prefixes=BMI
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver3 | FileCheck %s --check-prefixes=BMI
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4 | FileCheck %s --check-prefixes=BMI2
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=BMI2
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=BMI2
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=BMI2
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=BMI2
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver5 | FileCheck %s --check-prefixes=BMI2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4 | FileCheck %s --check-prefixes=BMI2-SLOW
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=BMI2-SLOW
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=BMI2-SLOW
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=BMI2-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=BMI2-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver5 | FileCheck %s --check-prefixes=BMI2-FAST
 
 ; Verify that for the X86_64 processors that are known to have poor latency
 ; double precision shift instructions we do not generate 'shld' or 'shrd'
@@ -53,15 +53,23 @@ define i64 @lshift(i64 %a, i64 %b, i32 %c) nounwind readnone {
 ; BMI-NEXT:    orq %rdi, %rax
 ; BMI-NEXT:    retq
 ;
-; BMI2-LABEL: lshift:
-; BMI2:       # %bb.0: # %entry
-; BMI2-NEXT:    # kill: def $edx killed $edx def $rdx
-; BMI2-NEXT:    shlxq %rdx, %rdi, %rcx
-; BMI2-NEXT:    notb %dl
-; BMI2-NEXT:    shrq %rsi
-; BMI2-NEXT:    shrxq %rdx, %rsi, %rax
-; BMI2-NEXT:    orq %rcx, %rax
-; BMI2-NEXT:    retq
+; BMI2-SLOW-LABEL: lshift:
+; BMI2-SLOW:       # %bb.0: # %entry
+; BMI2-SLOW-NEXT:    # kill: def $edx killed $edx def $rdx
+; BMI2-SLOW-NEXT:    shlxq %rdx, %rdi, %rcx
+; BMI2-SLOW-NEXT:    notb %dl
+; BMI2-SLOW-NEXT:    shrq %rsi
+; BMI2-SLOW-NEXT:    shrxq %rdx, %rsi, %rax
+; BMI2-SLOW-NEXT:    orq %rcx, %rax
+; BMI2-SLOW-NEXT:    retq
+;
+; BMI2-FAST-LABEL: lshift:
+; BMI2-FAST:       # %bb.0: # %entry
+; BMI2-FAST-NEXT:    movl %edx, %ecx
+; BMI2-FAST-NEXT:    movq %rdi, %rax
+; BMI2-FAST-NEXT:    # kill: def $cl killed $cl killed $ecx
+; BMI2-FAST-NEXT:    shldq %cl, %rsi, %rax
+; BMI2-FAST-NEXT:    retq
 entry:
   %sh_prom = zext i32 %c to i64
   %shl = shl i64 %a, %sh_prom
@@ -100,15 +108,23 @@ define i64 @rshift(i64 %a, i64 %b, i32 %c) nounwind readnone {
 ; BMI-NEXT:    orq %rdi, %rax
 ; BMI-NEXT:    retq
 ;
-; BMI2-LABEL: rshift:
-; BMI2:       # %bb.0: # %entry
-; BMI2-NEXT:    # kill: def $edx killed $edx def $rdx
-; BMI2-NEXT:    shrxq %rdx, %rdi, %rcx
-; BMI2-NEXT:    notb %dl
-; BMI2-NEXT:    addq %rsi, %rsi
-; BMI2-NEXT:    shlxq %rdx, %rsi, %rax
-; BMI2-NEXT:    orq %rcx, %rax
-; BMI2-NEXT:    retq
+; BMI2-SLOW-LABEL: rshift:
+; BMI2-SLOW:       # %bb.0: # %entry
+; BMI2-SLOW-NEXT:    # kill: def $edx killed $edx def $rdx
+; BMI2-SLOW-NEXT:    shrxq %rdx, %rdi, %rcx
+; BMI2-SLOW-NEXT:    notb %dl
+; BMI2-SLOW-NEXT:    addq %rsi, %rsi
+; BMI2-SLOW-NEXT:    shlxq %rdx, %rsi, %rax
+; BMI2-SLOW-NEXT:    orq %rcx, %rax
+; BMI2-SLOW-NEXT:    retq
+;
+; BMI2-FAST-LABEL: rshift:
+; BMI2-FAST:       # %bb.0: # %entry
+; BMI2-FAST-NEXT:    movl %edx, %ecx
+; BMI2-FAST-NEXT:    movq %rdi, %rax
+; BMI2-FAST-NEXT:    # kill: def $cl killed $cl killed $ecx
+; BMI2-FAST-NEXT:    shrdq %cl, %rsi, %rax
+; BMI2-FAST-NEXT:    retq
 entry:
   %sh_prom = zext i32 %c to i64
   %shr = lshr i64 %a, %sh_prom
diff --git a/llvm/test/DebugInfo/dynamic-bitfield.ll b/llvm/test/DebugInfo/dynamic-bitfield.ll
new file mode 100644
index 0000000000000..1a5ed81774538
--- /dev/null
+++ b/llvm/test/DebugInfo/dynamic-bitfield.ll
@@ -0,0 +1,62 @@
+; RUN: llc -O0 -filetype=obj -o - %s | llvm-dwarfdump -v -debug-info - | FileCheck %s
+
+; A basic test of using a DIExpression for DW_AT_data_bit_offset and
+; DW_AT_bit_size.
+
+source_filename = "bitfield.c"
+
+%struct.PackedBits = type <{ i8, i32 }>
+
+@s = common global %struct.PackedBits zeroinitializer, align 1, !dbg !2
+@value = common global i32 zeroinitializer, align 4, !dbg !0
+
+!llvm.dbg.cu = !{!4}
+!llvm.module.flags = !{!17, !18, !19}
+!llvm.ident = !{!20}
+
+!0 = distinct !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = !DIGlobalVariable(name: "value", scope: !4, file: !5, line: 8, type: !15, isLocal: false, isDefinition: true)
+!2 = distinct !DIGlobalVariableExpression(var: !3, expr: !DIExpression())
+!3 = !DIGlobalVariable(name: "s", scope: !4, file: !5, line: 8, type: !8, isLocal: false, isDefinition: true)
+
+
+!4 = distinct !DICompileUnit(language: DW_LANG_C99, file: !5, producer: "clang version 3.9.0 (trunk 267633)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !6, globals: !7)
+!5 = !DIFile(filename: "bitfield.c", directory: "/Volumes/Data/llvm")
+!6 = !{}
+!7 = !{!0, !2}
+!8 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "PackedBits", file: !5, line: 3, size: 40, elements: !9)
+!9 = !{!10, !12, !16}
+!10 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !8, file: !5, line: 5, baseType: !11, size: 8)
+; CHECK: DW_TAG_member
+; CHECK-NEXT: DW_AT_name{{.*}}"a"
+; CHECK-NOT:  DW_TAG
+; CHECK-NOT:  DW_AT_bit_offset
+; CHECK-NOT:  DW_AT_data_bit_offset
+; CHECK:      DW_AT_data_member_location [DW_FORM_data1]	(0x00)
+!11 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !8, file: !5, line: 6, baseType: !13, size: !3, offset: !3, flags: DIFlagBitField)
+!13 = !DIDerivedType(tag: DW_TAG_typedef, name: "uint32_t", file: !14, line: 183, baseType: !15)
+!14 = !DIFile(filename: "/Volumes/Data/llvm/_build.ninja.release/bin/../lib/clang/3.9.0/include/stdint.h", directory: "/Volumes/Data/llvm")
+!15 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+; CHECK: DW_TAG_member
+; CHECK-NEXT: DW_AT_name{{.*}}"b"
+; CHECK-NOT:  DW_TAG
+; CHECK-NOT:  DW_AT_bit_offset
+; CHECK-NOT:  DW_AT_byte_size
+; CHECK:      DW_AT_bit_size             [DW_FORM_ref4] ({{.*}})
+; CHECK-NEXT: DW_AT_data_bit_offset      [DW_FORM_ref4] ({{.*}})
+; CHECK-NOT:  DW_AT_data_member_location
+!16 = !DIDerivedType(tag: DW_TAG_member, name: "c", scope: !8, file: !5, line: 7, baseType: !13, size: !DIExpression(DW_OP_constu, 27), offset: !DIExpression(DW_OP_constu, 13), flags: DIFlagBitField)
+!17 = !{i32 2, !"Dwarf Version", i32 4}
+!18 = !{i32 2, !"Debug Info Version", i32 3}
+!19 = !{i32 1, !"PIC Level", i32 2}
+; CHECK: DW_TAG_member
+; CHECK-NEXT: DW_AT_name{{.*}}"c"
+; CHECK-NOT:  DW_TAG
+; CHECK-NOT:  DW_AT_bit_offset
+; CHECK-NOT:  DW_AT_byte_size
+; CHECK:      DW_AT_bit_size             [DW_FORM_exprloc]	(DW_OP_lit27)
+; CHECK-NEXT: DW_AT_data_bit_offset      [DW_FORM_exprloc]	(DW_OP_lit13)
+; CHECK-NOT:  DW_AT_data_member_location
+; CHECK: DW_TAG
+!20 = !{!"clang version 3.9.0 (trunk 267633)"}
diff --git a/llvm/test/Instrumentation/BoundsChecking/runtimes.ll b/llvm/test/Instrumentation/BoundsChecking/runtimes.ll
index 6c1acf6d13775..2006a6db2ef40 100644
--- a/llvm/test/Instrumentation/BoundsChecking/runtimes.ll
+++ b/llvm/test/Instrumentation/BoundsChecking/runtimes.ll
@@ -205,7 +205,7 @@ define void @f1(i64 %x) nounwind {
 ; TR-GUARD: attributes #[[ATTR3]] = { nomerge noreturn nounwind }
 ;.
 ; RT-GUARD: attributes #[[ATTR0]] = { nounwind }
-; RT-GUARD: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+; RT-GUARD: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
 ; RT-GUARD: attributes #[[ATTR2]] = { nomerge nounwind }
 ;.
 ; TR: [[META0]] = !{}
diff --git a/llvm/test/Instrumentation/MemorySanitizer/count-zeroes.ll b/llvm/test/Instrumentation/MemorySanitizer/count-zeroes.ll
index 73e047e68ddc6..c51dc1a373629 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/count-zeroes.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/count-zeroes.ll
@@ -9,10 +9,14 @@ define i64 @test_ctlz_i64_zeropoison(i64 %v) #0 {
 ; CHECK-LABEL: @test_ctlz_i64_zeropoison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCZ_BS:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[_MSCZ_BZP:%.*]] = icmp eq i64 [[V:%.*]], 0
-; CHECK-NEXT:    [[_MSCZ_BS1:%.*]] = or i1 [[_MSCZ_BS]], [[_MSCZ_BZP]]
-; CHECK-NEXT:    [[_MSCZ_OS:%.*]] = sext i1 [[_MSCZ_BS1]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[V:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    [[_MSCZ_CMP_ZEROS:%.*]] = icmp uge i64 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSCZ_SHADOW_NOT_NULL:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSCZ_MAIN:%.*]] = and i1 [[_MSCZ_CMP_ZEROS]], [[_MSCZ_SHADOW_NOT_NULL]]
+; CHECK-NEXT:    [[_MSCZ_BZP:%.*]] = icmp eq i64 [[V]], 0
+; CHECK-NEXT:    [[_MSCZ_BS:%.*]] = or i1 [[_MSCZ_MAIN]], [[_MSCZ_BZP]]
+; CHECK-NEXT:    [[_MSCZ_OS:%.*]] = sext i1 [[_MSCZ_BS]] to i64
 ; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.ctlz.i64(i64 [[V]], i1 true)
 ; CHECK-NEXT:    store i64 [[_MSCZ_OS]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[RES]]
@@ -24,9 +28,13 @@ define i64 @test_ctlz_i64_nozeropoison(i64 %v) #0 {
 ; CHECK-LABEL: @test_ctlz_i64_nozeropoison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCZ_BS:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[_MSCZ_OS:%.*]] = sext i1 [[_MSCZ_BS]] to i64
-; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.ctlz.i64(i64 [[V:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[V:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    [[_MSCZ_CMP_ZEROS:%.*]] = icmp uge i64 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSCZ_SHADOW_NOT_NULL:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSCZ_MAIN:%.*]] = and i1 [[_MSCZ_CMP_ZEROS]], [[_MSCZ_SHADOW_NOT_NULL]]
+; CHECK-NEXT:    [[_MSCZ_OS:%.*]] = sext i1 [[_MSCZ_MAIN]] to i64
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.ctlz.i64(i64 [[V]], i1 false)
 ; CHECK-NEXT:    store i64 [[_MSCZ_OS]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[RES]]
 ;
@@ -39,10 +47,14 @@ define <2 x i64> @test_ctlz_v2i64_zeropoison(<2 x i64> %v) #0 {
 ; CHECK-LABEL: @test_ctlz_v2i64_zeropoison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCZ_BS:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSCZ_BZP:%.*]] = icmp eq <2 x i64> [[V:%.*]], zeroinitializer
-; CHECK-NEXT:    [[_MSCZ_BS1:%.*]] = or <2 x i1> [[_MSCZ_BS]], [[_MSCZ_BZP]]
-; CHECK-NEXT:    [[_MSCZ_OS:%.*]] = sext <2 x i1> [[_MSCZ_BS1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[V:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[TMP1]], i1 false)
+; CHECK-NEXT:    [[_MSCZ_CMP_ZEROS:%.*]] = icmp uge <2 x i64> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSCZ_SHADOW_NOT_NULL:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSCZ_MAIN:%.*]] = and <2 x i1> [[_MSCZ_CMP_ZEROS]], [[_MSCZ_SHADOW_NOT_NULL]]
+; CHECK-NEXT:    [[_MSCZ_BZP:%.*]] = icmp eq <2 x i64> [[V]], zeroinitializer
+; CHECK-NEXT:    [[_MSCZ_BS:%.*]] = or <2 x i1> [[_MSCZ_MAIN]], [[_MSCZ_BZP]]
+; CHECK-NEXT:    [[_MSCZ_OS:%.*]] = sext <2 x i1> [[_MSCZ_BS]] to <2 x i64>
 ; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[V]], i1 true)
 ; CHECK-NEXT:    store <2 x i64> [[_MSCZ_OS]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
@@ -54,9 +66,13 @@ define <2 x i64> @test_ctlz_v2i64_nozeropoison(<2 x i64> %v) #0 {
 ; CHECK-LABEL: @test_ctlz_v2i64_nozeropoison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCZ_BS:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSCZ_OS:%.*]] = sext <2 x i1> [[_MSCZ_BS]] to <2 x i64>
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[V:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[V:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[TMP1]], i1 false)
+; CHECK-NEXT:    [[_MSCZ_CMP_ZEROS:%.*]] = icmp uge <2 x i64> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSCZ_SHADOW_NOT_NULL:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSCZ_MAIN:%.*]] = and <2 x i1> [[_MSCZ_CMP_ZEROS]], [[_MSCZ_SHADOW_NOT_NULL]]
+; CHECK-NEXT:    [[_MSCZ_OS:%.*]] = sext <2 x i1> [[_MSCZ_MAIN]] to <2 x i64>
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[V]], i1 false)
 ; CHECK-NEXT:    store <2 x i64> [[_MSCZ_OS]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;
@@ -69,10 +85,14 @@ define i64 @test_cttz_i64_zeropoison(i64 %v) #0 {
 ; CHECK-LABEL: @test_cttz_i64_zeropoison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCZ_BS:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[_MSCZ_BZP:%.*]] = icmp eq i64 [[V:%.*]], 0
-; CHECK-NEXT:    [[_MSCZ_BS1:%.*]] = or i1 [[_MSCZ_BS]], [[_MSCZ_BZP]]
-; CHECK-NEXT:    [[_MSCZ_OS:%.*]] = sext i1 [[_MSCZ_BS1]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.cttz.i64(i64 [[V:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    [[_MSCZ_CMP_ZEROS:%.*]] = icmp uge i64 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSCZ_SHADOW_NOT_NULL:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSCZ_MAIN:%.*]] = and i1 [[_MSCZ_CMP_ZEROS]], [[_MSCZ_SHADOW_NOT_NULL]]
+; CHECK-NEXT:    [[_MSCZ_BZP:%.*]] = icmp eq i64 [[V]], 0
+; CHECK-NEXT:    [[_MSCZ_BS:%.*]] = or i1 [[_MSCZ_MAIN]], [[_MSCZ_BZP]]
+; CHECK-NEXT:    [[_MSCZ_OS:%.*]] = sext i1 [[_MSCZ_BS]] to i64
 ; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.cttz.i64(i64 [[V]], i1 true)
 ; CHECK-NEXT:    store i64 [[_MSCZ_OS]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[RES]]
@@ -84,9 +104,13 @@ define i64 @test_cttz_i64_nozeropoison(i64 %v) #0 {
 ; CHECK-LABEL: @test_cttz_i64_nozeropoison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCZ_BS:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[_MSCZ_OS:%.*]] = sext i1 [[_MSCZ_BS]] to i64
-; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.cttz.i64(i64 [[V:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.cttz.i64(i64 [[V:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    [[_MSCZ_CMP_ZEROS:%.*]] = icmp uge i64 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSCZ_SHADOW_NOT_NULL:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSCZ_MAIN:%.*]] = and i1 [[_MSCZ_CMP_ZEROS]], [[_MSCZ_SHADOW_NOT_NULL]]
+; CHECK-NEXT:    [[_MSCZ_OS:%.*]] = sext i1 [[_MSCZ_MAIN]] to i64
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.cttz.i64(i64 [[V]], i1 false)
 ; CHECK-NEXT:    store i64 [[_MSCZ_OS]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[RES]]
 ;
@@ -99,10 +123,14 @@ define <2 x i64> @test_cttz_v2i64_zeropoison(<2 x i64> %v) #0 {
 ; CHECK-LABEL: @test_cttz_v2i64_zeropoison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCZ_BS:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSCZ_BZP:%.*]] = icmp eq <2 x i64> [[V:%.*]], zeroinitializer
-; CHECK-NEXT:    [[_MSCZ_BS1:%.*]] = or <2 x i1> [[_MSCZ_BS]], [[_MSCZ_BZP]]
-; CHECK-NEXT:    [[_MSCZ_OS:%.*]] = sext <2 x i1> [[_MSCZ_BS1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> [[V:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> [[TMP1]], i1 false)
+; CHECK-NEXT:    [[_MSCZ_CMP_ZEROS:%.*]] = icmp uge <2 x i64> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSCZ_SHADOW_NOT_NULL:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSCZ_MAIN:%.*]] = and <2 x i1> [[_MSCZ_CMP_ZEROS]], [[_MSCZ_SHADOW_NOT_NULL]]
+; CHECK-NEXT:    [[_MSCZ_BZP:%.*]] = icmp eq <2 x i64> [[V]], zeroinitializer
+; CHECK-NEXT:    [[_MSCZ_BS:%.*]] = or <2 x i1> [[_MSCZ_MAIN]], [[_MSCZ_BZP]]
+; CHECK-NEXT:    [[_MSCZ_OS:%.*]] = sext <2 x i1> [[_MSCZ_BS]] to <2 x i64>
 ; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> [[V]], i1 true)
 ; CHECK-NEXT:    store <2 x i64> [[_MSCZ_OS]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
@@ -114,9 +142,13 @@ define <2 x i64> @test_cttz_v2i64_nozeropoison(<2 x i64> %v) #0 {
 ; CHECK-LABEL: @test_cttz_v2i64_nozeropoison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCZ_BS:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSCZ_OS:%.*]] = sext <2 x i1> [[_MSCZ_BS]] to <2 x i64>
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> [[V:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> [[V:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> [[TMP1]], i1 false)
+; CHECK-NEXT:    [[_MSCZ_CMP_ZEROS:%.*]] = icmp uge <2 x i64> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSCZ_SHADOW_NOT_NULL:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSCZ_MAIN:%.*]] = and <2 x i1> [[_MSCZ_CMP_ZEROS]], [[_MSCZ_SHADOW_NOT_NULL]]
+; CHECK-NEXT:    [[_MSCZ_OS:%.*]] = sext <2 x i1> [[_MSCZ_MAIN]] to <2 x i64>
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> [[V]], i1 false)
 ; CHECK-NEXT:    store <2 x i64> [[_MSCZ_OS]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_unsupported.s b/llvm/test/MC/AMDGPU/gfx1250_asm_unsupported.s
index c2785553030fc..2111e91cd5ef2 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_unsupported.s
@@ -1,5 +1,99 @@
 ; RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX1250-ERR --implicit-check-not=error: --strict-whitespace %s
 
+;; DOT4_F32_*, DOT2_F32_*, DOT2_F16 and DOT2_BF16
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3 row_mirror
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 quad_perm:[3,2,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3 row_shl:15
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3 row_share:15
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot2_f16_f16 v5, v1, v2, s3
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot2_bf16_bf16 v5, v1, v2, s3
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot2_f32_bf16 v5, v1, v2, v3
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot2_f32_f16 v5, v1, v2, s3
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+;; LDS-direct and parameter-load, VINTERP
+
+ds_direct_load v1 wait_va_vdst:15
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+ds_param_load v1, attr0.x wait_va_vdst:15
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+ds_direct_load v1 wait_va_vdst:15 wait_vm_vsrc:1
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_interp_p10_f32 v0, v1, v2, v3
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_interp_p2_f32 v0, v1, v2, v3
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_interp_p10_f16_f32 v0, v1, v2, v3
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_interp_p2_f16_f32 v0, v1, v2, v3
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_interp_p10_rtz_f16_f32 v0, v1, v2, v3
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_interp_p2_rtz_f16_f32 v0, v1, v2, v3
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
 ;; Export, S_WAIT_EXPCNT and S_WAIT_EVENT
 
 export mrt0 off, off, off, off
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
new file mode 100644
index 0000000000000..e62eb6fbb723c
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
@@ -0,0 +1,65 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
+
+v_cvt_f32_bf16 v5, v1
+// GFX1250: v_cvt_f32_bf16_e32 v5, v1               ; encoding: [0x01,0xe5,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, v127
+// GFX1250: v_cvt_f32_bf16_e32 v5, v127             ; encoding: [0x7f,0xe5,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, s1
+// GFX1250: v_cvt_f32_bf16_e32 v5, s1               ; encoding: [0x01,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, s105
+// GFX1250: v_cvt_f32_bf16_e32 v5, s105             ; encoding: [0x69,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, vcc_lo
+// GFX1250: v_cvt_f32_bf16_e32 v5, vcc_lo           ; encoding: [0x6a,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, vcc_hi
+// GFX1250: v_cvt_f32_bf16_e32 v5, vcc_hi           ; encoding: [0x6b,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, ttmp15
+// GFX1250: v_cvt_f32_bf16_e32 v5, ttmp15           ; encoding: [0x7b,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, m0
+// GFX1250: v_cvt_f32_bf16_e32 v5, m0               ; encoding: [0x7d,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, exec_lo
+// GFX1250: v_cvt_f32_bf16_e32 v5, exec_lo          ; encoding: [0x7e,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, exec_hi
+// GFX1250: v_cvt_f32_bf16_e32 v5, exec_hi          ; encoding: [0x7f,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, null
+// GFX1250: v_cvt_f32_bf16_e32 v5, null             ; encoding: [0x7c,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, -1
+// GFX1250: v_cvt_f32_bf16_e32 v5, -1               ; encoding: [0xc1,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, 0.5
+// GFX1250: v_cvt_f32_bf16_e32 v5, 0.5              ; encoding: [0xf0,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, src_scc
+// GFX1250: v_cvt_f32_bf16_e32 v5, src_scc          ; encoding: [0xfd,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v127, 0x8000
+// GFX1250: v_cvt_f32_bf16_e32 v127, 0x8000         ; encoding: [0xff,0xe4,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_cvt_pk_f16_bf8 v1, v2
+// GFX1250: v_cvt_pk_f16_bf8 v1, v2                 ; encoding: [0x02,0xed,0x02,0x7e]
+
+v_cvt_pk_f16_bf8 v1, s2
+// GFX1250: v_cvt_pk_f16_bf8 v1, s2                 ; encoding: [0x02,0xec,0x02,0x7e]
+
+v_cvt_pk_f16_bf8 v1, 100
+// GFX1250: v_cvt_pk_f16_bf8 v1, 0x64               ; encoding: [0xff,0xec,0x02,0x7e,0x64,0x00,0x00,0x00]
+
+v_cvt_pk_f16_fp8 v1, v2
+// GFX1250: v_cvt_pk_f16_fp8 v1, v2                 ; encoding: [0x02,0xeb,0x02,0x7e]
+
+v_cvt_pk_f16_fp8 v1, s2
+// GFX1250: v_cvt_pk_f16_fp8 v1, s2                 ; encoding: [0x02,0xea,0x02,0x7e]
+
+v_cvt_pk_f16_fp8 v1, 100
+// GFX1250: v_cvt_pk_f16_fp8 v1, 0x64               ; encoding: [0xff,0xea,0x02,0x7e,0x64,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
new file mode 100644
index 0000000000000..37f39546ae13d
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
@@ -0,0 +1,68 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
+
+v_cvt_f32_bf16 v5, v1
+// GFX1250: v_cvt_f32_bf16_e32 v5, v1               ; encoding: [0x01,0xe5,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, v127
+// GFX1250: v_cvt_f32_bf16_e32 v5, v127             ; encoding: [0x7f,0xe5,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, s1
+// GFX1250: v_cvt_f32_bf16_e32 v5, s1               ; encoding: [0x01,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, s105
+// GFX1250: v_cvt_f32_bf16_e32 v5, s105             ; encoding: [0x69,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, vcc_lo
+// GFX1250: v_cvt_f32_bf16_e32 v5, vcc_lo           ; encoding: [0x6a,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, vcc_hi
+// GFX1250: v_cvt_f32_bf16_e32 v5, vcc_hi           ; encoding: [0x6b,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, ttmp15
+// GFX1250: v_cvt_f32_bf16_e32 v5, ttmp15           ; encoding: [0x7b,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, m0
+// GFX1250: v_cvt_f32_bf16_e32 v5, m0               ; encoding: [0x7d,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, exec_lo
+// GFX1250: v_cvt_f32_bf16_e32 v5, exec_lo          ; encoding: [0x7e,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, exec_hi
+// GFX1250: v_cvt_f32_bf16_e32 v5, exec_hi          ; encoding: [0x7f,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, null
+// GFX1250: v_cvt_f32_bf16_e32 v5, null             ; encoding: [0x7c,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, -1
+// GFX1250: v_cvt_f32_bf16_e32 v5, -1               ; encoding: [0xc1,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, 0.5
+// GFX1250: v_cvt_f32_bf16_e32 v5, 0.5              ; encoding: [0xf0,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v5, src_scc
+// GFX1250: v_cvt_f32_bf16_e32 v5, src_scc          ; encoding: [0xfd,0xe4,0x0a,0x7e]
+
+v_cvt_f32_bf16 v127, 0x8000
+// GFX1250: v_cvt_f32_bf16_e32 v127, 0x8000         ; encoding: [0xff,0xe4,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_cvt_f32_bf16 v5, v1.h
+// GFX1250: v_cvt_f32_bf16_e32 v5, v1.h             ; encoding: [0x81,0xe5,0x0a,0x7e]
+
+v_cvt_pk_f16_bf8 v1, v2
+// GFX1250: v_cvt_pk_f16_bf8 v1, v2                 ; encoding: [0x02,0xed,0x02,0x7e]
+
+v_cvt_pk_f16_bf8 v1, s2
+// GFX1250: v_cvt_pk_f16_bf8 v1, s2                 ; encoding: [0x02,0xec,0x02,0x7e]
+
+v_cvt_pk_f16_bf8 v1, 100
+// GFX1250: v_cvt_pk_f16_bf8 v1, 0x64               ; encoding: [0xff,0xec,0x02,0x7e,0x64,0x00,0x00,0x00]
+
+v_cvt_pk_f16_fp8 v1, v2
+// GFX1250: v_cvt_pk_f16_fp8 v1, v2                 ; encoding: [0x02,0xeb,0x02,0x7e]
+
+v_cvt_pk_f16_fp8 v1, s2
+// GFX1250: v_cvt_pk_f16_fp8 v1, s2                 ; encoding: [0x02,0xea,0x02,0x7e]
+
+v_cvt_pk_f16_fp8 v1, 100
+// GFX1250: v_cvt_pk_f16_fp8 v1, 0x64               ; encoding: [0xff,0xea,0x02,0x7e,0x64,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s
new file mode 100644
index 0000000000000..1ec54d137b335
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s
@@ -0,0 +1,67 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_mirror
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_half_mirror
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_shl:1
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_shl:15
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_shr:1
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_shr:15
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_ror:1
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_ror:15
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_cvt_f32_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_fp8 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xea,0x02,0x7e,0x02,0xe4,0x04,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_bf8 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_cvt_pk_f16_bf8_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xec,0x02,0x7e,0x02,0xe4,0x04,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s
new file mode 100644
index 0000000000000..d674a9ea06843
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s
@@ -0,0 +1,79 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_mirror
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_half_mirror
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_shl:1
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_shl:15
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_shr:1
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_shr:15
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_ror:1
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_ror:15
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_cvt_f32_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1.h quad_perm:[3,2,1,0]
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x81,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_bf8 v1, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_cvt_pk_f16_bf8_dpp v1, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xec,0x02,0x7e,0x02,0xe4,0x04,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_bf8 v1, v2.h quad_perm:[0,1,2,3]
+// GFX1250: v_cvt_pk_f16_bf8_dpp v1, v2.h quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xec,0x02,0x7e,0x82,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_fp8 v1, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xea,0x02,0x7e,0x02,0xe4,0x04,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_fp8 v1, v2.h quad_perm:[0,1,2,3]
+// GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2.h quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xea,0x02,0x7e,0x82,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s
new file mode 100644
index 0000000000000..9ab3a8adfa511
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s
@@ -0,0 +1,23 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_cvt_f32_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_fp8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xea,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_bf8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cvt_pk_f16_bf8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xec,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s
new file mode 100644
index 0000000000000..6904624471801
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s
@@ -0,0 +1,35 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_cvt_f32_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x81,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_fp8 v1, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xea,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_fp8 v1, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xea,0x02,0x7e,0x82,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_bf8 v1, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cvt_pk_f16_bf8_dpp v1, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xec,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_bf8 v1, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_pk_f16_bf8_dpp v1, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xec,0x02,0x7e,0x82,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s
new file mode 100644
index 0000000000000..c393d3e819880
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s
@@ -0,0 +1,36 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX1250-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_cvt_pk_f16_bf8 v1, v2 clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_cvt_pk_f16_bf8 v1, v2 clamp
+// GFX1250-ERR-NEXT:{{^}}                        ^
+
+v_cvt_pk_f16_bf8 v1, v2 mul:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_cvt_pk_f16_bf8 v1, v2 mul:2
+// GFX1250-ERR-NEXT:{{^}}                        ^
+
+v_cvt_pk_f16_fp8 v1, v2 clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_cvt_pk_f16_fp8 v1, v2 clamp
+// GFX1250-ERR-NEXT:{{^}}                        ^
+
+v_cvt_pk_f16_fp8 v1, v2 mul:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_cvt_pk_f16_fp8 v1, v2 mul:2
+// GFX1250-ERR-NEXT:{{^}}                        ^
+
+v_cvt_f32_bf16 v5, v1 clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_cvt_f32_bf16 v5, v1 clamp
+// GFX1250-ERR-NEXT:{{^}}                      ^
+
+v_cvt_f32_bf16 v5, v1 mul:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_cvt_f32_bf16 v5, v1 mul:2
+// GFX1250-ERR-NEXT:{{^}}                      ^
+
+v_cvt_f32_bf16 v5, v1 div:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_cvt_f32_bf16 v5, v1 div:2
+// GFX1250-ERR-NEXT:{{^}}                      ^
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
new file mode 100644
index 0000000000000..f6c7cf8006508
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
@@ -0,0 +1,101 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
+
+v_cvt_f32_bf16_e64 v5, v1
+// GFX1250: v_cvt_f32_bf16_e64 v5, v1               ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, v255
+// GFX1250: v_cvt_f32_bf16_e64 v5, v255             ; encoding: [0x05,0x00,0xf2,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, s1
+// GFX1250: v_cvt_f32_bf16_e64 v5, s1               ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, s105
+// GFX1250: v_cvt_f32_bf16_e64 v5, s105             ; encoding: [0x05,0x00,0xf2,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, vcc_lo
+// GFX1250: v_cvt_f32_bf16_e64 v5, vcc_lo           ; encoding: [0x05,0x00,0xf2,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, vcc_hi
+// GFX1250: v_cvt_f32_bf16_e64 v5, vcc_hi           ; encoding: [0x05,0x00,0xf2,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, ttmp15
+// GFX1250: v_cvt_f32_bf16_e64 v5, ttmp15           ; encoding: [0x05,0x00,0xf2,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, m0
+// GFX1250: v_cvt_f32_bf16_e64 v5, m0               ; encoding: [0x05,0x00,0xf2,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, exec_lo
+// GFX1250: v_cvt_f32_bf16_e64 v5, exec_lo          ; encoding: [0x05,0x00,0xf2,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, exec_hi
+// GFX1250: v_cvt_f32_bf16_e64 v5, exec_hi          ; encoding: [0x05,0x00,0xf2,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, null
+// GFX1250: v_cvt_f32_bf16_e64 v5, null             ; encoding: [0x05,0x00,0xf2,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, -1
+// GFX1250: v_cvt_f32_bf16_e64 v5, -1               ; encoding: [0x05,0x00,0xf2,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, v1 op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, v1 op_sel:[1,0]  ; encoding: [0x05,0x08,0xf2,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, v255 op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, v255 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, s1 op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, s1 op_sel:[1,0]  ; encoding: [0x05,0x08,0xf2,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, s105 op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, s105 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, vcc_lo op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, vcc_lo op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, vcc_hi op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, vcc_hi op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, ttmp15 op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, ttmp15 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, m0 op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, m0 op_sel:[1,0]  ; encoding: [0x05,0x08,0xf2,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, exec_lo op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, exec_lo op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, exec_hi op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, exec_hi op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, null op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, null op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, -1 op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, -1 op_sel:[1,0]  ; encoding: [0x05,0x08,0xf2,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, src_scc op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, src_scc op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_pk_f16_bf8 v1, v150
+// GFX1250: v_cvt_pk_f16_bf8 v1, v150               ; encoding: [0x01,0x00,0xf6,0xd5,0x96,0x01,0x00,0x00]
+
+v_cvt_pk_f16_bf8 v1, v2 op_sel:[1]
+// GFX1250: v_cvt_pk_f16_bf8 v1, v2 op_sel:[1,0]    ; encoding: [0x01,0x08,0xf6,0xd5,0x02,0x01,0x00,0x00]
+
+v_cvt_pk_f16_bf8 v1, v150 op_sel:[1]
+// GFX1250: v_cvt_pk_f16_bf8 v1, v150 op_sel:[1,0]  ; encoding: [0x01,0x08,0xf6,0xd5,0x96,0x01,0x00,0x00]
+
+v_cvt_pk_f16_bf8 v1, s2 op_sel:[1]
+// GFX1250: v_cvt_pk_f16_bf8 v1, s2 op_sel:[1,0]    ; encoding: [0x01,0x08,0xf6,0xd5,0x02,0x00,0x00,0x00]
+
+v_cvt_pk_f16_fp8 v1, v150
+// GFX1250: v_cvt_pk_f16_fp8 v1, v150               ; encoding: [0x01,0x00,0xf5,0xd5,0x96,0x01,0x00,0x00]
+
+v_cvt_pk_f16_fp8 v1, v2 op_sel:[1]
+// GFX1250: v_cvt_pk_f16_fp8 v1, v2 op_sel:[1,0]    ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x01,0x00,0x00]
+
+v_cvt_pk_f16_fp8 v1, v150 op_sel:[1]
+// GFX1250: v_cvt_pk_f16_fp8 v1, v150 op_sel:[1,0]  ; encoding: [0x01,0x08,0xf5,0xd5,0x96,0x01,0x00,0x00]
+
+v_cvt_pk_f16_fp8 v1, s2 op_sel:[1]
+// GFX1250: v_cvt_pk_f16_fp8 v1, s2 op_sel:[1,0]    ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
new file mode 100644
index 0000000000000..531d734a0683d
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
@@ -0,0 +1,104 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
+
+v_cvt_f32_bf16_e64 v5, v1
+// GFX1250: v_cvt_f32_bf16_e64 v5, v1               ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, v255
+// GFX1250: v_cvt_f32_bf16_e64 v5, v255             ; encoding: [0x05,0x00,0xf2,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, s1
+// GFX1250: v_cvt_f32_bf16_e64 v5, s1               ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, s105
+// GFX1250: v_cvt_f32_bf16_e64 v5, s105             ; encoding: [0x05,0x00,0xf2,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, vcc_lo
+// GFX1250: v_cvt_f32_bf16_e64 v5, vcc_lo           ; encoding: [0x05,0x00,0xf2,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, vcc_hi
+// GFX1250: v_cvt_f32_bf16_e64 v5, vcc_hi           ; encoding: [0x05,0x00,0xf2,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, ttmp15
+// GFX1250: v_cvt_f32_bf16_e64 v5, ttmp15           ; encoding: [0x05,0x00,0xf2,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, m0
+// GFX1250: v_cvt_f32_bf16_e64 v5, m0               ; encoding: [0x05,0x00,0xf2,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, exec_lo
+// GFX1250: v_cvt_f32_bf16_e64 v5, exec_lo          ; encoding: [0x05,0x00,0xf2,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, exec_hi
+// GFX1250: v_cvt_f32_bf16_e64 v5, exec_hi          ; encoding: [0x05,0x00,0xf2,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, null
+// GFX1250: v_cvt_f32_bf16_e64 v5, null             ; encoding: [0x05,0x00,0xf2,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, -1
+// GFX1250: v_cvt_f32_bf16_e64 v5, -1               ; encoding: [0x05,0x00,0xf2,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, v1 op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, v1 op_sel:[1,0]  ; encoding: [0x05,0x08,0xf2,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, v255 op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, v255 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, s1 op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, s1 op_sel:[1,0]  ; encoding: [0x05,0x08,0xf2,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, s105 op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, s105 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, vcc_lo op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, vcc_lo op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, vcc_hi op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, vcc_hi op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, ttmp15 op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, ttmp15 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, m0 op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, m0 op_sel:[1,0]  ; encoding: [0x05,0x08,0xf2,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, exec_lo op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, exec_lo op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, exec_hi op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, exec_hi op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, null op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, null op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, -1 op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, -1 op_sel:[1,0]  ; encoding: [0x05,0x08,0xf2,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, src_scc op_sel:[1]
+// GFX1250: v_cvt_f32_bf16_e64 v5, src_scc op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_f32_bf16_e64 v5, v128.h
+// GFX1250: v_cvt_f32_bf16_e64 v5, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x80,0x01,0x00,0x00]
+
+v_cvt_pk_f16_bf8 v1, v150
+// GFX1250: v_cvt_pk_f16_bf8 v1, v150               ; encoding: [0x01,0x00,0xf6,0xd5,0x96,0x01,0x00,0x00]
+
+v_cvt_pk_f16_bf8 v1, v2 op_sel:[1]
+// GFX1250: v_cvt_pk_f16_bf8 v1, v2 op_sel:[1,0]    ; encoding: [0x01,0x08,0xf6,0xd5,0x02,0x01,0x00,0x00]
+
+v_cvt_pk_f16_bf8 v1, v150 op_sel:[1]
+// GFX1250: v_cvt_pk_f16_bf8 v1, v150 op_sel:[1,0]  ; encoding: [0x01,0x08,0xf6,0xd5,0x96,0x01,0x00,0x00]
+
+v_cvt_pk_f16_bf8 v1, s2 op_sel:[1]
+// GFX1250: v_cvt_pk_f16_bf8 v1, s2 op_sel:[1,0]    ; encoding: [0x01,0x08,0xf6,0xd5,0x02,0x00,0x00,0x00]
+
+v_cvt_pk_f16_fp8 v1, v150
+// GFX1250: v_cvt_pk_f16_fp8 v1, v150               ; encoding: [0x01,0x00,0xf5,0xd5,0x96,0x01,0x00,0x00]
+
+v_cvt_pk_f16_fp8 v1, v2 op_sel:[1]
+// GFX1250: v_cvt_pk_f16_fp8 v1, v2 op_sel:[1,0]    ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x01,0x00,0x00]
+
+v_cvt_pk_f16_fp8 v1, v150 op_sel:[1]
+// GFX1250: v_cvt_pk_f16_fp8 v1, v150 op_sel:[1,0]  ; encoding: [0x01,0x08,0xf5,0xd5,0x96,0x01,0x00,0x00]
+
+v_cvt_pk_f16_fp8 v1, s2 op_sel:[1]
+// GFX1250: v_cvt_pk_f16_fp8 v1, s2 op_sel:[1,0]    ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s
new file mode 100644
index 0000000000000..844b4259229ed
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s
@@ -0,0 +1,63 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_bf8 v1, v128 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_cvt_pk_f16_bf8_e64_dpp v1, v128 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x01,0x00,0xf6,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x04,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_bf8 v1, v128 op_sel:[1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_cvt_pk_f16_bf8_e64_dpp v1, v128 op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x01,0x08,0xf6,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x04,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_fp8 v1, v128 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v128 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x01,0x00,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x04,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_fp8 v1, v2 op_sel:[1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v2 op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x04,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
new file mode 100644
index 0000000000000..32c2e54cf0e71
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
@@ -0,0 +1,67 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v128.h quad_perm:[3,2,1,0]
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v128.h op_sel:[1,0] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_bf8 v1, v128.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_cvt_pk_f16_bf8_e64_dpp v1, v128.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x01,0x00,0xf6,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x04,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_bf8 v1, v128.h quad_perm:[0,1,2,3]
+// GFX1250: v_cvt_pk_f16_bf8_e64_dpp v1, v128.h op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xf6,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_fp8 v1, v128.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v128.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x01,0x00,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x04,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_fp8 v1, v128.h quad_perm:[0,1,2,3]
+// GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v128.h op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s
new file mode 100644
index 0000000000000..75692c7422f64
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s
@@ -0,0 +1,23 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_cvt_pk_f16_bf8 v1, v128 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cvt_pk_f16_bf8_e64_dpp v1, v128 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x01,0x00,0xf6,0xd5,0xea,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_bf8 v1, v2 op_sel:[1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_pk_f16_bf8_e64_dpp v1, v2 op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf6,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_fp8 v1, v128 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v128 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x01,0x00,0xf5,0xd5,0xea,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_fp8 v1, v2 op_sel:[1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v2 op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
new file mode 100644
index 0000000000000..2c1eb47164e59
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
@@ -0,0 +1,27 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_cvt_pk_f16_bf8 v1, v128.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cvt_pk_f16_bf8_e64_dpp v1, v128.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x01,0x00,0xf6,0xd5,0xea,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_bf8 v1, v128.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_pk_f16_bf8_e64_dpp v1, v128.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf6,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v128.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v128.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_fp8 v1, v128.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v128.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x01,0x00,0xf5,0xd5,0xea,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_f16_fp8 v1, v128.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v128.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt
new file mode 100644
index 0000000000000..47eebb9d44a95
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt
@@ -0,0 +1,71 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+
+0xff,0xe4,0xfe,0x7e,0x00,0x80,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e32 v127, 0x8000         ; encoding: [0xff,0xe4,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+0xc1,0xe4,0x0a,0x7e
+# GFX1250: v_cvt_f32_bf16_e32 v5, -1               ; encoding: [0xc1,0xe4,0x0a,0x7e]
+
+0xf0,0xe4,0x0a,0x7e
+# GFX1250: v_cvt_f32_bf16_e32 v5, 0.5              ; encoding: [0xf0,0xe4,0x0a,0x7e]
+
+0x7f,0xe4,0x0a,0x7e
+# GFX1250: v_cvt_f32_bf16_e32 v5, exec_hi          ; encoding: [0x7f,0xe4,0x0a,0x7e]
+
+0x7e,0xe4,0x0a,0x7e
+# GFX1250: v_cvt_f32_bf16_e32 v5, exec_lo          ; encoding: [0x7e,0xe4,0x0a,0x7e]
+
+0x7d,0xe4,0x0a,0x7e
+# GFX1250: v_cvt_f32_bf16_e32 v5, m0               ; encoding: [0x7d,0xe4,0x0a,0x7e]
+
+0x7c,0xe4,0x0a,0x7e
+# GFX1250: v_cvt_f32_bf16_e32 v5, null             ; encoding: [0x7c,0xe4,0x0a,0x7e]
+
+0x01,0xe4,0x0a,0x7e
+# GFX1250: v_cvt_f32_bf16_e32 v5, s1               ; encoding: [0x01,0xe4,0x0a,0x7e]
+
+0x69,0xe4,0x0a,0x7e
+# GFX1250: v_cvt_f32_bf16_e32 v5, s105             ; encoding: [0x69,0xe4,0x0a,0x7e]
+
+0xfd,0xe4,0x0a,0x7e
+# GFX1250: v_cvt_f32_bf16_e32 v5, src_scc          ; encoding: [0xfd,0xe4,0x0a,0x7e]
+
+0x7b,0xe4,0x0a,0x7e
+# GFX1250: v_cvt_f32_bf16_e32 v5, ttmp15           ; encoding: [0x7b,0xe4,0x0a,0x7e]
+
+0x01,0xe5,0x0a,0x7e
+# GFX1250: v_cvt_f32_bf16_e32 v5, v1.l             ; encoding: [0x01,0xe5,0x0a,0x7e]
+
+0x7f,0xe5,0x0a,0x7e
+# GFX1250: v_cvt_f32_bf16_e32 v5, v127.l           ; encoding: [0x7f,0xe5,0x0a,0x7e]
+
+0x6b,0xe4,0x0a,0x7e
+# GFX1250: v_cvt_f32_bf16_e32 v5, vcc_hi           ; encoding: [0x6b,0xe4,0x0a,0x7e]
+
+0x6a,0xe4,0x0a,0x7e
+# GFX1250: v_cvt_f32_bf16_e32 v5, vcc_lo           ; encoding: [0x6a,0xe4,0x0a,0x7e]
+
+0x81,0xe5,0x0a,0x7e
+# GFX1250: v_cvt_f32_bf16_e32 v5, v1.h             ; encoding: [0x81,0xe5,0x0a,0x7e]
+
+0xff,0xec,0x02,0x7e,0x64,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f16_bf8 v1, 0x64               ; encoding: [0xff,0xec,0x02,0x7e,0x64,0x00,0x00,0x00]
+
+0x02,0xec,0x02,0x7e
+# GFX1250: v_cvt_pk_f16_bf8 v1, s2                 ; encoding: [0x02,0xec,0x02,0x7e]
+
+0x02,0xed,0x02,0x7e
+# GFX1250-REAL16: v_cvt_pk_f16_bf8 v1, v2.l               ; encoding: [0x02,0xed,0x02,0x7e]
+# GFX1250-FAKE16: v_cvt_pk_f16_bf8 v1, v2                 ; encoding: [0x02,0xed,0x02,0x7e]
+
+0xff,0xea,0x02,0x7e,0x64,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f16_fp8 v1, 0x64               ; encoding: [0xff,0xea,0x02,0x7e,0x64,0x00,0x00,0x00]
+
+0x02,0xea,0x02,0x7e
+# GFX1250: v_cvt_pk_f16_fp8 v1, s2                 ; encoding: [0x02,0xea,0x02,0x7e]
+
+0x02,0xeb,0x02,0x7e
+# GFX1250-REAL16: v_cvt_pk_f16_fp8 v1, v2.l               ; encoding: [0x02,0xeb,0x02,0x7e]
+# GFX1250-FAKE16: v_cvt_pk_f16_fp8 v1, v2                 ; encoding: [0x02,0xeb,0x02,0x7e]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt
new file mode 100644
index 0000000000000..25e982b7fd688
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt
@@ -0,0 +1,64 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+
+0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30
+# GFX1250: v_cvt_f32_bf16_dpp v127, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+0xfa,0xe4,0x0a,0x7e,0x01,0xe4,0x00,0xff
+# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff
+# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+0xfa,0xe4,0x0a,0x7e,0x01,0x41,0x01,0xff
+# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+0xfa,0xe4,0x0a,0x7e,0x01,0x40,0x01,0xff
+# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+0xfa,0xe4,0x0a,0x7e,0x01,0x21,0x01,0xff
+# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+0xfa,0xe4,0x0a,0x7e,0x01,0x2f,0x01,0xff
+# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+0xfa,0xe4,0x0a,0x7e,0x01,0x50,0x01,0xff
+# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+0xfa,0xe4,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0xe4,0x0a,0x7e,0x01,0x01,0x01,0xff
+# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+0xfa,0xe4,0x0a,0x7e,0x01,0x0f,0x01,0xff
+# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+0xfa,0xe4,0x0a,0x7e,0x01,0x11,0x01,0xff
+# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+0xfa,0xe4,0x0a,0x7e,0x01,0x1f,0x01,0xff
+# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+0xfa,0xe4,0x0a,0x7e,0x01,0x60,0x09,0x13
+# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+0xfa,0xe4,0x0a,0x7e,0x81,0x1b,0x00,0xff
+# GFX1250: v_cvt_f32_bf16_dpp v5, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x81,0x1b,0x00,0xff]
+
+0xfa,0xec,0x02,0x7e,0x02,0xe4,0x04,0xff
+# GFX1250-REAL16: v_cvt_pk_f16_bf8_dpp v1, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xec,0x02,0x7e,0x02,0xe4,0x04,0xff]
+# GFX1250-FAKE16: v_cvt_pk_f16_bf8_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xec,0x02,0x7e,0x02,0xe4,0x04,0xff]
+
+0xfa,0xec,0x02,0x7e,0x82,0xe4,0x00,0xff
+# GFX1250-REAL16: v_cvt_pk_f16_bf8_dpp v1, v2.h quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xec,0x02,0x7e,0x82,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_cvt_pk_f16_bf8_dpp v1, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xec,0x02,0x7e,0x82,0xe4,0x00,0xff]
+
+0xfa,0xea,0x02,0x7e,0x02,0xe4,0x04,0xff
+# GFX1250-REAL16: v_cvt_pk_f16_fp8_dpp v1, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xea,0x02,0x7e,0x02,0xe4,0x04,0xff]
+# GFX1250-FAKE16: v_cvt_pk_f16_fp8_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xea,0x02,0x7e,0x02,0xe4,0x04,0xff]
+
+0xfa,0xea,0x02,0x7e,0x82,0xe4,0x00,0xff
+# GFX1250-REAL16: v_cvt_pk_f16_fp8_dpp v1, v2.h quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xea,0x02,0x7e,0x82,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_cvt_pk_f16_fp8_dpp v1, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xea,0x02,0x7e,0x82,0xe4,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt
new file mode 100644
index 0000000000000..bd524af907ee0
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+
+0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_dpp v127, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xea,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xe9,0xe4,0x0a,0x7e,0x81,0x77,0x39,0x05
+# GFX1250: v_cvt_f32_bf16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x81,0x77,0x39,0x05]
+
+0xea,0xec,0x02,0x7e,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_cvt_pk_f16_bf8_dpp v1, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xec,0x02,0x7e,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cvt_pk_f16_bf8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xec,0x02,0x7e,0x02,0x77,0x39,0x05]
+
+0xe9,0xec,0x02,0x7e,0x82,0x77,0x39,0x05
+# GFX1250-REAL16: v_cvt_pk_f16_bf8_dpp v1, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xec,0x02,0x7e,0x82,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cvt_pk_f16_bf8_dpp v1, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xec,0x02,0x7e,0x82,0x77,0x39,0x05]
+
+0xea,0xea,0x02,0x7e,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_cvt_pk_f16_fp8_dpp v1, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xea,0x02,0x7e,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cvt_pk_f16_fp8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xea,0x02,0x7e,0x02,0x77,0x39,0x05]
+
+0xe9,0xea,0x02,0x7e,0x82,0x77,0x39,0x05
+# GFX1250-REAL16: v_cvt_pk_f16_fp8_dpp v1, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xea,0x02,0x7e,0x82,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cvt_pk_f16_fp8_dpp v1, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xea,0x02,0x7e,0x82,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt
new file mode 100644
index 0000000000000..70abf4289ac11
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt
@@ -0,0 +1,79 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
+
+0x05,0x00,0xf2,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, -1               ; encoding: [0x05,0x00,0xf2,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, exec_hi          ; encoding: [0x05,0x00,0xf2,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, exec_lo          ; encoding: [0x05,0x00,0xf2,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, m0               ; encoding: [0x05,0x00,0xf2,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, null             ; encoding: [0x05,0x00,0xf2,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, s1               ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, s105             ; encoding: [0x05,0x00,0xf2,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, ttmp15           ; encoding: [0x05,0x00,0xf2,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f32_bf16_e64 v5, v1.l             ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64 v5, v1               ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f32_bf16_e64 v5, v255.l           ; encoding: [0x05,0x00,0xf2,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64 v5, v255             ; encoding: [0x05,0x00,0xf2,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, vcc_hi           ; encoding: [0x05,0x00,0xf2,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, vcc_lo           ; encoding: [0x05,0x00,0xf2,0xd5,0x6a,0x00,0x00,0x00]
+
+0x05,0x08,0xf2,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f32_bf16_e64 v5, v1.h op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64 v5, v1 op_sel:[1,0]  ; encoding: [0x05,0x08,0xf2,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xf2,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f32_bf16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64 v5, v255 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xff,0x01,0x00,0x00]
+
+0x01,0x08,0xf6,0xd5,0x02,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f16_bf8 v1, s2 op_sel:[1,0]    ; encoding: [0x01,0x08,0xf6,0xd5,0x02,0x00,0x00,0x00]
+
+0x01,0x00,0xf6,0xd5,0x96,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_pk_f16_bf8 v1, v150.l             ; encoding: [0x01,0x00,0xf6,0xd5,0x96,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_pk_f16_bf8 v1, v150               ; encoding: [0x01,0x00,0xf6,0xd5,0x96,0x01,0x00,0x00]
+
+0x01,0x08,0xf6,0xd5,0x96,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_pk_f16_bf8 v1, v150.h op_sel:[1,0] ; encoding: [0x01,0x08,0xf6,0xd5,0x96,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_pk_f16_bf8 v1, v150 op_sel:[1,0]  ; encoding: [0x01,0x08,0xf6,0xd5,0x96,0x01,0x00,0x00]
+
+0x01,0x08,0xf6,0xd5,0x02,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_pk_f16_bf8 v1, v2.h op_sel:[1,0]  ; encoding: [0x01,0x08,0xf6,0xd5,0x02,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_pk_f16_bf8 v1, v2 op_sel:[1,0]    ; encoding: [0x01,0x08,0xf6,0xd5,0x02,0x01,0x00,0x00]
+
+0x01,0x08,0xf5,0xd5,0x02,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f16_fp8 v1, s2 op_sel:[1,0]    ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x00,0x00,0x00]
+
+0x01,0x00,0xf5,0xd5,0x96,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_pk_f16_fp8 v1, v150.l             ; encoding: [0x01,0x00,0xf5,0xd5,0x96,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_pk_f16_fp8 v1, v150               ; encoding: [0x01,0x00,0xf5,0xd5,0x96,0x01,0x00,0x00]
+
+0x01,0x08,0xf5,0xd5,0x96,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_pk_f16_fp8 v1, v150.h op_sel:[1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0x96,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_pk_f16_fp8 v1, v150 op_sel:[1,0]  ; encoding: [0x01,0x08,0xf5,0xd5,0x96,0x01,0x00,0x00]
+
+0x01,0x08,0xf5,0xd5,0x02,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_pk_f16_fp8 v1, v2.h op_sel:[1,0]  ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_pk_f16_fp8 v1, v2 op_sel:[1,0]    ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x01,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt
new file mode 100644
index 0000000000000..d53d532eef804
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt
@@ -0,0 +1,67 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s
+
+0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_cvt_f32_bf16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_cvt_f32_bf16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_cvt_f32_bf16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_cvt_f32_bf16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_cvt_f32_bf16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_cvt_f32_bf16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_cvt_f32_bf16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_cvt_f32_bf16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_cvt_f32_bf16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_cvt_f32_bf16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_cvt_f32_bf16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+0x05,0x08,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff
+# GFX1250-REAL16: v_cvt_f32_bf16_e64_dpp v5, v128.h op_sel:[1,0] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64_dpp v5, v128 op_sel:[1,0] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+
+0x01,0x00,0xf6,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x04,0xff
+# GFX1250-REAL16: v_cvt_pk_f16_bf8_e64_dpp v1, v128.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x01,0x00,0xf6,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x04,0xff]
+# GFX1250-FAKE16: v_cvt_pk_f16_bf8_e64_dpp v1, v128 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x01,0x00,0xf6,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x04,0xff]
+
+0x01,0x08,0xf6,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff
+# GFX1250-REAL16: v_cvt_pk_f16_bf8_e64_dpp v1, v128.h op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xf6,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_cvt_pk_f16_bf8_e64_dpp v1, v128 op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xf6,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff]
+
+0x01,0x00,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x04,0xff
+# GFX1250-REAL16: v_cvt_pk_f16_fp8_e64_dpp v1, v128.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x01,0x00,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x04,0xff]
+# GFX1250-FAKE16: v_cvt_pk_f16_fp8_e64_dpp v1, v128 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x01,0x00,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x04,0xff]
+
+0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff
+# GFX1250-REAL16: v_cvt_pk_f16_fp8_e64_dpp v1, v128.h op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_cvt_pk_f16_fp8_e64_dpp v1, v128 op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt
new file mode 100644
index 0000000000000..8df21f3f5e4df
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt
@@ -0,0 +1,27 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s
+
+0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_cvt_f32_bf16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_cvt_f32_bf16_e64_dpp v5, v128.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64_dpp v5, v128 op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
+0x01,0x00,0xf6,0xd5,0xea,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_cvt_pk_f16_bf8_e64_dpp v1, v128.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x01,0x00,0xf6,0xd5,0xea,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cvt_pk_f16_bf8_e64_dpp v1, v128 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x01,0x00,0xf6,0xd5,0xea,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
+0x01,0x08,0xf6,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_cvt_pk_f16_bf8_e64_dpp v1, v128.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf6,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cvt_pk_f16_bf8_e64_dpp v1, v128 op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf6,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
+0x01,0x00,0xf5,0xd5,0xea,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_cvt_pk_f16_fp8_e64_dpp v1, v128.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x01,0x00,0xf5,0xd5,0xea,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cvt_pk_f16_fp8_e64_dpp v1, v128 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x01,0x00,0xf5,0xd5,0xea,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
+0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_cvt_pk_f16_fp8_e64_dpp v1, v128.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cvt_pk_f16_fp8_e64_dpp v1, v128 op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/RISCV/c_lui_disasm.txt b/llvm/test/MC/Disassembler/RISCV/c_lui_disasm.txt
index 17889c15cbf95..0a83a200c9317 100644
--- a/llvm/test/MC/Disassembler/RISCV/c_lui_disasm.txt
+++ b/llvm/test/MC/Disassembler/RISCV/c_lui_disasm.txt
@@ -10,265 +10,197 @@
 # RUN: llvm-mc --disassemble -triple=riscv32  -mattr=+c -mattr=+Zcmop \
 # RUN:     -M no-aliases --show-encoding < %s | \
 # RUN:   FileCheck --check-prefixes=GOOD,MOP %s
-#
-# RUN: llvm-mc --disassemble -triple=riscv32  -mattr=+c -mattr=+no-rvc-hints \
-# RUN:     -M no-aliases --show-encoding < %s 2>&1 | \
-# RUN:   FileCheck --check-prefix=NOHINTS %s
 
 # BAD: invalid instruction encoding
-# NOHINTS: invalid instruction encoding
 0x01 0x60
 
 # GOOD: c.lui zero, 1
-# NOHINTS: invalid instruction encoding
 0x05 0x60
 
 # GOOD: c.lui zero, 2
-# NOHINTS: invalid instruction encoding
 0x09 0x60
 
 # GOOD: c.lui zero, 3
-# NOHINTS: invalid instruction encoding
 0x0D 0x60
 
 # GOOD: c.lui zero, 4
-# NOHINTS: invalid instruction encoding
 0x11 0x060
 
 # GOOD: c.lui zero, 5
-# NOHINTS: invalid instruction encoding
 0x15 0x60
 
 # GOOD: c.lui zero, 6
-# NOHINTS: invalid instruction encoding
 0x19 0x60
 
 # GOOD: c.lui zero, 7
-# NOHINTS: invalid instruction encoding
 0x1D 0x60
 
 # GOOD: c.lui zero, 8
-# NOHINTS: invalid instruction encoding
 0x21 0x60
 
 # GOOD: c.lui zero, 9
-# NOHINTS: invalid instruction encoding
 0x25 0x60
 
 # GOOD: c.lui zero, 10
-# NOHINTS: invalid instruction encoding
 0x29 0x60
 
 # GOOD: c.lui zero, 11
-# NOHINTS: invalid instruction encoding
 0x2D 0x60
 
 # GOOD: c.lui zero, 12
-# NOHINTS: invalid instruction encoding
 0x31 0x60
 
 # GOOD: c.lui zero, 13
-# NOHINTS: invalid instruction encoding
 0x35 0x60
 
 # GOOD: c.lui zero, 14
-# NOHINTS: invalid instruction encoding
 0x39 0x60
 
 # GOOD: c.lui zero, 15
-# NOHINTS: invalid instruction encoding
 0x3D 0x60
 
 # GOOD: c.lui zero, 16
-# NOHINTS: invalid instruction encoding
 0x41 0x60
 
 # GOOD: c.lui zero, 17
-# NOHINTS: invalid instruction encoding
 0x45 0x60
 
 # GOOD: c.lui zero, 18
-# NOHINTS: invalid instruction encoding
 0x49 0x60
 
 # GOOD: c.lui zero, 19
-# NOHINTS: invalid instruction encoding
 0x4D 0x60
 
 # GOOD: c.lui zero, 20
-# NOHINTS: invalid instruction encoding
 0x51 0x60
 
 # GOOD: c.lui zero, 21
-# NOHINTS: invalid instruction encoding
 0x55 0x60
 
 # GOOD: c.lui zero, 22
-# NOHINTS: invalid instruction encoding
 0x59 0x60
 
 # GOOD: c.lui zero, 23
-# NOHINTS: invalid instruction encoding
 0x5D 0x60
 
 # GOOD: c.lui zero, 24
-# NOHINTS: invalid instruction encoding
 0x61 0x60
 
 # GOOD: c.lui zero, 25
-# NOHINTS: invalid instruction encoding
 0x65 0x60
 
 # GOOD: c.lui zero, 26
-# NOHINTS: invalid instruction encoding
 0x69 0x60
 
 # GOOD: c.lui zero, 27
-# NOHINTS: invalid instruction encoding
 0x6D 0x60
 
 # GOOD: c.lui zero, 28
-# NOHINTS: invalid instruction encoding
 0x71 0x60
 
 # GOOD: c.lui zero, 29
-# NOHINTS: invalid instruction encoding
 0x75 0x60
 
 # GOOD: c.lui zero, 30
-# NOHINTS: invalid instruction encoding
 0x79 0x60
 
 # GOOD: c.lui zero, 31
-# NOHINTS: invalid instruction encoding
 0x7D 0x60
 
 # GOOD: c.lui zero, 1048544
-# NOHINTS: invalid instruction encoding
 0x01 0x70
 
 # GOOD: c.lui zero, 1048545
-# NOHINTS: invalid instruction encoding
 0x05 0x70
 
 # GOOD: c.lui zero, 1048546
-# NOHINTS: invalid instruction encoding
 0x09 0x70
 
 # GOOD: c.lui zero, 1048547
-# NOHINTS: invalid instruction encoding
 0x0D 0x70
 
 # GOOD: c.lui zero, 1048548
-# NOHINTS: invalid instruction encoding
 0x11 0x70
 
 # GOOD: c.lui zero, 1048549
-# NOHINTS: invalid instruction encoding
 0x15 0x70
 
 # GOOD: c.lui zero, 1048550
-# NOHINTS: invalid instruction encoding
 0x19 0x70
 
 # GOOD: c.lui zero, 1048551
-# NOHINTS: invalid instruction encoding
 0x1D 0x70
 
 # GOOD: c.lui zero, 1048552
-# NOHINTS: invalid instruction encoding
 0x21 0x70
 
 # GOOD: c.lui zero, 1048553
-# NOHINTS: invalid instruction encoding
 0x25 0x70
 
 # GOOD: c.lui zero, 1048554
-# NOHINTS: invalid instruction encoding
 0x29 0x70
 
 # GOOD: c.lui zero, 1048555
-# NOHINTS: invalid instruction encoding
 0x2D 0x70
 
 # GOOD: c.lui zero, 1048556
-# NOHINTS: invalid instruction encoding
 0x31 0x70
 
 # GOOD: c.lui zero, 1048557
-# NOHINTS: invalid instruction encoding
 0x35 0x70
 
 # GOOD: c.lui zero, 1048558
-# NOHINTS: invalid instruction encoding
 0x39 0x70
 
 # GOOD: c.lui zero, 1048559
-# NOHINTS: invalid instruction encoding
 0x3D 0x70
 
 # GOOD: c.lui zero, 1048560
-# NOHINTS: invalid instruction encoding
 0x41 0x70
 
 # GOOD: c.lui zero, 1048561
-# NOHINTS: invalid instruction encoding
 0x45 0x70
 
 # GOOD: c.lui zero, 1048562
-# NOHINTS: invalid instruction encoding
 0x49 0x70
 
 # GOOD: c.lui zero, 1048563
-# NOHINTS: invalid instruction encoding
 0x4D 0x70
 
 # GOOD: c.lui zero, 1048564
-# NOHINTS: invalid instruction encoding
 0x51 0x70
 
 # GOOD: c.lui zero, 1048565
-# NOHINTS: invalid instruction encoding
 0x55 0x70
 
 # GOOD: c.lui zero, 1048566
-# NOHINTS: invalid instruction encoding
 0x59 0x70
 
 # GOOD: c.lui zero, 1048567
-# NOHINTS: invalid instruction encoding
 0x5D 0x70
 
 # GOOD: c.lui zero, 1048568
-# NOHINTS: invalid instruction encoding
 0x61 0x70
 
 # GOOD: c.lui zero, 1048569
-# NOHINTS: invalid instruction encoding
 0x65 0x70
 
 # GOOD: c.lui zero, 1048570
-# NOHINTS: invalid instruction encoding
 0x69 0x70
 
 # GOOD: c.lui zero, 1048571
-# NOHINTS: invalid instruction encoding
 0x6D 0x70
 
 # GOOD: c.lui zero, 1048572
-# NOHINTS: invalid instruction encoding
 0x71 0x70
 
 # GOOD: c.lui zero, 1048573
-# NOHINTS: invalid instruction encoding
 0x75 0x70
 
 # GOOD: c.lui zero, 1048574
-# NOHINTS: invalid instruction encoding
 0x79 0x70
 
 # GOOD: c.lui zero, 1048575
-# NOHINTS: invalid instruction encoding
 0x7D 0x70
 
 # BAD: invalid instruction encoding
diff --git a/llvm/test/MC/RISCV/rv32c-invalid.s b/llvm/test/MC/RISCV/rv32c-invalid.s
index 8dddbf887c87c..413573af1c5e6 100644
--- a/llvm/test/MC/RISCV/rv32c-invalid.s
+++ b/llvm/test/MC/RISCV/rv32c-invalid.s
@@ -1,6 +1,6 @@
-# RUN: not llvm-mc -triple=riscv32 -mattr=+c -mattr=+no-rvc-hints < %s 2>&1 \
+# RUN: not llvm-mc -triple=riscv32 -mattr=+c < %s 2>&1 \
 # RUN:     | FileCheck %s
-# RUN: not llvm-mc -triple=riscv32 -mattr=+zca -mattr=+no-rvc-hints < %s 2>&1 \
+# RUN: not llvm-mc -triple=riscv32 -mattr=+zca < %s 2>&1 \
 # RUN:     | FileCheck %s
 
 ## GPRC
@@ -23,16 +23,10 @@ c.lwsp  x0, 4(sp) # CHECK: :[[@LINE]]:9: error: register must be a GPR excluding
 c.lwsp  zero, 4(sp) # CHECK: :[[@LINE]]:9: error: register must be a GPR excluding zero (x0)
 c.jr  x0 # CHECK: :[[@LINE]]:7: error: register must be a GPR excluding zero (x0)
 c.jalr  zero # CHECK: :[[@LINE]]:9: error: register must be a GPR excluding zero (x0)
-c.addi  x0, x0, 1 # CHECK: :[[@LINE]]:13: error: immediate must be zero
-c.li  zero, 2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RVC Hint Instructions{{$}}
-c.slli  zero, zero, 4 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction
-c.mv  zero, s0 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RVC Hint Instructions{{$}}
 c.mv  ra, x0 # CHECK: :[[@LINE]]:11: error: register must be a GPR excluding zero (x0)
 c.add  ra, ra, x0 # CHECK: :[[@LINE]]:16: error: invalid operand for instruction
-c.add  zero, zero, sp # CHECK: :[[@LINE]]:14: error: invalid operand for instruction
 
 ## GPRNoX0X2
-c.lui x0, 4 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RVC Hint Instructions{{$}}
 c.lui x2, 4 # CHECK: :[[@LINE]]:7: error: register must be a GPR excluding zero (x0) and sp (x2){{$}}
 
 ## SP
@@ -57,7 +51,6 @@ c.andi a0, %lo(foo) # CHECK: :[[@LINE]]:12: error: immediate must be an integer
 c.andi a0, %hi(foo) # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [-32, 31]
 
 ## simm6nonzero
-c.addi t0, 0 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RVC Hint Instructions{{$}}
 c.addi t0, -33 # CHECK: :[[@LINE]]:12: error: immediate must be non-zero in the range [-32, 31]
 c.addi t0, 32 # CHECK: :[[@LINE]]:12: error: immediate must be non-zero in the range [-32, 31]
 c.addi t0, foo # CHECK: :[[@LINE]]:12: error: immediate must be non-zero in the range [-32, 31]
diff --git a/llvm/test/MC/X86/gotpcrel-non-globals.ll b/llvm/test/MC/X86/gotpcrel-non-globals.ll
new file mode 100644
index 0000000000000..222d2d73ff728
--- /dev/null
+++ b/llvm/test/MC/X86/gotpcrel-non-globals.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+; Check that we emit the `@bar_*` symbols, and that we don't emit multiple symbols.
+
+; CHECK-LABEL: .Lrel_0:
+; CHECK: .long   foo_0@GOTPCREL+0
+; CHECK-LABEL: .Lrel_1_failed:
+; CHECK: .long   bar_1-foo_0
+; CHECK-LABEL: .Lrel_2:
+; CHECK: .long   foo_2@GOTPCREL+0
+
+; CHECK: bar_0:
+; CHECK: bar_1:
+; CHECK: bar_2_indirect:
+
+@rel_0 = private unnamed_addr constant [1 x i32] [
+  i32 trunc (i64 sub (i64 ptrtoint (ptr @bar_0 to i64), i64 ptrtoint (ptr @rel_0 to i64)) to i32)]
+@rel_1_failed = private unnamed_addr constant [1 x i32] [
+  i32 trunc (i64 sub (i64 ptrtoint (ptr @bar_1 to i64), i64 ptrtoint (ptr @foo_0 to i64)) to i32)]
+@rel_2 = private unnamed_addr constant [1 x i32] [
+  i32 trunc (i64 sub (i64 ptrtoint (ptr @bar_2_indirect to i64), i64 ptrtoint (ptr @rel_2 to i64)) to i32)]
+@bar_0 = internal unnamed_addr constant ptr @foo_0, align 8
+@bar_1 = internal unnamed_addr constant ptr @foo_1, align 8
+@bar_2_indirect = internal unnamed_addr constant ptr @foo_2, align 8
+@foo_0 = external global ptr, align 8
+@foo_1 = external global ptr, align 8
+@foo_2 = external global ptr, align 8
+
+define void @foo(ptr %arg0, ptr %arg1) {
+  store ptr @bar_0, ptr %arg0, align 8
+  store ptr @bar_1, ptr %arg1, align 8
+  store ptr getelementptr (i8, ptr @bar_2_indirect, i32 1), ptr %arg1, align 8
+  ret void
+}
diff --git a/llvm/test/TableGen/CompressWriteLatencyEntry.td b/llvm/test/TableGen/CompressWriteLatencyEntry.td
index 88273e8858448..d6a9f0ac0dd76 100644
--- a/llvm/test/TableGen/CompressWriteLatencyEntry.td
+++ b/llvm/test/TableGen/CompressWriteLatencyEntry.td
@@ -33,10 +33,10 @@ def Read_D : SchedRead;
 // CHECK-NEXT: }; // MyTargetReadAdvanceTable
 
 // CHECK:  static const llvm::MCSchedClassDesc SchedModel_ASchedClasses[] = {
-// CHECK-NEXT:  {DBGFIELD("InvalidSchedClass")  8191, false, false, false, 0, 0,  0, 0,  0, 0},
-// CHECK-NEXT:  {DBGFIELD("Inst_A")             1, false, false, false,  0, 0,  1, 1,  0, 0}, // #1
-// CHECK-NEXT:  {DBGFIELD("Inst_B")             1, false, false, false,  0, 0,  2, 1,  0, 0}, // #2
-// CHECK-NEXT:  {DBGFIELD("Inst_C")             1, false, false, false,  0, 0,  1, 1,  1, 1}, // #3
+// CHECK-NEXT:  {DBGFIELD(1)  8191, false, false, false, 0, 0,  0, 0,  0, 0},
+// CHECK-NEXT:  {DBGFIELD(/*Inst_A*/ {{[0-9]+}})             1, false, false, false,  0, 0,  1, 1,  0, 0}, // #1
+// CHECK-NEXT:  {DBGFIELD(/*Inst_B*/ {{[0-9]+}})             1, false, false, false,  0, 0,  2, 1,  0, 0}, // #2
+// CHECK-NEXT:  {DBGFIELD(/*Inst_C*/ {{[0-9]+}})             1, false, false, false,  0, 0,  1, 1,  1, 1}, // #3
 // CHECK-NEXT: }; // SchedModel_ASchedClasses
 
 let SchedModel = SchedModel_A in {
diff --git a/llvm/test/TableGen/InvalidMCSchedClassDesc.td b/llvm/test/TableGen/InvalidMCSchedClassDesc.td
index de5392237a84c..e43edd4174589 100644
--- a/llvm/test/TableGen/InvalidMCSchedClassDesc.td
+++ b/llvm/test/TableGen/InvalidMCSchedClassDesc.td
@@ -1,13 +1,13 @@
 // RUN: llvm-tblgen -gen-subtarget -I %p/../../include %s 2>&1 | FileCheck %s
-// Check if it is valid MCSchedClassDesc if didn't have the resources. 
+// Check if it is valid MCSchedClassDesc if didn't have the resources.
 
 include "llvm/Target/Target.td"
 
 def MyTarget : Target;
 
 let OutOperandList = (outs), InOperandList = (ins) in {
-  def Inst_A : Instruction; 
-  def Inst_B : Instruction; 
+  def Inst_A : Instruction;
+  def Inst_B : Instruction;
 }
 
 let CompleteModel = 0 in {
@@ -18,8 +18,8 @@ let CompleteModel = 0 in {
 
 // Inst_B didn't have the resoures, and it is invalid.
 // CHECK: SchedModel_ASchedClasses[] = {
-// CHECK: {DBGFIELD("Inst_A")             1
-// CHECK-NEXT: {DBGFIELD("Inst_B")             8191
+// CHECK: {DBGFIELD(/*Inst_A*/ 19)             1
+// CHECK-NEXT: {DBGFIELD(/*Inst_B*/ 26)             8191
 let SchedModel = SchedModel_A in {
   def Write_A : SchedWriteRes<[]>;
   def : InstRW<[Write_A], (instrs Inst_A)>;
@@ -27,18 +27,18 @@ let SchedModel = SchedModel_A in {
 
 // Inst_A didn't have the resoures, and it is invalid.
 // CHECK: SchedModel_BSchedClasses[] = {
-// CHECK: {DBGFIELD("Inst_A")             8191
-// CHECK-NEXT: {DBGFIELD("Inst_B")             1 
+// CHECK: {DBGFIELD(/*Inst_A*/ 19)             8191
+// CHECK-NEXT: {DBGFIELD(/*Inst_B*/ 26)             1
 let SchedModel = SchedModel_B in {
-  def Write_B: SchedWriteRes<[]>; 
+  def Write_B: SchedWriteRes<[]>;
   def : InstRW<[Write_B], (instrs Inst_B)>;
 }
 
 // CHECK: SchedModel_CSchedClasses[] = {
-// CHECK: {DBGFIELD("Inst_A")             1
-// CHECK-NEXT: {DBGFIELD("Inst_B")             1
+// CHECK: {DBGFIELD(/*Inst_A*/ 19)             1
+// CHECK-NEXT: {DBGFIELD(/*Inst_B*/ 26)             1
 let SchedModel = SchedModel_C in {
-  def Write_C: SchedWriteRes<[]>; 
+  def Write_C: SchedWriteRes<[]>;
   def : InstRW<[Write_C], (instrs Inst_A, Inst_B)>;
 }
 
diff --git a/llvm/test/Transforms/FunctionAttrs/initializes.ll b/llvm/test/Transforms/FunctionAttrs/initializes.ll
index 937595b5e9b74..5800bc1ca7864 100644
--- a/llvm/test/Transforms/FunctionAttrs/initializes.ll
+++ b/llvm/test/Transforms/FunctionAttrs/initializes.ll
@@ -443,7 +443,7 @@ define void @memset_neg(ptr %p) {
 }
 
 define void @memset_volatile(ptr %p) {
-; CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: write)
+; CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: write, inaccessiblemem: readwrite)
 ; CHECK-LABEL: define void @memset_volatile(
 ; CHECK-SAME: ptr writeonly [[P:%.*]]) #[[ATTR5:[0-9]+]] {
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[P]], i8 2, i64 9, i1 true)
@@ -478,7 +478,7 @@ define void @memcpy(ptr %p, ptr %p2) {
 }
 
 define void @memcpy_volatile(ptr %p, ptr %p2) {
-; CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+; CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite)
 ; CHECK-LABEL: define void @memcpy_volatile(
 ; CHECK-SAME: ptr writeonly [[P:%.*]], ptr readonly [[P2:%.*]]) #[[ATTR6:[0-9]+]] {
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[P]], ptr [[P2]], i64 9, i1 true)
@@ -541,7 +541,7 @@ define void @memmove(ptr %p, ptr %p2) {
 }
 
 define void @memmove_volatile(ptr %p, ptr %p2) {
-; CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+; CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite)
 ; CHECK-LABEL: define void @memmove_volatile(
 ; CHECK-SAME: ptr writeonly [[P:%.*]], ptr readonly [[P2:%.*]]) #[[ATTR6]] {
 ; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr [[P]], ptr [[P2]], i64 9, i1 true)
diff --git a/llvm/test/Transforms/FunctionAttrs/nosync.ll b/llvm/test/Transforms/FunctionAttrs/nosync.ll
index de5398f17ce51..9abfbb21a71a0 100644
--- a/llvm/test/Transforms/FunctionAttrs/nosync.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nosync.ll
@@ -236,7 +236,7 @@ declare void @llvm.memset(ptr %dest, i8 %val, i32 %len, i1 %isvolatile)
 
 ; negative, checking volatile intrinsics.
 define i32 @memcpy_volatile(ptr %ptr1, ptr %ptr2) {
-; CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+; CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite)
 ; CHECK-LABEL: @memcpy_volatile(
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], i32 8, i1 true)
 ; CHECK-NEXT:    ret i32 4
diff --git a/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll b/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll
index 8bc71148352d2..5bc733f5622c7 100644
--- a/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll
+++ b/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -S -passes='function(instsimplify),hotcoldsplit' -hotcoldsplit-threshold=-1 -debug < %s 2>&1 | FileCheck %s
+; RUN: opt -S -passes='function(instsimplify),hotcoldsplit' -hotcoldsplit-threshold=-1 < %s 2>&1 | FileCheck %s
 ; RUN: opt -passes='function(instcombine),hotcoldsplit,function(instsimplify)' %s -o /dev/null
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll b/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll
index c5c17d65524c2..d824d6d35643d 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=instsimplify < %s | FileCheck %s
-; XFAIL: target={{.*}}-aix{{.*}}
 
 define double @test_atan_0() {
 ; CHECK-LABEL: define double @test_atan_0() {
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/calls.ll b/llvm/test/Transforms/InstSimplify/ConstProp/calls.ll
index 61a30c781c0f4..26fb8c0d7a1c6 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/calls.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/calls.ll
@@ -202,5 +202,17 @@ entry:
   ret float %0
 }
 
+define float @test_atan_negzero() nounwind uwtable ssp {
+entry:
+; CHECK-LABEL: @test_atan_negzero(
+; CHECK:    ret float -0.000000e+00
+;
+; FNOBUILTIN-LABEL: @test_atan_negzero(
+; FNOBUILTIN:  ret float -0.000000e+00
+;
+  %1 = call float @atanf(float -0.0)
+  ret float %1
+}
+
 declare double @llvm.pow.f64(double, double) nounwind readonly
 declare float @llvm.pow.f32(float, float) nounwind readonly
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
index 61a68692ff5b9..c565066541d1d 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
@@ -16,17 +16,13 @@ define void @deinterleave4(ptr %src) {
 ;
 
   %load = load <vscale x 16 x i32>, ptr %src, align 4
-  %deinterleave_src = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %load)
-  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 0
-  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 1
-  %deinterleave_half1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
-  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 0
-  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 1
-  %deinterleave_half2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
-  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 0
-  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
-  %sum = add <vscale x 4 x i32> %5, %7
-  %sub = sub <vscale x 4 x i32> %6, %8
+  %deinterleave = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> %load)
+  %1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 0
+  %2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 1
+  %3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 2
+  %4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 3
+  %sum = add <vscale x 4 x i32> %1, %2
+  %sub = sub <vscale x 4 x i32> %3, %4
   ret void
 }
 
@@ -58,17 +54,13 @@ define void @wide_deinterleave4(ptr %src) {
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 32 x i32>, ptr %src, align 4
-  %deinterleave_src = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %load)
-  %3 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave_src, 0
-  %4 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave_src, 1
-  %deinterleave_half1 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %3)
-  %5 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half1, 0
-  %6 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half1, 1
-  %deinterleave_half2 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %4)
-  %7 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half2, 0
-  %8 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half2, 1
-  %sum = add <vscale x 8 x i32> %5, %7
-  %sub = sub <vscale x 8 x i32> %6, %8
+  %deinterleave = tail call { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave4.nxv32i32(<vscale x 32 x i32> %load)
+  %1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave, 0
+  %2 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave, 1
+  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave, 2
+  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave, 3
+  %sum = add <vscale x 8 x i32> %1, %2
+  %sub = sub <vscale x 8 x i32> %3, %4
   ret void
 }
 
@@ -87,52 +79,36 @@ define void @mix_deinterleave4_deinterleave2(ptr %src) {
 ;
 
   %load = load <vscale x 16 x i32>, ptr %src, align 4
-  %deinterleave_src = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %load)
-  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 0
-  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 1
-  %deinterleave_half1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
-  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 0
-  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 1
-  %deinterleave_half2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
-  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 0
-  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
+  %deinterleave = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> %load)
+  %1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 0
+  %2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 1
+  %3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 2
+  %4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 3
 
   %load2 = load <vscale x 8 x i32>, ptr %src, align 4
-  %deinterleave_src2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 8 x i32> %load2)
-  %ld2_1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_src2, 0
-  %ld2_2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_src2, 1
+  %deinterleave2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 8 x i32> %load2)
+  %ld2_1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave2, 0
+  %ld2_2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave2, 1
   ret void
 }
 
 define void @negative_deinterleave4_test(ptr %src) {
 ; CHECK-LABEL: define void @negative_deinterleave4_test
 ; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[SRC]], i64 0
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[TMP1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[SRC]], i64 2
-; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[TMP6]])
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP7]], i64 4)
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP9]], i64 4)
-; CHECK-NEXT:    [[DEINTERLEAVE_HALF1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP8]])
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE_HALF1]], 0
-; CHECK-NEXT:    [[DEINTERLEAVE_HALF2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP10]])
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE_HALF2]], 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 16 x i32>, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[LOAD]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 2
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 16 x i32>, ptr %src, align 4
-  %deinterleave_src = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %load)
-  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 0
-  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 1
-  %deinterleave_half1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
-  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 0
-  %deinterleave_half2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
-  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
+  %deinterleave = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> %load)
+  %1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 0
+  %2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 1
+  %3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 2
+  %4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 2
 
   ret void
 }
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
index 085089978d8f5..a61db6577d56d 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
@@ -8,9 +8,7 @@ define void @interleave4(ptr %dst, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b,
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]], <vscale x 4 x i32> [[D]], <vscale x 4 x i1> splat (i1 true), ptr [[DST]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleaved.half1 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
-  %interleaved.half2 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %b, <vscale x 4 x i32> %d)
-  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.half1, <vscale x 8 x i32> %interleaved.half2)
+  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d)
   store <vscale x 16 x i32> %interleaved.vec, ptr %dst, align 4
   ret void
 }
@@ -32,9 +30,7 @@ define void @wide_interleave4(ptr %dst, <vscale x 8 x i32> %a, <vscale x 8 x i32
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP10]], <vscale x 4 x i1> splat (i1 true), ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleaved.half1 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %c)
-  %interleaved.half2 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %b, <vscale x 8 x i32> %d)
-  %interleaved.vec = tail call <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32> %interleaved.half1, <vscale x 16 x i32> %interleaved.half2)
+  %interleaved.vec = tail call <vscale x 32 x i32> @llvm.vector.interleave4.nxv32i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, <vscale x 8 x i32> %c, <vscale x 8 x i32> %d)
   store <vscale x 32 x i32> %interleaved.vec, ptr %dst, align 4
   ret void
 }
@@ -46,9 +42,7 @@ define void @mix_interleave4_interleave2(ptr %dst1, ptr %dst2, <vscale x 4 x i32
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]], <vscale x 4 x i1> splat (i1 true), ptr [[DST2]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleaved.half1 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
-  %interleaved.half2 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %b, <vscale x 4 x i32> %d)
-  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.half1, <vscale x 8 x i32> %interleaved.half2)
+  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d)
   store <vscale x 16 x i32> %interleaved.vec, ptr %dst1, align 4
 
   %interleaved = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
@@ -64,8 +58,7 @@ define void @duplicate_by_interleave(<vscale x 4 x i32> %A, <vscale x 4 x i32> %
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[B]], <vscale x 4 x i1> splat (i1 true), ptr [[AB_DUPLICATE]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %A, <vscale x 4 x i32> %B)
-  %duplicate_by_interleave = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleave, <vscale x 8 x i32> %interleave)
-  store <vscale x 16 x i32> %duplicate_by_interleave, ptr %AB_duplicate, align 4
+  %interleave = tail call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> %A, <vscale x 4 x i32> %A, <vscale x 4 x i32> %B, <vscale x 4 x i32> %B)
+  store <vscale x 16 x i32> %interleave, ptr %AB_duplicate, align 4
   ret void
 }
diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
index 87b16d17aa5f0..72c1f22032bb7 100644
--- a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
@@ -175,53 +175,6 @@ define void @load_factor4_vscale(ptr %ptr) {
   ret void
 }
 
-; TODO: Remove once recursive deinterleaving support is removed
-define void @load_factor4_vscale_recursive(ptr %ptr) {
-; RV32-LABEL: @load_factor4_vscale_recursive(
-; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t.p0.i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[PTR:%.*]], i32 -1, i32 5)
-; RV32-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 0)
-; RV32-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP2]], 0
-; RV32-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 1)
-; RV32-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], <vscale x 4 x i32> [[TMP4]], 1
-; RV32-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 2)
-; RV32-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[TMP6]], 2
-; RV32-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 3)
-; RV32-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]], <vscale x 4 x i32> [[TMP8]], 3
-; RV32-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 0
-; RV32-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 1
-; RV32-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 2
-; RV32-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 3
-; RV32-NEXT:    ret void
-;
-; RV64-LABEL: @load_factor4_vscale_recursive(
-; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t.p0.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[PTR:%.*]], i64 -1, i64 5)
-; RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 0)
-; RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP2]], 0
-; RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 1)
-; RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], <vscale x 4 x i32> [[TMP4]], 1
-; RV64-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 2)
-; RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[TMP6]], 2
-; RV64-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 3)
-; RV64-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]], <vscale x 4 x i32> [[TMP8]], 3
-; RV64-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 0
-; RV64-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 1
-; RV64-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 2
-; RV64-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 3
-; RV64-NEXT:    ret void
-;
-  %interleaved.vec = load <vscale x 16 x i32>, ptr %ptr
-  %d0 = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %interleaved.vec)
-  %d0.0 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 0
-  %d0.1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 1
-  %d1 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.0)
-  %t0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 0
-  %t1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 1
-  %d2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.1)
-  %t2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 0
-  %t3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 1
-  ret void
-}
-
 define void @load_factor5(ptr %ptr) {
 ; RV32-LABEL: @load_factor5(
 ; RV32-NEXT:    [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg5.load.mask.v4i32.p0.i32(ptr [[PTR:%.*]], <4 x i1> splat (i1 true), i32 4)
@@ -590,91 +543,6 @@ define void @load_factor8_vscale(ptr %ptr) {
   ret void
 }
 
-; TODO: Remove once recursive deinterleaving support is removed
-define void @load_factor8_vscale_recursive(ptr %ptr) {
-; RV32-LABEL: @load_factor8_vscale_recursive(
-; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[PTR:%.*]], i32 -1, i32 5)
-; RV32-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 0)
-; RV32-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
-; RV32-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 1)
-; RV32-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP3]], <vscale x 2 x i32> [[TMP4]], 1
-; RV32-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 2)
-; RV32-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP5]], <vscale x 2 x i32> [[TMP6]], 2
-; RV32-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 3)
-; RV32-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP7]], <vscale x 2 x i32> [[TMP8]], 3
-; RV32-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 4)
-; RV32-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP9]], <vscale x 2 x i32> [[TMP10]], 4
-; RV32-NEXT:    [[TMP12:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 5)
-; RV32-NEXT:    [[TMP13:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP11]], <vscale x 2 x i32> [[TMP12]], 5
-; RV32-NEXT:    [[TMP14:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 6)
-; RV32-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], <vscale x 2 x i32> [[TMP14]], 6
-; RV32-NEXT:    [[TMP16:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 7)
-; RV32-NEXT:    [[TMP17:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP15]], <vscale x 2 x i32> [[TMP16]], 7
-; RV32-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 0
-; RV32-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 1
-; RV32-NEXT:    [[TMP20:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 2
-; RV32-NEXT:    [[TMP21:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 3
-; RV32-NEXT:    [[TMP22:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 4
-; RV32-NEXT:    [[TMP23:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 5
-; RV32-NEXT:    [[TMP24:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 6
-; RV32-NEXT:    [[TMP25:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 7
-; RV32-NEXT:    ret void
-;
-; RV64-LABEL: @load_factor8_vscale_recursive(
-; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[PTR:%.*]], i64 -1, i64 5)
-; RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 0)
-; RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
-; RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 1)
-; RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP3]], <vscale x 2 x i32> [[TMP4]], 1
-; RV64-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 2)
-; RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP5]], <vscale x 2 x i32> [[TMP6]], 2
-; RV64-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 3)
-; RV64-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP7]], <vscale x 2 x i32> [[TMP8]], 3
-; RV64-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 4)
-; RV64-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP9]], <vscale x 2 x i32> [[TMP10]], 4
-; RV64-NEXT:    [[TMP12:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 5)
-; RV64-NEXT:    [[TMP13:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP11]], <vscale x 2 x i32> [[TMP12]], 5
-; RV64-NEXT:    [[TMP14:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 6)
-; RV64-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], <vscale x 2 x i32> [[TMP14]], 6
-; RV64-NEXT:    [[TMP16:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 7)
-; RV64-NEXT:    [[TMP17:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP15]], <vscale x 2 x i32> [[TMP16]], 7
-; RV64-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 0
-; RV64-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 1
-; RV64-NEXT:    [[TMP20:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 2
-; RV64-NEXT:    [[TMP21:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 3
-; RV64-NEXT:    [[TMP22:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 4
-; RV64-NEXT:    [[TMP23:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 5
-; RV64-NEXT:    [[TMP24:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 6
-; RV64-NEXT:    [[TMP25:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 7
-; RV64-NEXT:    ret void
-;
-  %interleaved.vec = load <vscale x 16 x i32>, ptr %ptr
-  %d0 = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %interleaved.vec)
-  %d0.0 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 0
-  %d0.1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 1
-
-  %d1 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.0)
-  %d1.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 0
-  %d1.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 1
-  %d2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.1)
-  %d2.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 0
-  %d2.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 1
-
-  %d3 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.0)
-  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 0
-  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 1
-  %d4 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.1)
-  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 0
-  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 1
-  %d5 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.0)
-  %t4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 0
-  %t5 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 1
-  %d6 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.1)
-  %t6 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 0
-  %t7 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 1
-  ret void
-}
-
 
 define void @store_factor2(ptr %ptr, <8 x i8> %v0, <8 x i8> %v1) {
 ; RV32-LABEL: @store_factor2(
@@ -808,31 +676,6 @@ define void @store_factor4_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
   ret void
 }
 
-; TODO: Remove once recursive interleaving support is removed
-define void @store_factor4_vscale_recursive(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1) {
-; RV32-LABEL: @store_factor4_vscale_recursive(
-; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
-; RV32-NEXT:    [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP1]], <vscale x 8 x i8> [[V0]], i32 1)
-; RV32-NEXT:    [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP2]], <vscale x 8 x i8> [[V1:%.*]], i32 2)
-; RV32-NEXT:    [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP3]], <vscale x 8 x i8> [[V1]], i32 3)
-; RV32-NEXT:    call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP4]], ptr [[PTR:%.*]], i32 -1, i32 3)
-; RV32-NEXT:    ret void
-;
-; RV64-LABEL: @store_factor4_vscale_recursive(
-; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
-; RV64-NEXT:    [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP1]], <vscale x 8 x i8> [[V0]], i32 1)
-; RV64-NEXT:    [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP2]], <vscale x 8 x i8> [[V1:%.*]], i32 2)
-; RV64-NEXT:    [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP3]], <vscale x 8 x i8> [[V1]], i32 3)
-; RV64-NEXT:    call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP4]], ptr [[PTR:%.*]], i64 -1, i64 3)
-; RV64-NEXT:    ret void
-;
-  %i0 = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1)
-  %i1 = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1)
-  %i2 = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 16 x i8> %i0, <vscale x 16 x i8> %i1)
-  store <vscale x 32 x i8> %i2, ptr %ptr, align 4
-  ret void
-}
-
 define void @store_factor5_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3, <vscale x 8 x i8> %v4) {
 ; RV32-LABEL: @store_factor5_vscale(
 ; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
@@ -1013,45 +856,6 @@ define void @store_factor8_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
   ret void
 }
 
-; TODO: Remove once recursive interleaving support is removed
-define void @store_factor8_vscale_recursive(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3) {
-; RV32-LABEL: @store_factor8_vscale_recursive(
-; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
-; RV32-NEXT:    [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], <vscale x 8 x i8> [[V2:%.*]], i32 1)
-; RV32-NEXT:    [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP2]], <vscale x 8 x i8> [[V0]], i32 2)
-; RV32-NEXT:    [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP3]], <vscale x 8 x i8> [[V2]], i32 3)
-; RV32-NEXT:    [[TMP5:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP4]], <vscale x 8 x i8> [[V1:%.*]], i32 4)
-; RV32-NEXT:    [[TMP6:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP5]], <vscale x 8 x i8> [[V3:%.*]], i32 5)
-; RV32-NEXT:    [[TMP7:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP6]], <vscale x 8 x i8> [[V1]], i32 6)
-; RV32-NEXT:    [[TMP8:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP7]], <vscale x 8 x i8> [[V3]], i32 7)
-; RV32-NEXT:    call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP8]], ptr [[PTR:%.*]], i32 -1, i32 3)
-; RV32-NEXT:    ret void
-;
-; RV64-LABEL: @store_factor8_vscale_recursive(
-; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
-; RV64-NEXT:    [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], <vscale x 8 x i8> [[V2:%.*]], i32 1)
-; RV64-NEXT:    [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP2]], <vscale x 8 x i8> [[V0]], i32 2)
-; RV64-NEXT:    [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP3]], <vscale x 8 x i8> [[V2]], i32 3)
-; RV64-NEXT:    [[TMP5:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP4]], <vscale x 8 x i8> [[V1:%.*]], i32 4)
-; RV64-NEXT:    [[TMP6:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP5]], <vscale x 8 x i8> [[V3:%.*]], i32 5)
-; RV64-NEXT:    [[TMP7:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP6]], <vscale x 8 x i8> [[V1]], i32 6)
-; RV64-NEXT:    [[TMP8:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP7]], <vscale x 8 x i8> [[V3]], i32 7)
-; RV64-NEXT:    call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP8]], ptr [[PTR:%.*]], i64 -1, i64 3)
-; RV64-NEXT:    ret void
-;
-  %i0 = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1)
-  %i1 = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1)
-  %i2 = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 16 x i8> %i0, <vscale x 16 x i8> %i1)
-
-  %i3 = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3)
-  %i4 = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3)
-  %i5 = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 16 x i8> %i3, <vscale x 16 x i8> %i4)
-
-  %i6 = call <vscale x 64 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 32 x i8> %i2, <vscale x 32 x i8> %i5)
-  store <vscale x 64 x i8> %i6, ptr %ptr, align 4
-  ret void
-}
-
 define void @load_factor2_fp128(ptr %ptr) {
 ; RV32-LABEL: @load_factor2_fp128(
 ; RV32-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x fp128>, ptr [[PTR:%.*]], align 16
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index b23b0ce759d49..dd8b7d6ea7e42 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -274,7 +274,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
-; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<for.body.preheader> in BB:for.body.preheader
+; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<for.body.preheader> in BB: for.body.preheader
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  for.body.preheader: ; preds = %entry
 ; CHECK-NEXT:    %0 = zext i32 %n to i64
@@ -282,7 +282,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    %2 = mul nuw i64 %1, 4
 ; CHECK-NEXT:    %min.iters.check = icmp ult i64 %0, %2
 ; CHECK-NEXT:    br i1 %min.iters.check, label %scalar.ph, label %vector.scevcheck
-; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.scevcheck> in BB:vector.scevcheck
+; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<vector.scevcheck> in BB: vector.scevcheck
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  vector.scevcheck: ; preds = %for.body.preheader
 ; CHECK-NEXT:    %3 = add nsw i64 %0, -1
@@ -297,8 +297,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    %9 = icmp ugt i64 %3, 4294967295
 ; CHECK-NEXT:    %10 = or i1 %8, %9
 ; CHECK-NEXT:    br i1 %10, label %scalar.ph, label %vector.memcheck
-; CHECK-NEXT:  LV: draw edge fromfor.body.preheader
-; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.memcheck> in BB:vector.memcheck
+; CHECK-NEXT:  LV: draw edge from for.body.preheader
+; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<vector.memcheck> in BB: vector.memcheck
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  vector.memcheck: ; preds = %vector.scevcheck
 ; CHECK-NEXT:    %11 = call i64 @llvm.vscale.i64()
@@ -307,8 +307,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    %14 = sub i64 %B1, %A2
 ; CHECK-NEXT:    %diff.check = icmp ult i64 %14, %13
 ; CHECK-NEXT:    br i1 %diff.check, label %scalar.ph, label %vector.ph
-; CHECK-NEXT:  LV: draw edge fromvector.scevcheck
-; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.ph> in BB:vector.ph
+; CHECK-NEXT:  LV: draw edge from vector.scevcheck
+; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<vector.ph> in BB: vector.ph
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  vector.ph: ; preds = %vector.memcheck
 ; CHECK-NEXT:    %15 = call i64 @llvm.vscale.i64()
@@ -321,10 +321,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    %.cast = trunc i64 %n.vec to i32
 ; CHECK-NEXT:    %20 = sub i32 %n, %.cast
 ; CHECK-NEXT:    br
-; CHECK-NEXT:  LV: draw edge fromvector.memcheck
+; CHECK-NEXT:  LV: draw edge from vector.memcheck
 ; CHECK-NEXT:  LV: created vector.body
-; CHECK-NEXT:  LV: draw edge fromvector.ph
-; CHECK-NEXT:  LV: vectorizing VPBB:vector.body in BB:vector.body
+; CHECK-NEXT:  LV: draw edge from vector.ph
+; CHECK-NEXT:  LV: vectorizing VPBB: vector.body in BB: vector.body
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  vector.body: ; preds = %vector.body, %vector.ph
 ; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ]
@@ -351,28 +351,28 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    %34 = icmp eq i64 %index.next, %n.vec
 ; CHECK-NEXT:    br i1 %34, <null operand!>, label %vector.body
 ; CHECK-NEXT:  LV: created middle.block
-; CHECK-NEXT:  LV: draw edge fromvector.body
-; CHECK-NEXT:  LV: vectorizing VPBB:middle.block in BB:middle.block
+; CHECK-NEXT:  LV: draw edge from vector.body
+; CHECK-NEXT:  LV: vectorizing VPBB: middle.block in BB: middle.block
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  middle.block: ; preds = %vector.body
 ; CHECK-NEXT:    %cmp.n = icmp eq i64 %0, %n.vec
 ; CHECK-NEXT:    br i1 %cmp.n, <null operand!>, <null operand!>
-; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<for.cond.cleanup.loopexit> in BB:for.cond.cleanup.loopexit
+; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<for.cond.cleanup.loopexit> in BB: for.cond.cleanup.loopexit
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  for.cond.cleanup.loopexit: ; preds = %for.body
 ; CHECK-NEXT:    br label %for.cond.cleanup
-; CHECK-NEXT:  LV: draw edge frommiddle.block
-; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<scalar.ph> in BB:scalar.ph
+; CHECK-NEXT:  LV: draw edge from middle.block
+; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<scalar.ph> in BB: scalar.ph
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader
 ; CHECK-NEXT:    %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
 ; CHECK-NEXT:    %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
 ; CHECK-NEXT:    br label %for.body
-; CHECK-NEXT:  LV: draw edge frommiddle.block
-; CHECK-NEXT:  LV: draw edge fromfor.body.preheader
-; CHECK-NEXT:  LV: draw edge fromvector.scevcheck
-; CHECK-NEXT:  LV: draw edge fromvector.memcheck
-; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<for.body> in BB:for.body
+; CHECK-NEXT:  LV: draw edge from middle.block
+; CHECK-NEXT:  LV: draw edge from for.body.preheader
+; CHECK-NEXT:  LV: draw edge from vector.scevcheck
+; CHECK-NEXT:  LV: draw edge from vector.memcheck
+; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<for.body> in BB: for.body
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  for.body: ; preds = %for.body, %scalar.ph
 ; CHECK-NEXT:    %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
@@ -387,7 +387,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:    %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:    br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
-; CHECK-NEXT:  LV: draw edge fromscalar.ph
+; CHECK-NEXT:  LV: draw edge from scalar.ph
 ; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ; CHECK-EMPTY:
@@ -683,7 +683,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
-; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<for.body.preheader> in BB:for.body.preheader
+; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<for.body.preheader> in BB: for.body.preheader
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  for.body.preheader: ; preds = %entry
 ; CHECK-NEXT:    %0 = zext i32 %n to i64
@@ -691,7 +691,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    %2 = mul nuw i64 %1, 4
 ; CHECK-NEXT:    %min.iters.check = icmp ult i64 %0, %2
 ; CHECK-NEXT:    br i1 %min.iters.check, label %scalar.ph, label %vector.scevcheck
-; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.scevcheck> in BB:vector.scevcheck
+; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<vector.scevcheck> in BB: vector.scevcheck
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  vector.scevcheck: ; preds = %for.body.preheader
 ; CHECK-NEXT:    %3 = add nsw i64 %0, -1
@@ -706,8 +706,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    %9 = icmp ugt i64 %3, 4294967295
 ; CHECK-NEXT:    %10 = or i1 %8, %9
 ; CHECK-NEXT:    br i1 %10, label %scalar.ph, label %vector.memcheck
-; CHECK-NEXT:  LV: draw edge fromfor.body.preheader
-; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.memcheck> in BB:vector.memcheck
+; CHECK-NEXT:  LV: draw edge from for.body.preheader
+; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<vector.memcheck> in BB: vector.memcheck
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  vector.memcheck: ; preds = %vector.scevcheck
 ; CHECK-NEXT:    %11 = call i64 @llvm.vscale.i64()
@@ -716,8 +716,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    %14 = sub i64 %B1, %A2
 ; CHECK-NEXT:    %diff.check = icmp ult i64 %14, %13
 ; CHECK-NEXT:    br i1 %diff.check, label %scalar.ph, label %vector.ph
-; CHECK-NEXT:  LV: draw edge fromvector.scevcheck
-; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.ph> in BB:vector.ph
+; CHECK-NEXT:  LV: draw edge from vector.scevcheck
+; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<vector.ph> in BB: vector.ph
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  vector.ph: ; preds = %vector.memcheck
 ; CHECK-NEXT:    %15 = call i64 @llvm.vscale.i64()
@@ -730,10 +730,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    %.cast = trunc i64 %n.vec to i32
 ; CHECK-NEXT:    %20 = sub i32 %n, %.cast
 ; CHECK-NEXT:    br
-; CHECK-NEXT:  LV: draw edge fromvector.memcheck
+; CHECK-NEXT:  LV: draw edge from vector.memcheck
 ; CHECK-NEXT:  LV: created vector.body
-; CHECK-NEXT:  LV: draw edge fromvector.ph
-; CHECK-NEXT:  LV: vectorizing VPBB:vector.body in BB:vector.body
+; CHECK-NEXT:  LV: draw edge from vector.ph
+; CHECK-NEXT:  LV: vectorizing VPBB: vector.body in BB: vector.body
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  vector.body: ; preds = %vector.body, %vector.ph
 ; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ]
@@ -760,28 +760,28 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    %34 = icmp eq i64 %index.next, %n.vec
 ; CHECK-NEXT:    br i1 %34, <null operand!>, label %vector.body
 ; CHECK-NEXT:  LV: created middle.block
-; CHECK-NEXT:  LV: draw edge fromvector.body
-; CHECK-NEXT:  LV: vectorizing VPBB:middle.block in BB:middle.block
+; CHECK-NEXT:  LV: draw edge from vector.body
+; CHECK-NEXT:  LV: vectorizing VPBB: middle.block in BB: middle.block
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  middle.block: ; preds = %vector.body
 ; CHECK-NEXT:    %cmp.n = icmp eq i64 %0, %n.vec
 ; CHECK-NEXT:    br i1 %cmp.n, <null operand!>, <null operand!>
-; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<for.cond.cleanup.loopexit> in BB:for.cond.cleanup.loopexit
+; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<for.cond.cleanup.loopexit> in BB: for.cond.cleanup.loopexit
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  for.cond.cleanup.loopexit: ; preds = %for.body
 ; CHECK-NEXT:    br label %for.cond.cleanup
-; CHECK-NEXT:  LV: draw edge frommiddle.block
-; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<scalar.ph> in BB:scalar.ph
+; CHECK-NEXT:  LV: draw edge from middle.block
+; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<scalar.ph> in BB: scalar.ph
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader
 ; CHECK-NEXT:    %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
 ; CHECK-NEXT:    %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
 ; CHECK-NEXT:    br label %for.body
-; CHECK-NEXT:  LV: draw edge frommiddle.block
-; CHECK-NEXT:  LV: draw edge fromfor.body.preheader
-; CHECK-NEXT:  LV: draw edge fromvector.scevcheck
-; CHECK-NEXT:  LV: draw edge fromvector.memcheck
-; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<for.body> in BB:for.body
+; CHECK-NEXT:  LV: draw edge from middle.block
+; CHECK-NEXT:  LV: draw edge from for.body.preheader
+; CHECK-NEXT:  LV: draw edge from vector.scevcheck
+; CHECK-NEXT:  LV: draw edge from vector.memcheck
+; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<for.body> in BB: for.body
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  for.body: ; preds = %for.body, %scalar.ph
 ; CHECK-NEXT:    %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
@@ -796,7 +796,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:    %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:    br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
-; CHECK-NEXT:  LV: draw edge fromscalar.ph
+; CHECK-NEXT:  LV: draw edge from scalar.ph
 ; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-variable-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-variable-size.ll
deleted file mode 100644
index 3e88672f29242..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-variable-size.ll
+++ /dev/null
@@ -1,407 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
-; RUN: opt -p loop-vectorize -force-vector-width=2 -S %s | FileCheck %s
-
-declare void @llvm.assume(i1)
-
-; %a is known dereferenceable via assume for the whole loop.
-define void @deref_assumption_in_preheader_non_constant_trip_count_access_i8(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c, i64 %n) nofree nosync {
-; CHECK-LABEL: define void @deref_assumption_in_preheader_non_constant_trip_count_access_i8(
-; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 [[N]]) ]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP15]], i32 0
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
-; CHECK:       [[PRED_LOAD_IF]]:
-; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i8, ptr [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x i8> poison, i8 [[TMP17]], i32 0
-; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
-; CHECK:       [[PRED_LOAD_CONTINUE]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i8> [ poison, %[[VECTOR_BODY]] ], [ [[TMP18]], %[[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP15]], i32 1
-; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
-; CHECK:       [[PRED_LOAD_IF1]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP12]], align 1
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i8> [[TMP9]], i8 [[TMP13]], i32 1
-; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
-; CHECK:       [[PRED_LOAD_CONTINUE2]]:
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = phi <2 x i8> [ [[TMP9]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP4]], <2 x i8> [[WIDE_LOAD]], <2 x i8> [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <2 x i8> [[PREDPHI]], ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i8 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i8 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i8 [[MERGE]], ptr [[GEP_C]], align 1
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
-;
-entry:
-  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 %n) ]
-  br label %loop.header
-
-loop.header:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
-  %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
-  %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
-  %l.b = load i8, ptr %gep.b, align 1
-  %c.1 = icmp sge i8 %l.b, 0
-  br i1 %c.1, label %loop.latch, label %loop.then
-
-loop.then:
-  %l.a = load i8, ptr %gep.a, align 1
-  br label %loop.latch
-
-loop.latch:
-  %merge = phi i8 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
-  %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv
-  store i8 %merge, ptr %gep.c, align 1
-  %iv.next = add nuw nsw i64 %iv, 1
-  %ec = icmp eq i64 %iv.next, %n
-  br i1 %ec, label %exit, label %loop.header
-
-exit:
-  ret void
-}
-
-; %a is known dereferenceable via assume for the whole loop.
-define void @deref_assumption_in_preheader_non_constant_trip_count_access_i32(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c, i64 %n) nofree nosync {
-; CHECK-LABEL: define void @deref_assumption_in_preheader_non_constant_trip_count_access_i32(
-; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i64 [[N]], 4
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 [[MUL]]) ]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP15]], i32 0
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
-; CHECK:       [[PRED_LOAD_IF]]:
-; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> poison, i32 [[TMP17]], i32 0
-; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
-; CHECK:       [[PRED_LOAD_CONTINUE]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP18]], %[[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP15]], i32 1
-; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
-; CHECK:       [[PRED_LOAD_IF1]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 1
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP13]], i32 1
-; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
-; CHECK:       [[PRED_LOAD_CONTINUE2]]:
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = phi <2 x i32> [ [[TMP9]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 1
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 1
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 1
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %mul = mul nsw nuw i64 %n, 4
-  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 %mul) ]
-  br label %loop.header
-
-loop.header:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
-  %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv
-  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
-  %l.b = load i32, ptr %gep.b, align 1
-  %c.1 = icmp sge i32 %l.b, 0
-  br i1 %c.1, label %loop.latch, label %loop.then
-
-loop.then:
-  %l.a = load i32, ptr %gep.a, align 1
-  br label %loop.latch
-
-loop.latch:
-  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
-  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
-  store i32 %merge, ptr %gep.c, align 1
-  %iv.next = add nuw nsw i64 %iv, 1
-  %ec = icmp eq i64 %iv.next, %n
-  br i1 %ec, label %exit, label %loop.header
-
-exit:
-  ret void
-}
-
-
-; %a is NOT known dereferenceable via assume for the whole loop.
-define void @deref_assumption_in_preheader_too_small_non_constant_trip_count_access_i32(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c, i64 %n) nofree nosync {
-; CHECK-LABEL: define void @deref_assumption_in_preheader_too_small_non_constant_trip_count_access_i32(
-; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 [[N]]) ]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP15]], i32 0
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
-; CHECK:       [[PRED_LOAD_IF]]:
-; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> poison, i32 [[TMP17]], i32 0
-; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
-; CHECK:       [[PRED_LOAD_CONTINUE]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP18]], %[[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP15]], i32 1
-; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
-; CHECK:       [[PRED_LOAD_IF1]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 1
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP13]], i32 1
-; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
-; CHECK:       [[PRED_LOAD_CONTINUE2]]:
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = phi <2 x i32> [ [[TMP9]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 1
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 1
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 1
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
-;
-entry:
-  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 %n) ]
-  br label %loop.header
-
-loop.header:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
-  %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv
-  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
-  %l.b = load i32, ptr %gep.b, align 1
-  %c.1 = icmp sge i32 %l.b, 0
-  br i1 %c.1, label %loop.latch, label %loop.then
-
-loop.then:
-  %l.a = load i32, ptr %gep.a, align 1
-  br label %loop.latch
-
-loop.latch:
-  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
-  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
-  store i32 %merge, ptr %gep.c, align 1
-  %iv.next = add nuw nsw i64 %iv, 1
-  %ec = icmp eq i64 %iv.next, %n
-  br i1 %ec, label %exit, label %loop.header
-
-exit:
-  ret void
-}
-
-; %a is NOT known dereferenceable via assume for the whole loop.
-define void @deref_assumption_in_preheader_too_small2_non_constant_trip_count_access_i32(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c, i64 %n) nofree nosync {
-; CHECK-LABEL: define void @deref_assumption_in_preheader_too_small2_non_constant_trip_count_access_i32(
-; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 100) ]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP15]], i32 0
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
-; CHECK:       [[PRED_LOAD_IF]]:
-; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> poison, i32 [[TMP17]], i32 0
-; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
-; CHECK:       [[PRED_LOAD_CONTINUE]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP18]], %[[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP15]], i32 1
-; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
-; CHECK:       [[PRED_LOAD_IF1]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 1
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP13]], i32 1
-; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
-; CHECK:       [[PRED_LOAD_CONTINUE2]]:
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = phi <2 x i32> [ [[TMP9]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 1
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 1
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 1
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
-;
-entry:
-  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 100) ]
-  br label %loop.header
-
-loop.header:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
-  %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv
-  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
-  %l.b = load i32, ptr %gep.b, align 1
-  %c.1 = icmp sge i32 %l.b, 0
-  br i1 %c.1, label %loop.latch, label %loop.then
-
-loop.then:
-  %l.a = load i32, ptr %gep.a, align 1
-  br label %loop.latch
-
-loop.latch:
-  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
-  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
-  store i32 %merge, ptr %gep.c, align 1
-  %iv.next = add nuw nsw i64 %iv, 1
-  %ec = icmp eq i64 %iv.next, %n
-  br i1 %ec, label %exit, label %loop.header
-
-exit:
-  ret void
-}
diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
index 3a08681a2fb92..7c80dad006952 100644
--- a/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
@@ -165,35 +165,6 @@ exit:
   ret void
 }
 
-define void @loop_contains_store_assumed_bounds(ptr noalias %array, ptr readonly %pred, i32 %n) {
-; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_assumed_bounds'
-; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops.
-entry:
-  %n_bytes = mul nuw nsw i32 %n, 2
-  call void @llvm.assume(i1 true) [ "align"(ptr %pred, i64 2), "dereferenceable"(ptr %pred, i32 %n_bytes) ]
-  %tc = sext i32 %n to i64
-  br label %for.body
-
-for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
-  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
-  %data = load i16, ptr %st.addr, align 2
-  %inc = add nsw i16 %data, 1
-  store i16 %inc, ptr %st.addr, align 2
-  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
-  %ee.val = load i16, ptr %ee.addr, align 2
-  %ee.cond = icmp sgt i16 %ee.val, 500
-  br i1 %ee.cond, label %exit, label %for.inc
-
-for.inc:
-  %iv.next = add nuw nsw i64 %iv, 1
-  %counted.cond = icmp eq i64 %iv.next, %tc
-  br i1 %counted.cond, label %exit, label %for.body
-
-exit:
-  ret void
-}
-
 define void @loop_contains_store_to_pointer_with_no_deref_info(ptr align 2 dereferenceable(40) readonly %load.array, ptr align 2 noalias %array, ptr align 2 dereferenceable(40) readonly %pred) {
 ; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_to_pointer_with_no_deref_info'
 ; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index a0e52c13ec621..d2617a1986764 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -274,14 +274,11 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_3(
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 24
-; AVX2-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP1]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <4 x i32> <i32 0, i32 11, i32 4, i32 15>
-; AVX2-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[TMP14]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> poison, <4 x i32> <i32 12, i32 3, i32 0, i32 15>
-; AVX2-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
-; AVX2-NEXT:    store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
+; AVX2-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
+; AVX2-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_3(
@@ -412,14 +409,11 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_4(
-; AVX2-NEXT:    [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 24
-; AVX2-NEXT:    [[TMP1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[T1]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <4 x i32> <i32 0, i32 11, i32 4, i32 15>
-; AVX2-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[T26]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> poison, <4 x i32> <i32 12, i32 3, i32 0, i32 15>
-; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
-; AVX2-NEXT:    store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
+; AVX2-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
+; AVX2-NEXT:    store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_4(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index 6c5638819dcea..8f31200a3683d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -274,14 +274,11 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_3(
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 24
-; AVX2-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP1]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <4 x i32> <i32 0, i32 11, i32 4, i32 15>
-; AVX2-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[TMP14]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> poison, <4 x i32> <i32 12, i32 3, i32 0, i32 15>
-; AVX2-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
-; AVX2-NEXT:    store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
+; AVX2-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
+; AVX2-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_3(
@@ -412,14 +409,11 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_4(
-; AVX2-NEXT:    [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 24
-; AVX2-NEXT:    [[TMP1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[T1]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <4 x i32> <i32 0, i32 11, i32 4, i32 15>
-; AVX2-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[T26]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> poison, <4 x i32> <i32 12, i32 3, i32 0, i32 15>
-; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
-; AVX2-NEXT:    store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
+; AVX2-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
+; AVX2-NEXT:    store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_4(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
index 09a5ace101e64..3fd9e126f4685 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
@@ -15,15 +15,9 @@
 define void @test(ptr noalias %p, ptr noalias %p1) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[I:%.*]] = load i32, ptr [[P:%.*]], align 4
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr i32, ptr [[P]], i64 32
-; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr i32, ptr [[P]], i64 33
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call <35 x i32> @llvm.masked.load.v35i32.p0(ptr [[P:%.*]], i32 4, <35 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true>, <35 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <35 x i32> [[TMP0]], <35 x i32> poison, <4 x i32> <i32 0, i32 32, i32 33, i32 34>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[I]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[I2]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP3]], <2 x i32> [[TMP0]], i64 2)
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP1]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[P1:%.*]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SimplifyCFG/preserve-branchweights.ll b/llvm/test/Transforms/SimplifyCFG/preserve-branchweights.ll
index fb607c72a0e35..0f78e236b4248 100644
--- a/llvm/test/Transforms/SimplifyCFG/preserve-branchweights.ll
+++ b/llvm/test/Transforms/SimplifyCFG/preserve-branchweights.ll
@@ -38,45 +38,12 @@ Z:
   ret void
 }
 
-; Make sure the metadata name string is "branch_weights" before propagating it.
-
-define void @fake_weights(i1 %a, i1 %b) {
-; CHECK-LABEL: @fake_weights(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_NOT:%.*]] = xor i1 [[A:%.*]], true
-; CHECK-NEXT:    [[C:%.*]] = or i1 [[B:%.*]], false
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[A_NOT]], i1 [[C]], i1 false
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[Z:%.*]], label [[Y:%.*]], !prof [[PROF1:![0-9]+]]
-; CHECK:       common.ret:
-; CHECK-NEXT:    ret void
-; CHECK:       Y:
-; CHECK-NEXT:    call void @helper(i32 0)
-; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
-; CHECK:       Z:
-; CHECK-NEXT:    call void @helper(i32 1)
-; CHECK-NEXT:    br label [[COMMON_RET]]
-;
-entry:
-  br i1 %a, label %Y, label %X, !prof !12
-X:
-  %c = or i1 %b, false
-  br i1 %c, label %Z, label %Y, !prof !1
-
-Y:
-  call void @helper(i32 0)
-  ret void
-
-Z:
-  call void @helper(i32 1)
-  ret void
-}
-
 define void @test2(i1 %a, i1 %b) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[C:%.*]] = or i1 [[B:%.*]], false
 ; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[A:%.*]], i1 [[C]], i1 false
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[Z:%.*]], label [[Y:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[Z:%.*]], label [[Y:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       Y:
@@ -107,7 +74,7 @@ define void @test3(i1 %a, i1 %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[C:%.*]] = or i1 [[B:%.*]], false
 ; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[A:%.*]], i1 [[C]], i1 false
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[Z:%.*]], label [[Y:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[Z:%.*]], label [[Y:%.*]], !prof [[PROF2:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       Y:
@@ -138,7 +105,7 @@ define void @test4(i1 %a, i1 %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[C:%.*]] = or i1 [[B:%.*]], false
 ; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[A:%.*]], i1 [[C]], i1 false
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[Z:%.*]], label [[Y:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[Z:%.*]], label [[Y:%.*]], !prof [[PROF2]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       Y:
@@ -1120,7 +1087,6 @@ exit:
 !9 = !{!"branch_weights", i32 7, i32 6}
 !10 = !{!"branch_weights", i32 672646, i32 21604207}
 !11 = !{!"branch_weights", i32 6960, i32 21597248}
-!12 = !{!"these_are_not_the_branch_weights_you_are_looking_for", i32 3, i32 5}
 !13 = !{!"branch_weights", i32 2, i32 3}
 !14 = !{!"branch_weights", i32 4, i32 7}
 !15 = !{!"branch_weights", i32 99, i32 1}
@@ -1136,8 +1102,8 @@ exit:
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { noredzone nounwind ssp memory(none) }
 ;.
 ; CHECK: [[PROF0]] = !{!"branch_weights", i32 5, i32 11}
-; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 3}
-; CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 5}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 5}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 3}
 ; CHECK: [[PROF3]] = !{!"branch_weights", i32 7, i32 1, i32 2}
 ; CHECK: [[PROF4]] = !{!"branch_weights", i32 49, i32 12, i32 24, i32 35}
 ; CHECK: [[PROF5]] = !{!"branch_weights", i32 11, i32 5}
diff --git a/llvm/test/Verifier/assume-bundles.ll b/llvm/test/Verifier/assume-bundles.ll
index 4b6971d6be832..047e8d9bba3ed 100644
--- a/llvm/test/Verifier/assume-bundles.ll
+++ b/llvm/test/Verifier/assume-bundles.ll
@@ -30,8 +30,7 @@ define void @func(ptr %P, i32 %P1, ptr %P2, ptr %P3) {
   call void @llvm.assume(i1 true) ["separate_storage"(ptr %P)]
 ; CHECK: arguments to separate_storage assumptions should be pointers
   call void @llvm.assume(i1 true) ["separate_storage"(ptr %P, i32 123)]
-; FIXME: The dereferenceable bundle is invalid.
-; CHECK-NOT: call {{.+}}dereferenceable
+; CHECK: this attribute should have 2 arguments
   call void @llvm.assume(i1 true) ["align"(ptr %P, i32 4), "dereferenceable"(ptr %P)]
   ret void
 }
diff --git a/llvm/test/Verifier/branch-weight.ll b/llvm/test/Verifier/branch-weight.ll
new file mode 100644
index 0000000000000..e3b0f340e31bc
--- /dev/null
+++ b/llvm/test/Verifier/branch-weight.ll
@@ -0,0 +1,39 @@
+; Test MD_prof validation
+
+; RUN: split-file %s %t
+; RUN: opt -passes=verify %t/valid.ll --disable-output
+; RUN: not opt -passes=verify %t/invalid1.ll --disable-output 2>&1 | FileCheck %s
+; RUN: not opt -passes=verify %t/invalid2.ll --disable-output 2>&1 | FileCheck %s
+
+;--- valid.ll
+define void @test(i1 %0) {
+  br i1 %0, label %2, label %3, !prof !0
+2:
+  ret void
+3:
+  ret void
+}
+!0 = !{!"branch_weights", i32 1, i32 2}
+
+;--- invalid1.ll
+define void @test(i1 %0) {
+  br i1 %0, label %2, label %3, !prof !0
+2:
+  ret void
+3:
+  ret void
+}
+!0 = !{!"invalid", i32 1, i32 2}
+
+;--- invalid2.ll
+define void @test(i1 %0) {
+  br i1 %0, label %2, label %3, !prof !0
+2:
+  ret void
+3:
+  ret void
+}
+
+!0 = !{!"function_entry_count", i32 1}
+
+; CHECK: expected either branch_weights or VP profile name
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
index b8779b9d54ea7..f466b1de9fb5a 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
@@ -19,11 +19,7 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct
 ; CHECK-NEXT:    st.param.v2.b64 [param0], {%rd2, %rd1};
 ; CHECK-NEXT:    st.param.v2.b64 [param0+16], {%rd4, %rd3};
 ; CHECK-NEXT:    .param .align 16 .b8 retval0[32];
-; CHECK-NEXT:    call.uni (retval0),
-; CHECK-NEXT:    callee_St8x4,
-; CHECK-NEXT:    (
-; CHECK-NEXT:    param0
-; CHECK-NEXT:    );
+; CHECK-NEXT:    call.uni (retval0), callee_St8x4, (param0);
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd5, %rd6}, [retval0];
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd7, %rd8}, [retval0+16];
 ; CHECK-NEXT:    } // callseq 0
diff --git a/llvm/test/tools/llvm-objdump/ELF/AArch64/symbolize-operands-executable.yaml b/llvm/test/tools/llvm-objdump/ELF/AArch64/symbolize-operands-executable.yaml
new file mode 100644
index 0000000000000..d318ea01b4c30
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/AArch64/symbolize-operands-executable.yaml
@@ -0,0 +1,67 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-objdump %t -d --symbolize-operands --no-show-raw-insn --no-leading-addr | \
+# RUN:   FileCheck %s --match-full-lines -DABS_ADRP_VAL=0x6000
+# RUN: llvm-objdump %t -d --symbolize-operands --no-show-raw-insn --no-leading-addr --adjust-vma=0x2000 | \
+# RUN:   FileCheck %s --match-full-lines -DABS_ADRP_VAL=0x8000
+
+## Expect to find the branch labels and global variable name.
+# CHECK:      <_start>:
+# CHECK-NEXT:   ldr x0, <symbol>
+# CHECK-NEXT: <L0>:
+# CHECK-NEXT:   adrp x1, [[ABS_ADRP_VAL]] <symbol+0xff4>
+# CHECK-NEXT:   adr x2, <symbol>
+# CHECK-NEXT:   cmp x1, x2
+# CHECK-NEXT:   b.eq <L1>
+# CHECK-NEXT:   b <L0>
+# CHECK-NEXT: <L1>:
+# CHECK-NEXT:   cbz x2, <L0>
+# CHECK-NEXT:   ret
+
+## Machine code generated with:
+# llvm-mc --arch=aarch64 --filetype=obj -o tmp.o <<EOF
+#   .text
+#   .p2align 14
+#   .globl .start
+# _start:
+#   ldr x0, symbol
+# 1:
+#   adrp x1, symbol + 0x1000
+#   adr x2, symbol
+#   cmp x1, x2
+#   b.eq 2f
+#   b 1b
+# 2:
+#   cbz x2, 1b
+#   ret
+#
+#   .data
+#   .p2align 12
+#   .skip 12
+# symbol:
+# EOF
+# ld.lld -shared --nmagic -o tmp.so tmp.o
+# llvm-objdump -s tmp.so --section=.text
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+Sections:
+  - Name:    .text
+    Type:    SHT_PROGBITS
+    Address: 0x4000
+    Flags:   [SHF_ALLOC, SHF_EXECINSTR]
+    Content: '60800058010000d0228000103f0002eb40000054fcffff1762ffffb4c0035fd6'
+  - Name:    .data
+    Type:    SHT_PROGBITS
+    Flags:   [SHF_ALLOC, SHF_WRITE]
+    Address: 0x5000
+Symbols:
+  - Name:    _start
+    Section: .text
+    Value:   0x4000
+  - Name:    symbol
+    Section: .data
+    Value:   0x500c
diff --git a/llvm/test/tools/llvm-objdump/ELF/AArch64/symbolize-operands-relocatable.s b/llvm/test/tools/llvm-objdump/ELF/AArch64/symbolize-operands-relocatable.s
new file mode 100644
index 0000000000000..17af1bfcde33a
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/AArch64/symbolize-operands-relocatable.s
@@ -0,0 +1,79 @@
+# RUN: llvm-mc --triple=aarch64-elf --filetype=obj < %s | \
+# RUN:   llvm-objdump -d -r --symbolize-operands --no-show-raw-insn --no-leading-addr - | \
+# RUN:   FileCheck %s --match-full-lines
+
+# CHECK:      <fn1>:
+# CHECK-NEXT:   b <L0>
+# CHECK-NEXT:   tbz x0, #0x2c, <L2>
+# CHECK-NEXT: <L0>:
+# CHECK-NEXT:   b.eq <L1>
+# CHECK-NEXT: <L1>:
+# CHECK-NEXT:   cbz x1, <L0>
+# CHECK-NEXT: <L2>:
+# CHECK-NEXT:   nop
+# CHECK-NEXT: <L3>:
+# CHECK-NEXT:   bl <L3>
+# CHECK-NEXT:     R_AARCH64_CALL26 fn2
+# CHECK-NEXT:   bl <fn2>
+# CHECK-NEXT:   adr x0, <L2>
+# CHECK-NEXT: <L4>:
+# CHECK-NEXT:   adr x1, <L4>
+# CHECK-NEXT:     R_AARCH64_ADR_PREL_LO21 fn2
+# CHECK-NEXT:   adr x2, <fn2>
+# CHECK-NEXT:   ldr w0, <L2>
+# CHECK-NEXT: <L5>:
+# CHECK-NEXT:   ldr w0, <L5>
+# CHECK-NEXT:     R_AARCH64_LD_PREL_LO19 fn2
+# CHECK-NEXT:   ret
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   nop
+# CHECK-EMPTY:
+# CHECK-NEXT: <fn2>:
+# CHECK-NEXT:   bl <L0>
+# CHECK-NEXT:   adrp x3, 0x0 <fn1>
+# CHECK-NEXT:     R_AARCH64_ADR_PREL_PG_HI21 fn2
+# CHECK-NEXT:   add x3, x3, #0x0
+# CHECK-NEXT:     R_AARCH64_ADD_ABS_LO12_NC fn2
+# CHECK-NEXT:   adrp x3, 0x0 <fn1>
+# CHECK-NEXT:     R_AARCH64_ADR_PREL_PG_HI21 fn2
+# CHECK-NEXT:   ldr x0, [x3]
+# CHECK-NEXT:     R_AARCH64_LDST64_ABS_LO12_NC fn2
+# CHECK-NEXT:   ret
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   nop
+# CHECK-NEXT: <L0>:
+# CHECK-NEXT:   ret
+
+    .p2align 4
+    .global fn1
+fn1:
+    b 0f
+    tbz x0, 44, 2f
+0:  b.eq 1f
+1:  cbz x1, 0b
+2:  nop
+    bl fn2
+    bl .Lfn2
+    adr x0, 2b
+    adr x1, fn2
+    adr x2, .Lfn2
+    ldr w0, 2b
+    ldr w0, fn2
+    ret
+
+    .p2align 4
+    .global fn2
+fn2:
+.Lfn2: ## Local label for non-interposable call.
+    bl .Lfn3
+    ## In future, we might identify the pairs and symbolize the operands properly.
+    adrp x3, fn2
+    add x3, x3, :lo12:fn2
+    adrp x3, fn2
+    ldr x0, [x3, :lo12:fn2]
+    ret
+
+    .p2align 4
+.Lfn3: ## Private function
+    ret
diff --git a/llvm/tools/llvm-exegesis/lib/Analysis.cpp b/llvm/tools/llvm-exegesis/lib/Analysis.cpp
index be10c32cf08d5..fb843285ada2a 100644
--- a/llvm/tools/llvm-exegesis/lib/Analysis.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Analysis.cpp
@@ -137,9 +137,9 @@ void Analysis::printInstructionRowCsv(const size_t PointId,
   std::tie(SchedClassId, std::ignore) = ResolvedSchedClass::resolveSchedClassId(
       State_.getSubtargetInfo(), State_.getInstrInfo(), MCI);
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  const MCSchedClassDesc *const SCDesc =
-      State_.getSubtargetInfo().getSchedModel().getSchedClassDesc(SchedClassId);
-  writeEscaped<kEscapeCsv>(OS, SCDesc->Name);
+  StringRef SCDescName =
+      State_.getSubtargetInfo().getSchedModel().getSchedClassName(SchedClassId);
+  writeEscaped<kEscapeCsv>(OS, SCDescName);
 #else
   OS << SchedClassId;
 #endif
@@ -563,7 +563,8 @@ Error Analysis::run<Analysis::PrintSchedClassInconsistencies>(
     OS << "<div class=\"inconsistency\"><p>Sched Class <span "
           "class=\"sched-class-name\">";
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-    writeEscaped<kEscapeHtml>(OS, RSCAndPoints.RSC.SCDesc->Name);
+    writeEscaped<kEscapeHtml>(OS, SI.getSchedModel().getSchedClassName(
+                                      RSCAndPoints.RSC.SchedClassId));
 #else
     OS << RSCAndPoints.RSC.SchedClassId;
 #endif
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 5ecb33375943f..c5967cd090eec 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -1495,8 +1495,9 @@ collectLocalBranchTargets(ArrayRef<uint8_t> Bytes, MCInstrAnalysis *MIA,
   // Supported by certain targets.
   const bool isPPC = STI->getTargetTriple().isPPC();
   const bool isX86 = STI->getTargetTriple().isX86();
+  const bool isAArch64 = STI->getTargetTriple().isAArch64();
   const bool isBPF = STI->getTargetTriple().isBPF();
-  if (!isPPC && !isX86 && !isBPF)
+  if (!isPPC && !isX86 && !isAArch64 && !isBPF)
     return;
 
   if (MIA)
diff --git a/llvm/unittests/ADT/ArrayRefTest.cpp b/llvm/unittests/ADT/ArrayRefTest.cpp
index 39a4a9b6a178c..3858d9064f9ca 100644
--- a/llvm/unittests/ADT/ArrayRefTest.cpp
+++ b/llvm/unittests/ADT/ArrayRefTest.cpp
@@ -8,10 +8,16 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Allocator.h"
-#include "llvm/Support/raw_ostream.h"
 #include "gtest/gtest.h"
 #include <limits>
 #include <vector>
+#if __has_include(<version>)
+#include <version>
+#endif
+#ifdef __cpp_lib_span
+#include <span>
+#endif
+
 using namespace llvm;
 
 // Check that the ArrayRef-of-pointer converting constructor only allows adding
@@ -406,4 +412,15 @@ TEST(ArrayRefTest, MutableArrayRefDeductionGuides) {
   }
 }
 
+#ifdef __cpp_lib_span
+static_assert(std::is_constructible_v<ArrayRef<int>, std::span<const int>>,
+              "should be able to construct ArrayRef from const std::span");
+static_assert(std::is_constructible_v<std::span<const int>, ArrayRef<int>>,
+              "should be able to construct const std::span from ArrayRef");
+static_assert(std::is_constructible_v<ArrayRef<int>, std::span<int>>,
+              "should be able to construct ArrayRef from mutable std::span");
+static_assert(!std::is_constructible_v<std::span<int>, ArrayRef<int>>,
+              "cannot construct mutable std::span from ArrayRef");
+#endif
+
 } // end anonymous namespace
diff --git a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
index 6eef0b5f91719..aa4d712cde09e 100644
--- a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
+++ b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
@@ -14,6 +14,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
@@ -23,7 +24,7 @@
 using namespace llvm;
 using namespace llvm::memprof;
 
-extern cl::opt<bool> MemProfKeepAllNotColdContexts;
+LLVM_ABI extern cl::opt<bool> MemProfKeepAllNotColdContexts;
 
 namespace {
 
diff --git a/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
index 0c2c06ec75d6e..8a6549b1b594e 100644
--- a/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
@@ -19,6 +19,8 @@
 #include "llvm/Testing/Support/Error.h"
 #include "gtest/gtest.h"
 
+#include "OrcTestCommon.h"
+
 using namespace llvm;
 using namespace llvm::jitlink;
 using namespace llvm::orc;
diff --git a/llvm/unittests/ExecutionEngine/Orc/OrcTestCommon.h b/llvm/unittests/ExecutionEngine/Orc/OrcTestCommon.h
index 0981f4b8132bd..6675921c29eb4 100644
--- a/llvm/unittests/ExecutionEngine/Orc/OrcTestCommon.h
+++ b/llvm/unittests/ExecutionEngine/Orc/OrcTestCommon.h
@@ -82,6 +82,47 @@ class CoreAPIsBasedStandardTest : public testing::Test {
   unique_function<void(std::unique_ptr<Task>)> DispatchOverride;
 };
 
+/// A ExecutorProcessControl instance that asserts if any of its methods are
+/// used. Suitable for use is unit tests, and by ORC clients who haven't moved
+/// to ExecutorProcessControl-based APIs yet.
+class UnsupportedExecutorProcessControl : public ExecutorProcessControl,
+                                          private InProcessMemoryAccess {
+public:
+  UnsupportedExecutorProcessControl(
+      std::shared_ptr<SymbolStringPool> SSP = nullptr,
+      std::unique_ptr<TaskDispatcher> D = nullptr, const std::string &TT = "",
+      unsigned PageSize = 0)
+      : ExecutorProcessControl(
+            SSP ? std::move(SSP) : std::make_shared<SymbolStringPool>(),
+            D ? std::move(D) : std::make_unique<InPlaceTaskDispatcher>()),
+        InProcessMemoryAccess(Triple(TT).isArch64Bit()) {
+    this->TargetTriple = Triple(TT);
+    this->PageSize = PageSize;
+    this->MemAccess = this;
+  }
+
+  Expected<int32_t> runAsMain(ExecutorAddr MainFnAddr,
+                              ArrayRef<std::string> Args) override {
+    llvm_unreachable("Unsupported");
+  }
+
+  Expected<int32_t> runAsVoidFunction(ExecutorAddr VoidFnAddr) override {
+    llvm_unreachable("Unsupported");
+  }
+
+  Expected<int32_t> runAsIntFunction(ExecutorAddr IntFnAddr, int Arg) override {
+    llvm_unreachable("Unsupported");
+  }
+
+  void callWrapperAsync(ExecutorAddr WrapperFnAddr,
+                        IncomingWFRHandler OnComplete,
+                        ArrayRef<char> ArgBuffer) override {
+    llvm_unreachable("Unsupported");
+  }
+
+  Error disconnect() override { return Error::success(); }
+};
+
 } // end namespace orc
 
 class OrcNativeTarget {
diff --git a/llvm/unittests/Frontend/CMakeLists.txt b/llvm/unittests/Frontend/CMakeLists.txt
index 4048143b36819..2412cc9d26c7a 100644
--- a/llvm/unittests/Frontend/CMakeLists.txt
+++ b/llvm/unittests/Frontend/CMakeLists.txt
@@ -19,6 +19,7 @@ add_llvm_unittest(LLVMFrontendTests
   OpenMPParsingTest.cpp
   OpenMPCompositionTest.cpp
   OpenMPDecompositionTest.cpp
+  OpenMPDirectiveNameTest.cpp
 
   DEPENDS
   acc_gen
diff --git a/llvm/unittests/Frontend/OpenMPDirectiveNameTest.cpp b/llvm/unittests/Frontend/OpenMPDirectiveNameTest.cpp
new file mode 100644
index 0000000000000..da648157ee9b6
--- /dev/null
+++ b/llvm/unittests/Frontend/OpenMPDirectiveNameTest.cpp
@@ -0,0 +1,96 @@
+//===- llvm/unittests/Frontend/OpenMPDirectiveNameTest.cpp ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Frontend/OpenMP/OMP.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::omp;
+
+const DenseMap<Directive, StringRef> &Expected52() {
+  static const DenseMap<Directive, StringRef> Names{
+      {OMPD_begin_declare_target, "begin declare target"},
+      {OMPD_begin_declare_variant, "begin declare variant"},
+      {OMPD_cancellation_point, "cancellation point"},
+      {OMPD_declare_mapper, "declare mapper"},
+      {OMPD_declare_reduction, "declare reduction"},
+      {OMPD_declare_simd, "declare simd"},
+      {OMPD_declare_target, "declare target"},
+      {OMPD_declare_variant, "declare variant"},
+      {OMPD_end_declare_target, "end declare target"},
+      {OMPD_end_declare_variant, "end declare variant"},
+      {OMPD_target_data, "target data"},
+      {OMPD_target_enter_data, "target enter data"},
+      {OMPD_target_exit_data, "target exit data"},
+      {OMPD_target_update, "target update"},
+  };
+  return Names;
+}
+
+const DenseMap<Directive, StringRef> &Expected60() {
+  static const DenseMap<Directive, StringRef> Names{
+      {OMPD_begin_declare_target, "begin declare_target"},
+      {OMPD_begin_declare_variant, "begin declare_variant"},
+      {OMPD_cancellation_point, "cancellation_point"},
+      {OMPD_declare_mapper, "declare_mapper"},
+      {OMPD_declare_reduction, "declare_reduction"},
+      {OMPD_declare_simd, "declare_simd"},
+      {OMPD_declare_target, "declare_target"},
+      {OMPD_declare_variant, "declare_variant"},
+      {OMPD_end_declare_target, "end declare_target"},
+      {OMPD_end_declare_variant, "end declare_variant"},
+      {OMPD_target_data, "target_data"},
+      {OMPD_target_enter_data, "target_enter_data"},
+      {OMPD_target_exit_data, "target_exit_data"},
+      {OMPD_target_update, "target_update"},
+  };
+  return Names;
+}
+
+class VersionTest : public testing::TestWithParam<unsigned> {
+public:
+  void SetUp() override {
+    Version = GetParam();
+
+    if (Version < 60)
+      KindToName = &Expected52();
+    else
+      KindToName = &Expected60();
+  }
+
+  const DenseMap<Directive, StringRef> *KindToName;
+  unsigned Version;
+};
+
+INSTANTIATE_TEST_SUITE_P(OpenMPDirectiveNames, VersionTest,
+                         testing::ValuesIn(getOpenMPVersions()));
+
+TEST_P(VersionTest, DirectiveName) {
+  for (auto [Kind, Name] : *KindToName)
+    ASSERT_EQ(Name, getOpenMPDirectiveName(Kind, Version));
+}
+
+TEST(OpenMPDirectiveNames, DirectiveKind52) {
+  for (auto [Kind, Name] : Expected52()) {
+    auto [K, R] = getOpenMPDirectiveKindAndVersions(Name);
+    ASSERT_EQ(K, Kind);
+    // Expect the name to be valid in 5.2, but not in 6.0.
+    EXPECT_TRUE(52 <= R.Max && R.Max < 60);
+  }
+}
+
+TEST(OpenMPDirectiveNames, DirectiveKind60) {
+  for (auto [Kind, Name] : Expected60()) {
+    auto [K, R] = getOpenMPDirectiveKindAndVersions(Name);
+    ASSERT_EQ(K, Kind);
+    // Expect the name to be valid in 6.0 and later.
+    EXPECT_TRUE(60 <= R.Min);
+  }
+}
diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp
index d7aa584bb8cb4..35bdbf8cc8321 100644
--- a/llvm/unittests/IR/DebugInfoTest.cpp
+++ b/llvm/unittests/IR/DebugInfoTest.cpp
@@ -1301,4 +1301,33 @@ TEST(DIBuilder, CompositeTypes) {
   EXPECT_EQ(Enum->getTag(), dwarf::DW_TAG_enumeration_type);
 }
 
+TEST(DIBuilder, DynamicOffsetAndSize) {
+  LLVMContext Ctx;
+  auto M = std::make_unique<Module>("MyModule", Ctx);
+  DIBuilder DIB(*M);
+  DIScope *Scope = DISubprogram::getDistinct(
+      Ctx, nullptr, "", "", nullptr, 0, nullptr, 0, nullptr, 0, 0,
+      DINode::FlagZero, DISubprogram::SPFlagZero, nullptr);
+  DIFile *F = DIB.createFile("main.adb", "/");
+
+  DIVariable *Len = DIB.createAutoVariable(Scope, "length", F, 0, nullptr,
+                                           false, DINode::FlagZero, 0);
+
+  DICompositeType *Struct = DIB.createStructType(
+      Scope, "some_record", F, 18, Len, 8, DINode::FlagZero, nullptr, {});
+  EXPECT_EQ(Struct->getTag(), dwarf::DW_TAG_structure_type);
+
+  SmallVector<uint64_t, 4> ops;
+  ops.push_back(llvm::dwarf::DW_OP_push_object_address);
+  DIExpression::appendOffset(ops, 3);
+  ops.push_back(llvm::dwarf::DW_OP_deref);
+  DIExpression *Expr = DIB.createExpression(ops);
+
+  DIDerivedType *Field = DIB.createMemberType(Scope, "field", F, 23, Len, 0,
+                                              Expr, DINode::FlagZero, Struct);
+
+  EXPECT_EQ(Field->getRawOffsetInBits(), Expr);
+  EXPECT_EQ(Field->getRawSizeInBits(), Len);
+}
+
 } // end namespace
diff --git a/llvm/unittests/IR/DebugTypeODRUniquingTest.cpp b/llvm/unittests/IR/DebugTypeODRUniquingTest.cpp
index e1ce671852c8b..6716796e71c5a 100644
--- a/llvm/unittests/IR/DebugTypeODRUniquingTest.cpp
+++ b/llvm/unittests/IR/DebugTypeODRUniquingTest.cpp
@@ -141,12 +141,12 @@ TEST(DebugTypeODRUniquingTest, buildODRTypeFields) {
   DO_FOR_FIELD(BaseType)                                                       \
   DO_FOR_FIELD(Elements)                                                       \
   DO_FOR_FIELD(VTableHolder)                                                   \
-  DO_FOR_FIELD(TemplateParams)
+  DO_FOR_FIELD(TemplateParams)                                                 \
+  DO_FOR_FIELD(SizeInBits)                                                     \
+  DO_FOR_FIELD(OffsetInBits)
 #define FOR_EACH_INLINEFIELD()                                                 \
   DO_FOR_FIELD(Line)                                                           \
-  DO_FOR_FIELD(SizeInBits)                                                     \
   DO_FOR_FIELD(AlignInBits)                                                    \
-  DO_FOR_FIELD(OffsetInBits)                                                   \
   DO_FOR_FIELD(NumExtraInhabitants)                                            \
   DO_FOR_FIELD(RuntimeLang)                                                    \
   DO_FOR_FIELD(EnumKind)
diff --git a/llvm/unittests/TargetParser/TripleTest.cpp b/llvm/unittests/TargetParser/TripleTest.cpp
index 0f6d07657c931..4d547011c1568 100644
--- a/llvm/unittests/TargetParser/TripleTest.cpp
+++ b/llvm/unittests/TargetParser/TripleTest.cpp
@@ -2281,6 +2281,44 @@ TEST(TripleTest, XROS) {
   EXPECT_EQ(VersionTuple(17), Version);
 }
 
+TEST(TripleTest, BridgeOS) {
+  Triple T;
+  VersionTuple Version;
+
+  T = Triple("arm64-apple-bridgeos");
+  EXPECT_TRUE(T.isBridgeOS());
+  EXPECT_FALSE(T.isXROS());
+  EXPECT_TRUE(T.isOSDarwin());
+  EXPECT_FALSE(T.isiOS());
+  EXPECT_FALSE(T.isMacOSX());
+  EXPECT_FALSE(T.isSimulatorEnvironment());
+  EXPECT_EQ(T.getOSName(), "bridgeos");
+  Version = T.getOSVersion();
+  EXPECT_EQ(VersionTuple(0), Version);
+
+  T = Triple("arm64-apple-bridgeos1.0");
+  EXPECT_TRUE(T.isBridgeOS());
+  EXPECT_FALSE(T.isXROS());
+  EXPECT_TRUE(T.isOSDarwin());
+  EXPECT_FALSE(T.isiOS());
+  EXPECT_FALSE(T.isMacOSX());
+  EXPECT_FALSE(T.isSimulatorEnvironment());
+  EXPECT_EQ(T.getOSName(), "bridgeos1.0");
+  Version = T.getOSVersion();
+  EXPECT_EQ(VersionTuple(1), Version);
+
+  T = Triple("arm64-apple-bridgeos9.0");
+  EXPECT_TRUE(T.isBridgeOS());
+  EXPECT_FALSE(T.isXROS());
+  EXPECT_TRUE(T.isOSDarwin());
+  EXPECT_FALSE(T.isiOS());
+  EXPECT_FALSE(T.isMacOSX());
+  EXPECT_FALSE(T.isSimulatorEnvironment());
+  EXPECT_EQ(T.getOSName(), "bridgeos9.0");
+  Version = T.getOSVersion();
+  EXPECT_EQ(VersionTuple(9), Version);
+}
+
 TEST(TripleTest, getOSVersion) {
   Triple T;
   VersionTuple Version;
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index 77618b24cf115..408fe544d260f 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -27,6 +27,8 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/StringToOffsetTable.h"
 #include "llvm/TableGen/TableGenBackend.h"
 #include <algorithm>
 #include <cassert>
@@ -1380,6 +1382,10 @@ void SubtargetEmitter::emitSchedClassTables(SchedClassTables &SchedTables,
   }
   OS << "}; // " << Target << "ReadAdvanceTable\n";
 
+  // Pool all SchedClass names in a string table.
+  StringToOffsetTable StrTab;
+  unsigned InvalidNameOff = StrTab.GetOrAddStringOffset("InvalidSchedClass");
+
   // Emit a SchedClass table for each processor.
   for (const auto &[Idx, Proc] : enumerate(SchedModels.procModels())) {
     if (!Proc.hasInstrSchedModel())
@@ -1397,14 +1403,15 @@ void SubtargetEmitter::emitSchedClassTables(SchedClassTables &SchedTables,
     // name and position.
     assert(SchedModels.getSchedClass(0).Name == "NoInstrModel" &&
            "invalid class not first");
-    OS << "  {DBGFIELD(\"InvalidSchedClass\")  "
+    OS << "  {DBGFIELD(" << InvalidNameOff << ")  "
        << MCSchedClassDesc::InvalidNumMicroOps
        << ", false, false, false, 0, 0,  0, 0,  0, 0},\n";
 
     for (unsigned SCIdx = 1, SCEnd = SCTab.size(); SCIdx != SCEnd; ++SCIdx) {
       MCSchedClassDesc &MCDesc = SCTab[SCIdx];
       const CodeGenSchedClass &SchedClass = SchedModels.getSchedClass(SCIdx);
-      OS << "  {DBGFIELD(\"" << SchedClass.Name << "\") ";
+      unsigned NameOff = StrTab.GetOrAddStringOffset(SchedClass.Name);
+      OS << "  {DBGFIELD(/*" << SchedClass.Name << "*/ " << NameOff << ") ";
       if (SchedClass.Name.size() < 18)
         OS.indent(18 - SchedClass.Name.size());
       OS << MCDesc.NumMicroOps << ", " << (MCDesc.BeginGroup ? "true" : "false")
@@ -1419,6 +1426,8 @@ void SubtargetEmitter::emitSchedClassTables(SchedClassTables &SchedTables,
     }
     OS << "}; // " << Proc.ModelName << "SchedClasses\n";
   }
+
+  StrTab.EmitStringTableDef(OS, Target + "SchedClassNames");
 }
 
 void SubtargetEmitter::emitProcessorModels(raw_ostream &OS) {
@@ -1472,6 +1481,8 @@ void SubtargetEmitter::emitProcessorModels(raw_ostream &OS) {
     else
       OS << "  nullptr, nullptr, 0, 0,"
          << " // No instruction-level machine model.\n";
+    OS << "  DBGVAL_OR_NULLPTR(&" << Target
+       << "SchedClassNames), // SchedClassNames\n";
     if (PM.hasItineraries())
       OS << "  " << PM.ItinsDef->getName() << ",\n";
     else
@@ -1493,8 +1504,10 @@ void SubtargetEmitter::emitSchedModel(raw_ostream &OS) {
      << "#endif\n"
      << "#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)\n"
      << "#define DBGFIELD(x) x,\n"
+     << "#define DBGVAL_OR_NULLPTR(x) x\n"
      << "#else\n"
      << "#define DBGFIELD(x)\n"
+     << "#define DBGVAL_OR_NULLPTR(x) nullptr\n"
      << "#endif\n";
 
   if (SchedModels.hasItineraries()) {
@@ -1512,10 +1525,11 @@ void SubtargetEmitter::emitSchedModel(raw_ostream &OS) {
   }
   emitSchedClassTables(SchedTables, OS);
 
-  OS << "\n#undef DBGFIELD\n";
-
   // Emit the processor machine model
   emitProcessorModels(OS);
+
+  OS << "\n#undef DBGFIELD\n";
+  OS << "\n#undef DBGVAL_OR_NULLPTR\n";
 }
 
 static void emitPredicateProlog(const RecordKeeper &Records, raw_ostream &OS) {
diff --git a/mlir/include/mlir/Dialect/ArmNeon/Transforms.h b/mlir/include/mlir/Dialect/ArmNeon/Transforms.h
index 52ebea2d0ffd9..2f0f634a96770 100644
--- a/mlir/include/mlir/Dialect/ArmNeon/Transforms.h
+++ b/mlir/include/mlir/Dialect/ArmNeon/Transforms.h
@@ -13,7 +13,7 @@ namespace mlir {
 class RewritePatternSet;
 
 namespace arm_neon {
-void populateLowerContractionToSMMLAPatternPatterns(
+void populateLowerContractionToNeonI8MMPatternPatterns(
     RewritePatternSet &patterns);
 } // namespace arm_neon
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 418931b931265..6895e946b8a45 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -965,19 +965,21 @@ def NVVM_ElectSyncOp : NVVM_Op<"elect.sync">
   let summary = "Elect one leader thread";
   let description = [{
     The `elect.sync` instruction elects one predicated active leader
-    thread from among a set of threads specified in membermask.
-    The membermask is set to `0xFFFFFFFF` for the current version
-    of this Op. The predicate result is set to `True` for the
-    leader thread, and `False` for all other threads.
+    thread from among a set of threads specified in the `membermask`.
+    When the `membermask` is not provided explicitly, a default value
+    of `0xFFFFFFFF` is used. The predicate result is set to `True` for
+    the leader thread, and `False` for all other threads.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync)
   }];
 
+  let arguments = (ins Optional<I32>:$membermask);
   let results = (outs I1:$pred);
-  let assemblyFormat = "attr-dict `->` type(results)";  
+  let assemblyFormat = "($membermask^)? attr-dict `->` type(results)";
   string llvmBuilder = [{
     auto *resultTuple = createIntrinsicCall(builder,
-        llvm::Intrinsic::nvvm_elect_sync, {builder.getInt32(0xFFFFFFFF)});
+        llvm::Intrinsic::nvvm_elect_sync,
+        {$membermask ? $membermask : builder.getInt32(0xFFFFFFFF)});
     // Extract the second value into $pred
     $pred = builder.CreateExtractValue(resultTuple, 1);
   }];
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/Linalg.h b/mlir/include/mlir/Dialect/Linalg/IR/Linalg.h
index 57bf6305a469d..4f5fea107f07b 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/Linalg.h
+++ b/mlir/include/mlir/Dialect/Linalg/IR/Linalg.h
@@ -100,6 +100,20 @@ OpFoldResult createFoldedDimOp(OpBuilder &b, Location loc, Value val,
 
 #include "mlir/Dialect/Linalg/IR/LinalgOpsEnums.h.inc"
 
+namespace mlir {
+namespace linalg {
+
+/// Converts the given `m` and `r` parameters to a WinogradConv2DFmr enumeration
+/// value.
+std::optional<WinogradConv2DFmr> getWinogradConv2DFmr(int64_t m, int64_t r);
+
+/// Converts the given WinogradConv2DFmr enumeration value to a pair of
+/// m and r parameters.
+std::pair<int64_t, int64_t> getFmrFromWinogradConv2DFmr(WinogradConv2DFmr fmr);
+
+} // namespace linalg
+} // namespace mlir
+
 //===----------------------------------------------------------------------===//
 // Linalg Attributes
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgEnums.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgEnums.td
index ce68afe471fe8..1109db973f522 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgEnums.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgEnums.td
@@ -122,4 +122,19 @@ def TypeFn : I32EnumAttr<"TypeFn", "", [
   let cppNamespace = "::mlir::linalg";
 }
 
+/// We use F(m, r) to define the size of minimal filtering algorithms.
+/// m is the output dimension and r is the filter dimension. We can get
+/// the input dimension, alpha, from the formula, alpha = m + r - 1.
+///
+/// For example, when m = 2 and r = 3, we know its input size is 4.
+/// The Conv2D will operate on 4x4 input data with 3x3 filter and get
+/// 2x2 output result.
+def WinogradConv2DFmr : I32EnumAttr<"WinogradConv2DFmr", "", [
+      I32EnumAttrCase<"F_2_3", 0>,
+      I32EnumAttrCase<"F_4_3", 1>,
+      I32EnumAttrCase<"F_2_5", 2>,
+]>{
+  let cppNamespace = "mlir::linalg";
+}
+
 #endif // LINALG_ENUMS
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
index 1b48bf5fcb237..7ff44c2e1d2ed 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
@@ -183,15 +183,13 @@ def Linalg_WinogradFilterTransformOp : Linalg_Op<"winograd_filter_transform",
 
   let arguments = (ins TensorRankOf<[AnyType], [4]>:$filter,
                        TensorRankOf<[AnyType], [4]>:$output,
-                       I64Attr:$m,
-                       I64Attr:$r
+                       WinogradConv2DFmr:$fmr
   );
 
   let results = (outs TensorRankOf<[AnyType], [4]>:$result);
   let assemblyFormat = [{
     attr-dict
-    `m` `(` $m `)`
-    `r` `(` $r `)`
+    `fmr` `(` $fmr `)`
     `ins` `(` $filter `:` type($filter) `)`
     `outs` `(` $output `:` type($output) `)`
     `->` type($result)
@@ -254,15 +252,13 @@ def Linalg_WinogradInputTransformOp : Linalg_Op<"winograd_input_transform",
 
   let arguments = (ins TensorRankOf<[AnyType], [4]>:$input,
                        TensorRankOf<[AnyType], [6]>:$output,
-                       I64Attr:$m,
-                       I64Attr:$r
+                       WinogradConv2DFmr:$fmr
   );
 
   let results = (outs TensorRankOf<[AnyType], [6]>:$result);
   let assemblyFormat = [{
     attr-dict
-    `m` `(` $m `)`
-    `r` `(` $r `)`
+    `fmr` `(` $fmr `)`
     `ins` `(` $input `:` type($input) `)`
     `outs` `(` $output `:` type($output) `)`
     `->` type($result)
@@ -343,15 +339,13 @@ def Linalg_WinogradOutputTransformOp : Linalg_Op<"winograd_output_transform",
 
   let arguments = (ins TensorRankOf<[AnyType], [6]>:$value,
                        TensorRankOf<[AnyType], [4]>:$output,
-                       I64Attr:$m,
-                       I64Attr:$r
+                       WinogradConv2DFmr:$fmr
   );
 
   let results = (outs TensorRankOf<[AnyType], [4]>:$result);
   let assemblyFormat = [{
     attr-dict
-    `m` `(` $m `)`
-    `r` `(` $r `)`
+    `fmr` `(` $fmr `)`
     `ins` `(` $value `:` type($value) `)`
     `outs` `(` $output `:` type($output) `)`
     `->` type($result)
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 9d6ce653e285c..d64f94a49f781 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -9,6 +9,7 @@
 #ifndef LINALG_TRANSFORM_OPS
 #define LINALG_TRANSFORM_OPS
 
+include "mlir/Dialect/Linalg/IR/LinalgEnums.td"
 include "mlir/Dialect/Linalg/TransformOps/LinalgTransformEnums.td"
 include "mlir/Dialect/Transform/IR/TransformAttrs.td"
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
@@ -2902,8 +2903,7 @@ def WinogradConv2DOp : Op<Transform_Dialect,
   }];
 
   let arguments = (ins TransformHandleTypeInterface:$target,
-                       I64Attr:$m,
-                       I64Attr:$r);
+                       WinogradConv2DFmr:$fmr);
   let results = (outs TransformHandleTypeInterface:$transformed);
 
   let assemblyFormat =
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 189438e9ad528..2b4855f49695c 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -37,6 +37,7 @@ class BufferizationState;
 namespace linalg {
 
 class LinalgOp;
+enum class WinogradConv2DFmr : uint32_t;
 
 //===----------------------------------------------------------------------===//
 // Utils.
@@ -1426,8 +1427,8 @@ FailureOr<Operation *> transposeBatchMatmul(RewriterBase &rewriter,
 /// F(m x m, r x r). m is the dimension size of output and r is the dimension
 /// size of filter.
 FailureOr<Operation *> winogradConv2D(RewriterBase &rewriter,
-                                      linalg::Conv2DNhwcFhwcOp op, int64_t m,
-                                      int64_t r);
+                                      linalg::Conv2DNhwcFhwcOp op,
+                                      WinogradConv2DFmr fmr);
 
 /// Rewrite linalg.winograd_filter_transform. The data layout of the filter is
 /// FHWC. The transformation matrix is 2-dimension. We need to extract H x W
@@ -1968,8 +1969,8 @@ void populateBlockPackMatmulPatterns(RewritePatternSet &patterns,
                                      const ControlBlockPackMatmulFn &controlFn);
 
 /// Patterns to apply Winograd Conv2D algorithm F(m x m, r x r).
-void populateWinogradConv2DPatterns(RewritePatternSet &patterns, int64_t m,
-                                    int64_t r);
+void populateWinogradConv2DPatterns(RewritePatternSet &patterns,
+                                    WinogradConv2DFmr fmr);
 
 /// Patterns to decompose Winograd operators.
 void populateDecomposeWinogradOpsPatterns(RewritePatternSet &patterns);
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
index 7b43aa43c7517..3205da6e448fc 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
@@ -313,19 +313,23 @@ tileConsumerAndFuseProducersUsingSCF(RewriterBase &rewriter,
                                      TilingInterface consumer,
                                      const SCFTileAndFuseOptions &options);
 
-/// Fuse the consumer of the source of `candidateSliceOp` by computing the
-/// required slice of the consumer in-place.  Note that the method
-/// replaces the uses of `candidateSliceOp` with the tiled and fused consumer
-/// value but does not delete the slice operation.
+/// Fuse the consumer `candidateSlices` by computing the required slice of the
+/// consumer in-place. All the entries of `candidateSlices` are expected to map
+/// to the same consumer. The method returns an error if the consumer cannot be
+/// tiled in a manner that is consistent for all the passed slices. Note that
+/// the method replaces the uses of `candidateSlices` with the tiled and fused
+/// consumer value but does not delete the slice operations.
 struct SCFFuseConsumerOfSliceResult {
-  OpOperand *origConsumerOperand; // Original untiled consumer's operand.
-  OpOperand
-      *tiledAndFusedConsumerOperand; // Tiled and fused consumer's operand.
+  // Original untiled consumer operands.
+  SmallVector<OpOperand *> origConsumerOperands;
+  // Tiled and fused consumer operands.
+  SmallVector<OpOperand *> tiledAndFusedConsumerOperands;
   SmallVector<Operation *> tiledOps;
 };
 FailureOr<scf::SCFFuseConsumerOfSliceResult>
-tileAndFuseConsumerOfSlice(RewriterBase &rewriter, Operation *candidateSliceOp,
-                           MutableArrayRef<LoopLikeOpInterface> loops);
+tileAndFuseConsumerOfSlices(RewriterBase &rewriter,
+                            ArrayRef<Operation *> candidateSlices,
+                            MutableArrayRef<LoopLikeOpInterface> loops);
 
 /// Method to lower an `op` that implements the `TilingInterface` to
 /// loops/scalars.
diff --git a/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
index 18981337742eb..87deef9ca7466 100644
--- a/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
@@ -31,12 +31,16 @@ namespace tensor {
 FailureOr<TilingResult> replaceExtractSliceWithTiledProducer(
     OpBuilder &builder, tensor::ExtractSliceOp sliceOp, OpResult producerOp);
 
-/// Method to swap an `tensor.insert_slice` with its consumer when the
-/// consumer implements the `TilingInterface`.
+/// Method to swap `tensor.insert_slice`s with their consumers when the
+/// consumer implements the `TilingInterface`. The size of `sliceOps` and
+/// `consumerOperands` is expected to be the same. Every entry in
+/// `consumerOperands` represents a use of the the corresponding
+/// entry in `sliceOps` in the consumer. All entries of `consumerOperands` is
+/// expected to be uses in the same consumer.
 FailureOr<TilingResult>
-replaceInsertSliceWithTiledConsumer(OpBuilder &builder,
-                                    OffsetSizeAndStrideOpInterface sliceOp,
-                                    OpOperand &consumerOp);
+replaceInsertSlicesWithTiledConsumer(OpBuilder &builder,
+                                     ArrayRef<tensor::InsertSliceOp> sliceOps,
+                                     ArrayRef<OpOperand *> consumerOperands);
 
 //===----------------------------------------------------------------------===//
 // Populate functions.
diff --git a/mlir/include/mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.td b/mlir/include/mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.td
index 0275f241fda35..4a6898e36d343 100644
--- a/mlir/include/mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.td
+++ b/mlir/include/mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.td
@@ -20,7 +20,7 @@ include "mlir/Dialect/Transform/Interfaces/MatchInterfaces.td"
 include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
 
-def DebugEmitRemarkAtOp : TransformDialectOp<"debug.emit_remark_at",
+def EmitRemarkAtOp : TransformDialectOp<"debug.emit_remark_at",
   [MatchOpInterface,
    DeclareOpInterfaceMethods<TransformOpInterface>,
    MemoryEffectsOpInterface, NavigationTransformOpTrait]> {
@@ -39,7 +39,7 @@ def DebugEmitRemarkAtOp : TransformDialectOp<"debug.emit_remark_at",
   let assemblyFormat = "$at `,` $message attr-dict `:` type($at)";
 }
 
-def DebugEmitParamAsRemarkOp 
+def EmitParamAsRemarkOp
   : TransformDialectOp<"debug.emit_param_as_remark",
     [MatchOpInterface,
      DeclareOpInterfaceMethods<TransformOpInterface>,
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 02e62930a742d..d58ee84bee63d 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -2920,8 +2920,8 @@ def Vector_SplatOp : Vector_Op<"splat", [
   ]> {
   let summary = "vector splat or broadcast operation";
   let description = [{
-    Broadcast the operand to all elements of the result vector. The operand is
-    required to be of integer/index/float type.
+    Broadcast the operand to all elements of the result vector. The type of the
+    operand must match the element type of the vector type.
 
     Example:
 
@@ -2931,8 +2931,7 @@ def Vector_SplatOp : Vector_Op<"splat", [
     ```
   }];
 
-  let arguments = (ins AnyTypeOf<[AnySignlessInteger, Index, AnyFloat],
-                                 "integer/index/float type">:$input);
+  let arguments = (ins AnyType:$input);
   let results = (outs AnyVectorOfAnyRank:$aggregate);
 
   let builders = [
diff --git a/mlir/include/mlir/IR/EnumAttr.td b/mlir/include/mlir/IR/EnumAttr.td
index 3f7f747ac20d3..ff6cec6d41161 100644
--- a/mlir/include/mlir/IR/EnumAttr.td
+++ b/mlir/include/mlir/IR/EnumAttr.td
@@ -39,8 +39,11 @@ class EnumCase<string sym, int intVal, string strVal, int widthVal> {
 class IntEnumAttrCaseBase<I intType, string sym, string strVal, int intVal> :
     EnumCase<sym, intVal, strVal, intType.bitwidth>,
     SignlessIntegerAttrBase<intType, "case " # strVal> {
-  let predicate =
-    CPred<"::llvm::cast<::mlir::IntegerAttr>($_self).getInt() == " # intVal>;
+  let predicate = CPred<[{
+    ::llvm::cast<::mlir::IntegerAttr>($_self).getValue().eq(::llvm::APInt(}]
+      # intType.bitwidth # ", "
+      # intVal # 
+    "))">;
 }
 
 // Cases of integer enums with a specific type. By default, the string
diff --git a/mlir/include/mlir/IR/OpDefinition.h b/mlir/include/mlir/IR/OpDefinition.h
index 31f54413a5ff0..663c256c848df 100644
--- a/mlir/include/mlir/IR/OpDefinition.h
+++ b/mlir/include/mlir/IR/OpDefinition.h
@@ -272,7 +272,7 @@ class OpFoldResult : public PointerUnion<Attribute, Value> {
   using PointerUnion<Attribute, Value>::PointerUnion;
 
 public:
-  void dump() const { llvm::errs() << *this << "\n"; }
+  LLVM_DUMP_METHOD void dump() const { llvm::errs() << *this << "\n"; }
 
   MLIRContext *getContext() const {
     PointerUnion pu = *this;
diff --git a/mlir/include/mlir/IR/Visitors.h b/mlir/include/mlir/IR/Visitors.h
index 15abf2559e5c4..893f66ae33deb 100644
--- a/mlir/include/mlir/IR/Visitors.h
+++ b/mlir/include/mlir/IR/Visitors.h
@@ -14,6 +14,7 @@
 #define MLIR_IR_VISITORS_H
 
 #include "mlir/Support/LLVM.h"
+#include "mlir/Support/WalkResult.h"
 #include "llvm/ADT/STLExtras.h"
 
 namespace mlir {
@@ -23,41 +24,6 @@ class Operation;
 class Block;
 class Region;
 
-/// A utility result that is used to signal how to proceed with an ongoing walk:
-///   * Interrupt: the walk will be interrupted and no more operations, regions
-///   or blocks will be visited.
-///   * Advance: the walk will continue.
-///   * Skip: the walk of the current operation, region or block and their
-///   nested elements that haven't been visited already will be skipped and will
-///   continue with the next operation, region or block.
-class WalkResult {
-  enum ResultEnum { Interrupt, Advance, Skip } result;
-
-public:
-  WalkResult(ResultEnum result = Advance) : result(result) {}
-
-  /// Allow LogicalResult to interrupt the walk on failure.
-  WalkResult(LogicalResult result)
-      : result(failed(result) ? Interrupt : Advance) {}
-
-  /// Allow diagnostics to interrupt the walk.
-  WalkResult(Diagnostic &&) : result(Interrupt) {}
-  WalkResult(InFlightDiagnostic &&) : result(Interrupt) {}
-
-  bool operator==(const WalkResult &rhs) const { return result == rhs.result; }
-  bool operator!=(const WalkResult &rhs) const { return result != rhs.result; }
-
-  static WalkResult interrupt() { return {Interrupt}; }
-  static WalkResult advance() { return {Advance}; }
-  static WalkResult skip() { return {Skip}; }
-
-  /// Returns true if the walk was interrupted.
-  bool wasInterrupted() const { return result == Interrupt; }
-
-  /// Returns true if the walk was skipped.
-  bool wasSkipped() const { return result == Skip; }
-};
-
 /// Traversal order for region, block and operation walk utilities.
 enum class WalkOrder { PreOrder, PostOrder };
 
diff --git a/mlir/include/mlir/Interfaces/TilingInterface.td b/mlir/include/mlir/Interfaces/TilingInterface.td
index 0de37338c95e4..0c0fc88aec95a 100644
--- a/mlir/include/mlir/Interfaces/TilingInterface.td
+++ b/mlir/include/mlir/Interfaces/TilingInterface.td
@@ -202,28 +202,28 @@ def TilingInterface : OpInterface<"TilingInterface"> {
       InterfaceMethod<
         /*desc=*/[{
           Method to generate the tiled implementation of an operation that uses
-          exactly a tile of the given operand.
+          the exact tiles of the given operands.
 
           This method is required to allow operations to be "tiled and fused"
-          with an (already tiled) producer. Given a tile of the producer, this
-          method generates the tile of the consumer that uses exactly this
-          produced tile. In some sense it is the "reverse" of
+          with an (already tiled) producer. Given tiles of the producer, this
+          method generates the tile of the consumer that uses exactly these
+          produced tiles. In some sense it is the "reverse" of
           `generateResultTileValue`.
-          - `operandNumber` is the result of the producer used by the consumer.
-          - `offsets` is the offset of the slice of the producer result used by
-            the tiled implementation of the consumer.
-          - `sizes` is the size of the slice of the producer result used by the
+          - `operandNumbers` is the list of operands whose tiles are "producers".
+          - `allOffsets` is the offset of the slice of the producer used by the
+            tiled implementation of the consumer.
+          - `allSizes` is the size of the slice of the producer used by the
             consumer.
-          If it is illegal to fuse with a producer along the given operand for
+          If it is illegal to fuse with a producer along the given operand tiles for
           an operation, the implementation should return a failure.
         }],
         /*retType=*/"::mlir::FailureOr<::mlir::TilingResult>",
-        /*methodName=*/"getTiledImplementationFromOperandTile",
+        /*methodName=*/"getTiledImplementationFromOperandTiles",
         /*args=*/(ins
           "::mlir::OpBuilder &":$b,
-          "unsigned":$operandNumber,
-          "::mlir::ArrayRef<::mlir::OpFoldResult>":$offsets,
-          "::mlir::ArrayRef<::mlir::OpFoldResult>":$sizes),
+          "::mlir::ArrayRef<unsigned>":$operandNumbers,
+          "::mlir::ArrayRef<::mlir::SmallVector<::mlir::OpFoldResult>>":$allOffsets,
+          "::mlir::ArrayRef<::mlir::SmallVector<::mlir::OpFoldResult>>":$allSizes),
         /*methodBody=*/"",
         /*defaultImplementation=*/[{
           return failure();
@@ -235,16 +235,17 @@ def TilingInterface : OpInterface<"TilingInterface"> {
           tile of the operand.
 
           This method is required to allow operations to be "tiled and fused"
-          with an (already tiled) producer. Given a tile of an operand,
-          returns the tile of the iteration space that uses this tile.
-          - `operandNumber` is the result of the producer used by the consumer.
-          - `offsets` is the offset of the slice of the producer result used by
+          with an (already tiled) producer. Given tiles of operands,
+          returns the tile of the iteration space that uses these tiles.
+          - `operandNumbers` is the list of operands whose tiles are "produced"
+            by the producer(s).
+          - `allOffsets` is the offset of the slice of the producers used by
             the tiled implementation of the consumer.
-          - `sizes` is the size of the slice of the producer result used by the
+          - `allSizes` is the size of the slice of the producers used by the
             consumer.
-          If it is illegal to fuse with a producer along the given operand for
-          an operation, or if this mapping cannot be computed, the
-          implementation should return a failure.
+          If it is illegal to fuse with the producer slices for an operation,
+          or if this mapping cannot be computed, the implementation should
+          return a failure.
 
           Note that unlike the "tile consumer and fuse producer" case, the
           "tile producer and fuse consumer" requires an additional method to get
@@ -285,17 +286,17 @@ def TilingInterface : OpInterface<"TilingInterface"> {
           transformation. It does not provide guarantees on whether such a
           transformation is profitable.
 
-          For most cases `getTiledImplementationFromOperandTile` could be a
-          implemented using `getIterationDomainTileFromOperandTile` +
+          For most cases `getTiledImplementationFromOperandTiles` could be a
+          implemented using `getIterationDomainTileFromOperandTiles` +
           `getTiledImplementation` methods.
         }],
         /*retType=*/"::llvm::LogicalResult",
-        /*methodName=*/"getIterationDomainTileFromOperandTile",
+        /*methodName=*/"getIterationDomainTileFromOperandTiles",
         /*args=*/(ins
           "::mlir::OpBuilder &":$b,
-          "unsigned":$operandNumber,
-          "::mlir::ArrayRef<::mlir::OpFoldResult> ":$offsets,
-          "::mlir::ArrayRef<::mlir::OpFoldResult> ":$sizes,
+          "::mlir::ArrayRef<unsigned>":$operandNumbers,
+          "::mlir::ArrayRef<::mlir::SmallVector<::mlir::OpFoldResult>> ":$allOffsets,
+          "::mlir::ArrayRef<::mlir::SmallVector<::mlir::OpFoldResult>> ":$allSizes,
           "::mlir::SmallVectorImpl<::mlir::OpFoldResult> &":$iterDomainOffsets,
           "::mlir::SmallVectorImpl<::mlir::OpFoldResult> &":$iterDomainSizes),
         /*methodBody=*/"",
diff --git a/mlir/include/mlir/IR/StateStack.h b/mlir/include/mlir/Support/StateStack.h
similarity index 96%
rename from mlir/include/mlir/IR/StateStack.h
rename to mlir/include/mlir/Support/StateStack.h
index 6a22e3b0d00a4..ef0f5d198b456 100644
--- a/mlir/include/mlir/IR/StateStack.h
+++ b/mlir/include/mlir/Support/StateStack.h
@@ -12,11 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_IR_STACKFRAME_H
-#define MLIR_IR_STACKFRAME_H
+#ifndef MLIR_SUPPORT_STACKFRAME_H
+#define MLIR_SUPPORT_STACKFRAME_H
 
-#include "mlir/IR/Visitors.h"
 #include "mlir/Support/TypeID.h"
+#include "mlir/Support/WalkResult.h"
 #include <memory>
 
 namespace mlir {
@@ -125,4 +125,4 @@ struct isa_impl<T, ::mlir::StateStackFrame> {
 };
 } // namespace llvm
 
-#endif // MLIR_IR_STACKFRAME_H
+#endif // MLIR_SUPPORT_STACKFRAME_H
diff --git a/mlir/include/mlir/Support/WalkResult.h b/mlir/include/mlir/Support/WalkResult.h
new file mode 100644
index 0000000000000..cd3b1e1562796
--- /dev/null
+++ b/mlir/include/mlir/Support/WalkResult.h
@@ -0,0 +1,59 @@
+//===- WalkResult.h - Status of completed walk ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Result kind for completed walk.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_WALKRESULT_H
+#define MLIR_SUPPORT_WALKRESULT_H
+
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+class Diagnostic;
+class InFlightDiagnostic;
+
+/// A utility result that is used to signal how to proceed with an ongoing walk:
+///   * Interrupt: the walk will be interrupted and no more operations, regions
+///   or blocks will be visited.
+///   * Advance: the walk will continue.
+///   * Skip: the walk of the current operation, region or block and their
+///   nested elements that haven't been visited already will be skipped and will
+///   continue with the next operation, region or block.
+class WalkResult {
+  enum ResultEnum { Interrupt, Advance, Skip } result;
+
+public:
+  WalkResult(ResultEnum result = Advance) : result(result) {}
+
+  /// Allow LogicalResult to interrupt the walk on failure.
+  WalkResult(LogicalResult result)
+      : result(failed(result) ? Interrupt : Advance) {}
+
+  /// Allow diagnostics to interrupt the walk.
+  WalkResult(Diagnostic &&) : result(Interrupt) {}
+  WalkResult(InFlightDiagnostic &&) : result(Interrupt) {}
+
+  bool operator==(const WalkResult &rhs) const { return result == rhs.result; }
+  bool operator!=(const WalkResult &rhs) const { return result != rhs.result; }
+
+  static WalkResult interrupt() { return {Interrupt}; }
+  static WalkResult advance() { return {Advance}; }
+  static WalkResult skip() { return {Skip}; }
+
+  /// Returns true if the walk was interrupted.
+  bool wasInterrupted() const { return result == Interrupt; }
+
+  /// Returns true if the walk was skipped.
+  bool wasSkipped() const { return result == Skip; }
+};
+
+} // namespace mlir
+
+#endif
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
index 197be5f30b5b0..79e8bb6add0da 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
@@ -16,9 +16,9 @@
 
 #include "mlir/Dialect/LLVMIR/LLVMInterfaces.h"
 #include "mlir/IR/Operation.h"
-#include "mlir/IR/StateStack.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/IR/Value.h"
+#include "mlir/Support/StateStack.h"
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Target/LLVMIR/LLVMTranslationInterface.h"
 #include "mlir/Target/LLVMIR/TypeToLLVM.h"
diff --git a/mlir/lib/Conversion/PDLToPDLInterp/PDLToPDLInterp.cpp b/mlir/lib/Conversion/PDLToPDLInterp/PDLToPDLInterp.cpp
index bd2846ac388fd..945d38e929e08 100644
--- a/mlir/lib/Conversion/PDLToPDLInterp/PDLToPDLInterp.cpp
+++ b/mlir/lib/Conversion/PDLToPDLInterp/PDLToPDLInterp.cpp
@@ -991,7 +991,7 @@ void PDLToPDLInterpPass::runOnOperation() {
       module.getLoc(), pdl_interp::PDLInterpDialect::getMatcherFunctionName(),
       builder.getFunctionType(builder.getType<pdl::OperationType>(),
                               /*results=*/{}),
-      /*attrs=*/std::nullopt);
+      /*attrs=*/ArrayRef<NamedAttribute>());
 
   // Create a nested module to hold the functions invoked for rewriting the IR
   // after a successful match.
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index 923f5f67b865a..c2be08ef40f21 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -310,8 +310,8 @@ static Value createLinalgBodyCalculationForElementwiseOp(
     auto shifted =
         rewriter.create<arith::ShRSIOp>(loc, resultTypes, args[0], subtract)
             ->getResults();
-    auto truncated =
-        rewriter.create<arith::TruncIOp>(loc, i1Ty, shifted, std::nullopt);
+    auto truncated = rewriter.create<arith::TruncIOp>(
+        loc, i1Ty, shifted, ArrayRef<NamedAttribute>());
     auto isInputOdd =
         rewriter.create<arith::AndIOp>(loc, i1Ty, truncated, i1one);
 
@@ -552,20 +552,20 @@ static Value createLinalgBodyCalculationForElementwiseOp(
 
     if (isa<FloatType>(srcTy) && isa<FloatType>(dstTy) && bitExtend)
       return rewriter.create<arith::ExtFOp>(loc, resultTypes, args,
-                                            std::nullopt);
+                                            ArrayRef<NamedAttribute>());
 
     if (isa<FloatType>(srcTy) && isa<FloatType>(dstTy) && !bitExtend)
       return rewriter.create<arith::TruncFOp>(loc, resultTypes, args,
-                                              std::nullopt);
+                                              ArrayRef<NamedAttribute>());
 
     // 1-bit integers need to be treated as signless.
     if (srcTy.isInteger(1) && arith::UIToFPOp::areCastCompatible(srcTy, dstTy))
       return rewriter.create<arith::UIToFPOp>(loc, resultTypes, args,
-                                              std::nullopt);
+                                              ArrayRef<NamedAttribute>());
 
     if (srcTy.isInteger(1) && isa<IntegerType>(dstTy) && bitExtend)
       return rewriter.create<arith::ExtUIOp>(loc, resultTypes, args,
-                                             std::nullopt);
+                                             ArrayRef<NamedAttribute>());
 
     // Unsigned integers need an unrealized cast so that they can be passed
     // to UIToFP.
@@ -583,7 +583,7 @@ static Value createLinalgBodyCalculationForElementwiseOp(
     // All other si-to-fp conversions should be handled by SIToFP.
     if (arith::SIToFPOp::areCastCompatible(srcTy, dstTy))
       return rewriter.create<arith::SIToFPOp>(loc, resultTypes, args,
-                                              std::nullopt);
+                                              ArrayRef<NamedAttribute>());
 
     // Casting to boolean, floats need to only be checked as not-equal to zero.
     if (isa<FloatType>(srcTy) && dstTy.isInteger(1)) {
@@ -690,7 +690,7 @@ static Value createLinalgBodyCalculationForElementwiseOp(
 
     if (isa<IntegerType>(srcTy) && isa<IntegerType>(dstTy) && bitExtend)
       return rewriter.create<arith::ExtSIOp>(loc, resultTypes, args,
-                                             std::nullopt);
+                                             ArrayRef<NamedAttribute>());
 
     if (isa<IntegerType>(srcTy) && isa<IntegerType>(dstTy) && !bitExtend) {
       return rewriter.create<arith::TruncIOp>(loc, dstTy, args[0]);
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
index 293e01a5bf4d4..67c0eca15638a 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
@@ -85,7 +85,7 @@ void ConvertVectorToLLVMPass::runOnOperation() {
     populateVectorGatherLoweringPatterns(patterns);
     if (armI8MM) {
       if (armNeon)
-        arm_neon::populateLowerContractionToSMMLAPatternPatterns(patterns);
+        arm_neon::populateLowerContractionToNeonI8MMPatternPatterns(patterns);
       if (armSVE)
         populateLowerContractionToSVEI8MMPatternPatterns(patterns);
     }
diff --git a/mlir/lib/Dialect/ArmNeon/TransformOps/ArmNeonVectorTransformOps.cpp b/mlir/lib/Dialect/ArmNeon/TransformOps/ArmNeonVectorTransformOps.cpp
index e81fc6a8b5980..d07e6a52d8b5f 100644
--- a/mlir/lib/Dialect/ArmNeon/TransformOps/ArmNeonVectorTransformOps.cpp
+++ b/mlir/lib/Dialect/ArmNeon/TransformOps/ArmNeonVectorTransformOps.cpp
@@ -20,7 +20,7 @@ using namespace mlir;
 
 void transform::ApplyArmNeonContractionToI8MMPatternsOp::populatePatterns(
     RewritePatternSet &patterns) {
-  arm_neon::populateLowerContractionToSMMLAPatternPatterns(patterns);
+  arm_neon::populateLowerContractionToNeonI8MMPatternPatterns(patterns);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/ArmNeon/Transforms/CMakeLists.txt b/mlir/lib/Dialect/ArmNeon/Transforms/CMakeLists.txt
index 84fb1b0116d2a..06bafde451cbb 100644
--- a/mlir/lib/Dialect/ArmNeon/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/ArmNeon/Transforms/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_mlir_dialect_library(MLIRArmNeonTransforms
-  LowerContractionToSMMLAPattern.cpp
+  LowerContractionToNeonI8MMPattern.cpp
 
   DEPENDS
   MLIRArmNeonIncGen
diff --git a/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToSMMLAPattern.cpp b/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToNeonI8MMPattern.cpp
similarity index 59%
rename from mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToSMMLAPattern.cpp
rename to mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToNeonI8MMPattern.cpp
index 5ce3d2b28aeb3..7180884c77e98 100644
--- a/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToSMMLAPattern.cpp
+++ b/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToNeonI8MMPattern.cpp
@@ -1,4 +1,4 @@
-//===- LowerContractionToSMMLAPattern.cpp - Contract to SMMLA ---*- C++ -*-===//
+//===- LowerContractionToNeonI8MMPattern.cpp - Contract to I8MM -*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,10 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements lowering patterns from vector.contract to
-// arm_neon.intr.smmla
+// This file implements lowering patterns from vector.contract to operations
+// that map to instructions from the Neon FEAT_I8MM extension.
 //
-//===---
+// TODO: There may be opportunities to unify this with a similar pattern
+// for SVE. See:
+//   https://github.com/llvm/llvm-project/issues/145559
+//   LowerContractionToSVEI8MMPattern.cpp
+//
+//===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/ArmNeon/ArmNeonDialect.h"
@@ -37,12 +42,87 @@ static Type matchContainerType(Type element, Type container) {
   return element;
 }
 
+// Get the operand of a `vector.contract`. This function is intended to abstract
+// away from the particular way a value is extended before feeding it into the
+// `vector.contract` - via zero-extend or an explicit or implicit sign-extend
+// (for implicit sign-extension see `vector.contract` documentation).
+//
+// The template parameter `Op` indicates the extension operation (explicit or
+// implicit) for which we are checking.
+//
+// Return success only for extensions from `iN` (N <= 8) to `i32`.
+template <typename Op>
+std::optional<Value> getExtOperand(Value v) {
+
+  static_assert(llvm::is_one_of<Op, arith::ExtSIOp, arith::ExtUIOp>::value,
+                "Must be instantiated with either sign- or zero- extension op");
+
+  // If the operand is not defined by an explicit extend operation of the
+  // accepted operation type allow for an implicit sign-extension.
+  auto extOp = dyn_cast_or_null<Op>(v.getDefiningOp());
+  if (!extOp) {
+    if constexpr (std::is_same<Op, arith::ExtSIOp>::value) {
+      auto eltTy = cast<VectorType>(v.getType()).getElementType();
+      if (!eltTy.isSignlessInteger() || eltTy.getIntOrFloatBitWidth() > 8)
+        return {};
+      return v;
+    }
+    return {};
+  }
+
+  // If the operand is defined by an explicit extend operation of the accepted
+  // operation type, check it's extended from `iN` (N <= 8) to `i32`.
+  auto inOp = extOp.getIn();
+  auto inTy = dyn_cast<VectorType>(inOp.getType());
+  if (!inTy)
+    return {};
+  auto inEltTy = inTy.getElementType();
+  if (!inEltTy.isSignlessInteger() || inEltTy.getIntOrFloatBitWidth() > 8)
+    return {};
+
+  auto outTy = dyn_cast<VectorType>(extOp.getType());
+  if (!(outTy && outTy.getElementType().isSignlessInteger(32)))
+    return {};
+
+  return inOp;
+}
+
+// Designate the operation (resp. instruction) used to do sub-tile matrix
+// multiplications.
+enum class MMLA {
+  Signed,      // smmla
+  Unsigned,    // ummla
+  Mixed,       // usmmla
+  MixedSwapped // usmmla with LHS and RHS swapped
+};
+
+// Create the matrix mulitply and accumulate operation according to `op`.
+Value createMMLA(PatternRewriter &rewriter, MMLA op, Location loc,
+                 mlir::Type accType, Value acc, Value lhs, Value rhs) {
+  switch (op) {
+  case MMLA::Signed:
+    return rewriter.createOrFold<arm_neon::SmmlaOp>(loc, accType, acc, lhs,
+                                                    rhs);
+  case MMLA::Unsigned:
+    return rewriter.createOrFold<arm_neon::UmmlaOp>(loc, accType, acc, lhs,
+                                                    rhs);
+  case MMLA::Mixed:
+    return rewriter.createOrFold<arm_neon::UsmmlaOp>(loc, accType, acc, lhs,
+                                                     rhs);
+  case MMLA::MixedSwapped:
+    // The accumulator comes transposed and the result will be transposed
+    // later, so all we have to do here is swap the operands.
+    return rewriter.createOrFold<arm_neon::UsmmlaOp>(loc, accType, acc, rhs,
+                                                     lhs);
+  }
+}
+
 /// Lowering from a vector::contractOp arm neon smmla intrinsic. This will tile
 /// any vector.contract into multiple smmla instructions with unrolling so long
 /// as [2,2,8] is a divisor of its shape. It can also process vecmats with dimM
 /// = 1 (either explicitly or inferred if LHS has only dimK) If no unrolling is
 /// necessary, a single smmla instruction is emitted.
-class LowerContractionToSMMLAPattern
+class LowerContractionToNeonI8MMPattern
     : public OpRewritePattern<vector::ContractionOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
@@ -88,39 +168,64 @@ class LowerContractionToSMMLAPattern
       return failure();
     }
 
-    // Check two extsi inputs Rhs Lhs for contract.
-    arith::ExtSIOp origLhsExtOp =
-        dyn_cast_or_null<arith::ExtSIOp>(op.getLhs().getDefiningOp());
-    arith::ExtSIOp origRhsExtOp =
-        dyn_cast_or_null<arith::ExtSIOp>(op.getRhs().getDefiningOp());
-    if (!origLhsExtOp || !origRhsExtOp) {
+    // Check inputs are sign-/zero- extensions from iN (N <= 8) to i32. Get the
+    // values before the extension. All four signed/unsigned combinations for
+    // input operands are supported, but they are lowered to different
+    // operations. Determine which is the appropriate operation to lower to.
+    MMLA mmlaOp = MMLA::Signed;
+    auto maybeLhs = getExtOperand<arith::ExtSIOp>(op.getLhs());
+    if (!maybeLhs) {
+      mmlaOp = MMLA::Unsigned;
+      maybeLhs = getExtOperand<arith::ExtUIOp>(op.getLhs());
+    }
+    if (!maybeLhs)
       return failure();
+
+    auto maybeRhs = getExtOperand<arith::ExtSIOp>(op.getRhs());
+    if (maybeRhs) {
+      if (mmlaOp == MMLA::Unsigned)
+        mmlaOp = MMLA::Mixed;
+    } else {
+      if (mmlaOp == MMLA::Signed)
+        mmlaOp = MMLA::MixedSwapped;
+      maybeRhs = getExtOperand<arith::ExtUIOp>(op.getRhs());
     }
+    if (!maybeRhs)
+      return failure();
+
+    Value origLhs = *maybeLhs;
+    Value origRhs = *maybeRhs;
 
     // Match any iX to i32 for X<8 then turn into an i8 output. Feed into
     // following neon instruction. Check inputs for extsi are <=i8
-    Value extsiLhs;
-    Value extsiRhs;
-    if (auto lhsExtInType =
-            dyn_cast<mlir::VectorType>(origLhsExtOp.getIn().getType())) {
+    Value extLhs;
+    Value extRhs;
+    if (auto lhsExtInType = dyn_cast<mlir::VectorType>(origLhs.getType())) {
       if (lhsExtInType.getElementTypeBitWidth() <= 8) {
         Type targetLhsExtTy =
             matchContainerType(rewriter.getI8Type(), lhsExtInType);
-        extsiLhs = rewriter.createOrFold<arith::ExtSIOp>(loc, targetLhsExtTy,
-                                                         origLhsExtOp.getIn());
+        if (mmlaOp == MMLA::Signed || mmlaOp == MMLA::Mixed)
+          extLhs = rewriter.createOrFold<arith::ExtSIOp>(loc, targetLhsExtTy,
+                                                         origLhs);
+        else
+          extLhs = rewriter.createOrFold<arith::ExtUIOp>(loc, targetLhsExtTy,
+                                                         origLhs);
       }
     }
-    if (auto rhsExtInType =
-            dyn_cast<mlir::VectorType>(origRhsExtOp.getIn().getType())) {
+    if (auto rhsExtInType = dyn_cast<mlir::VectorType>(origRhs.getType())) {
       if (rhsExtInType.getElementTypeBitWidth() <= 8) {
         Type targetRhsExtTy =
             matchContainerType(rewriter.getI8Type(), rhsExtInType);
-        extsiRhs = rewriter.createOrFold<arith::ExtSIOp>(loc, targetRhsExtTy,
-                                                         origRhsExtOp.getIn());
+        if (mmlaOp == MMLA::Unsigned || mmlaOp == MMLA::Mixed)
+          extRhs = rewriter.createOrFold<arith::ExtUIOp>(loc, targetRhsExtTy,
+                                                         origRhs);
+        else
+          extRhs = rewriter.createOrFold<arith::ExtSIOp>(loc, targetRhsExtTy,
+                                                         origRhs);
       }
     }
 
-    if (!extsiLhs || !extsiRhs) {
+    if (!extLhs || !extRhs) {
       return failure();
     }
 
@@ -155,11 +260,11 @@ class LowerContractionToSMMLAPattern
       AffineMap lhsPermutationMap = op.getIndexingMapsArray()[0];
       SmallVector<int64_t> lhsOffsets =
           applyPermutationMap(lhsPermutationMap, ArrayRef<int64_t>(offsets));
-      Value tiledLhs = extractOperand(extsiLhs, lhsPermutationMap, lhsOffsets);
+      Value tiledLhs = extractOperand(extLhs, lhsPermutationMap, lhsOffsets);
       AffineMap rhsPermutationMap = op.getIndexingMapsArray()[1];
       SmallVector<int64_t> rhsOffsets =
           applyPermutationMap(rhsPermutationMap, ArrayRef<int64_t>(offsets));
-      Value tiledRhs = extractOperand(extsiRhs, rhsPermutationMap, rhsOffsets);
+      Value tiledRhs = extractOperand(extRhs, rhsPermutationMap, rhsOffsets);
       AffineMap accPermutationMap = op.getIndexingMapsArray()[2];
       SmallVector<int64_t> accOffsets =
           applyPermutationMap(accPermutationMap, ArrayRef<int64_t>(offsets));
@@ -191,6 +296,13 @@ class LowerContractionToSMMLAPattern
         tiledAcc = expandForSMMLA(tiledAcc, outputExpandedType);
       }
 
+      // Transpose ACC if doing signed by unsigned multiplication, because we're
+      // using the instruction for unsigned by signed multiplication with
+      // reversed operands.
+      if (mmlaOp == MMLA::MixedSwapped)
+        tiledAcc = rewriter.create<vector::TransposeOp>(
+            loc, tiledAcc, ArrayRef<int64_t>({1, 0}));
+
       // Collapse tiled operands to 1D vectors required by smmla intrinsic
       auto collapsedInputType =
           VectorType::get(inputExpandedType.getNumElements(), inputElementType);
@@ -211,15 +323,21 @@ class LowerContractionToSMMLAPattern
       }
 
       // Insert contract op
-      kAcc = rewriter.createOrFold<arm_neon::SmmlaOp>(
-          op.getLoc(), collapsedRes.getType(), collapsedRes, collapsedLhs,
-          collapsedRhs);
+      kAcc = createMMLA(rewriter, mmlaOp, op.getLoc(), collapsedRes.getType(),
+                        collapsedRes, collapsedLhs, collapsedRhs);
 
       // Reshape output back to 2D
       Value tiledRes = rewriter.createOrFold<vector::ShapeCastOp>(
           kAcc.getLoc(), tiledAcc.getType(), kAcc);
 
-      // With vecmat, only one row of tiled ACC can be inserted into file result
+      // Because of the reversed operands the result is obtained transposed.
+      // Transpose it back,
+      if (mmlaOp == MMLA::MixedSwapped)
+        tiledRes = rewriter.create<vector::TransposeOp>(
+            loc, tiledRes, ArrayRef<int64_t>({1, 0}));
+
+      // With vecmat, only one row of tiled ACC can be inserted into the final
+      // result
       if (isVecmat) {
         tiledRes = rewriter.createOrFold<vector::ExtractOp>(loc, tiledRes, 0);
       }
@@ -239,8 +357,8 @@ class LowerContractionToSMMLAPattern
 
 } // namespace
 
-void mlir::arm_neon::populateLowerContractionToSMMLAPatternPatterns(
+void mlir::arm_neon::populateLowerContractionToNeonI8MMPatternPatterns(
     RewritePatternSet &patterns) {
   MLIRContext *context = patterns.getContext();
-  patterns.add<LowerContractionToSMMLAPattern>(context, /*benefit=*/2);
+  patterns.add<LowerContractionToNeonI8MMPattern>(context, /*benefit=*/2);
 }
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
index 95965872f4098..1e8e1265affa0 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
@@ -724,59 +724,6 @@ struct LiftIllegalVectorTransposeToMemory
   }
 };
 
-/// A rewrite to turn unit dim transpose-like vector.shape_casts into
-/// vector.transposes. The shape_cast has to be from an illegal vector type to a
-/// legal one (as defined by isLegalVectorType).
-///
-/// The reasoning for this is if we've got to this pass and we still have
-/// shape_casts of illegal types, then they likely will not cancel out. Turning
-/// them into transposes gives LiftIllegalVectorTransposeToMemory a chance to
-/// eliminate them.
-///
-/// Example:
-///
-///  BEFORE:
-///  ```mlir
-///  %0 = vector.shape_cast %a : vector<[4]x1xf32> to vector<1x[4]xf32>
-///  ```
-///
-///  AFTER:
-///  ```mlir
-///  %0 = vector.transpose %0, [1, 0] : vector<[4]x1xf32> to vector<1x[4]xf32>
-///  ```
-struct ConvertIllegalShapeCastOpsToTransposes
-    : public OpRewritePattern<vector::ShapeCastOp> {
-  using OpRewritePattern<vector::ShapeCastOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(vector::ShapeCastOp shapeCastOp,
-                                PatternRewriter &rewriter) const override {
-    auto sourceType = shapeCastOp.getSourceVectorType();
-    auto resultType = shapeCastOp.getResultVectorType();
-    if (isLegalVectorType(sourceType) || !isLegalVectorType(resultType))
-      return rewriter.notifyMatchFailure(shapeCastOp,
-                                         kMatchFailureNotIllegalToLegal);
-
-    // Note: If we know that `sourceType` is an illegal vector type (and 2D)
-    // then dim 0 is scalable and dim 1 is fixed.
-    if (sourceType.getRank() != 2 || sourceType.getDimSize(1) != 1)
-      return rewriter.notifyMatchFailure(
-          shapeCastOp, "expected source to be a 2D scalable vector with a "
-                       "trailing unit dim");
-
-    auto loc = shapeCastOp.getLoc();
-    auto transpose = rewriter.create<vector::TransposeOp>(
-        loc, shapeCastOp.getSource(), ArrayRef<int64_t>{1, 0});
-
-    if (resultType.getRank() == 1)
-      rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(shapeCastOp, resultType,
-                                                       transpose);
-    else
-      rewriter.replaceOp(shapeCastOp, transpose);
-
-    return success();
-  }
-};
-
 /// Rewrites an illegal/unsupported SVE transfer_write(transpose) to instead use
 /// the ZA state. This workaround rewrite to support these transposes when ZA is
 /// available.
@@ -920,6 +867,116 @@ struct LowerIllegalTransposeStoreViaZA
   }
 };
 
+/// Lower `vector.transfer_read` of a scalable column to `scf::for`
+///
+/// Lowers a "read" of a scalable column from a MemRef for which there is no
+/// hardware pperation that we could use to a loop over the rows to read and
+/// loads one element at a time.
+///
+///  BEFORE:
+///  ```
+///  %res = vector.transfer_read %mem[%a, %b] (...)
+///    : memref<?x?xf32>, vector<[4]x1xf32>
+///  ```
+///
+///  AFTER:
+///  ```
+///    %cst = arith.constant (...) : vector<[4]xf32>
+///    %vscale = vector.vscale
+///    %c4_vscale = arith.muli %vscale, %c4 : index
+///    %scf = scf.for %lb = %c0 to %c4_vscale step %c1 iter_args(%arg4 = %cst)
+///      -> (vector<[4]xf32>) {
+///
+///        %load = memref.load %mem[%arg3 + %a, %b] : memref<?x?xf32>
+///        %vec = vector.insert %load, %cst [%arg3] : f32 into vector<[4]xf32>
+///        scf.yield %vec : vector<[4]xf32>
+///    }
+///    %res = vector.shape_cast %scf : vector<[4]xf32> to vector<[4]x1xf32>
+///  ```
+///
+///  TODO: This transformation isn't specific to SME - move it to the SVE
+///  dialect.
+///  TODO: Check the in_bounds attribute and generate vector.maskedload if
+///  required.
+struct LowerColumnTransferReadToLoops
+    : public OpRewritePattern<vector::TransferReadOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TransferReadOp readOp,
+                                PatternRewriter &rewriter) const override {
+    // NOTE: This is a fairly low-level transformation, so we shouldn't be
+    // adding support for Tensors without good rationale.
+    if (readOp.hasPureTensorSemantics())
+      return rewriter.notifyMatchFailure(
+          readOp, "Tensor semantics are unsupported (either bufferize or "
+                  "extend this pattern)");
+
+    auto resType = readOp.getVectorType();
+
+    if (resType.getRank() != 2)
+      return rewriter.notifyMatchFailure(readOp,
+                                         "Only 2D vectors are supported!");
+
+    if (resType.getShape()[1] != 1)
+      return rewriter.notifyMatchFailure(
+          readOp, "The trailing output dim is != 1 (not supported ATM)");
+
+    if (!resType.getScalableDims()[0] || resType.getScalableDims()[1])
+      return rewriter.notifyMatchFailure(
+          readOp, "Expected the leading dim to be scalable and the trailing "
+                  "dim to be fixed.");
+
+    // Create new result type - similar to the original vector with the
+    // trailing unit dim collapsed.
+    int64_t numRows = resType.getShape()[0];
+    VectorType newResType = VectorType::get(numRows, resType.getElementType(),
+                                            /*scalableDims=*/{true});
+
+    // Create a loop over all rows and load one element at a time.
+    auto loc = readOp.getLoc();
+    auto lowerBound = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    auto createVscaleMultiple =
+        vector::makeVscaleConstantBuilder(rewriter, loc);
+    auto upperBound = createVscaleMultiple(numRows);
+    auto step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+    Value init = rewriter.create<arith::ConstantOp>(
+        loc, newResType, DenseElementsAttr::get(newResType, 0.0f));
+
+    scf::ForOp loadLoop;
+    {
+      OpBuilder::InsertionGuard g(rewriter);
+      loadLoop = rewriter.create<scf::ForOp>(loc, lowerBound, upperBound, step,
+                                             ValueRange{init});
+      rewriter.setInsertionPointToStart(loadLoop.getBody());
+
+      auto tileSliceIndex = loadLoop.getInductionVar();
+
+      auto idx0 = rewriter.create<arith::AddIOp>(loc, tileSliceIndex,
+                                                 readOp.getIndices()[0]);
+      auto idx1 = readOp.getIndices()[1];
+
+      Value scalar = rewriter.create<memref::LoadOp>(
+          loc, readOp.getBase(), SmallVector<Value>({idx0, idx1}));
+
+      Operation *updateInit = rewriter.create<vector::InsertOp>(
+          loc, scalar, loadLoop.getRegionIterArg(0), tileSliceIndex);
+
+      rewriter.create<scf::YieldOp>(loc, updateInit->getResult(0));
+    }
+
+    // The read operation has been "legalized", but since the original result
+    // type was a 2D vector, we need to cast before returning the result. This
+    // ShapeCast should cancel-out with some other ShapeCast (i.e. it's a
+    // no-op).
+    auto sc = rewriter.create<vector::ShapeCastOp>(
+        loc, readOp.getResult().getType(), loadLoop.getResult(0));
+
+    rewriter.replaceOp(readOp, sc);
+
+    return success();
+  }
+};
+
 struct VectorLegalizationPass
     : public arm_sme::impl::VectorLegalizationBase<VectorLegalizationPass> {
   void runOnOperation() override {
@@ -941,10 +998,10 @@ struct VectorLegalizationPass
 
     // Apply preprocessing patterns.
     RewritePatternSet rewritePatterns(context);
-    rewritePatterns.add<FoldExtractFromVectorOfSMELikeCreateMasks,
-                        LiftIllegalVectorTransposeToMemory,
-                        ConvertIllegalShapeCastOpsToTransposes,
-                        LowerIllegalTransposeStoreViaZA>(context);
+    rewritePatterns
+        .add<FoldExtractFromVectorOfSMELikeCreateMasks,
+             LowerColumnTransferReadToLoops, LiftIllegalVectorTransposeToMemory,
+             LowerIllegalTransposeStoreViaZA>(context);
     if (failed(
             applyPatternsGreedily(getOperation(), std::move(rewritePatterns))))
       return signalPassFailure();
diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
index d2ac850a5f70b..d52ff4d4257c7 100644
--- a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
+++ b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
@@ -298,16 +298,156 @@ struct LegalizeSVEMaskLoadConversion : public OpRewritePattern<memref::LoadOp> {
   }
 };
 
+/// Transforms a `transfer_read` operation so it reads vector of a type that
+/// can be mapped to an LLVM type ("LLVM-legal" type). This is done by
+/// collapsing trailing dimensions so we obtain a vector type with a single
+/// scalable dimension in the rightmost position.
+///
+/// Example:
+/// ```
+/// %v = vector.transfer_read %M[%i, %j, %c0, %c0], %c0_i8
+///   {in_bounds = [false, true, true, true]}
+///   : memref<?x?x2x8xi8>, vector<2x[4]x2x8xi8>
+/// ```
+/// is rewritten to
+/// ```
+/// %collapse_shape = memref.collapse_shape %M [[0], [1, 2, 3]]
+///   : memref<?x?x2x8xi8> into memref<?x?xi8>
+/// %0 = vector.transfer_read  %collapse_shape[%i, %j], %c0_i8
+///   {in_bounds = [false, true]}
+///   : memref<?x?xi8>, vector<2x[64]xi8>
+/// %1 = vector.shape_cast %0 : vector<2x[64]xi8> to vector<2x[4]x2x8xi8>
+/// ```
+struct LegalizeTransferRead : public OpRewritePattern<vector::TransferReadOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TransferReadOp readOp,
+                                PatternRewriter &rewriter) const override {
+
+    // Do not try to transform masked reads. For example, if we have a transfer
+    // to a `vector<[4]x4xi8>` we could have a mask like
+    //    1 1 1 0
+    //    1 1 1 0
+    //    1 1 1 0
+    //    0 0 0 0
+    // Flattening this mask would look like
+    //    1 1 1 0 1 1 1 0 1 1 1 0 0 0 0 0
+    // and we have not yet figured out an efficient way to build such a mask,
+    // neither from the mask operand, nor from the original `vector.create_mask`
+    // operation (if visible at all).
+    if (readOp.isMasked() || readOp.getMask())
+      return rewriter.notifyMatchFailure(readOp,
+                                         "masked transfers not-supported");
+
+    // General permutation maps are not supported. The issue is with transpose,
+    // broadcast, and other forms of non-identify mapping in the minor
+    // dimensions which is impossible to represent after collapsing (at least
+    // because the resulting "collapsed" maps would have smaller number of
+    // dimension indices).
+    // TODO: We have not had yet the need for it, but some forms of permutation
+    // maps with identity in the minor dimensions voukld be supported, for
+    // example `(i, j, k, p) -> (j, i, k, p)` where we need to collapse only `k`
+    // and `p`.
+    if (!readOp.getPermutationMap().isMinorIdentity())
+      return rewriter.notifyMatchFailure(readOp, "non-identity permutation");
+
+    // We handle transfers of vectors with rank >= 2 and a single scalable
+    // dimension. This transformation aims to transform an LLVM-illegal type
+    // into an LLVM-legal type and one dimensional vectors are already
+    // LLVM-legal, even if scalable. A value of a vector type with more than one
+    // scalable dimension is impossible to represent using a vector type with no
+    // scalable dimensions or a single one. For example a `vector<[4]x[4]xi8>`
+    // would have `4 * 4 * vscale * vscale` elements and this quantity is
+    // impossible to represent as `N` or `N * vscale` (where `N` is a constant).
+    VectorType origVT = readOp.getVectorType();
+    ArrayRef<bool> origScalableDims = origVT.getScalableDims();
+    const int64_t origVRank = origVT.getRank();
+    if (origVRank < 2 || origVT.getNumScalableDims() != 1)
+      return rewriter.notifyMatchFailure(readOp, "wrong dimensions");
+
+    // Number of trailing dimensions to collapse, including the scalable
+    // dimension.  Nothing to do if the single scalable dimension is already the
+    // last one.
+    const int64_t numCollapseDims = std::distance(
+        llvm::find(origScalableDims, true), origScalableDims.end());
+    if (numCollapseDims < 2)
+      return rewriter.notifyMatchFailure(readOp,
+                                         "scalable dimension is trailing");
+
+    // We want a simple memref (not a tensor) with contiguous elements for at
+    // least all the trailing dimensions up to and including the scalable one.
+    auto memTy = dyn_cast<MemRefType>(readOp.getBase().getType());
+    if (!(memTy && memTy.areTrailingDimsContiguous(numCollapseDims)))
+      return rewriter.notifyMatchFailure(
+          readOp, "non-contiguous memref dimensions to collapse");
+
+    // The dimensions to collapse (excluding the scalable one) of the vector and
+    // the memref must match. A dynamic memref dimension is considered
+    // non-matching. The transfers from the dimensions to collapse must be
+    // in-bounds (it follows the corresponding indices would be zero). This
+    // guarantees that the operation transfers a contiguous block
+    // and no padding is necessary.
+    if (!llvm::equal(memTy.getShape().take_back(numCollapseDims - 1),
+                     origVT.getShape().take_back(numCollapseDims - 1)))
+      return rewriter.notifyMatchFailure(
+          readOp, "memref and vector dimensions do not match");
+
+    SmallVector<bool> origInBounds = readOp.getInBoundsValues();
+    if (!llvm::all_of(
+            ArrayRef<bool>(origInBounds).take_back(numCollapseDims - 1),
+            [](bool v) { return v; }))
+      return rewriter.notifyMatchFailure(
+          readOp, "out-of-bounds transfer from a dimension to collapse");
+
+    // Collapse the trailing dimensions of the memref.
+    SmallVector<ReassociationIndices> reassoc;
+    for (int64_t i = 0; i < memTy.getRank() - numCollapseDims + 1; ++i)
+      reassoc.push_back({i});
+    for (int64_t i = memTy.getRank() - numCollapseDims + 1; i < memTy.getRank();
+         ++i)
+      reassoc.back().push_back(i);
+    if (!memref::CollapseShapeOp::isGuaranteedCollapsible(memTy, reassoc))
+      return failure();
+    Value collapsedMem = rewriter.create<memref::CollapseShapeOp>(
+        readOp.getLoc(), readOp.getBase(), reassoc);
+
+    // Get a vector type with collapsed trailing dimensions.
+    SmallVector<int64_t> shape(origVT.getShape());
+    for (int64_t i = origVRank - numCollapseDims + 1; i < origVRank; ++i)
+      shape[origVRank - numCollapseDims] *= shape[i];
+    shape.pop_back_n(numCollapseDims - 1);
+    auto collapsedVT =
+        VectorType::get(shape, origVT.getElementType(),
+                        origScalableDims.drop_back(numCollapseDims - 1));
+
+    // Drop the extra (zero) indices.
+    auto indices = readOp.getIndices().drop_back(numCollapseDims - 1);
+
+    // Create the new `transfer_read`.
+    auto newReadOp = rewriter.create<vector::TransferReadOp>(
+        readOp.getLoc(), collapsedVT, collapsedMem, indices,
+        ArrayRef<bool>(origInBounds).drop_back(numCollapseDims - 1));
+
+    // Cast back to the original vector type.
+    auto toOrigShape = rewriter.create<vector::ShapeCastOp>(readOp.getLoc(),
+                                                            origVT, newReadOp);
+
+    rewriter.replaceOp(readOp, toOrigShape);
+    return success();
+  }
+};
+
 } // namespace
 
 void mlir::arm_sve::populateLegalizeVectorStoragePatterns(
     RewritePatternSet &patterns) {
-  patterns.add<RelaxScalableVectorAllocaAlignment,
-               LegalizeSVEMaskAllocation<memref::AllocaOp>,
-               LegalizeSVEMaskAllocation<memref::AllocOp>,
-               LegalizeSVEMaskTypeCastConversion,
-               LegalizeSVEMaskStoreConversion, LegalizeSVEMaskLoadConversion>(
-      patterns.getContext());
+  patterns
+      .add<RelaxScalableVectorAllocaAlignment,
+           LegalizeSVEMaskAllocation<memref::AllocaOp>,
+           LegalizeSVEMaskAllocation<memref::AllocOp>,
+           LegalizeSVEMaskTypeCastConversion, LegalizeSVEMaskStoreConversion,
+           LegalizeSVEMaskLoadConversion, LegalizeTransferRead>(
+          patterns.getContext());
 }
 
 namespace {
diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/LowerContractionToSVEI8MMPattern.cpp b/mlir/lib/Dialect/ArmSVE/Transforms/LowerContractionToSVEI8MMPattern.cpp
index a1209fe8230e2..b7703ff0393eb 100644
--- a/mlir/lib/Dialect/ArmSVE/Transforms/LowerContractionToSVEI8MMPattern.cpp
+++ b/mlir/lib/Dialect/ArmSVE/Transforms/LowerContractionToSVEI8MMPattern.cpp
@@ -1,4 +1,4 @@
-//===- LowerContractionToSMMLAPattern.cpp - Contract to SMMLA ---*- C++ -*-===//
+//===- LowerContractionToSVEI8MMPattern.cpp - Contract to I8MM --*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,6 +9,11 @@
 // This file implements lowering patterns from vector.contract to operations
 // that map to instructions from the SVE FEAT_I8MM extension.
 //
+// TODO: There may be opportunities to unify this with a similar pattern
+// for Neon. See:
+//   https://github.com/llvm/llvm-project/issues/145559
+//   LowerContractionToNeonI8MMPattern.cpp
+//
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
diff --git a/mlir/lib/Dialect/GPU/Transforms/DecomposeMemRefs.cpp b/mlir/lib/Dialect/GPU/Transforms/DecomposeMemRefs.cpp
index 695d43b04cff0..f63af8da28087 100644
--- a/mlir/lib/Dialect/GPU/Transforms/DecomposeMemRefs.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/DecomposeMemRefs.cpp
@@ -111,7 +111,8 @@ static Value getFlatMemref(OpBuilder &rewriter, Location loc, Value source,
       getFlatOffsetAndStrides(rewriter, loc, source, offsetsTemp);
   MemRefType retType = inferCastResultType(base, offset);
   return rewriter.create<memref::ReinterpretCastOp>(loc, retType, base, offset,
-                                                    std::nullopt, std::nullopt);
+                                                    ArrayRef<OpFoldResult>(),
+                                                    ArrayRef<OpFoldResult>());
 }
 
 static bool needFlatten(Value val) {
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 5dbb2403eddbd..b2639edb0d0f5 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -2989,8 +2989,9 @@ LogicalResult WinogradFilterTransformOp::verify() {
   ArrayRef<int64_t> filterShape = filterType.getShape();
   int64_t filterH = filterShape[getFilterHDim()];
   int64_t filterW = filterShape[getFilterWDim()];
-  int64_t r = getR();
-  int64_t m = getM();
+  WinogradConv2DFmr fmr = getFmr();
+  int64_t m, r;
+  std::tie(m, r) = getFmrFromWinogradConv2DFmr(fmr);
 
   if (filterH != r && filterH != 1)
     return emitOpError("expect filter height either equals to r or 1");
@@ -3046,8 +3047,9 @@ LogicalResult WinogradFilterTransformOp::getResultTilePosition(
   ArrayRef<int64_t> filterShape = filterType.getShape();
   int64_t filterH = filterShape[getFilterHDim()];
   int64_t filterW = filterShape[getFilterWDim()];
-  int64_t m = getM();
-  int64_t r = getR();
+  WinogradConv2DFmr fmr = getFmr();
+  int64_t m, r;
+  std::tie(m, r) = getFmrFromWinogradConv2DFmr(fmr);
   int64_t alpha = m + r - 1;
   int64_t alphaH = filterH != 1 ? alpha : 1;
   int64_t alphaW = filterW != 1 ? alpha : 1;
@@ -3124,8 +3126,9 @@ LogicalResult WinogradInputTransformOp::verify() {
   ArrayRef<int64_t> inputShape = inputType.getShape();
   int64_t inputH = inputShape[getInputHDim()];
   int64_t inputW = inputShape[getInputWDim()];
-  int m = getM();
-  int r = getR();
+  WinogradConv2DFmr fmr = getFmr();
+  int64_t m, r;
+  std::tie(m, r) = getFmrFromWinogradConv2DFmr(fmr);
   int64_t tileSize = m + r - 1;
 
   auto outputType = cast<ShapedType>(getOutput().getType());
@@ -3194,8 +3197,9 @@ LogicalResult WinogradInputTransformOp::getResultTilePosition(
   int64_t outputAlphaH = outputShape[getOutputAlphaHDim()];
   int64_t outputAlphaW = outputShape[getOutputAlphaWDim()];
 
-  int64_t m = getM();
-  int64_t r = getR();
+  WinogradConv2DFmr fmr = getFmr();
+  int64_t m, r;
+  std::tie(m, r) = getFmrFromWinogradConv2DFmr(fmr);
   int64_t alpha = m + r - 1;
   int64_t alphaH = outputAlphaH != 1 ? alpha : 1;
   int64_t alphaW = outputAlphaW != 1 ? alpha : 1;
@@ -3224,8 +3228,9 @@ WinogradInputTransformOp::getTiledImplementation(OpBuilder &builder,
                                                  ArrayRef<OpFoldResult> offsets,
                                                  ArrayRef<OpFoldResult> sizes) {
   IntegerAttr oneAttr = builder.getI64IntegerAttr(1);
-  int64_t m = getM();
-  int64_t r = getR();
+  WinogradConv2DFmr fmr = getFmr();
+  int64_t m, r;
+  std::tie(m, r) = getFmrFromWinogradConv2DFmr(fmr);
 
   ShapedType outputType = getOutputOperandType();
   ArrayRef<int64_t> outputShape = outputType.getShape();
@@ -3303,8 +3308,9 @@ LogicalResult WinogradOutputTransformOp::verify() {
   int64_t valueW = valueShape[getValueAlphaWDim()];
   int64_t valueTileH = valueShape[getValueTileHDim()];
   int64_t valueTileW = valueShape[getValueTileWDim()];
-  int m = getM();
-  int r = getR();
+  WinogradConv2DFmr fmr = getFmr();
+  int64_t m, r;
+  std::tie(m, r) = getFmrFromWinogradConv2DFmr(fmr);
   bool leftTransform = valueH != 1;
   bool rightTransform = valueW != 1;
 
@@ -3365,7 +3371,9 @@ LogicalResult WinogradOutputTransformOp::getResultTilePosition(
     OpBuilder &builder, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
     ArrayRef<OpFoldResult> sizes, SmallVector<OpFoldResult> &resultOffsets,
     SmallVector<OpFoldResult> &resultSizes) {
-  int64_t m = getM();
+  WinogradConv2DFmr fmr = getFmr();
+  int64_t m, r;
+  std::tie(m, r) = getFmrFromWinogradConv2DFmr(fmr);
 
   Location loc = getLoc();
   MLIRContext *context = builder.getContext();
@@ -3623,6 +3631,27 @@ verifyExtendedBatchVariantMatmulSemantic(OpTy batchVariantMatmulOp,
 namespace mlir {
 namespace linalg {
 
+std::optional<WinogradConv2DFmr> getWinogradConv2DFmr(int64_t m, int64_t r) {
+  if (m == 2 && r == 3)
+    return WinogradConv2DFmr::F_2_3;
+  if (m == 4 && r == 3)
+    return WinogradConv2DFmr::F_4_3;
+  if (m == 2 && r == 5)
+    return WinogradConv2DFmr::F_2_5;
+  return std::nullopt;
+}
+
+std::pair<int64_t, int64_t> getFmrFromWinogradConv2DFmr(WinogradConv2DFmr fmr) {
+  switch (fmr) {
+  case WinogradConv2DFmr::F_2_3:
+    return {2, 3};
+  case WinogradConv2DFmr::F_4_3:
+    return {4, 3};
+  case WinogradConv2DFmr::F_2_5:
+    return {2, 5};
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // MatMulOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 2b78e31558ea2..8571d641e26d1 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -4250,7 +4250,7 @@ DiagnosedSilenceableFailure transform::WinogradConv2DOp::applyToOne(
   bool supported = TypeSwitch<Operation *, bool>(target)
                        .Case([&](linalg::Conv2DNhwcFhwcOp op) {
                          maybeTransformed =
-                             winogradConv2D(rewriter, op, getM(), getR());
+                             winogradConv2D(rewriter, op, getFmr());
                          return true;
                        })
                        .Default([&](Operation *op) { return false; });
diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
index 19d484a3bb701..513cecef29b61 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
@@ -22,8 +22,11 @@
 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 #include "mlir/Interfaces/TilingInterface.h"
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
+#include "llvm/Support/Debug.h"
 #include <optional>
 
+#define DEBUG_TYPE "linalg-tiling-interface-impl"
+
 using namespace mlir;
 using namespace mlir::linalg;
 
@@ -148,55 +151,82 @@ struct LinalgOpTilingInterface
   /// Utility to fetch the offsets and sizes when applied as per the indexing
   /// map of the linalg op. This helps in fusing the linalg op as a consumer of
   /// a given slice op.
-  void
-  getMappedOffsetAndSize(LinalgOp linalgOp, OpBuilder &b, AffineMap indexingMap,
-                         ArrayRef<OpFoldResult> offsets,
-                         ArrayRef<OpFoldResult> sizes,
-                         SmallVectorImpl<OpFoldResult> &mappedOffsets,
-                         SmallVectorImpl<OpFoldResult> &mappedSizes) const {
-    unsigned numLoops = linalgOp.getNumLoops();
-    auto tilingInterfaceOp = cast<TilingInterface>(linalgOp.getOperation());
-    mappedOffsets.resize(numLoops);
-    mappedSizes.resize(numLoops);
-    if (!indexingMap.isPermutation()) {
-      SmallVector<Range> iterationDomain =
-          tilingInterfaceOp.getIterationDomain(b);
-      for (const auto &&[index, value] : llvm::enumerate(iterationDomain)) {
-        mappedOffsets[index] = value.offset;
-        mappedSizes[index] = value.size;
+  static LogicalResult
+  getMappedOffsetAndSize(LinalgOp linalgOp, OpBuilder &b,
+                         ArrayRef<AffineMap> indexingMaps,
+                         ArrayRef<SmallVector<OpFoldResult>> allOffsets,
+                         ArrayRef<SmallVector<OpFoldResult>> allSizes,
+                         SmallVectorImpl<OpFoldResult> &mappedOffsetsVec,
+                         SmallVectorImpl<OpFoldResult> &mappedSizesVec) {
+    DenseMap<unsigned, OpFoldResult> mappedOffsets, mappedSizes;
+
+    for (auto [indexingMap, offsets, sizes] :
+         llvm::zip_equal(indexingMaps, allOffsets, allSizes)) {
+      for (auto [resultExpr, offset, size] :
+           llvm::zip_equal(indexingMap.getResults(), offsets, sizes)) {
+        auto dimExpr = dyn_cast<AffineDimExpr>(resultExpr);
+        if (!dimExpr)
+          continue;
+        unsigned position = dimExpr.getPosition();
+        auto it = mappedOffsets.find(position);
+        if (it != mappedOffsets.end()) {
+          OpFoldResult seenOffset = it->second;
+          OpFoldResult seenSize = mappedSizes.lookup(position);
+          if (seenOffset != offset || seenSize != size) {
+            LLVM_DEBUG({
+              llvm::dbgs() << "inconsistent iteration space mapping from "
+                              "offsets/sizes of operands/results";
+            });
+            return failure();
+          }
+        } else {
+          mappedOffsets[position] = offset;
+          mappedSizes[position] = size;
+        }
       }
     }
-    for (const auto &&[index, value] :
-         llvm::enumerate(indexingMap.getResults())) {
-      unsigned dimPosition = cast<AffineDimExpr>(value).getPosition();
-      mappedOffsets[dimPosition] = offsets[index];
-      mappedSizes[dimPosition] = sizes[index];
+
+    // Aggregate from the given operand offsets and sizes, or default to
+    // iteration space values.
+    SmallVector<Range> iterationDomain =
+        cast<TilingInterface>(linalgOp.getOperation()).getIterationDomain(b);
+    mappedOffsetsVec.resize(iterationDomain.size());
+    mappedSizesVec.resize(iterationDomain.size());
+    for (auto [index, domain] : llvm::enumerate(iterationDomain)) {
+      auto it = mappedOffsets.find(index);
+      if (it != mappedOffsets.end()) {
+        mappedOffsetsVec[index] = it->second;
+        mappedSizesVec[index] = mappedSizes.lookup(index);
+        continue;
+      }
+      mappedOffsetsVec[index] = domain.offset;
+      mappedSizesVec[index] = domain.size;
     }
+    return success();
   }
 
   /// Method to return the position of the result tile computed by the tiled
   /// operation.
-  LogicalResult getIterationDomainTileFromOperandTile(
-      Operation *op, OpBuilder &b, unsigned operandNumber,
-      ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
+  LogicalResult getIterationDomainTileFromOperandTiles(
+      Operation *op, OpBuilder &b, ArrayRef<unsigned> operandNumbers,
+      ArrayRef<SmallVector<OpFoldResult>> allOffsets,
+      ArrayRef<SmallVector<OpFoldResult>> allSizes,
       SmallVectorImpl<OpFoldResult> &iterDomainOffsets,
       SmallVectorImpl<OpFoldResult> &iterDomainSizes) const {
     auto linalgOp = cast<LinalgOp>(op);
 
-    // Check that the indexing map used for the operand is a projected
-    // permutation. This could be relaxed with a more general approach that can
-    // map the offsets and sizes from the operand to iteration space tiles
-    // (filling in full extent for dimensions not used to access the result).
-    AffineMap indexingMap =
-        linalgOp.getMatchingIndexingMap(&op->getOpOperand(operandNumber));
-    if (!indexingMap.isProjectedPermutation()) {
-      return op->emitError()
-             << "unhandled get iter domain position when operand is not "
-                "accessed using a permuted projection";
+    std::optional<SmallVector<OpFoldResult>> iterationSpaceOffsets,
+        iterationSpaceSizes;
+    SmallVector<AffineMap> indexingMaps =
+        llvm::map_to_vector(operandNumbers, [&](unsigned operandNumber) {
+          OpOperand &opOperand = linalgOp->getOpOperand(operandNumber);
+          return linalgOp.getMatchingIndexingMap(&opOperand);
+        });
+    if (failed(getMappedOffsetAndSize(linalgOp, b, indexingMaps, allOffsets,
+                                      allSizes, iterDomainOffsets,
+                                      iterDomainSizes))) {
+      return failure();
     }
-
-    getMappedOffsetAndSize(linalgOp, b, indexingMap, offsets, sizes,
-                           iterDomainOffsets, iterDomainSizes);
     return success();
   }
 
@@ -247,8 +277,13 @@ struct LinalgOpTilingInterface
           "accessed using a permuted projection");
     }
 
-    getMappedOffsetAndSize(linalgOp, b, indexingMap, offsets, sizes,
-                           iterDomainOffsets, iterDomainSizes);
+    SmallVector<OpFoldResult> allOffsets = llvm::to_vector(offsets);
+    SmallVector<OpFoldResult> allSizes = llvm::to_vector(sizes);
+    auto status =
+        getMappedOffsetAndSize(linalgOp, b, indexingMap, {allOffsets},
+                               {allSizes}, iterDomainOffsets, iterDomainSizes);
+    (void)status;
+    assert(succeeded(status) && "unexpected error in offset calculation");
     return success();
   }
 
@@ -279,12 +314,13 @@ struct LinalgOpTilingInterface
 
   /// Method to generate the tiled implementation of an operation from the tile
   /// of the operand.
-  FailureOr<TilingResult> getTiledImplementationFromOperandTile(
-      Operation *op, OpBuilder &b, unsigned operandNumber,
-      ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes) const {
+  FailureOr<TilingResult> getTiledImplementationFromOperandTiles(
+      Operation *op, OpBuilder &b, ArrayRef<unsigned> operandNumbers,
+      ArrayRef<SmallVector<OpFoldResult>> allOffsets,
+      ArrayRef<SmallVector<OpFoldResult>> allSizes) const {
     SmallVector<OpFoldResult> mappedOffsets, mappedSizes;
-    if (failed(getIterationDomainTileFromOperandTile(
-            op, b, operandNumber, offsets, sizes, mappedOffsets,
+    if (failed(getIterationDomainTileFromOperandTiles(
+            op, b, operandNumbers, allOffsets, allSizes, mappedOffsets,
             mappedSizes))) {
       return failure();
     }
@@ -837,13 +873,20 @@ struct PackOpTiling
   /// Method to return the position of iteration domain tile computed by the
   /// tiled operation. In current `tensor.pack` context, the `resultOffsets` and
   /// `resultSizes` only cover outer dimensions.
-  LogicalResult getIterationDomainTileFromOperandTile(
-      Operation *op, OpBuilder &b, unsigned operandNumber,
-      ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
+  LogicalResult getIterationDomainTileFromOperandTiles(
+      Operation *op, OpBuilder &b, ArrayRef<unsigned> operandNumbers,
+      ArrayRef<SmallVector<OpFoldResult>> allOffsets,
+      ArrayRef<SmallVector<OpFoldResult>> allSizes,
       SmallVectorImpl<OpFoldResult> &resultOffsets,
       SmallVectorImpl<OpFoldResult> &resultSizes) const {
-    if (operandNumber != 0)
+    if (operandNumbers.size() != 1 || operandNumbers[0] != 0) {
+      LLVM_DEBUG(
+          { llvm::dbgs() << "unsupported operands for consumer fusion"; });
       return failure();
+    }
+
+    ArrayRef<OpFoldResult> offsets(allOffsets[0]);
+    ArrayRef<OpFoldResult> sizes(allSizes[0]);
 
     auto packOp = cast<PackOp>(op);
     // It is not trivial to infer dest tile from source tile if `packOp` has
@@ -904,11 +947,18 @@ struct PackOpTiling
   }
 
   /// Method to return the tiled implementation of tensor.pack as a consumer.
-  FailureOr<TilingResult> getTiledImplementationFromOperandTile(
-      Operation *op, OpBuilder &b, unsigned operandNumber,
-      ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes) const {
-    if (operandNumber != 0)
+  FailureOr<TilingResult> getTiledImplementationFromOperandTiles(
+      Operation *op, OpBuilder &b, ArrayRef<unsigned> operandNumbers,
+      ArrayRef<SmallVector<OpFoldResult>> allOffsets,
+      ArrayRef<SmallVector<OpFoldResult>> allSizes) const {
+    if (operandNumbers.size() != 1 || operandNumbers[0] != 0) {
+      LLVM_DEBUG(
+          { llvm ::dbgs() << "unhandled operands for consumer fusion"; });
       return failure();
+    }
+
+    ArrayRef<OpFoldResult> offsets(allOffsets[0]);
+    ArrayRef<OpFoldResult> sizes(allSizes[0]);
 
     auto packOp = cast<PackOp>(op);
     Location loc = packOp.getLoc();
@@ -923,8 +973,8 @@ struct PackOpTiling
     tiledOperands.push_back(sourceSlice);
 
     SmallVector<OpFoldResult> outerDimOffsets, outerDimSizes;
-    if (failed(getIterationDomainTileFromOperandTile(
-            op, b, /*operandNumber=*/0, offsets, sizes, outerDimOffsets,
+    if (failed(getIterationDomainTileFromOperandTiles(
+            op, b, operandNumbers, allOffsets, allSizes, outerDimOffsets,
             outerDimSizes)))
       return failure();
 
@@ -1182,12 +1232,21 @@ struct UnPackOpTiling
 
   /// Method to return the position of iteration domain tile computed by the
   /// tiled operation.
-  LogicalResult getIterationDomainTileFromOperandTile(
-      Operation *op, OpBuilder &b, unsigned operandNumber,
-      ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
+  LogicalResult getIterationDomainTileFromOperandTiles(
+      Operation *op, OpBuilder &b, ArrayRef<unsigned> operandNumbers,
+      ArrayRef<SmallVector<OpFoldResult>> allOffsets,
+      ArrayRef<SmallVector<OpFoldResult>> allSizes,
       SmallVectorImpl<OpFoldResult> &resultOffsets,
       SmallVectorImpl<OpFoldResult> &resultSizes) const {
+    if (operandNumbers.size() != 1) {
+      LLVM_DEBUG({ llvm::dbgs() << "unable to handle multiple operands"; });
+      return failure();
+    }
     auto unPackOp = cast<UnPackOp>(op);
+    unsigned operandNumber = operandNumbers[0];
+    ArrayRef<OpFoldResult> offsets(allOffsets[0]);
+    ArrayRef<OpFoldResult> sizes(allSizes[0]);
+
     // If the operand tile is the dest, then no adjustment is needed.
     if (operandNumber == unPackOp.getDestMutable().getOperandNumber()) {
       resultOffsets = llvm::to_vector(offsets);
@@ -1241,10 +1300,18 @@ struct UnPackOpTiling
   }
 
   /// Method to return the tiled implementation of tensor.unpack as a consumer.
-  FailureOr<TilingResult> getTiledImplementationFromOperandTile(
-      Operation *op, OpBuilder &b, unsigned operandNumber,
-      ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes) const {
+  FailureOr<TilingResult> getTiledImplementationFromOperandTiles(
+      Operation *op, OpBuilder &b, ArrayRef<unsigned> operandNumbers,
+      ArrayRef<SmallVector<OpFoldResult>> allOffsets,
+      ArrayRef<SmallVector<OpFoldResult>> allSizes) const {
+    if (operandNumbers.size() != 1 || operandNumbers[0] != 0) {
+      LLVM_DEBUG({ llvm::dbgs() << "unhandled operands for consumer fusion"; });
+      return failure();
+    }
     auto unPackOp = cast<UnPackOp>(op);
+    ArrayRef<OpFoldResult> offsets(allOffsets[0]);
+    ArrayRef<OpFoldResult> sizes(allSizes[0]);
+
     // tensor.unpack op is fusible (as a consumer) only if inner dims are not
     // tiled.
     int64_t numTiles = unPackOp.getInnerDimsPos().size();
@@ -1259,8 +1326,8 @@ struct UnPackOpTiling
     // Fetch offset/size for creating the slice of the dest operand of
     // unpack op.
     SmallVector<OpFoldResult> outputOffsets, outputSizes;
-    if (failed(getIterationDomainTileFromOperandTile(
-            op, b, /*operandNumber=*/0, offsets, sizes, outputOffsets,
+    if (failed(getIterationDomainTileFromOperandTiles(
+            op, b, operandNumbers, allOffsets, allSizes, outputOffsets,
             outputSizes)))
       return failure();
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
index e4221d4748415..4e90defebcf5c 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
@@ -38,6 +38,15 @@ namespace {
 ///
 /// The following tables define these constant transformation matrices for
 /// F(2 x 2, 3 x 3), F(4 x 4, 3 x 3), and F(2 x 2, 5 x 5)
+///
+/// To add more transformation matrices, we need to add the following
+/// items:
+/// 1. Add the constant transformation matrix to the corresponding
+///   G, GT, BT, B, AT, or A array.
+/// 2. Add the corresponding TransformMatrix to the GMatrices, GTMatrices,
+///   BTMatrices, BMatrices, ATMatrices, or AMatrices map.
+/// 3. Add a enum value F_m_r to WinogradConv2DFmr enum.
+///
 constexpr float G_2x2_3x3[] = {
    -1,     0,   0,
  1./2, -1./2, 1./2,
@@ -176,19 +185,6 @@ constexpr float A_2x2_5x5[] = {
 };
 // clang-format on
 
-using TransformMapKeyTy = std::pair<int, int>;
-
-/// We use F(m, r) to define the size of minimal filtering algorithms.
-/// m is the output dimension and r is the filter dimension. We can get
-/// the input dimension, alpha, from the formula, alpha = m + r - 1.
-///
-/// For example, when m = 2 and r = 3, we know its input size is 4.
-/// The Conv2D will operate on 4x4 input data with 3x3 filter and get
-/// 2x2 output result.
-constexpr TransformMapKeyTy F_2_3{2, 3};
-constexpr TransformMapKeyTy F_4_3{4, 3};
-constexpr TransformMapKeyTy F_2_5{2, 5};
-
 /// Structure to keep information of constant transform matrices.
 struct TransformMatrix {
   TransformMatrix(const float *table, int64_t rows, int64_t cols,
@@ -344,22 +340,22 @@ Value insert2DDataTo6D(OpBuilder &builder, Location loc, Value source,
 ///     %ret = linalg.matmul %ret, GT
 ///     %inserted = insert %ret into filter<h x w x c x f>
 Value filterTransform(RewriterBase &rewriter, Location loc, Value filter,
-                      Value retValue, int64_t m, int64_t r,
+                      Value retValue, WinogradConv2DFmr fmr,
                       bool leftTransform = true, bool rightTransform = true) {
   // Map from (m, r) to G transform matrix.
-  static const llvm::SmallDenseMap<TransformMapKeyTy, TransformMatrix>
+  static const llvm::SmallDenseMap<WinogradConv2DFmr, TransformMatrix>
       GMatrices = {
-          {F_2_3, TransformMatrix(G_2x2_3x3, 4, 3)},
-          {F_4_3, TransformMatrix(G_4x4_3x3, 6, 3)},
-          {F_2_5, TransformMatrix(G_2x2_5x5, 6, 5)},
+          {WinogradConv2DFmr::F_2_3, TransformMatrix(G_2x2_3x3, 4, 3)},
+          {WinogradConv2DFmr::F_4_3, TransformMatrix(G_4x4_3x3, 6, 3)},
+          {WinogradConv2DFmr::F_2_5, TransformMatrix(G_2x2_5x5, 6, 5)},
       };
 
   // Map from (m, r) to GT transform matrix.
-  static const llvm::SmallDenseMap<TransformMapKeyTy, TransformMatrix>
+  static const llvm::SmallDenseMap<WinogradConv2DFmr, TransformMatrix>
       GTMatrices = {
-          {F_2_3, TransformMatrix(GT_2x2_3x3, 3, 4)},
-          {F_4_3, TransformMatrix(GT_4x4_3x3, 3, 6)},
-          {F_2_5, TransformMatrix(GT_2x2_5x5, 5, 6)},
+          {WinogradConv2DFmr::F_2_3, TransformMatrix(GT_2x2_3x3, 3, 4)},
+          {WinogradConv2DFmr::F_4_3, TransformMatrix(GT_4x4_3x3, 3, 6)},
+          {WinogradConv2DFmr::F_2_5, TransformMatrix(GT_2x2_5x5, 5, 6)},
       };
 
   auto filterType = cast<ShapedType>(filter.getType());
@@ -370,6 +366,8 @@ Value filterTransform(RewriterBase &rewriter, Location loc, Value filter,
   int64_t filterW = filterShape[2];
   int64_t filterC = filterShape[3];
 
+  int64_t m, r;
+  std::tie(m, r) = getFmrFromWinogradConv2DFmr(fmr);
   if (filterH != r && filterH != 1)
     return Value();
   if (filterW != r && filterW != 1)
@@ -387,14 +385,13 @@ Value filterTransform(RewriterBase &rewriter, Location loc, Value filter,
                             zeroIdx, filterH, filterW, /*loopNorFIdx=*/0,
                             /*loopCorFIdx=*/3, /*heightIdx=*/1, /*widthIdx=*/2);
 
-    TransformMapKeyTy key = {m, r};
     int64_t retRows = 1;
     Value matmulRetValue = extractFilter;
     Value zero = builder.create<arith::ConstantOp>(
         loc, rewriter.getZeroAttr(elementType));
     if (leftTransform) {
       // Get constant transform matrix G.
-      auto it = GMatrices.find(key);
+      auto it = GMatrices.find(fmr);
       if (it == GMatrices.end())
         return {};
       const TransformMatrix &GMatrix = it->second;
@@ -416,7 +413,7 @@ Value filterTransform(RewriterBase &rewriter, Location loc, Value filter,
 
     if (rightTransform) {
       // Get constant transform matrix GT.
-      auto it = GTMatrices.find(key);
+      auto it = GTMatrices.find(fmr);
       if (it == GTMatrices.end())
         return {};
       const TransformMatrix &GTMatrix = it->second;
@@ -476,24 +473,26 @@ Value filterTransform(RewriterBase &rewriter, Location loc, Value filter,
 ///                            %output<alphaH x alphaW x tileH x tileW x N x C>
 ///                            at [0, 0, %h, %w, %n, %c]
 Value inputTransform(RewriterBase &rewriter, Location loc, Value input,
-                     Value retValue, int64_t m, int64_t r,
+                     Value retValue, WinogradConv2DFmr fmr,
                      bool leftTransform = true, bool rightTransform = true) {
   // Map from (m, r) to BT transform matrix.
-  static const llvm::SmallDenseMap<TransformMapKeyTy, TransformMatrix>
+  static const llvm::SmallDenseMap<WinogradConv2DFmr, TransformMatrix>
       BTMatrices = {
-          {F_2_3, TransformMatrix(BT_2x2_3x3, 4, 4)},
-          {F_4_3, TransformMatrix(BT_4x4_3x3, 6, 6)},
-          {F_2_5, TransformMatrix(BT_2x2_5x5, 6, 6)},
+          {WinogradConv2DFmr::F_2_3, TransformMatrix(BT_2x2_3x3, 4, 4)},
+          {WinogradConv2DFmr::F_4_3, TransformMatrix(BT_4x4_3x3, 6, 6)},
+          {WinogradConv2DFmr::F_2_5, TransformMatrix(BT_2x2_5x5, 6, 6)},
       };
 
   // Map from (m, r) to B transform matrix.
-  static const llvm::SmallDenseMap<TransformMapKeyTy, TransformMatrix>
+  static const llvm::SmallDenseMap<WinogradConv2DFmr, TransformMatrix>
       BMatrices = {
-          {F_2_3, TransformMatrix(B_2x2_3x3, 4, 4)},
-          {F_4_3, TransformMatrix(B_4x4_3x3, 6, 6)},
-          {F_2_5, TransformMatrix(B_2x2_5x5, 6, 6)},
+          {WinogradConv2DFmr::F_2_3, TransformMatrix(B_2x2_3x3, 4, 4)},
+          {WinogradConv2DFmr::F_4_3, TransformMatrix(B_4x4_3x3, 6, 6)},
+          {WinogradConv2DFmr::F_2_5, TransformMatrix(B_2x2_5x5, 6, 6)},
       };
 
+  int64_t m, r;
+  std::tie(m, r) = getFmrFromWinogradConv2DFmr(fmr);
   auto inputType = cast<ShapedType>(input.getType());
   Type elementType = inputType.getElementType();
   auto inputShape = inputType.getShape(); // N, H, W, C
@@ -529,7 +528,6 @@ Value inputTransform(RewriterBase &rewriter, Location loc, Value input,
                             widthOffset, alphaH, alphaW, /*loopNorFIdx=*/0,
                             /*loopCorFIdx=*/3, /*heightIdx=*/1, /*widthIdx=*/2);
 
-    TransformMapKeyTy key = {m, r};
     int64_t retRows = 1;
     int64_t retCols = 1;
     Value matmulRetValue = extractInput;
@@ -537,7 +535,7 @@ Value inputTransform(RewriterBase &rewriter, Location loc, Value input,
         loc, rewriter.getZeroAttr(elementType));
     if (leftTransform) {
       // Get constant transform matrix BT.
-      auto it = BTMatrices.find(key);
+      auto it = BTMatrices.find(fmr);
       if (it == BTMatrices.end())
         return {};
       const TransformMatrix &BTMatrix = it->second;
@@ -560,7 +558,7 @@ Value inputTransform(RewriterBase &rewriter, Location loc, Value input,
 
     if (rightTransform) {
       // Get constant transform matrix B.
-      auto it = BMatrices.find(key);
+      auto it = BMatrices.find(fmr);
       if (it == BMatrices.end())
         return {};
       const TransformMatrix &BMatrix = it->second;
@@ -696,24 +694,26 @@ static Value matrixMultiply(RewriterBase &rewriter, Location loc,
 ///                            output<N x H x W x F>
 ///                            at [%n, (%h x m), (%w x m), %f]
 Value outputTransform(RewriterBase &rewriter, Location loc, Value value,
-                      Value output, int64_t m, int64_t r,
+                      Value output, WinogradConv2DFmr fmr,
                       bool leftTransform = true, bool rightTransform = true) {
   // Map from (m, r) to AT transform matrix.
-  static const llvm::SmallDenseMap<TransformMapKeyTy, TransformMatrix>
+  static const llvm::SmallDenseMap<WinogradConv2DFmr, TransformMatrix>
       ATMatrices = {
-          {F_2_3, TransformMatrix(AT_2x2_3x3, 2, 4)},
-          {F_4_3, TransformMatrix(AT_4x4_3x3, 4, 6, 32)},
-          {F_2_5, TransformMatrix(AT_2x2_5x5, 2, 6, 16)},
+          {WinogradConv2DFmr::F_2_3, TransformMatrix(AT_2x2_3x3, 2, 4)},
+          {WinogradConv2DFmr::F_4_3, TransformMatrix(AT_4x4_3x3, 4, 6, 32)},
+          {WinogradConv2DFmr::F_2_5, TransformMatrix(AT_2x2_5x5, 2, 6, 16)},
       };
 
   // Map from (m, r) to A transform matrix.
-  static const llvm::SmallDenseMap<TransformMapKeyTy, TransformMatrix>
+  static const llvm::SmallDenseMap<WinogradConv2DFmr, TransformMatrix>
       AMatrices = {
-          {F_2_3, TransformMatrix(A_2x2_3x3, 4, 2)},
-          {F_4_3, TransformMatrix(A_4x4_3x3, 6, 4, 32)},
-          {F_2_5, TransformMatrix(A_2x2_5x5, 6, 2, 16)},
+          {WinogradConv2DFmr::F_2_3, TransformMatrix(A_2x2_3x3, 4, 2)},
+          {WinogradConv2DFmr::F_4_3, TransformMatrix(A_4x4_3x3, 6, 4, 32)},
+          {WinogradConv2DFmr::F_2_5, TransformMatrix(A_2x2_5x5, 6, 2, 16)},
       };
 
+  int64_t m, r;
+  std::tie(m, r) = getFmrFromWinogradConv2DFmr(fmr);
   auto valueType = cast<ShapedType>(value.getType());
   Type elementType = valueType.getElementType();
   auto valueShape = valueType.getShape(); // H, W, TileH, TileW, N, F
@@ -743,9 +743,8 @@ Value outputTransform(RewriterBase &rewriter, Location loc, Value value,
                             FIter, 2, 3, /*loopNorFIdx=*/4,
                             /*loopCorFIdx=*/5, /*heightIdx=*/0, /*widthIdx=*/1);
 
-    const TransformMapKeyTy key = {m, r};
-    const TransformMatrix &AMatrix = AMatrices.at(key);
-    const TransformMatrix &ATMatrix = ATMatrices.at(key);
+    const TransformMatrix &AMatrix = AMatrices.at(fmr);
+    const TransformMatrix &ATMatrix = ATMatrices.at(fmr);
     int64_t scalarFactor = (rightTransform ? AMatrix.scalarFactor : 1) *
                            (leftTransform ? ATMatrix.scalarFactor : 1);
     int64_t retCols = rightTransform ? AMatrix.cols : 1;
@@ -903,7 +902,7 @@ static bool hasAllOneValues(DenseIntElementsAttr attr) {
 /// linalg.winograd_*_transform ops.
 static FailureOr<Operation *>
 winogradConv2DHelper(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp convOp,
-                     int64_t m, int64_t r) {
+                     WinogradConv2DFmr fmr) {
   if (!convOp.hasPureTensorSemantics())
     return rewriter.notifyMatchFailure(
         convOp, "expected pure tensor semantics for linalg.conv_2d_nhwc_fhwc");
@@ -946,6 +945,8 @@ winogradConv2DHelper(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp convOp,
   int64_t outputW = outputShape[2];
   int64_t outputF = outputShape[3];
 
+  int64_t m, r;
+  std::tie(m, r) = getFmrFromWinogradConv2DFmr(fmr);
   // Only support F(m x m, r x r), F(m x 1, r x 1) or F(1 x m, 1 x r).
   bool isSupportedFilter = false;
   if (filterH == filterW && filterH == r)
@@ -959,17 +960,6 @@ winogradConv2DHelper(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp convOp,
     return rewriter.notifyMatchFailure(
         convOp, "only support filter (r x r), (r x 1) or (1 x r)");
 
-  // Currently, we support (m, r) = (2, 3) or (4, 3) or (2, 5).
-  static const llvm::SmallVector<TransformMapKeyTy, 3> validConfigs = {
-      F_2_3, F_4_3, F_2_5};
-
-  TransformMapKeyTy key = {m, r};
-  auto it = llvm::find(validConfigs, key);
-  // If we cannot find the constant transformation matrix, it means we do
-  // not support this configuration yet.
-  if (it == validConfigs.end())
-    return failure();
-
   // All the criterias are satisfied. We can do Winograd Conv2D.
   Location loc = convOp.getLoc();
 
@@ -993,7 +983,7 @@ winogradConv2DHelper(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp convOp,
   Value retValue = rewriter.create<tensor::EmptyOp>(loc, retType.getShape(),
                                                     filterElementType);
   auto transformedFilter = rewriter.create<linalg::WinogradFilterTransformOp>(
-      loc, retType, filter, retValue, m, r);
+      loc, retType, filter, retValue, fmr);
 
   // --- Create operation for input transform ---
 
@@ -1012,7 +1002,7 @@ winogradConv2DHelper(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp convOp,
   retValue = rewriter.create<tensor::EmptyOp>(loc, retType.getShape(),
                                               inputElementType);
   auto transformedInput = rewriter.create<linalg::WinogradInputTransformOp>(
-      loc, retType, input, retValue, m, r);
+      loc, retType, input, retValue, fmr);
 
   Type outputElementType = outputType.getElementType();
   Value matmulRet = matrixMultiply(rewriter, loc, transformedFilter,
@@ -1035,7 +1025,7 @@ winogradConv2DHelper(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp convOp,
   }
 
   Value transformedOutput = rewriter.create<linalg::WinogradOutputTransformOp>(
-      loc, outputType, matmulRet, output, m, r);
+      loc, outputType, matmulRet, output, fmr);
 
   // When output size is not aligned with output tile size, extract the
   // value from the padded buffer.
@@ -1067,8 +1057,8 @@ decomposeWinogradFilterTransformHelper(RewriterBase &rewriter,
   // For F(1 x m, 1 x r), we only need to do right side transform.
   bool rightTransform = filterW != 1;
   Value transformedFilter =
-      filterTransform(rewriter, loc, filter, op.getOutput(), op.getM(),
-                      op.getR(), leftTransform, rightTransform);
+      filterTransform(rewriter, loc, filter, op.getOutput(), op.getFmr(),
+                      leftTransform, rightTransform);
   if (!transformedFilter)
     return failure();
 
@@ -1094,8 +1084,8 @@ decomposeWinogradInputTransformHelper(RewriterBase &rewriter,
   // For F(1 x m, 1 x r), we only need to do right side transform.
   bool rightTransform = outputW != 1;
   Value transformedInput =
-      inputTransform(rewriter, loc, op.getInput(), op.getOutput(), op.getM(),
-                     op.getR(), leftTransform, rightTransform);
+      inputTransform(rewriter, loc, op.getInput(), op.getOutput(), op.getFmr(),
+                     leftTransform, rightTransform);
   if (!transformedInput)
     return failure();
 
@@ -1120,8 +1110,8 @@ decomposeWinogradOutputTransformHelper(RewriterBase &rewriter,
   // For F(1 x m, 1 x r), we only need to do right side transform.
   bool rightTransform = valueW != 1;
   Value transformedOutput =
-      outputTransform(rewriter, loc, value, op.getOutput(), op.getM(),
-                      op.getR(), leftTransform, rightTransform);
+      outputTransform(rewriter, loc, value, op.getOutput(), op.getFmr(),
+                      leftTransform, rightTransform);
   if (!transformedOutput)
     return failure();
 
@@ -1171,28 +1161,28 @@ class WinogradConv2DNhwcFhwc final
     : public OpRewritePattern<linalg::Conv2DNhwcFhwcOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
-  WinogradConv2DNhwcFhwc(mlir::MLIRContext *context, int64_t m, int64_t r)
-      : OpRewritePattern(context), m(m), r(r) {}
+  WinogradConv2DNhwcFhwc(mlir::MLIRContext *context, WinogradConv2DFmr fmr)
+      : OpRewritePattern(context), fmr(fmr) {}
 
   LogicalResult matchAndRewrite(linalg::Conv2DNhwcFhwcOp convOp,
                                 PatternRewriter &rewriter) const override {
-    if (failed(winogradConv2DHelper(rewriter, convOp, m, r)))
+    if (failed(winogradConv2DHelper(rewriter, convOp, fmr)))
       return failure();
 
     return success();
   }
 
 private:
-  int64_t m;
-  int64_t r;
+  WinogradConv2DFmr fmr;
 };
+
 } // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
 FailureOr<Operation *> winogradConv2D(RewriterBase &rewriter,
-                                      linalg::Conv2DNhwcFhwcOp op, int64_t m,
-                                      int64_t r) {
-  return winogradConv2DHelper(rewriter, op, m, r);
+                                      linalg::Conv2DNhwcFhwcOp op,
+                                      linalg::WinogradConv2DFmr fmr) {
+  return winogradConv2DHelper(rewriter, op, fmr);
 }
 
 FailureOr<Operation *>
@@ -1213,11 +1203,11 @@ decomposeWinogradOutputTransformOp(RewriterBase &rewriter,
   return decomposeWinogradOutputTransformHelper(rewriter, op);
 }
 
-void populateWinogradConv2DPatterns(RewritePatternSet &patterns, int64_t m,
-                                    int64_t r) {
+void populateWinogradConv2DPatterns(RewritePatternSet &patterns,
+                                    WinogradConv2DFmr fmr) {
   MLIRContext *context = patterns.getContext();
   // TODO: Support more Conv2D data layout, e.g., conv_2d_nchw_fchw
-  patterns.insert<WinogradConv2DNhwcFhwc>(context, m, r);
+  patterns.insert<WinogradConv2DNhwcFhwc>(context, fmr);
 }
 
 void populateDecomposeWinogradOpsPatterns(RewritePatternSet &patterns) {
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index d56b32193765e..372e83a98ee52 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -1567,11 +1567,23 @@ LogicalResult GlobalOp::verify() {
     // Check that the type of the initial value is compatible with the type of
     // the global variable.
     if (auto elementsAttr = llvm::dyn_cast<ElementsAttr>(initValue)) {
-      Type initType = elementsAttr.getType();
-      Type tensorType = getTensorTypeFromMemRefType(memrefType);
-      if (initType != tensorType)
-        return emitOpError("initial value expected to be of type ")
-               << tensorType << ", but was of type " << initType;
+      // Check the element types match.
+      auto initElementType =
+          cast<TensorType>(elementsAttr.getType()).getElementType();
+      auto memrefElementType = memrefType.getElementType();
+
+      if (initElementType != memrefElementType)
+        return emitOpError("initial value element expected to be of type ")
+               << memrefElementType << ", but was of type " << initElementType;
+
+      // Check the shapes match, given that memref globals can only produce
+      // statically shaped memrefs and elements literal type must have a static
+      // shape we can assume both types are shaped.
+      auto initShape = elementsAttr.getShapedType().getShape();
+      auto memrefShape = memrefType.getShape();
+      if (initShape != memrefShape)
+        return emitOpError("initial value shape expected to be ")
+               << memrefShape << " but was " << initShape;
     }
   }
 
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index ddcae8481a5b4..995120ad8680e 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -2047,53 +2047,119 @@ getUntiledConsumerFromSlice(RewriterBase &rewriter,
 
 /// A utility to fetch an untiled consumer of
 /// tensor.insert_slice/tensor.parallel_insert_slice.
-static FailureOr<OpOperand *>
-getUntiledConsumerFromSlice(RewriterBase &rewriter, Operation *sliceOp,
-                            MutableArrayRef<LoopLikeOpInterface> loops) {
+static FailureOr<SmallVector<OpOperand *>> getUntiledConsumerOperandsFromSlices(
+    RewriterBase &rewriter, ArrayRef<Operation *> sliceOps,
+    MutableArrayRef<LoopLikeOpInterface> loops) {
   assert(!loops.empty() && "unexpected empty loops");
-  if (auto insertSlice = dyn_cast<tensor::InsertSliceOp>(sliceOp)) {
-    return getUntiledConsumerFromSlice(rewriter, insertSlice, loops);
-  } else if (auto parallelInsertSlice =
-                 dyn_cast<tensor::ParallelInsertSliceOp>(sliceOp)) {
-    return getUntiledConsumerFromSlice(rewriter, parallelInsertSlice, loops);
-  } else {
-    return failure();
+  assert(!sliceOps.empty() && "unexpected empty list of candidate slices");
+  SmallVector<OpOperand *> fusedOperands;
+  for (auto sliceOp : sliceOps) {
+    FailureOr<OpOperand *> fusedOperand =
+        TypeSwitch<Operation *, FailureOr<OpOperand *>>(sliceOp)
+            .Case<tensor::InsertSliceOp, tensor::ParallelInsertSliceOp>(
+                [&](auto op) {
+                  return getUntiledConsumerFromSlice(rewriter, op, loops);
+                })
+            .Default([&](Operation *op) {
+              return rewriter.notifyMatchFailure(op, "unhandled slice type");
+            });
+    if (failed(fusedOperand)) {
+      return failure();
+    }
+    if (!fusedOperands.empty() &&
+        fusedOperand.value()->getOwner() != fusedOperands.front()->getOwner()) {
+      return rewriter.notifyMatchFailure(
+          fusedOperand.value()->getOwner(),
+          "all candidate slices must be to the same consumer");
+    }
+    fusedOperands.push_back(fusedOperand.value());
   }
+  return fusedOperands;
+}
+
+template <typename InsertSliceOpTy>
+static tensor::InsertSliceOp cloneAsInsertSlice(RewriterBase &rewriter,
+                                                InsertSliceOpTy sliceOp);
+
+template <>
+tensor::InsertSliceOp
+cloneAsInsertSlice<tensor::InsertSliceOp>(RewriterBase &rewriter,
+                                          tensor::InsertSliceOp insertSliceOp) {
+  return cast<tensor::InsertSliceOp>(
+      rewriter.clone(*insertSliceOp.getOperation()));
+}
+
+template <>
+tensor::InsertSliceOp cloneAsInsertSlice<tensor::ParallelInsertSliceOp>(
+    RewriterBase &rewriter, tensor::ParallelInsertSliceOp insertSliceOp) {
+  return rewriter.create<tensor::InsertSliceOp>(
+      insertSliceOp->getLoc(), insertSliceOp.getSource(),
+      insertSliceOp.getDest(), insertSliceOp.getMixedOffsets(),
+      insertSliceOp.getMixedSizes(), insertSliceOp.getMixedStrides());
+}
+
+static SmallVector<tensor::InsertSliceOp>
+cloneAsInsertSlices(RewriterBase &rewriter,
+                    ArrayRef<Operation *> candidateSlices) {
+  assert(!candidateSlices.empty() &&
+         "unexpected empty list of slices to clone");
+  SmallVector<tensor::InsertSliceOp> clonedSlices;
+  for (auto sliceOp : candidateSlices) {
+    TypeSwitch<Operation *>(sliceOp)
+        .Case<tensor::InsertSliceOp, tensor::ParallelInsertSliceOp>(
+            [&](auto op) {
+              auto clonedOp = cloneAsInsertSlice(rewriter, op);
+              clonedSlices.push_back(clonedOp);
+            })
+        .Default([&](Operation *op) {
+          // Assert here assuming this has already been checked.
+          assert(0 && "unexpected slice type while cloning as insert slice");
+        });
+  }
+  return clonedSlices;
 }
 
 /// Implementation of fusing consumer of a single slice by computing the
 /// slice of the consumer in-place for scf loop.
 FailureOr<scf::SCFFuseConsumerOfSliceResult>
-mlir::scf::tileAndFuseConsumerOfSlice(
-    RewriterBase &rewriter, Operation *candidateSliceOp,
+mlir::scf::tileAndFuseConsumerOfSlices(
+    RewriterBase &rewriter, ArrayRef<Operation *> candidateSlices,
     MutableArrayRef<LoopLikeOpInterface> loops) {
+  if (candidateSlices.empty()) {
+    return rewriter.notifyMatchFailure(
+        rewriter.getUnknownLoc(),
+        "no candidate slices provided for consumer fusion");
+  }
   // Return if `loops` is empty, return an error for now. Caller is expected
   // to handle this case.
   if (loops.empty()) {
-    return candidateSliceOp->emitOpError(
+    return rewriter.notifyMatchFailure(
+        candidateSlices.front(),
         "cannot call tile and fuse consumer with an empty loop nest");
   }
-  if (!isa<tensor::InsertSliceOp, tensor::ParallelInsertSliceOp>(
-          candidateSliceOp))
-    return failure();
+
+  if (!(llvm::all_of(candidateSlices, llvm::IsaPred<tensor::InsertSliceOp>) ||
+        llvm::all_of(candidateSlices,
+                     llvm::IsaPred<tensor::ParallelInsertSliceOp>))) {
+    return rewriter.notifyMatchFailure(
+        candidateSlices.front(),
+        "candidates slices need to be all `tensor.extract_slice`s or "
+        "`tensor.parallel_insert_slice`s");
+  }
 
   // 1. Get the consumer of scf.for for the result yielded by
   // tensor.insert_slice/parallel_insert_slice.
-  FailureOr<OpOperand *> maybeConsumerOpOperand =
-      getUntiledConsumerFromSlice(rewriter, candidateSliceOp, loops);
-  if (failed(maybeConsumerOpOperand)) {
-    return rewriter.notifyMatchFailure(candidateSliceOp,
-                                       "could not fetch consumer to fuse");
-  }
-  OpOperand *consumerOpOperand = *maybeConsumerOpOperand;
-  Operation *consumerOp = consumerOpOperand->getOwner();
-  unsigned operandNumber = consumerOpOperand->getOperandNumber();
-  unsigned resultNumber = 0;
-  if (auto producerResult = dyn_cast<OpResult>(consumerOpOperand->get())) {
-    resultNumber = producerResult.getResultNumber();
-  } else {
-    return rewriter.notifyMatchFailure(
-        consumerOp, "consumer op's operand doesn't seem to be an OpResult");
+  SmallVector<OpOperand *> consumerOpOperands;
+  Operation *consumerOp;
+  {
+    FailureOr<SmallVector<OpOperand *>> maybeConsumerOpOperand =
+        getUntiledConsumerOperandsFromSlices(rewriter, candidateSlices, loops);
+    if (failed(maybeConsumerOpOperand)) {
+      return rewriter.notifyMatchFailure(candidateSlices.front(),
+                                         "could not fetch consumer to fuse");
+    }
+    std::swap(consumerOpOperands, maybeConsumerOpOperand.value());
+    consumerOp = consumerOpOperands.front()->getOwner();
   }
 
   LoopLikeOpInterface outerMostLoop = loops.front();
@@ -2113,16 +2179,14 @@ mlir::scf::tileAndFuseConsumerOfSlice(
   if (!dstOp)
     return rewriter.notifyMatchFailure(consumerOp,
                                        "consumer op is not DPS operation");
-  SmallVector<Value> dpsInits =
-      llvm::map_to_vector(dstOp.getDpsInits(), [](Value v) { return v; });
-  if (llvm::is_contained(dpsInits, outerMostLoop->getResult(resultNumber))) {
+  if (llvm::any_of(consumerOpOperands, [&](OpOperand *opOperand) {
+        return dstOp.isDpsInit(opOperand);
+      })) {
     return rewriter.notifyMatchFailure(
         consumerOp,
         "consumer op taking the result of scf.for as init is not supported");
   }
-  SmallVector<Value> newInits = dpsInits;
-
-  Location loc = outerMostLoop->getLoc();
+  SmallVector<Value> newInits = llvm::to_vector(dstOp.getDpsInits());
 
   // 3. Move the whole loop structure right before firstUserOfLoop, the
   // dominance should be already ensured by `checkAssumptionForLoop`.
@@ -2137,43 +2201,52 @@ mlir::scf::tileAndFuseConsumerOfSlice(
   // tensor.insert_slice. In the scf.for case this is a clone of the
   // candidateSliceOp whereas in the scf.forall case this is created from the
   // operands of tensor.parallel_insert_slice.
-  tensor::InsertSliceOp clonedInsertSliceOp;
   if (auto sliceOp =
-          dyn_cast<tensor::ParallelInsertSliceOp>(candidateSliceOp)) {
+          dyn_cast<tensor::ParallelInsertSliceOp>(candidateSlices.front())) {
     auto newForallOp = cast<scf::ForallOp>(innerMostLoop.getOperation());
     rewriter.setInsertionPoint(newForallOp.getTerminator());
-    clonedInsertSliceOp = rewriter.create<tensor::InsertSliceOp>(
-        loc, sliceOp.getSource(), sliceOp.getDest(), sliceOp.getMixedOffsets(),
-        sliceOp.getMixedSizes(), sliceOp.getMixedStrides());
   } else {
-    rewriter.setInsertionPoint(candidateSliceOp);
-    clonedInsertSliceOp =
-        cast<tensor::InsertSliceOp>(rewriter.clone(*candidateSliceOp));
+    rewriter.setInsertionPoint(candidateSlices.front());
   }
+  // 5.a. Clone all the candidate slices as equivalent insert slice ops.
+  SmallVector<tensor::InsertSliceOp> clonedInsertSlices =
+      cloneAsInsertSlices(rewriter, candidateSlices);
 
-  // 5.a. Clone consumer op.
+  // 5.b. Clone consumer op.
   auto clonedConsumerOp = cast<TilingInterface>(rewriter.clone(*consumerOp));
+  SmallVector<unsigned> operandNumbers =
+      llvm::map_to_vector(consumerOpOperands, [](OpOperand *opOperand) {
+        return opOperand->getOperandNumber();
+      });
+  SmallVector<OpOperand *> clonedOpFusedOperandsList =
+      llvm::map_to_vector(operandNumbers, [&](unsigned operandNum) {
+        return &clonedConsumerOp->getOpOperand(operandNum);
+      });
 
-  // 5.b. Replace all uses of the loop result with the result of the cloned
+  // 5.c. Replace all uses of the loop result with the result of the cloned
   // tensor.insert_slice.
-  OpOperand &operandToReplace = clonedConsumerOp->getOpOperand(operandNumber);
   rewriter.modifyOpInPlace(clonedConsumerOp, [&]() {
-    operandToReplace.set(clonedInsertSliceOp.getResult());
+    for (auto [operandToReplace, clonedSliceOp] :
+         llvm::zip_equal(clonedOpFusedOperandsList, clonedInsertSlices)) {
+      operandToReplace->set(clonedSliceOp.getResult());
+    }
   });
 
   // 6. Perform tiling of the cloned consumer and replace the operand at
   // `operandNumber` with the source of the cloned tensor.insert_slice op.
-  auto ossSliceOp =
-      cast<OffsetSizeAndStrideOpInterface>(clonedInsertSliceOp.getOperation());
   FailureOr<TilingResult> tileAndFuseResult =
-      tensor::replaceInsertSliceWithTiledConsumer(
-          rewriter, ossSliceOp, clonedConsumerOp->getOpOperand(operandNumber));
+      tensor::replaceInsertSlicesWithTiledConsumer(rewriter, clonedInsertSlices,
+                                                   clonedOpFusedOperandsList);
   if (failed(tileAndFuseResult)) {
     return failure();
   }
+
   auto tiledConsumerOp = cast<TilingInterface>(tileAndFuseResult->tiledOps[0]);
-  rewriter.replaceAllUsesWith(tiledConsumerOp->getOperand(operandNumber),
-                              clonedInsertSliceOp.getSource());
+  for (auto [operandNum, clonedSliceOp] :
+       llvm::zip_equal(operandNumbers, clonedInsertSlices)) {
+    rewriter.replaceAllUsesWith(tiledConsumerOp->getOperand(operandNum),
+                                clonedSliceOp.getSource());
+  }
 
   // 7. Reconstruct [nested] loop with new inits.
   YieldTiledValuesFn newYieldValuesFn =
@@ -2185,14 +2258,20 @@ mlir::scf::tileAndFuseConsumerOfSlice(
     // 8. Set inner insertPoint right before tiled consumer op.
     innerRewriter.setInsertionPoint(tiledConsumerOp);
 
-    SmallVector<OpFoldResult> offsets = ossSliceOp.getMixedOffsets();
-    SmallVector<OpFoldResult> sizes = ossSliceOp.getMixedSizes();
-    SmallVector<OpFoldResult> strides = ossSliceOp.getMixedStrides();
+    SmallVector<SmallVector<OpFoldResult>> allOffsets, allSizes;
+    for (auto candidateSliceOp : clonedInsertSlices) {
+      SmallVector<OpFoldResult> offsets = candidateSliceOp.getMixedOffsets();
+      SmallVector<OpFoldResult> sizes = candidateSliceOp.getMixedSizes();
+      SmallVector<OpFoldResult> strides = candidateSliceOp.getMixedStrides();
 
-    // 9. Check all insert stride is 1.
-    if (!llvm::all_of(strides, isOneInteger)) {
-      return rewriter.notifyMatchFailure(
-          candidateSliceOp, "containingOp's result yield with stride");
+      // 9. Check all insert stride is 1.
+      if (!llvm::all_of(strides, isOneInteger)) {
+        return rewriter.notifyMatchFailure(
+            candidateSliceOp, "containingOp's result yield with stride");
+      }
+
+      allOffsets.emplace_back(std::move(offsets));
+      allSizes.emplace_back(std::move(sizes));
     }
 
     // 10. Try to get iter domain position from input position. Use
@@ -2202,8 +2281,8 @@ mlir::scf::tileAndFuseConsumerOfSlice(
     // tiledConsumerOp could lead to some chained unnecessary extra index
     // computation.
     SmallVector<OpFoldResult> iterDomainOffsets, iterDomainSizes;
-    if (failed(clonedConsumerOp.getIterationDomainTileFromOperandTile(
-            rewriter, operandNumber, offsets, sizes, iterDomainOffsets,
+    if (failed(clonedConsumerOp.getIterationDomainTileFromOperandTiles(
+            rewriter, operandNumbers, allOffsets, allSizes, iterDomainOffsets,
             iterDomainSizes))) {
       return rewriter.notifyMatchFailure(
           clonedConsumerOp,
@@ -2279,10 +2358,13 @@ mlir::scf::tileAndFuseConsumerOfSlice(
   // 16. Need to erase the old scf loop and the cloned consumer op.
   rewriter.eraseOp(clonedConsumerOp);
 
+  SmallVector<OpOperand *> tiledAndFusedOpOperands =
+      llvm::map_to_vector(operandNumbers, [&](unsigned operandNum) {
+        return &tileAndFuseResult->tiledOps[0]->getOpOperand(operandNum);
+      });
   return scf::SCFFuseConsumerOfSliceResult{
-      consumerOpOperand,
-      &(tileAndFuseResult->tiledOps[0]->getOpOperand(operandNumber)),
-      tileAndFuseResult->tiledOps};
+      std::move(consumerOpOperands), std::move(tiledAndFusedOpOperands),
+      std::move(tileAndFuseResult->tiledOps)};
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp
index 6f33f9b55ceb6..4392a2c0eb839 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp
@@ -17,6 +17,9 @@
 #include "mlir/Dialect/Tensor/Transforms/Transforms.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Interfaces/TilingInterface.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "tensor-swap-slices"
 
 using namespace mlir;
 
@@ -39,21 +42,55 @@ FailureOr<TilingResult> tensor::replaceExtractSliceWithTiledProducer(
   return *tiledResult;
 }
 
-FailureOr<TilingResult> tensor::replaceInsertSliceWithTiledConsumer(
-    OpBuilder &builder, OffsetSizeAndStrideOpInterface sliceOp,
-    OpOperand &consumer) {
-  auto consumerOp = dyn_cast<TilingInterface>(consumer.getOwner());
+FailureOr<TilingResult> tensor::replaceInsertSlicesWithTiledConsumer(
+    OpBuilder &builder, ArrayRef<tensor::InsertSliceOp> sliceOps,
+    ArrayRef<OpOperand *> consumerOperands) {
+  if (sliceOps.empty()) {
+    LLVM_DEBUG(
+        { llvm::dbgs() << "expected candidate slices list to be non-empty"; });
+    return failure();
+  }
+  if (sliceOps.size() != consumerOperands.size()) {
+    LLVM_DEBUG({
+      llvm::dbgs()
+          << "expected as many operands as the number of slices passed";
+    });
+    return failure();
+  }
+  auto consumerOp =
+      dyn_cast<TilingInterface>(consumerOperands.front()->getOwner());
   if (!consumerOp)
     return failure();
+  for (auto opOperand : consumerOperands.drop_front()) {
+    if (opOperand->getOwner() != consumerOp) {
+      LLVM_DEBUG({
+        llvm::dbgs()
+            << "expected all consumer operands to be from the same operation";
+      });
+      return failure();
+    }
+  }
 
-  // `TilingInterface` currently only supports strides being 1.
-  if (!llvm::all_of(sliceOp.getMixedStrides(), isOneInteger))
-    return failure();
+  auto consumerOperandNums = llvm::map_to_vector(
+      consumerOperands, [](OpOperand *opOperand) -> unsigned {
+        return opOperand->getOperandNumber();
+      });
+  SmallVector<SmallVector<OpFoldResult>> allOffsets;
+  SmallVector<SmallVector<OpFoldResult>> allSizes;
+  for (auto sliceOp : sliceOps) {
+
+    // `TilingInterface` currently only supports strides being 1.
+    if (!llvm::all_of(sliceOp.getMixedStrides(), isOneInteger))
+      return failure();
 
+    SmallVector<OpFoldResult> offsets = sliceOp.getMixedOffsets();
+    SmallVector<OpFoldResult> sizes = sliceOp.getMixedSizes();
+    allOffsets.emplace_back(std::move(offsets));
+    allSizes.emplace_back(std::move(sizes));
+  }
   FailureOr<TilingResult> tiledResult =
-      consumerOp.getTiledImplementationFromOperandTile(
-          builder, consumer.getOperandNumber(), sliceOp.getMixedOffsets(),
-          sliceOp.getMixedSizes());
+      consumerOp.getTiledImplementationFromOperandTiles(
+          builder, consumerOperandNums, allOffsets, allSizes);
   if (failed(tiledResult))
     return failure();
 
diff --git a/mlir/lib/Dialect/Transform/DebugExtension/DebugExtensionOps.cpp b/mlir/lib/Dialect/Transform/DebugExtension/DebugExtensionOps.cpp
index 7a9f8f4b1b528..12257da878a40 100644
--- a/mlir/lib/Dialect/Transform/DebugExtension/DebugExtensionOps.cpp
+++ b/mlir/lib/Dialect/Transform/DebugExtension/DebugExtensionOps.cpp
@@ -19,9 +19,9 @@ using namespace mlir;
 #include "mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.cpp.inc"
 
 DiagnosedSilenceableFailure
-transform::DebugEmitRemarkAtOp::apply(transform::TransformRewriter &rewriter,
-                                      transform::TransformResults &results,
-                                      transform::TransformState &state) {
+transform::EmitRemarkAtOp::apply(transform::TransformRewriter &rewriter,
+                                 transform::TransformResults &results,
+                                 transform::TransformState &state) {
   if (isa<TransformHandleTypeInterface>(getAt().getType())) {
     auto payload = state.getPayloadOps(getAt());
     for (Operation *op : payload)
@@ -52,9 +52,10 @@ transform::DebugEmitRemarkAtOp::apply(transform::TransformRewriter &rewriter,
   return DiagnosedSilenceableFailure::success();
 }
 
-DiagnosedSilenceableFailure transform::DebugEmitParamAsRemarkOp::apply(
-    transform::TransformRewriter &rewriter,
-    transform::TransformResults &results, transform::TransformState &state) {
+DiagnosedSilenceableFailure
+transform::EmitParamAsRemarkOp::apply(transform::TransformRewriter &rewriter,
+                                      transform::TransformResults &results,
+                                      transform::TransformState &state) {
   std::string str;
   llvm::raw_string_ostream os(str);
   if (getMessage())
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 5e0f36064be3b..862ed7bae1fbb 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -5856,18 +5856,7 @@ OpFoldResult ShapeCastOp::fold(FoldAdaptor adaptor) {
 
   // shape_cast(transpose(x)) -> shape_cast(x)
   if (auto transpose = getSource().getDefiningOp<TransposeOp>()) {
-    // This folder does
-    //    shape_cast(transpose) -> shape_cast
-    // But another pattern, ConvertIllegalShapeCastOpsToTransposes, does
-    //    shape_cast -> shape_cast(transpose)
-    // i.e. the complete opposite. When paired, these 2 patterns can cause
-    // infinite cycles in pattern rewriting.
-    // ConvertIllegalShapeCastOpsToTransposes only matches on scalable
-    // vectors, so by disabling this folder for scalable vectors the
-    // cycle is avoided.
-    // TODO: Check if ConvertIllegalShapeCastOpsToTransposes is
-    // still needed. If it's not, then we can fold here.
-    if (!transpose.getType().isScalable() && isOrderPreserving(transpose)) {
+    if (isOrderPreserving(transpose)) {
       setOperand(transpose.getVector());
       return getResult();
     }
diff --git a/mlir/lib/IR/CMakeLists.txt b/mlir/lib/IR/CMakeLists.txt
index 997782df8c5f3..4cabac185171c 100644
--- a/mlir/lib/IR/CMakeLists.txt
+++ b/mlir/lib/IR/CMakeLists.txt
@@ -32,7 +32,6 @@ add_mlir_library(MLIRIR
   PatternMatch.cpp
   Region.cpp
   RegionKindInterface.cpp
-  StateStack.cpp
   SymbolTable.cpp
   TensorEncoding.cpp
   Types.cpp
diff --git a/mlir/lib/Support/CMakeLists.txt b/mlir/lib/Support/CMakeLists.txt
index 488decd52ae64..02b6c694a28fd 100644
--- a/mlir/lib/Support/CMakeLists.txt
+++ b/mlir/lib/Support/CMakeLists.txt
@@ -11,6 +11,7 @@ add_mlir_library(MLIRSupport
   FileUtilities.cpp
   InterfaceSupport.cpp
   RawOstreamExtras.cpp
+  StateStack.cpp
   StorageUniquer.cpp
   Timing.cpp
   ToolUtilities.cpp
diff --git a/mlir/lib/IR/StateStack.cpp b/mlir/lib/Support/StateStack.cpp
similarity index 92%
rename from mlir/lib/IR/StateStack.cpp
rename to mlir/lib/Support/StateStack.cpp
index 22fdcd73c625b..a9bb3ffb2e1b0 100644
--- a/mlir/lib/IR/StateStack.cpp
+++ b/mlir/lib/Support/StateStack.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/IR/StateStack.h"
+#include "mlir/Support/StateStack.h"
 
 namespace mlir {
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 9272f6572fda3..23140f22555a5 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -498,7 +498,13 @@ findAllocaInsertPoint(llvm::IRBuilderBase &builder,
         allocaInsertPoint = frame.allocaInsertPoint;
         return WalkResult::interrupt();
       });
-  if (walkResult.wasInterrupted())
+  // In cases with multiple levels of outlining, the tree walk might find an
+  // alloca insertion point that is inside the original function while the
+  // builder insertion point is inside the outlined function. We need to make
+  // sure that we do not use it in those cases.
+  if (walkResult.wasInterrupted() &&
+      allocaInsertPoint.getBlock()->getParent() ==
+          builder.GetInsertBlock()->getParent())
     return allocaInsertPoint;
 
   // Otherwise, insert to the entry block of the surrounding function.
@@ -4378,6 +4384,9 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   llvm::OpenMPIRBuilder::TargetDataInfo info(/*RequiresDevicePointerInfo=*/true,
                                              /*SeparateBeginEndCalls=*/true);
+  bool isTargetDevice = ompBuilder->Config.isTargetDevice();
+  bool isOffloadEntry =
+      isTargetDevice || !ompBuilder->Config.TargetTriples.empty();
 
   LogicalResult result =
       llvm::TypeSwitch<Operation *, LogicalResult>(op)
@@ -4467,6 +4476,9 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
 
   if (failed(result))
     return failure();
+  // Pretend we have IF(false) if we're not doing offload.
+  if (!isOffloadEntry)
+    ifCond = builder.getFalse();
 
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   MapInfoData mapData;
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index e8ce528bd185e..baf7a82b1c24a 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -777,7 +777,7 @@ ModuleTranslation::ModuleTranslation(Operation *module,
 }
 
 ModuleTranslation::~ModuleTranslation() {
-  if (ompBuilder)
+  if (ompBuilder && !ompBuilder->isFinalized())
     ompBuilder->finalize();
 }
 
@@ -2331,6 +2331,10 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext,
   // beforehand.
   translator.debugTranslation->addModuleFlagsIfNotPresent();
 
+  // Call the OpenMP IR Builder callbacks prior to verifying the module
+  if (auto *ompBuilder = translator.getOpenMPBuilder())
+    ompBuilder->finalize();
+
   if (!disableVerification &&
       llvm::verifyModule(*translator.llvmModule, &llvm::errs()))
     return nullptr;
diff --git a/mlir/lib/Tools/PDLL/CodeGen/MLIRGen.cpp b/mlir/lib/Tools/PDLL/CodeGen/MLIRGen.cpp
index 29bc49b78f15d..824201d17b5ab 100644
--- a/mlir/lib/Tools/PDLL/CodeGen/MLIRGen.cpp
+++ b/mlir/lib/Tools/PDLL/CodeGen/MLIRGen.cpp
@@ -350,8 +350,9 @@ Value CodeGen::genNonInitializerVar(const ast::VariableDecl *varDecl,
     Value results = builder.create<pdl::TypesOp>(
         loc, pdl::RangeType::get(builder.getType<pdl::TypeType>()),
         /*types=*/ArrayAttr());
-    return builder.create<pdl::OperationOp>(
-        loc, opType.getName(), operands, std::nullopt, ValueRange(), results);
+    return builder.create<pdl::OperationOp>(loc, opType.getName(), operands,
+                                            ArrayRef<StringRef>(), ValueRange(),
+                                            results);
   }
 
   if (ast::RangeType rangeTy = dyn_cast<ast::RangeType>(type)) {
diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt
index ee07081246fc7..b2daabb2a5957 100644
--- a/mlir/python/CMakeLists.txt
+++ b/mlir/python/CMakeLists.txt
@@ -171,6 +171,15 @@ ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
   DIALECT_NAME transform
   EXTENSION_NAME transform_pdl_extension)
 
+declare_mlir_dialect_extension_python_bindings(
+ADD_TO_PARENT MLIRPythonSources.Dialects
+ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
+  TD_FILE dialects/TransformDebugExtensionOps.td
+  SOURCES
+    dialects/transform/debug.py
+  DIALECT_NAME transform
+  EXTENSION_NAME transform_debug_extension)
+
 declare_mlir_dialect_python_bindings(
   ADD_TO_PARENT MLIRPythonSources.Dialects
   ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
diff --git a/mlir/python/mlir/dialects/TransformDebugExtensionOps.td b/mlir/python/mlir/dialects/TransformDebugExtensionOps.td
new file mode 100644
index 0000000000000..22a85d2366994
--- /dev/null
+++ b/mlir/python/mlir/dialects/TransformDebugExtensionOps.td
@@ -0,0 +1,19 @@
+//===-- TransformDebugExtensionOps.td - Binding entry point *- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Entry point of the generated Python bindings for the Debug extension of the
+// Transform dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PYTHON_BINDINGS_TRANSFORM_DEBUG_EXTENSION_OPS
+#define PYTHON_BINDINGS_TRANSFORM_DEBUG_EXTENSION_OPS
+
+include "mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.td"
+
+#endif // PYTHON_BINDINGS_TRANSFORM_DEBUG_EXTENSION_OPS
diff --git a/mlir/python/mlir/dialects/transform/debug.py b/mlir/python/mlir/dialects/transform/debug.py
new file mode 100644
index 0000000000000..f7c04268dc03d
--- /dev/null
+++ b/mlir/python/mlir/dialects/transform/debug.py
@@ -0,0 +1,81 @@
+#  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#  See https://llvm.org/LICENSE.txt for license information.
+#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from typing import Optional
+
+from ...ir import Attribute, Operation, Value, StringAttr
+from .._transform_debug_extension_ops_gen import *
+from .._transform_pdl_extension_ops_gen import _Dialect
+
+try:
+    from .._ods_common import _cext as _ods_cext
+except ImportError as e:
+    raise RuntimeError("Error loading imports from extension module") from e
+
+from typing import Union
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class EmitParamAsRemarkOp(EmitParamAsRemarkOp):
+    def __init__(
+        self,
+        param: Attribute,
+        *,
+        anchor: Optional[Operation] = None,
+        message: Optional[Union[StringAttr, str]] = None,
+        loc=None,
+        ip=None,
+    ):
+        if isinstance(message, str):
+            message = StringAttr.get(message)
+
+        super().__init__(
+            param,
+            anchor=anchor,
+            message=message,
+            loc=loc,
+            ip=ip,
+        )
+
+
+def emit_param_as_remark(
+    param: Attribute,
+    *,
+    anchor: Optional[Operation] = None,
+    message: Optional[Union[StringAttr, str]] = None,
+    loc=None,
+    ip=None,
+):
+    return EmitParamAsRemarkOp(param, anchor=anchor, message=message, loc=loc, ip=ip)
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class EmitRemarkAtOp(EmitRemarkAtOp):
+    def __init__(
+        self,
+        at: Union[Operation, Value],
+        message: Optional[Union[StringAttr, str]] = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        if isinstance(message, str):
+            message = StringAttr.get(message)
+
+        super().__init__(
+            at,
+            message,
+            loc=loc,
+            ip=ip,
+        )
+
+
+def emit_remark_at(
+    at: Union[Operation, Value],
+    message: Optional[Union[StringAttr, str]] = None,
+    *,
+    loc=None,
+    ip=None,
+):
+    return EmitRemarkAtOp(at, message, loc=loc, ip=ip)
diff --git a/mlir/test/Dialect/ArmNeon/lower-to-arm-neon.mlir b/mlir/test/Dialect/ArmNeon/lower-to-arm-neon.mlir
index e4f7ea150c850..5fc29c6442602 100644
--- a/mlir/test/Dialect/ArmNeon/lower-to-arm-neon.mlir
+++ b/mlir/test/Dialect/ArmNeon/lower-to-arm-neon.mlir
@@ -17,14 +17,28 @@ func.func @vector_arm_neon_mixed_types(%lhs: vector<2x8xi8>, %rhs: vector<2x8xi4
 
 // -----
 
-// CHECK-LABEL: vector_arm_neon_same_types
-// CHECK-SAME:    %[[A0:.*]]: vector<2x8xi8>, %[[A1:.*]]: vector<2x8xi8>, %[[A2:.*]]: vector<2x2xi32>
-// CHECK-DAG: %[[D0:.*]] = vector.shape_cast %[[A0]] : vector<2x8xi8> to vector<16xi8>
-// CHECK-DAG: %[[D1:.*]] = vector.shape_cast %[[A1]] : vector<2x8xi8> to vector<16xi8>
-// CHECK-DAG: %[[D2:.*]] = vector.shape_cast %[[A2]] : vector<2x2xi32> to vector<4xi32>
-// CHECK-DAG: %[[D3:.*]] = arm_neon.intr.smmla %[[D2]], %[[D0]], %[[D1]] : vector<16xi8> to vector<4xi32>
-// CHECK-DAG: %[[D4:.*]] = vector.shape_cast %[[D3]] : vector<4xi32> to vector<2x2xi32>
-func.func @vector_arm_neon_same_types(%lhs: vector<2x8xi8>, %rhs: vector<2x8xi8>, %acc : vector<2x2xi32>) -> vector<2x2xi32> {
+// CHECK-LABEL: vector_arm_neon_implicit_extsi
+// CHECK-SAME:    %[[LHS:.+]]: vector<2x8xi8>, %[[RHS:.+]]: vector<2x8xi8>, %[[ACC:.+]]: vector<2x2xi32>
+// CHECK:       %[[L:.+]] = vector.shape_cast %[[LHS]] : vector<2x8xi8> to vector<16xi8>
+// CHECK:       %[[R:.+]] = vector.shape_cast %[[RHS]] : vector<2x8xi8> to vector<16xi8>
+// CHECK:       %[[A:.+]] = vector.shape_cast %[[ACC]] : vector<2x2xi32> to vector<4xi32>
+// CHECK:       %[[M:.+]] = arm_neon.intr.smmla %[[A]], %[[L]], %[[R]] : vector<16xi8> to vector<4xi32>
+// CHECK:       %{{.+}} = vector.shape_cast %[[M]] : vector<4xi32> to vector<2x2xi32>
+func.func @vector_arm_neon_implicit_extsi(%lhs: vector<2x8xi8>, %rhs: vector<2x8xi8>, %acc : vector<2x2xi32>) -> vector<2x2xi32> {
+  %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs, %rhs, %acc : vector<2x8xi8>, vector<2x8xi8> into vector<2x2xi32>
+  return %res : vector<2x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: vector_arm_neon_signed_signed
+// CHECK-SAME:    %[[LHS:.+]]: vector<2x8xi8>, %[[RHS:.+]]: vector<2x8xi8>, %[[ACC:.+]]: vector<2x2xi32>
+// CHECK:       %[[L:.+]] = vector.shape_cast %[[LHS]] : vector<2x8xi8> to vector<16xi8>
+// CHECK:       %[[R:.+]] = vector.shape_cast %[[RHS]] : vector<2x8xi8> to vector<16xi8>
+// CHECK:       %[[A:.+]] = vector.shape_cast %[[ACC]] : vector<2x2xi32> to vector<4xi32>
+// CHECK:       %[[M:.+]] = arm_neon.intr.smmla %[[A]], %[[L]], %[[R]] : vector<16xi8> to vector<4xi32>
+// CHECK:       %{{.+}} = vector.shape_cast %[[M]] : vector<4xi32> to vector<2x2xi32>
+func.func @vector_arm_neon_signed_signed(%lhs: vector<2x8xi8>, %rhs: vector<2x8xi8>, %acc : vector<2x2xi32>) -> vector<2x2xi32> {
   %lhs_extsi = arith.extsi %lhs : vector<2x8xi8> to vector<2x8xi32>
   %rhs_extsi = arith.extsi %rhs : vector<2x8xi8> to vector<2x8xi32>
   %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs_extsi, %rhs_extsi, %acc : vector<2x8xi32>, vector<2x8xi32> into vector<2x2xi32>
@@ -33,11 +47,51 @@ func.func @vector_arm_neon_same_types(%lhs: vector<2x8xi8>, %rhs: vector<2x8xi8>
 
 // -----
 
-// CHECK-LABEL: vector_arm_neon_without_extsi
-// CHECK-SAME:    %[[A0:.*]]: vector<2x8xi32>, %[[A1:.*]]: vector<2x8xi32>, %[[A2:.*]]: vector<2x2xi32>
-// CHECK-DAG: %[[D0:.*]] = vector.contract
-func.func @vector_arm_neon_without_extsi(%lhs: vector<2x8xi32>, %rhs: vector<2x8xi32>, %acc : vector<2x2xi32>) -> vector<2x2xi32> {
-  %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs, %rhs, %acc : vector<2x8xi32>, vector<2x8xi32> into vector<2x2xi32>
+// CHECK-LABEL: vector_arm_neon_unsigned_signed
+// CHECK-SAME:    %[[LHS:.+]]: vector<2x8xi8>, %[[RHS:.+]]: vector<2x8xi8>, %[[ACC:.+]]: vector<2x2xi32>
+// CHECK:       %[[L:.+]] = vector.shape_cast %[[LHS]] : vector<2x8xi8> to vector<16xi8>
+// CHECK:       %[[R:.+]] = vector.shape_cast %[[RHS]] : vector<2x8xi8> to vector<16xi8>
+// CHECK:       %[[A:.+]] = vector.shape_cast %[[ACC]] : vector<2x2xi32> to vector<4xi32>
+// CHECK:       %[[M:.+]] = arm_neon.intr.usmmla %[[A]], %[[L]], %[[R]] : vector<16xi8> to vector<4xi32>
+// CHECK:       %{{.+}} = vector.shape_cast %[[M]] : vector<4xi32> to vector<2x2xi32>
+func.func @vector_arm_neon_unsigned_signed(%lhs: vector<2x8xi8>, %rhs: vector<2x8xi8>, %acc : vector<2x2xi32>) -> vector<2x2xi32> {
+  %lhs_extsi = arith.extui %lhs : vector<2x8xi8> to vector<2x8xi32>
+  %rhs_extsi = arith.extsi %rhs : vector<2x8xi8> to vector<2x8xi32>
+  %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs_extsi, %rhs_extsi, %acc : vector<2x8xi32>, vector<2x8xi32> into vector<2x2xi32>
+  return %res : vector<2x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: vector_arm_neon_unsigned_unsigned
+// CHECK-SAME:    %[[LHS:.+]]: vector<2x8xi8>, %[[RHS:.+]]: vector<2x8xi8>, %[[ACC:.+]]: vector<2x2xi32>
+// CHECK:       %[[L:.+]] = vector.shape_cast %[[LHS]] : vector<2x8xi8> to vector<16xi8>
+// CHECK:       %[[R:.+]] = vector.shape_cast %[[RHS]] : vector<2x8xi8> to vector<16xi8>
+// CHECK:       %[[A:.+]] = vector.shape_cast %[[ACC]] : vector<2x2xi32> to vector<4xi32>
+// CHECK:       %[[M:.+]] = arm_neon.intr.ummla %[[A]], %[[L]], %[[R]] : vector<16xi8> to vector<4xi32>
+// CHECK:       %{{.+}} = vector.shape_cast %[[M]] : vector<4xi32> to vector<2x2xi32>
+func.func @vector_arm_neon_unsigned_unsigned(%lhs: vector<2x8xi8>, %rhs: vector<2x8xi8>, %acc : vector<2x2xi32>) -> vector<2x2xi32> {
+  %lhs_extsi = arith.extui %lhs : vector<2x8xi8> to vector<2x8xi32>
+  %rhs_extsi = arith.extui %rhs : vector<2x8xi8> to vector<2x8xi32>
+  %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs_extsi, %rhs_extsi, %acc : vector<2x8xi32>, vector<2x8xi32> into vector<2x2xi32>
+  return %res : vector<2x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: vector_arm_neon_signed_unsigned
+// CHECK-SAME:    %[[LHS:.+]]: vector<2x8xi8>, %[[RHS:.+]]: vector<2x8xi8>, %[[ACC:.+]]: vector<2x2xi32>
+// CHECK:       %[[ACC_T:.+]] = vector.transpose %[[ACC]], [1, 0] : vector<2x2xi32> to vector<2x2xi32>
+// CHECK:       %[[L:.+]] = vector.shape_cast %[[LHS]] : vector<2x8xi8> to vector<16xi8>
+// CHECK:       %[[R:.+]] = vector.shape_cast %[[RHS]] : vector<2x8xi8> to vector<16xi8>
+// CHECK:       %[[A:.+]] = vector.shape_cast %[[ACC_T]] : vector<2x2xi32> to vector<4xi32>
+// CHECK:       %[[M:.+]] = arm_neon.intr.usmmla %[[A]], %[[R]], %[[L]] : vector<16xi8> to vector<4xi32>
+// CHECK:       %[[OUT_T:.+]] = vector.shape_cast %[[M]] : vector<4xi32> to vector<2x2xi32>
+// CHECK:       %{{.+}} = vector.transpose %[[OUT_T]], [1, 0] : vector<2x2xi32> to vector<2x2xi32>
+func.func @vector_arm_neon_signed_unsigned(%lhs: vector<2x8xi8>, %rhs: vector<2x8xi8>, %acc : vector<2x2xi32>) -> vector<2x2xi32> {
+  %lhs_extsi = arith.extsi %lhs : vector<2x8xi8> to vector<2x8xi32>
+  %rhs_extsi = arith.extui %rhs : vector<2x8xi8> to vector<2x8xi32>
+  %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs_extsi, %rhs_extsi, %acc : vector<2x8xi32>, vector<2x8xi32> into vector<2x2xi32>
   return %res : vector<2x2xi32>
 }
 
diff --git a/mlir/test/Dialect/ArmSME/vector-legalization.mlir b/mlir/test/Dialect/ArmSME/vector-legalization.mlir
index d56df9814f173..6cdf576272ebc 100644
--- a/mlir/test/Dialect/ArmSME/vector-legalization.mlir
+++ b/mlir/test/Dialect/ArmSME/vector-legalization.mlir
@@ -491,51 +491,6 @@ func.func @illegal_transpose_no_defining_source_op(%vec: vector<[4]x1xf32>) -> v
 
 // -----
 
-// CHECK-LABEL: @illegal_shape_cast_to_transpose_2d(
-// CHECK-SAME:                                      %[[VEC:.*]]: vector<[4]x1xf32>)
-func.func @illegal_shape_cast_to_transpose_2d(%vec: vector<[4]x1xf32>) -> vector<1x[4]xf32> {
-  // CHECK: vector.transpose %[[VEC]], [1, 0] : vector<[4]x1xf32> to vector<1x[4]xf32>
-  %0 = vector.shape_cast %vec : vector<[4]x1xf32> to vector<1x[4]xf32>
-  return %0 : vector<1x[4]xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @illegal_shape_cast_to_transpose_1d(
-// CHECK-SAME:                                      %[[VEC:.*]]: vector<[4]x1xf32>)
-func.func @illegal_shape_cast_to_transpose_1d(%vec: vector<[4]x1xf32>) -> vector<[4]xf32> {
-  // CHECK: %[[TRANSPOSE:.*]] = vector.transpose %[[VEC]], [1, 0] : vector<[4]x1xf32> to vector<1x[4]xf32>
-  // CHECK: vector.shape_cast %[[TRANSPOSE]] : vector<1x[4]xf32> to vector<[4]xf32>
-  %0 = vector.shape_cast %vec : vector<[4]x1xf32> to vector<[4]xf32>
-  return %0 : vector<[4]xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @lift_illegal_2d_shape_cast_to_memory
-func.func @lift_illegal_2d_shape_cast_to_memory(%a: index, %b: index, %memref: memref<?x?xf32>) -> vector<1x[4]xf32> {
-  // CHECK: vector.transfer_read {{.*}} : memref<?x?xf32, {{.*}}>, vector<1x[4]xf32>
-  // CHECK-NOT: vector.shape_cast
-  %pad = arith.constant 0.0 : f32
-  %illegalRead = vector.transfer_read %memref[%a, %b], %pad {in_bounds = [false, true]}: memref<?x?xf32>, vector<[4]x1xf32>
-  %cast = vector.shape_cast %illegalRead : vector<[4]x1xf32> to vector<1x[4]xf32>
-  return %cast : vector<1x[4]xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @lift_illegal_1d_shape_cast_to_memory
-func.func @lift_illegal_1d_shape_cast_to_memory(%a: index, %b: index, %memref: memref<?x?xf32>) -> vector<[4]xf32> {
-  // CHECK: vector.transfer_read {{.*}} : memref<?x?xf32, {{.*}}>, vector<1x[4]xf32>
-  // CHECK-NOT: vector.shape_cast {{.*}} : vector<[4]x1xf32> to vector<[4]xf32>
-  %pad = arith.constant 0.0 : f32
-  %illegalRead = vector.transfer_read %memref[%a, %b], %pad {in_bounds = [false, true]}: memref<?x?xf32>, vector<[4]x1xf32>
-  %cast = vector.shape_cast %illegalRead : vector<[4]x1xf32> to vector<[4]xf32>
-  return %cast : vector<[4]xf32>
-}
-
-// -----
-
 // CHECK-LABEL: @multi_tile_splat
 func.func @multi_tile_splat() -> vector<[8]x[8]xi32>
 {
@@ -656,3 +611,59 @@ func.func @vector_mask_without_maskable_op(%mask: vector<16x2xi1>, %vec: vector<
   %0 = vector.mask %mask { vector.yield %vec : vector<16x16xf32> } : vector<16x2xi1> -> vector<16x16xf32>
   return %0 : vector<16x16xf32>
 }
+
+// -----
+
+//=============================================================================
+// 1D examples - to be moved to the SVE dialect
+//=============================================================================
+
+/// TODO: Handle in_bounds
+
+// CHECK-LABEL:   func.func @xfer_read_scalable_column(
+// CHECK-SAME:      %[[IDX_0:[a-zA-Z0-9]+]]: index,
+// CHECK-SAME:      %[[IDX_1:[a-zA-Z0-9]+]]: index,
+// CHECK-SAME:      %[[PAD:.*]]: f32,
+// CHECK-SAME:      %[[SRC:.*]]: memref<?x?xf32>) -> vector<[4]x1xf32> {
+func.func @xfer_read_scalable_column(%a: index, %b: index, %pad: f32, %src: memref<?x?xf32>) -> (vector<[4]x1xf32>) {
+  // CHECK:           %[[INIT:.*]] = arith.constant dense<0.000000e+00> : vector<[4]xf32>
+  // CHECK:           %[[STEP:.*]] = arith.constant 1 : index
+  // CHECK:           %[[C4:.*]] = arith.constant 4 : index
+  // CHECK:           %[[LB:.*]] = arith.constant 0 : index
+  // CHECK:           %[[VSCALE:.*]] = vector.vscale
+  // CHECK:           %[[C4_VSCALE:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
+
+  // <scf.for>
+  // CHECK:           %[[SCF:.*]] = scf.for %[[IND_VAR:.*]] = %[[LB]] to %[[C4_VSCALE]] step %[[STEP]] iter_args(%[[SCF_RES:.*]] = %[[INIT]]) -> (vector<[4]xf32>) {
+  // CHECK:             %[[IDX_0_UPDATED:.*]] = arith.addi %[[IND_VAR]], %[[IDX_0]] : index
+  // CHECK:             %[[VAL_10:.*]] = memref.load %[[SRC]][%[[IDX_0_UPDATED]], %[[IDX_1]]] : memref<?x?xf32>
+  // CHECK:             %[[RES_UPDATED:.*]] = vector.insert %[[VAL_10]], %[[SCF_RES]] [%[[IND_VAR]]] : f32 into vector<[4]xf32>
+  // CHECK:             scf.yield %[[RES_UPDATED]] : vector<[4]xf32>
+  // CHECK:           }
+
+  // <shape-cast>
+  // CHECK:           %[[SC:.*]] = vector.shape_cast %[[SCF]] : vector<[4]xf32> to vector<[4]x1xf32>
+  // CHECK:           return %[[SC]]
+  %read = vector.transfer_read %src[%a, %b], %pad : memref<?x?xf32>, vector<[4]x1xf32>
+  return %read : vector<[4]x1xf32>
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @negative_xfer_read_scalable_column_x2
+func.func @negative_xfer_read_scalable_column_x2(%a: index, %b: index, %pad: f32, %src: memref<?x?xf32>) -> (vector<[4]x2xf32>) {
+  // CHECK-NOT: scf.for
+  // CHECK-NOT: memref.load
+  %read = vector.transfer_read %src[%a, %b], %pad : memref<?x?xf32>, vector<[4]x2xf32>
+  return %read : vector<[4]x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @negative_xfer_read_scalable_column_scalable_trailing_dim
+func.func @negative_xfer_read_scalable_column_scalable_trailing_dim(%a: index, %b: index, %pad: f32, %src: memref<?x?xf32>) -> (vector<4x[1]xf32>) {
+  // CHECK-NOT: scf.for
+  // CHECK-NOT: memref.load
+  %read = vector.transfer_read %src[%a, %b], %pad : memref<?x?xf32>, vector<4x[1]xf32>
+  return %read : vector<4x[1]xf32>
+}
diff --git a/mlir/test/Dialect/ArmSVE/legalize-transfer-read.mlir b/mlir/test/Dialect/ArmSVE/legalize-transfer-read.mlir
new file mode 100644
index 0000000000000..5f923cdafb956
--- /dev/null
+++ b/mlir/test/Dialect/ArmSVE/legalize-transfer-read.mlir
@@ -0,0 +1,257 @@
+// RUN: mlir-opt --arm-sve-legalize-vector-storage --split-input-file %s | FileCheck %s
+
+
+// Test the `LegalizeTransferRead` pattern
+// (mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp)
+
+// -----
+
+// This is the base case, unremarkable in any way, except that it's our main
+// motivating example and use case.
+
+// CHECK-LABEL:       @base_case
+// CHECK-SAME:          %[[I:.+]]: index, %[[J:.+]]: index, %[[M:.+]]:
+// CHECK:               %[[PAD:.+]] = arith.constant 0 : i8
+// CHECK:               %[[C0:.+]] = arith.constant 0 : index
+// CHECK:               %[[COLLAPSE:.+]] = memref.collapse_shape %[[M]]
+// CHECK-SAME{LITERAL}:   [[0], [1], [2, 3]]
+// CHECK-SAME:            : memref<?x?x?x8xi8> into memref<?x?x?xi8>
+// CHECK-NEXT:          %[[T0:.+]] = vector.transfer_read %[[COLLAPSE]][%[[I]], %[[J]], %[[C0]]], %[[PAD]] {in_bounds = [true]}
+// CHECK-SAME:            : memref<?x?x?xi8>, vector<[32]xi8>
+// CHECK-NEXT:          %[[T1:.+]] = vector.shape_cast %[[T0]] : vector<[32]xi8> to vector<[4]x8xi8>
+// CHECK-NEXT:          return %[[T1]] : vector<[4]x8xi8>
+
+func.func @base_case(%i : index, %j : index, %M : memref<?x?x?x8xi8>) -> vector<[4]x8xi8> {
+  %c0 = arith.constant 0 : index
+  %pad = arith.constant 123 : i8
+
+  %A = vector.transfer_read %M[%i, %j, %c0, %c0], %pad {in_bounds = [true, true]} : memref<?x?x?x8xi8>, vector<[4]x8xi8>
+
+  return %A : vector<[4]x8xi8>
+}
+
+// -----
+
+// Test the case where the scalable dimension is not the second-to-last.
+
+// CHECK-LABEL:       @with_3d_vector
+// CHECK-SAME:          %[[I:.+]]: index, %[[J:.+]]: index, %[[M:.+]]:
+// CHECK:               %[[PAD:.+]] = arith.constant 0 : i8
+// CHECK:               %[[COLLAPSED:.+]] = memref.collapse_shape %[[M]]
+// CHECK-SAME{LITERAL}:   [[0], [1, 2, 3]]
+// CHECK-SAME:            : memref<?x?x2x8xi8> into memref<?x?xi8>
+// CHECK-NEXT:          %[[T0:.+]] = vector.transfer_read %[[COLLAPSED]][%[[I]], %[[J]]], %[[PAD]] {in_bounds = [true]}
+// CHECK-SAME:            : memref<?x?xi8>, vector<[64]xi8>
+// CHECK-NEXT:          %[[T1:.+]] = vector.shape_cast %[[T0]] : vector<[64]xi8> to vector<[4]x2x8xi8>
+// CHECK-NEXT:          return %[[T1]] : vector<[4]x2x8xi8>
+
+func.func @with_3d_vector(%i : index, %j : index, %M : memref<?x?x2x8xi8>) -> vector<[4]x2x8xi8> {
+  %c0 = arith.constant 0 : index
+  %pad = arith.constant 123 : i8
+
+  %A = vector.transfer_read %M[%i, %j, %c0, %c0], %pad {in_bounds = [true, true, true]} : memref<?x?x2x8xi8>, vector<[4]x2x8xi8>
+
+  return %A : vector<[4]x2x8xi8>
+}
+
+// -----
+
+// Test the case when the vector is already LLVM-legal (fixed).
+
+// CHECK-LABEL: @negative_vector_legal_fixed
+// CHECK-NOT: memref.collapse
+
+func.func @negative_vector_legal_fixed(%i : index, %j : index, %M : memref<?x?x?x8xi8>) -> vector<8x8xi8> {
+  %c0 = arith.constant 0 : index
+  %pad = arith.constant 123 : i8
+
+  %A = vector.transfer_read %M[%i, %j, %c0, %c0], %pad {in_bounds = [true, true]} : memref<?x?x?x8xi8>, vector<8x8xi8>
+
+  return %A : vector<8x8xi8>
+}
+
+// -----
+
+// Test the case when the vector is already LLVM-legal (single-dimension scalable).
+
+// CHECK-LABEL: @negative_vector_legal_1d_scalable
+// CHECK-NOT: memref.collapse
+
+func.func @negative_vector_legal_1d_scalable(%i : index, %j : index, %M : memref<?x?x?x8xi8>) -> vector<[8]xi8> {
+  %c0 = arith.constant 0 : index
+  %pad = arith.constant 123 : i8
+
+  %A = vector.transfer_read %M[%i, %j, %c0, %c0], %pad {in_bounds = [true]} : memref<?x?x?x8xi8>, vector<[8]xi8>
+
+  return %A : vector<[8]xi8>
+}
+
+// -----
+
+// Test the case when the vector is already LLVM-legal (single trailing
+// scalable dimension).
+
+// CHECK-LABEL: @negative_vector_legal_trailing_scalable_dim
+// CHECK-NOT: memref.collapse
+
+func.func @negative_vector_legal_trailing_scalable_dim(%i : index, %j : index, %M : memref<?x?x?x8xi8>) -> vector<8x[8]xi8> {
+  %c0 = arith.constant 0 : index
+  %pad = arith.constant 123 : i8
+
+  %A = vector.transfer_read %M[%i, %j, %c0, %c0], %pad {in_bounds = [true, true]} : memref<?x?x?x8xi8>, vector<8x[8]xi8>
+
+  return %A : vector<8x[8]xi8>
+}
+
+// -----
+
+// Test the case of unsupported vector type (more than one scalable dimension)
+
+// CHECK-LABEL: @negative_vector_type_two_scalable_dims
+// CHECK-NOT: memref.collapse
+
+func.func @negative_vector_type_two_scalable_dims(%i : index, %j : index, %M : memref<?x?x?x8xi8>) -> vector<[8]x[8]x8xi8> {
+  %c0 = arith.constant 0 : index
+  %pad = arith.constant 123 : i8
+
+  %A = vector.transfer_read %M[%i, %j, %c0, %c0], %pad {in_bounds = [true, true, true]} : memref<?x?x?x8xi8>, vector<[8]x[8]x8xi8>
+
+  return %A : vector<[8]x[8]x8xi8>
+}
+
+// -----
+
+// Test the case of reading from a tensor - not supported, since the
+// transform reasons about memory layouts.
+
+// CHECK-LABEL: @negative_tensor_transfer
+// CHECK-NOT: memref.collapse
+
+func.func @negative_tensor_transfer(%i : index, %j : index, %M : tensor<?x?x?x8xi8>) -> vector<[4]x8xi8> {
+  %c0 = arith.constant 0 : index
+  %pad = arith.constant 123 : i8
+
+  %A = vector.transfer_read %M[%i, %j, %c0, %c0], %pad {in_bounds = [true, true]} : tensor<?x?x?x8xi8>, vector<[4]x8xi8>
+
+  return %A : vector<[4]x8xi8>
+}
+
+// -----
+
+// Test the case when the transfer is discontiguous because the memref
+// is discontiguous.
+// There are other ways to make a memref discontiguous. The transformation
+// is not concerned with the particular reason a memref is discontiguous, but
+// only with the fact. Therefore there are no variations with the memref made
+// discontiguous by some other mechanism.
+
+// CHECK-LABEL: @negative_discontig_mem
+// CHECK-NOT: memref.collapse
+
+#strides = strided<[?, ?, 16, 1]>
+
+func.func @negative_discontig_mem(%i : index, %j : index, %M : memref<?x?x?x8xi8, #strides>) -> vector<[4]x8xi8> {
+  %c0 = arith.constant 0 : index
+  %pad = arith.constant 123 : i8
+
+  %A = vector.transfer_read %M[%i, %j, %c0, %c0], %pad {in_bounds = [true, true]} : memref<?x?x?x8xi8, #strides>, vector<[4]x8xi8>
+
+  return %A : vector<[4]x8xi8>
+}
+
+// -----
+
+// Test the case when the transformation is not applied because of
+// a non-trivial permutation map (broadcast).
+
+// CHECK-LABEL: @negative_broadcast
+// CHECK-NOT: memref.collapse
+
+#perm = affine_map<(i, j, k, p) -> (k, 0)>
+
+func.func @negative_broadcast(%i : index, %j : index, %M : memref<?x?x?x8xi8>) -> vector<[4]x8xi8> {
+  %c0 = arith.constant 0 : index
+  %pad = arith.constant 123 : i8
+
+  %A = vector.transfer_read %M[%i, %j, %c0, %c0], %pad {permutation_map = #perm, in_bounds = [true, true] } : memref<?x?x?x8xi8>, vector<[4]x8xi8>
+
+  return %A : vector<[4]x8xi8>
+}
+
+// -----
+
+// Test the case of a masked read - not supported right now.
+// (see mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp)
+
+// CHECK-LABEL: @negative_masked
+// CHECK-NOT: memref.collapse
+
+func.func @negative_masked(
+  %i : index, %j : index,
+  %M : memref<?x?x?x8xi8>, %mask : vector<[4]x8xi1>) -> vector<[4]x8xi8> {
+  
+  %c0 = arith.constant 0 : index
+  %pad = arith.constant 123 : i8
+
+  %A = vector.mask %mask {
+    vector.transfer_read %M[%i, %j, %c0, %c0], %pad {in_bounds = [true, true] } : memref<?x?x?x8xi8>, vector<[4]x8xi8>
+  } : vector<[4]x8xi1> -> vector<[4]x8xi8>
+
+  return %A : vector<[4]x8xi8>
+}
+
+// -----
+
+// Test case with a mask operand - not supported right now.
+// (see mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp)
+
+// CHECK-LABEL: @negative_with_mask
+// CHECK-NOT: memref.collapse
+
+func.func @negative_with_mask(
+  %i : index, %j : index,
+  %M : memref<?x?x?x8xi8>, %mask : vector<[4]x8xi1>) -> vector<[4]x8xi8> {
+  
+  %c0 = arith.constant 0 : index
+  %pad = arith.constant 123 : i8
+
+  %A = vector.transfer_read %M[%i, %j, %c0, %c0], %pad, %mask {in_bounds = [true, true] } : memref<?x?x?x8xi8>, vector<[4]x8xi8>
+
+  return %A : vector<[4]x8xi8>
+}
+
+// -----
+
+// Test the case when the dimensions to collapse (excluding the scalable one)
+// of the vector and the memref do not match (static non matching dimension).
+
+// CHECK-LABEL: @negative_non_matching_dim_static
+// CHECK-NOT: memref.collapse
+
+func.func @negative_non_matching_dim_static(%i : index, %j : index,  %M : memref<?x?x?x8xi8>) -> vector<[4]x4xi8> {
+  
+  %c0 = arith.constant 0 : index
+  %pad = arith.constant 123 : i8
+
+  %A = vector.transfer_read %M[%i, %j, %c0, %c0], %pad {in_bounds = [true, true] } : memref<?x?x?x8xi8>, vector<[4]x4xi8>
+
+  return %A : vector<[4]x4xi8>
+}
+
+// -----
+
+// Test the case when the dimensions to collapse (excluding the scalable one)
+// of the vector and the memref do not match (dynamic non matching dimension).
+
+// CHECK-LABEL: @negative_non_matching_dim_dynamic
+// CHECK-NOT: memref.collapse
+
+func.func @negative_non_matching_dim_dynamic(%i : index, %j : index,  %M : memref<?x?x?x?xi8>) -> vector<[4]x4xi8> {
+  
+  %c0 = arith.constant 0 : index
+  %pad = arith.constant 123 : i8
+
+  %A = vector.transfer_read %M[%i, %j, %c0, %c0], %pad {in_bounds = [true, true] } : memref<?x?x?x?xi8>, vector<[4]x4xi8>
+
+  return %A : vector<[4]x4xi8>
+}
diff --git a/mlir/test/Dialect/Linalg/hoisting.mlir b/mlir/test/Dialect/Linalg/hoisting.mlir
index 318edca73cce1..8be4e1b79c52c 100644
--- a/mlir/test/Dialect/Linalg/hoisting.mlir
+++ b/mlir/test/Dialect/Linalg/hoisting.mlir
@@ -1,76 +1,210 @@
 // RUN: mlir-opt  -transform-interpreter -canonicalize --split-input-file --allow-unregistered-dialect %s | FileCheck %s
 
-// CHECK-LABEL: func @hoist_vector_transfer_pairs(
-//  CHECK-SAME:   %[[MEMREF0:[a-zA-Z0-9]*]]: memref<?x?xf32>,
-//  CHECK-SAME:   %[[MEMREF1:[a-zA-Z0-9]*]]: memref<?x?xf32>,
-//  CHECK-SAME:   %[[MEMREF2:[a-zA-Z0-9]*]]: memref<?x?xf32>,
-//  CHECK-SAME:   %[[MEMREF3:[a-zA-Z0-9]*]]: memref<?x?xf32>,
-//  CHECK-SAME:   %[[MEMREF4:[a-zA-Z0-9]*]]: memref<?x?xf32>,
-//  CHECK-SAME:   %[[MEMREF5:[a-zA-Z0-9]*]]: memref<?x?xf32>,
-//  CHECK-SAME:   %[[VAL:[a-zA-Z0-9]*]]: index,
-//  CHECK-SAME:   %[[LB:[a-zA-Z0-9]*]]: index,
-//  CHECK-SAME:   %[[UB:[a-zA-Z0-9]*]]: index,
-//  CHECK-SAME:   %[[STEP:[a-zA-Z0-9]*]]: index,
-//  CHECK-SAME:   %[[CMP:[a-zA-Z0-9]*]]: i1
-func.func @hoist_vector_transfer_pairs(
-    %memref0: memref<?x?xf32>, %memref1: memref<?x?xf32>, %memref2: memref<?x?xf32>,
-    %memref3: memref<?x?xf32>, %memref4: memref<?x?xf32>, %memref5: memref<?x?xf32>,
-    %val: index, %lb : index, %ub : index, %step: index, %cmp: i1) {
+///----------------------------------------------------------------------------------------
+/// Tests for vector.transfer_read + vector.transfer_write pairs
+///
+/// * Nested in double loops
+//  * Indices depend on induction variables
+///----------------------------------------------------------------------------------------
+
+// CHECK-LABEL: func @mem_use_outside
+// CHECK-SAME:      %[[MEM:[a-zA-Z0-9]+]]: memref<?x?xf32>,
+// CHECK-SAME:      %[[LB:[a-zA-Z0-9]+]]: index,
+// CHECK-SAME:      %[[UB:[a-zA-Z0-9]+]]: index,
+// CHECK-SAME:      %[[STEP:[a-zA-Z0-9]+]]: index)
+func.func @mem_use_outside(%mem: memref<?x?xf32>, %lb : index, %ub : index, %step: index) {
+  %pad = arith.constant 0.0 : f32
+
+// CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           scf.for %[[I:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
+// CHECK:             %[[READ:.*]] = vector.transfer_read %[[MEM]][%[[I]], %[[I]]], %[[PAD]] : memref<?x?xf32>, vector<1xf32>
+// CHECK:             %[[SCF:.*]] = scf.for %[[J:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] iter_args(%[[VAL_5:.*]] = %[[READ]]) -> (vector<1xf32>) {
+// CHECK:               %[[USE:.*]] = "val_use"(%[[VAL_5]]) : (vector<1xf32>) -> vector<1xf32>
+// CHECK:               scf.yield %[[USE]] : vector<1xf32>
+// CHECK:             }
+// CHECK:             vector.transfer_write %[[SCF]], %[[MEM]][%[[I]], %[[I]]] : vector<1xf32>, memref<?x?xf32>
+// CHECK:             "mem_use"(%[[MEM]]) : (memref<?x?xf32>) -> ()
+// CHECK:           }
+  scf.for %i = %lb to %ub step %step {
+    scf.for %j = %lb to %ub step %step {
+      %read = vector.transfer_read %mem[%i, %i], %pad: memref<?x?xf32>, vector<1xf32>
+      %use = "val_use"(%read) : (vector<1xf32>) -> vector<1xf32>
+      vector.transfer_write %use, %mem[%i, %i] : vector<1xf32>, memref<?x?xf32>
+    }
+  }
+  "mem_use"(%mem) : (memref<?x?xf32>) -> ()
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+    transform.structured.hoist_redundant_vector_transfers %0
+      : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func @mem_use_inside_outer_loop
+// CHECK-SAME:      %[[MEM:[a-zA-Z0-9]+]]: memref<?x?xf32>,
+// CHECK-SAME:      %[[LB:[a-zA-Z0-9]+]]: index,
+// CHECK-SAME:      %[[UB:[a-zA-Z0-9]+]]: index,
+// CHECK-SAME:      %[[STEP:[a-zA-Z0-9]+]]: index)
+func.func @mem_use_inside_outer_loop(%mem: memref<?x?xf32>, %lb : index, %ub : index, %step: index) {
+  %pad = arith.constant 0.0 : f32
+
+// CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           scf.for %[[I:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
+// CHECK:             %[[READ:.*]] = vector.transfer_read %[[MEM]]{{\[}}%[[I]], %[[I]]], %[[PAD]] : memref<?x?xf32>, vector<1xf32>
+// CHECK:             %[[SCF:.*]] = scf.for %[[J:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] iter_args(%[[VAL_5:.*]] = %[[READ]]) -> (vector<1xf32>) {
+// CHECK:               %[[USE:.*]] = "val_use"(%[[VAL_5]]) : (vector<1xf32>) -> vector<1xf32>
+// CHECK:               scf.yield %[[USE]] : vector<1xf32>
+// CHECK:             }
+// CHECK:             vector.transfer_write %[[SCF]], %[[MEM]]{{\[}}%[[I]], %[[I]]] : vector<1xf32>, memref<?x?xf32>
+// CHECK:           "mem_use"(%[[MEM]]) : (memref<?x?xf32>) -> ()
+// CHECK:           }
+  scf.for %i = %lb to %ub step %step {
+    scf.for %j = %lb to %ub step %step {
+      %read = vector.transfer_read %mem[%i, %i], %pad: memref<?x?xf32>, vector<1xf32>
+      %use = "val_use"(%read) : (vector<1xf32>) -> vector<1xf32>
+      vector.transfer_write %use, %mem[%i, %i] : vector<1xf32>, memref<?x?xf32>
+    }
+    "mem_use"(%mem) : (memref<?x?xf32>) -> ()
+  }
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+    transform.structured.hoist_redundant_vector_transfers %0
+      : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+///----------------------------------------------------------------------------------------
+/// Tests for vector.transfer_read + vector.transfer_write pairs
+///
+/// * Nested in double loops
+//  * Indices are constant
+///----------------------------------------------------------------------------------------
+
+// CHECK-LABEL: func @negative_mem_use_inside_inner_loop_before_write
+// CHECK-SAME:      %[[MEM:[a-zA-Z0-9]+]]: memref<?x?xf32>,
+// CHECK-SAME:      %[[LB:[a-zA-Z0-9]+]]: index,
+// CHECK-SAME:      %[[UB:[a-zA-Z0-9]+]]: index,
+// CHECK-SAME:      %[[STEP:[a-zA-Z0-9]+]]: index)
+func.func @negative_mem_use_inside_inner_loop_before_write(%mem: memref<?x?xf32>, %lb : index, %ub : index, %step: index) {
   %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.0 : f32
+  %pad = arith.constant 0.0 : f32
+
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           scf.for %[[I:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
+// CHECK:             scf.for %[[J:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
+// CHECK:               %[[READ:.*]] = vector.transfer_read %[[MEM]][%[[C0]], %[[C0]]], %[[PAD]] : memref<?x?xf32>, vector<1xf32>
+// CHECK:               %[[USE:.*]] = "val_use"(%[[READ]]) : (vector<1xf32>) -> vector<1xf32>
+// CHECK:               "mem_use"(%[[MEM]]) : (memref<?x?xf32>) -> ()
+// CHECK:               vector.transfer_write %[[USE]], %[[MEM]][%[[C0]], %[[C0]]] : vector<1xf32>, memref<?x?xf32>
+// CHECK:             }
+// CHECK:           }
+  scf.for %i = %lb to %ub step %step {
+    scf.for %j = %lb to %ub step %step {
+      %read = vector.transfer_read %mem[%c0, %c0], %pad: memref<?x?xf32>, vector<1xf32>
+      %use = "val_use"(%read) : (vector<1xf32>) -> vector<1xf32>
+      "mem_use"(%mem) : (memref<?x?xf32>) -> ()
+      vector.transfer_write %use, %mem[%c0, %c0] : vector<1xf32>, memref<?x?xf32>
+    }
+  }
+  return
+}
 
-// CHECK: vector.transfer_read %{{.*}} : memref<?x?xf32>, vector<1xf32>
-// CHECK: scf.for %[[I:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] iter_args({{.*}}) -> (vector<1xf32>) {
-// CHECK:   vector.transfer_read %{{.*}} : memref<?x?xf32>, vector<2xf32>
-// CHECK:   scf.for %[[J:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] iter_args({{.*}}) -> (vector<1xf32>, vector<2xf32>) {
-// CHECK:     vector.transfer_read %{{.*}} : memref<?x?xf32>, vector<3xf32>
-// CHECK:     vector.transfer_read %{{.*}} : memref<?x?xf32>, vector<4xf32>
-// CHECK:     "some_crippling_use"(%[[MEMREF4]]) : (memref<?x?xf32>) -> ()
-// CHECK:     vector.transfer_read %{{.*}} : memref<?x?xf32>, vector<5xf32>
-// CHECK:     "some_use"(%{{.*}}) : (vector<1xf32>) -> vector<1xf32>
-// CHECK:     "some_use"(%{{.*}}) : (vector<2xf32>) -> vector<2xf32>
-// CHECK:     "some_use"(%[[MEMREF2]], %{{.*}}) : (memref<?x?xf32>, vector<3xf32>) -> vector<3xf32>
-// CHECK:     "some_use"(%{{.*}}) : (vector<4xf32>) -> vector<4xf32>
-// CHECK:     "some_use"(%{{.*}}) : (vector<5xf32>) -> vector<5xf32>
-// CHECK:     vector.transfer_write %{{.*}} : vector<3xf32>, memref<?x?xf32>
-// CHECK:     vector.transfer_write %{{.*}} : vector<4xf32>, memref<?x?xf32>
-// CHECK:     vector.transfer_write %{{.*}} : vector<5xf32>, memref<?x?xf32>
-// CHECK:     "some_crippling_use"(%[[MEMREF3]]) : (memref<?x?xf32>) -> ()
-// CHECK:     scf.yield {{.*}} : vector<1xf32>, vector<2xf32>
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+    transform.structured.hoist_redundant_vector_transfers %0
+      : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func @negative_mem_use_inside_inner_loop_after_write
+// CHECK-SAME:      %[[MEM:[a-zA-Z0-9]+]]: memref<?x?xf32>,
+// CHECK-SAME:      %[[LB:[a-zA-Z0-9]+]]: index,
+// CHECK-SAME:      %[[UB:[a-zA-Z0-9]+]]: index,
+// CHECK-SAME:      %[[STEP:[a-zA-Z0-9]+]]: index)
+func.func @negative_mem_use_inside_inner_loop_after_write(%mem: memref<?x?xf32>, %lb : index, %ub : index, %step: index) {
+  %c0 = arith.constant 0 : index
+  %pad = arith.constant 0.0 : f32
+
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           scf.for %[[I:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
+// CHECK:             scf.for %[[J:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
+// CHECK:               %[[READ:.*]] = vector.transfer_read %[[MEM]][%[[C0]], %[[C0]]], %[[PAD]] : memref<?x?xf32>, vector<1xf32>
+// CHECK:               %[[USE:.*]] = "val_use"(%[[READ]]) : (vector<1xf32>) -> vector<1xf32>
+// CHECK:               vector.transfer_write %[[USE]], %[[MEM]][%[[C0]], %[[C0]]] : vector<1xf32>, memref<?x?xf32>
+// CHECK:               "mem_use"(%[[MEM]]) : (memref<?x?xf32>) -> ()
+// CHECK:             }
+// CHECK:           }
+  scf.for %i = %lb to %ub step %step {
+    scf.for %j = %lb to %ub step %step {
+      %r3 = vector.transfer_read %mem[%c0, %c0], %pad: memref<?x?xf32>, vector<1xf32>
+      %u3 = "val_use"(%r3) : (vector<1xf32>) -> vector<1xf32>
+      vector.transfer_write %u3, %mem[%c0, %c0] : vector<1xf32>, memref<?x?xf32>
+      "mem_use"(%mem) : (memref<?x?xf32>) -> ()
+    }
+  }
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+    transform.structured.hoist_redundant_vector_transfers %0
+      : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func @negative_mem_use_inside_inner_loop_before_read
+// CHECK-SAME:      %[[MEM:[a-zA-Z0-9]+]]: memref<?x?xf32>,
+// CHECK-SAME:      %[[LB:[a-zA-Z0-9]+]]: index,
+// CHECK-SAME:      %[[UB:[a-zA-Z0-9]+]]: index,
+// CHECK-SAME:      %[[STEP:[a-zA-Z0-9]+]]: index)
+func.func @negative_mem_use_inside_inner_loop_before_read(%mem: memref<?x?xf32>, %lb : index, %ub : index, %step: index) {
+  %c0 = arith.constant 0 : index
+  %pad = arith.constant 0.0 : f32
+
+// CHECK: scf.for %[[I:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
+// CHECK:   scf.for %[[J:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
+// CHECK:     "mem_use"(%[[MEM]]) : (memref<?x?xf32>) -> ()
+// CHECK:     vector.transfer_read %{{.*}} : memref<?x?xf32>, vector<1xf32>
+// CHECK:     "val_use"(%{{.*}}) : (vector<1xf32>) -> vector<1xf32>
+// CHECK:     vector.transfer_write %{{.*}} : vector<1xf32>, memref<?x?xf32>
 // CHECK:   }
-// CHECK:   vector.transfer_write %{{.*}} : vector<2xf32>, memref<?x?xf32>
-// CHECK:   "unrelated_use"(%[[MEMREF0]]) : (memref<?x?xf32>) -> ()
-// CHECK:   scf.yield {{.*}} : vector<1xf32>
 // CHECK: }
-// CHECK: vector.transfer_write %{{.*}} : vector<1xf32>, memref<?x?xf32>
-// CHECK: "unrelated_use"(%[[MEMREF1]]) : (memref<?x?xf32>) -> ()
   scf.for %i = %lb to %ub step %step {
     scf.for %j = %lb to %ub step %step {
-      %r0 = vector.transfer_read %memref1[%c0, %c0], %cst: memref<?x?xf32>, vector<1xf32>
-      %r1 = vector.transfer_read %memref0[%i, %i], %cst: memref<?x?xf32>, vector<2xf32>
-      %r2 = vector.transfer_read %memref2[%c0, %c0], %cst: memref<?x?xf32>, vector<3xf32>
-      %r3 = vector.transfer_read %memref3[%c0, %c0], %cst: memref<?x?xf32>, vector<4xf32>
-      "some_crippling_use"(%memref4) : (memref<?x?xf32>) -> ()
-      %r4 = vector.transfer_read %memref4[%c0, %c0], %cst: memref<?x?xf32>, vector<5xf32>
-      %r5 = vector.transfer_read %memref5[%c0, %c0], %cst: memref<?x?xf32>, vector<6xf32>
-      "some_crippling_use"(%memref5) : (memref<?x?xf32>) -> ()
-      %u0 = "some_use"(%r0) : (vector<1xf32>) -> vector<1xf32>
-      %u1 = "some_use"(%r1) : (vector<2xf32>) -> vector<2xf32>
-      %u2 = "some_use"(%memref2, %r2) : (memref<?x?xf32>, vector<3xf32>) -> vector<3xf32>
-      %u3 = "some_use"(%r3) : (vector<4xf32>) -> vector<4xf32>
-      %u4 = "some_use"(%r4) : (vector<5xf32>) -> vector<5xf32>
-      %u5 = "some_use"(%r5) : (vector<6xf32>) -> vector<6xf32>
-      vector.transfer_write %u0, %memref1[%c0, %c0] : vector<1xf32>, memref<?x?xf32>
-      vector.transfer_write %u1, %memref0[%i, %i] : vector<2xf32>, memref<?x?xf32>
-      vector.transfer_write %u2, %memref2[%c0, %c0] : vector<3xf32>, memref<?x?xf32>
-      vector.transfer_write %u3, %memref3[%c0, %c0] : vector<4xf32>, memref<?x?xf32>
-      vector.transfer_write %u4, %memref4[%c0, %c0] : vector<5xf32>, memref<?x?xf32>
-      vector.transfer_write %u5, %memref5[%c0, %c0] : vector<6xf32>, memref<?x?xf32>
-      "some_crippling_use"(%memref3) : (memref<?x?xf32>) -> ()
+      "mem_use"(%mem) : (memref<?x?xf32>) -> ()
+      %read = vector.transfer_read %mem[%c0, %c0], %pad: memref<?x?xf32>, vector<1xf32>
+      %use = "val_use"(%read) : (vector<1xf32>) -> vector<1xf32>
+      vector.transfer_write %use, %mem[%c0, %c0] : vector<1xf32>, memref<?x?xf32>
     }
-    "unrelated_use"(%memref0) : (memref<?x?xf32>) -> ()
   }
-  "unrelated_use"(%memref1) : (memref<?x?xf32>) -> ()
   return
 }
 
@@ -86,6 +220,12 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Other tests
+///
+/// TODO: Document
+///----------------------------------------------------------------------------------------
+
 // CHECK-LABEL: func @hoist_vector_transfer_pairs_disjoint(
 //  CHECK-SAME:   %[[MEMREF0:[a-zA-Z0-9]*]]: memref<?x?xf32>,
 //  CHECK-SAME:   %[[MEMREF1:[a-zA-Z0-9]*]]: memref<?x?xf32>,
diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir
index ca40301f04fa1..cbc863699ba9e 100644
--- a/mlir/test/Dialect/Linalg/invalid.mlir
+++ b/mlir/test/Dialect/Linalg/invalid.mlir
@@ -1165,7 +1165,7 @@ func.func @mixed_semantics(%a: tensor<?x?xf32>, %b: tensor<?x?xf32>, %c: memref<
 
 func.func @winograd_filter_transform_height(%arg0: tensor<2x4x3x5xf32>, %arg1: tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> {
   // expected-error @+1 {{expect filter height either equals to r or 1}}
-  %0 = linalg.winograd_filter_transform m(4) r(3) ins(%arg0 : tensor<2x4x3x5xf32>) outs(%arg1 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
+  %0 = linalg.winograd_filter_transform fmr(F_4_3) ins(%arg0 : tensor<2x4x3x5xf32>) outs(%arg1 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
   return %0 : tensor<6x6x5x2xf32>
 }
 
@@ -1173,7 +1173,7 @@ func.func @winograd_filter_transform_height(%arg0: tensor<2x4x3x5xf32>, %arg1: t
 
 func.func @winograd_filter_transform_width(%arg0: tensor<2x3x4x5xf32>, %arg1: tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> {
   // expected-error @+1 {{expect filter width either equals to r or 1}}
-  %0 = linalg.winograd_filter_transform m(4) r(3) ins(%arg0 : tensor<2x3x4x5xf32>) outs(%arg1 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
+  %0 = linalg.winograd_filter_transform fmr(F_4_3) ins(%arg0 : tensor<2x3x4x5xf32>) outs(%arg1 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
   return %0 : tensor<6x6x5x2xf32>
 }
 
@@ -1181,7 +1181,7 @@ func.func @winograd_filter_transform_width(%arg0: tensor<2x3x4x5xf32>, %arg1: te
 
 func.func @winograd_filter_transform(%arg0: tensor<2x1x1x5xf32>, %arg1: tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> {
   // expected-error @+1 {{expect either filter height or width equals to r}}
-  %0 = linalg.winograd_filter_transform m(4) r(3) ins(%arg0 : tensor<2x1x1x5xf32>) outs(%arg1 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
+  %0 = linalg.winograd_filter_transform fmr(F_4_3) ins(%arg0 : tensor<2x1x1x5xf32>) outs(%arg1 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
   return %0 : tensor<6x6x5x2xf32>
 }
 
@@ -1189,7 +1189,7 @@ func.func @winograd_filter_transform(%arg0: tensor<2x1x1x5xf32>, %arg1: tensor<6
 
 func.func @winograd_filter_dyn(%arg0: tensor<?x3x3x?xf32>, %arg1: tensor<6x5x?x?xf32>) -> tensor<6x5x?x?xf32> {
   // expected-error @+1 {{the output shape is not expected}}
-  %0 = linalg.winograd_filter_transform m(4) r(3) ins(%arg0 : tensor<?x3x3x?xf32>) outs(%arg1 : tensor<6x5x?x?xf32>) -> tensor<6x5x?x?xf32>
+  %0 = linalg.winograd_filter_transform fmr(F_4_3) ins(%arg0 : tensor<?x3x3x?xf32>) outs(%arg1 : tensor<6x5x?x?xf32>) -> tensor<6x5x?x?xf32>
   return %0 : tensor<6x5x?x?xf32>
 }
 
@@ -1197,7 +1197,7 @@ func.func @winograd_filter_dyn(%arg0: tensor<?x3x3x?xf32>, %arg1: tensor<6x5x?x?
 
 func.func @winograd_input_transform_height(%arg0: tensor<2x13x14x5xf32>, %arg1: tensor<6x6x3x3x2x5xf32>) -> tensor<6x6x3x3x2x5xf32> {
   // expected-error @+1 {{the output shape is not expected}}
-  %0 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x13x14x5xf32>) outs(%arg1 : tensor<6x6x3x3x2x5xf32>) -> tensor<6x6x3x3x2x5xf32>
+  %0 = linalg.winograd_input_transform fmr(F_4_3) ins(%arg0 : tensor<2x13x14x5xf32>) outs(%arg1 : tensor<6x6x3x3x2x5xf32>) -> tensor<6x6x3x3x2x5xf32>
   return %0 : tensor<6x6x3x3x2x5xf32>
 }
 
@@ -1205,7 +1205,7 @@ func.func @winograd_input_transform_height(%arg0: tensor<2x13x14x5xf32>, %arg1:
 
 func.func @winograd_input_transform_width(%arg0: tensor<2x14x13x5xf32>, %arg1: tensor<6x6x3x3x2x5xf32>) -> tensor<6x6x3x3x2x5xf32> {
   // expected-error @+1 {{the output shape is not expected}}
-  %0 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x14x13x5xf32>) outs(%arg1 : tensor<6x6x3x3x2x5xf32>) -> tensor<6x6x3x3x2x5xf32>
+  %0 = linalg.winograd_input_transform fmr(F_4_3) ins(%arg0 : tensor<2x14x13x5xf32>) outs(%arg1 : tensor<6x6x3x3x2x5xf32>) -> tensor<6x6x3x3x2x5xf32>
   return %0 : tensor<6x6x3x3x2x5xf32>
 }
 
@@ -1213,7 +1213,7 @@ func.func @winograd_input_transform_width(%arg0: tensor<2x14x13x5xf32>, %arg1: t
 
 func.func @winograd_input_transform_output_tileH(%arg0: tensor<2x14x14x5xf32>, %arg1: tensor<6x6x2x3x2x5xf32>) -> tensor<6x6x2x3x2x5xf32> {
   // expected-error @+1 {{the output shape is not expected}}
-  %0 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x14x14x5xf32>) outs(%arg1 : tensor<6x6x2x3x2x5xf32>) -> tensor<6x6x2x3x2x5xf32>
+  %0 = linalg.winograd_input_transform fmr(F_4_3) ins(%arg0 : tensor<2x14x14x5xf32>) outs(%arg1 : tensor<6x6x2x3x2x5xf32>) -> tensor<6x6x2x3x2x5xf32>
   return %0 : tensor<6x6x2x3x2x5xf32>
 }
 
@@ -1221,7 +1221,7 @@ func.func @winograd_input_transform_output_tileH(%arg0: tensor<2x14x14x5xf32>, %
 
 func.func @winograd_input_transform_output_tileW(%arg0: tensor<2x14x14x5xf32>, %arg1: tensor<6x6x3x2x2x5xf32>) -> tensor<6x6x3x2x2x5xf32> {
   // expected-error @+1 {{the output shape is not expected}}
-  %0 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x14x14x5xf32>) outs(%arg1 : tensor<6x6x3x2x2x5xf32>) -> tensor<6x6x3x2x2x5xf32>
+  %0 = linalg.winograd_input_transform fmr(F_4_3) ins(%arg0 : tensor<2x14x14x5xf32>) outs(%arg1 : tensor<6x6x3x2x2x5xf32>) -> tensor<6x6x3x2x2x5xf32>
   return %0 : tensor<6x6x3x2x2x5xf32>
 }
 
@@ -1229,7 +1229,7 @@ func.func @winograd_input_transform_output_tileW(%arg0: tensor<2x14x14x5xf32>, %
 
 func.func @winograd_input_transform_output_height(%arg0: tensor<2x14x14x5xf32>, %arg1: tensor<5x6x3x3x2x5xf32>) -> tensor<5x6x3x3x2x5xf32> {
   // expected-error @+1 {{the output shape is not expected}}
-  %0 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x14x14x5xf32>) outs(%arg1 : tensor<5x6x3x3x2x5xf32>) -> tensor<5x6x3x3x2x5xf32>
+  %0 = linalg.winograd_input_transform fmr(F_4_3) ins(%arg0 : tensor<2x14x14x5xf32>) outs(%arg1 : tensor<5x6x3x3x2x5xf32>) -> tensor<5x6x3x3x2x5xf32>
   return %0 : tensor<5x6x3x3x2x5xf32>
 }
 
@@ -1237,7 +1237,7 @@ func.func @winograd_input_transform_output_height(%arg0: tensor<2x14x14x5xf32>,
 
 func.func @winograd_input_transform_output_width(%arg0: tensor<2x14x14x5xf32>, %arg1: tensor<6x5x3x3x2x5xf32>) -> tensor<6x5x3x3x2x5xf32> {
   // expected-error @+1 {{the output shape is not expected}}
-  %0 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x14x14x5xf32>) outs(%arg1 : tensor<6x5x3x3x2x5xf32>) -> tensor<6x5x3x3x2x5xf32>
+  %0 = linalg.winograd_input_transform fmr(F_4_3) ins(%arg0 : tensor<2x14x14x5xf32>) outs(%arg1 : tensor<6x5x3x3x2x5xf32>) -> tensor<6x5x3x3x2x5xf32>
   return %0 : tensor<6x5x3x3x2x5xf32>
 }
 
@@ -1245,7 +1245,7 @@ func.func @winograd_input_transform_output_width(%arg0: tensor<2x14x14x5xf32>, %
 
 func.func @winograd_input_dyn(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<6x5x?x?x?x?xf32>) -> tensor<6x5x?x?x?x?xf32> {
   // expected-error @+1 {{the output shape is not expected}}
-  %0 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<?x?x?x?xf32>) outs(%arg1 : tensor<6x5x?x?x?x?xf32>) -> tensor<6x5x?x?x?x?xf32>
+  %0 = linalg.winograd_input_transform fmr(F_4_3) ins(%arg0 : tensor<?x?x?x?xf32>) outs(%arg1 : tensor<6x5x?x?x?x?xf32>) -> tensor<6x5x?x?x?x?xf32>
   return %0 : tensor<6x5x?x?x?x?xf32>
 }
 
@@ -1253,7 +1253,7 @@ func.func @winograd_input_dyn(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<6x5x?x?x
 
 func.func @winograd_output_transform_input_height(%arg0: tensor<5x6x3x3x2x2xf32>, %arg1: tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32> {
   // expected-error @+1 {{expect input height equals to input tile size}}
-  %0 = linalg.winograd_output_transform m(4) r(3) ins(%arg0 : tensor<5x6x3x3x2x2xf32>) outs(%arg1 : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
+  %0 = linalg.winograd_output_transform fmr(F_4_3) ins(%arg0 : tensor<5x6x3x3x2x2xf32>) outs(%arg1 : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
   return %0 : tensor<2x12x12x2xf32>
 }
 
@@ -1261,7 +1261,7 @@ func.func @winograd_output_transform_input_height(%arg0: tensor<5x6x3x3x2x2xf32>
 
 func.func @winograd_output_transform_input_width(%arg0: tensor<6x5x3x3x2x2xf32>, %arg1: tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32> {
   // expected-error @+1 {{expect input width equals to input tile size}}
-  %0 = linalg.winograd_output_transform m(4) r(3) ins(%arg0 : tensor<6x5x3x3x2x2xf32>) outs(%arg1 : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
+  %0 = linalg.winograd_output_transform fmr(F_4_3) ins(%arg0 : tensor<6x5x3x3x2x2xf32>) outs(%arg1 : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
   return %0 : tensor<2x12x12x2xf32>
 }
 
@@ -1269,7 +1269,7 @@ func.func @winograd_output_transform_input_width(%arg0: tensor<6x5x3x3x2x2xf32>,
 
 func.func @winograd_output_transform_output_height(%arg0: tensor<6x6x3x3x2x2xf32>, %arg1: tensor<2x11x12x2xf32>) -> tensor<2x11x12x2xf32> {
   // expected-error @+1 {{the output shape is not expected}}
-  %0 = linalg.winograd_output_transform m(4) r(3) ins(%arg0 : tensor<6x6x3x3x2x2xf32>) outs(%arg1 : tensor<2x11x12x2xf32>) -> tensor<2x11x12x2xf32>
+  %0 = linalg.winograd_output_transform fmr(F_4_3) ins(%arg0 : tensor<6x6x3x3x2x2xf32>) outs(%arg1 : tensor<2x11x12x2xf32>) -> tensor<2x11x12x2xf32>
   return %0 : tensor<2x11x12x2xf32>
 }
 
@@ -1277,7 +1277,7 @@ func.func @winograd_output_transform_output_height(%arg0: tensor<6x6x3x3x2x2xf32
 
 func.func @winograd_output_transform_output_width(%arg0: tensor<6x6x3x3x2x2xf32>, %arg1: tensor<2x12x11x2xf32>) -> tensor<2x12x11x2xf32> {
   // expected-error @+1 {{the output shape is not expected}}
-  %0 = linalg.winograd_output_transform m(4) r(3) ins(%arg0 : tensor<6x6x3x3x2x2xf32>) outs(%arg1 : tensor<2x12x11x2xf32>) -> tensor<2x12x11x2xf32>
+  %0 = linalg.winograd_output_transform fmr(F_4_3) ins(%arg0 : tensor<6x6x3x3x2x2xf32>) outs(%arg1 : tensor<2x12x11x2xf32>) -> tensor<2x12x11x2xf32>
   return %0 : tensor<2x12x11x2xf32>
 }
 
diff --git a/mlir/test/Dialect/Linalg/roundtrip.mlir b/mlir/test/Dialect/Linalg/roundtrip.mlir
index dc556761b09e5..4edbc6eda3eae 100644
--- a/mlir/test/Dialect/Linalg/roundtrip.mlir
+++ b/mlir/test/Dialect/Linalg/roundtrip.mlir
@@ -630,52 +630,52 @@ func.func @softmax(%arg0: tensor<2x16x32xf32>) -> tensor<2x16x32xf32> {
 
 func.func @winograd(%arg0: tensor<2x6x6x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg2: tensor<1xf32>, %arg3: tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32> {
   %0 = tensor.empty() : tensor<6x6x5x2xf32>
-  %1 = linalg.winograd_filter_transform m(4) r(3) ins(%arg1 : tensor<2x3x3x5xf32>) outs(%0 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
+  %1 = linalg.winograd_filter_transform fmr(F_4_3) ins(%arg1 : tensor<2x3x3x5xf32>) outs(%0 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
   %2 = tensor.empty() : tensor<6x6x1x1x2x5xf32>
-  %3 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x6x6x5xf32>) outs(%2 : tensor<6x6x1x1x2x5xf32>) -> tensor<6x6x1x1x2x5xf32>
+  %3 = linalg.winograd_input_transform fmr(F_4_3) ins(%arg0 : tensor<2x6x6x5xf32>) outs(%2 : tensor<6x6x1x1x2x5xf32>) -> tensor<6x6x1x1x2x5xf32>
   %collapsed = tensor.collapse_shape %1 [[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
   %collapsed_0 = tensor.collapse_shape %3 [[0, 1], [2, 3, 4], [5]] : tensor<6x6x1x1x2x5xf32> into tensor<36x2x5xf32>
   %4 = tensor.empty() : tensor<36x2x2xf32>
   %5 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<36x2x5xf32>, tensor<36x5x2xf32>) outs(%4 : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
   %expanded = tensor.expand_shape %5 [[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 1, 1, 2, 2] : tensor<36x2x2xf32> into tensor<6x6x1x1x2x2xf32>
-  %6 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x6x1x1x2x2xf32>) outs(%arg3 : tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32>
+  %6 = linalg.winograd_output_transform fmr(F_4_3) ins(%expanded : tensor<6x6x1x1x2x2xf32>) outs(%arg3 : tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32>
   return %6 : tensor<2x4x4x2xf32>
 }
 
 // CHECK-LABEL: func @winograd
-// CHECK:         linalg.winograd_filter_transform m(4) r(3)
-// CHECK:         linalg.winograd_input_transform m(4) r(3)
-// CHECK:         linalg.winograd_output_transform m(4) r(3)
+// CHECK:         linalg.winograd_filter_transform fmr(F_4_3)
+// CHECK:         linalg.winograd_input_transform fmr(F_4_3)
+// CHECK:         linalg.winograd_output_transform fmr(F_4_3)
 
 // -----
 
 func.func @winograd_filter_dyn(%arg0: tensor<?x3x3x?xf32>, %arg1: tensor<6x6x?x?xf32>) -> tensor<6x6x?x?xf32> {
-  %0 = linalg.winograd_filter_transform m(4) r(3) ins(%arg0 : tensor<?x3x3x?xf32>) outs(%arg1 : tensor<6x6x?x?xf32>) -> tensor<6x6x?x?xf32>
+  %0 = linalg.winograd_filter_transform fmr(F_4_3) ins(%arg0 : tensor<?x3x3x?xf32>) outs(%arg1 : tensor<6x6x?x?xf32>) -> tensor<6x6x?x?xf32>
   return %0 : tensor<6x6x?x?xf32>
 }
 
 // CHECK-LABEL: func @winograd_filter_dyn
-// CHECK:         linalg.winograd_filter_transform m(4) r(3) ins(%arg0 : tensor<?x3x3x?xf32>) outs(%arg1 : tensor<6x6x?x?xf32>) -> tensor<6x6x?x?xf32>
+// CHECK:         linalg.winograd_filter_transform fmr(F_4_3) ins(%arg0 : tensor<?x3x3x?xf32>) outs(%arg1 : tensor<6x6x?x?xf32>) -> tensor<6x6x?x?xf32>
 
 // -----
 
 func.func @winograd_input_dyn(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<6x6x?x?x?x?xf32>) -> tensor<6x6x?x?x?x?xf32> {
-  %0 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<?x?x?x?xf32>) outs(%arg1 : tensor<6x6x?x?x?x?xf32>) -> tensor<6x6x?x?x?x?xf32>
+  %0 = linalg.winograd_input_transform fmr(F_4_3) ins(%arg0 : tensor<?x?x?x?xf32>) outs(%arg1 : tensor<6x6x?x?x?x?xf32>) -> tensor<6x6x?x?x?x?xf32>
   return %0 : tensor<6x6x?x?x?x?xf32>
 }
 
 // CHECK-LABEL: func @winograd_input_dyn
-// CHECK:         linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<?x?x?x?xf32>) outs(%arg1 : tensor<6x6x?x?x?x?xf32>) -> tensor<6x6x?x?x?x?xf32>
+// CHECK:         linalg.winograd_input_transform fmr(F_4_3) ins(%arg0 : tensor<?x?x?x?xf32>) outs(%arg1 : tensor<6x6x?x?x?x?xf32>) -> tensor<6x6x?x?x?x?xf32>
 
 // -----
 
 func.func @winograd_output_dyn(%arg0: tensor<6x6x?x?x?x?xf32>, %arg1: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
-  %0 = linalg.winograd_output_transform m(4) r(3) ins(%arg0 : tensor<6x6x?x?x?x?xf32>) outs(%arg1 : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  %0 = linalg.winograd_output_transform fmr(F_4_3) ins(%arg0 : tensor<6x6x?x?x?x?xf32>) outs(%arg1 : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
   return %0 : tensor<?x?x?x?xf32>
 }
 
 // CHECK-LABEL: func @winograd_output_dyn
-// CHECK:         linalg.winograd_output_transform m(4) r(3) ins(%arg0 : tensor<6x6x?x?x?x?xf32>) outs(%arg1 : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+// CHECK:         linalg.winograd_output_transform fmr(F_4_3) ins(%arg0 : tensor<6x6x?x?x?x?xf32>) outs(%arg1 : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
 
 // -----
 
diff --git a/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir b/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
index 572a2ae70e0a4..5bdb5073ee865 100644
--- a/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
@@ -653,6 +653,7 @@ module {
       %5 = affine.min #map2(%i)[%d0, %idx]
       %6 = tensor.extract_slice %o[%4] [%5] [1] : tensor<?xf32> to tensor<?xf32>
 
+      // CHECK: linalg.generic
       // CHECK: %[[T1:.*]] = linalg.generic {{.*}}
       // CHECK: %[[T2:.*]] = linalg.generic {{.*}}
       %7 = tensor.extract_slice %1[%4] [%5] [1] : tensor<?xf32> to tensor<?xf32>
diff --git a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir
index cdc4b8a72a276..445ded4bfcafb 100644
--- a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir
+++ b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir
@@ -2,15 +2,15 @@
 
 func.func @conv2d(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg2: tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> {
   %0 = tensor.empty() : tensor<6x6x5x2xf32>
-  %1 = linalg.winograd_filter_transform m(4) r(3) ins(%arg1 : tensor<2x3x3x5xf32>) outs(%0 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
+  %1 = linalg.winograd_filter_transform fmr(F_4_3) ins(%arg1 : tensor<2x3x3x5xf32>) outs(%0 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
   %2 = tensor.empty() : tensor<6x6x2x2x2x5xf32>
-  %3 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x10x10x5xf32>) outs(%2 : tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32>
+  %3 = linalg.winograd_input_transform fmr(F_4_3) ins(%arg0 : tensor<2x10x10x5xf32>) outs(%2 : tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32>
   %collapsed = tensor.collapse_shape %1 [[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
   %collapsed_0 = tensor.collapse_shape %3 [[0, 1], [2, 3, 4], [5]] : tensor<6x6x2x2x2x5xf32> into tensor<36x8x5xf32>
   %4 = tensor.empty() : tensor<36x8x2xf32>
   %5 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<36x8x5xf32>, tensor<36x5x2xf32>) outs(%4 : tensor<36x8x2xf32>) -> tensor<36x8x2xf32>
   %expanded = tensor.expand_shape %5 [[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 2, 2, 2, 2] : tensor<36x8x2xf32> into tensor<6x6x2x2x2x2xf32>
-  %6 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x6x2x2x2x2xf32>) outs(%arg2 : tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32>
+  %6 = linalg.winograd_output_transform fmr(F_4_3) ins(%expanded : tensor<6x6x2x2x2x2xf32>) outs(%arg2 : tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32>
   return %6 : tensor<2x8x8x2xf32>
 }
 
@@ -123,13 +123,13 @@ module attributes {transform.with_named_sequence} {
 func.func @conv2d_unaligned(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg2: tensor<2x9x9x2xf32>) -> tensor<2x9x9x2xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<6x6x5x2xf32>
-  %1 = linalg.winograd_filter_transform m(4) r(3) ins(%arg1 : tensor<2x3x3x5xf32>) outs(%0 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
+  %1 = linalg.winograd_filter_transform fmr(F_4_3) ins(%arg1 : tensor<2x3x3x5xf32>) outs(%0 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
   %padded = tensor.pad %arg0 low[0, 0, 0, 0] high[0, 3, 3, 0] {
   ^bb0(%arg4: index, %arg5: index, %arg6: index, %arg7: index):
     tensor.yield %cst : f32
   } : tensor<2x11x11x5xf32> to tensor<2x14x14x5xf32>
   %2 = tensor.empty() : tensor<6x6x3x3x2x5xf32>
-  %3 = linalg.winograd_input_transform m(4) r(3) ins(%padded : tensor<2x14x14x5xf32>) outs(%2 : tensor<6x6x3x3x2x5xf32>) -> tensor<6x6x3x3x2x5xf32>
+  %3 = linalg.winograd_input_transform fmr(F_4_3) ins(%padded : tensor<2x14x14x5xf32>) outs(%2 : tensor<6x6x3x3x2x5xf32>) -> tensor<6x6x3x3x2x5xf32>
   %collapsed = tensor.collapse_shape %1 [[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
   %collapsed_0 = tensor.collapse_shape %3 [[0, 1], [2, 3, 4], [5]] : tensor<6x6x3x3x2x5xf32> into tensor<36x18x5xf32>
   %4 = tensor.empty() : tensor<36x18x2xf32>
@@ -140,7 +140,7 @@ func.func @conv2d_unaligned(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5x
   ^bb0(%arg4: index, %arg5: index, %arg6: index, %arg7: index):
     tensor.yield %cst : f32
   } : tensor<2x9x9x2xf32> to tensor<2x12x12x2xf32>
-  %7 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x6x3x3x2x2xf32>) outs(%padded_1 : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
+  %7 = linalg.winograd_output_transform fmr(F_4_3) ins(%expanded : tensor<6x6x3x3x2x2xf32>) outs(%padded_1 : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
   %extracted_slice = tensor.extract_slice %7[0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32>
   return %extracted_slice : tensor<2x9x9x2xf32>
 }
@@ -259,16 +259,16 @@ module attributes {transform.with_named_sequence} {
 func.func @conv2d_mx1_rx1(%arg0: tensor<2x6x1x5xf32>, %arg1: tensor<2x3x1x5xf32>, %arg2: tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<6x1x5x2xf32>
-  %1 = linalg.winograd_filter_transform m(4) r(3) ins(%arg1 : tensor<2x3x1x5xf32>) outs(%0 : tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32>
+  %1 = linalg.winograd_filter_transform fmr(F_4_3) ins(%arg1 : tensor<2x3x1x5xf32>) outs(%0 : tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32>
   %2 = tensor.empty() : tensor<6x1x1x1x2x5xf32>
-  %3 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x6x1x5xf32>) outs(%2 : tensor<6x1x1x1x2x5xf32>) -> tensor<6x1x1x1x2x5xf32>
+  %3 = linalg.winograd_input_transform fmr(F_4_3) ins(%arg0 : tensor<2x6x1x5xf32>) outs(%2 : tensor<6x1x1x1x2x5xf32>) -> tensor<6x1x1x1x2x5xf32>
   %collapsed = tensor.collapse_shape %1 [[0, 1], [2], [3]] : tensor<6x1x5x2xf32> into tensor<6x5x2xf32>
   %collapsed_0 = tensor.collapse_shape %3 [[0, 1], [2, 3, 4], [5]] : tensor<6x1x1x1x2x5xf32> into tensor<6x2x5xf32>
   %4 = tensor.empty() : tensor<6x2x2xf32>
   %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
   %6 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<6x2x5xf32>, tensor<6x5x2xf32>) outs(%5 : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
   %expanded = tensor.expand_shape %6 [[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<6x1x1x1x2x2xf32>
-  %7 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x1x1x1x2x2xf32>) outs(%arg2 : tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32>
+  %7 = linalg.winograd_output_transform fmr(F_4_3) ins(%expanded : tensor<6x1x1x1x2x2xf32>) outs(%arg2 : tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32>
   return %7 : tensor<2x4x1x2xf32>
 }
 
@@ -350,16 +350,16 @@ module attributes {transform.with_named_sequence} {
 func.func @conv2d_mx1_rx1_2(%arg0: tensor<2x6x2x5xf32>, %arg1: tensor<2x3x1x5xf32>, %arg2: tensor<2x4x2x2xf32>) -> tensor<2x4x2x2xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<6x1x5x2xf32>
-  %1 = linalg.winograd_filter_transform m(4) r(3) ins(%arg1 : tensor<2x3x1x5xf32>) outs(%0 : tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32>
+  %1 = linalg.winograd_filter_transform fmr(F_4_3) ins(%arg1 : tensor<2x3x1x5xf32>) outs(%0 : tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32>
   %2 = tensor.empty() : tensor<6x1x1x2x2x5xf32>
-  %3 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x6x2x5xf32>) outs(%2 : tensor<6x1x1x2x2x5xf32>) -> tensor<6x1x1x2x2x5xf32>
+  %3 = linalg.winograd_input_transform fmr(F_4_3) ins(%arg0 : tensor<2x6x2x5xf32>) outs(%2 : tensor<6x1x1x2x2x5xf32>) -> tensor<6x1x1x2x2x5xf32>
   %collapsed = tensor.collapse_shape %1 [[0, 1], [2], [3]] : tensor<6x1x5x2xf32> into tensor<6x5x2xf32>
   %collapsed_0 = tensor.collapse_shape %3 [[0, 1], [2, 3, 4], [5]] : tensor<6x1x1x2x2x5xf32> into tensor<6x4x5xf32>
   %4 = tensor.empty() : tensor<6x4x2xf32>
   %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<6x4x2xf32>) -> tensor<6x4x2xf32>
   %6 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<6x4x5xf32>, tensor<6x5x2xf32>) outs(%5 : tensor<6x4x2xf32>) -> tensor<6x4x2xf32>
   %expanded = tensor.expand_shape %6 [[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 2, 2, 2] : tensor<6x4x2xf32> into tensor<6x1x1x2x2x2xf32>
-  %7 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x1x1x2x2x2xf32>) outs(%arg2 : tensor<2x4x2x2xf32>) -> tensor<2x4x2x2xf32>
+  %7 = linalg.winograd_output_transform fmr(F_4_3) ins(%expanded : tensor<6x1x1x2x2x2xf32>) outs(%arg2 : tensor<2x4x2x2xf32>) -> tensor<2x4x2x2xf32>
   return %7 : tensor<2x4x2x2xf32>
 }
 
diff --git a/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir b/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir
index fc6424fd4c812..beb8d0b125738 100644
--- a/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir
+++ b/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s -transform-interpreter --split-input-file | FileCheck %s
 
 func.func @tile_winograd_filter(%arg0: tensor<2x3x3x5xf32>, %arg1: tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> {
-  %0 = linalg.winograd_filter_transform m(4) r(3) ins(%arg0 : tensor<2x3x3x5xf32>) outs(%arg1 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
+  %0 = linalg.winograd_filter_transform fmr(F_4_3) ins(%arg0 : tensor<2x3x3x5xf32>) outs(%arg1 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
   return %0 : tensor<6x6x5x2xf32>
 }
 
@@ -25,13 +25,13 @@ module attributes {transform.with_named_sequence} {
 // CHECK:    %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C5]] step %[[C1_1]] iter_args(%[[ARG5:.*]] = %[[ARG3]])
 // CHECK:      %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG4]]] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<2x3x3x5xf32> to tensor<1x3x3x1xf32>
 // CHECK:      %[[EXTRACTED_SLICE_2:.*]] = tensor.extract_slice %[[ARG5]][0, 0, %[[ARG4]], %[[ARG2]]] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6x5x2xf32> to tensor<6x6x1x1xf32>
-// CHECK:      %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x3x1xf32>) outs(%[[EXTRACTED_SLICE_2]] : tensor<6x6x1x1xf32>) -> tensor<6x6x1x1xf32>
+// CHECK:      %[[S3:.*]] = linalg.winograd_filter_transform fmr(F_4_3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x3x1xf32>) outs(%[[EXTRACTED_SLICE_2]] : tensor<6x6x1x1xf32>) -> tensor<6x6x1x1xf32>
 // CHECK:      %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S3]] into %[[ARG5]][0, 0, %[[ARG4]], %[[ARG2]]] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6x1x1xf32> into tensor<6x6x5x2xf32>
 
 // -----
 
 func.func @tile_winograd_filter(%arg0: tensor<2x3x3x5xf32>, %arg1: tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> {
-  %0 = linalg.winograd_filter_transform m(4) r(3) ins(%arg0 : tensor<2x3x3x5xf32>) outs(%arg1 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
+  %0 = linalg.winograd_filter_transform fmr(F_4_3) ins(%arg0 : tensor<2x3x3x5xf32>) outs(%arg1 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
   return %0 : tensor<6x6x5x2xf32>
 }
 
@@ -58,12 +58,12 @@ module attributes {transform.with_named_sequence} {
 // CHECK:       %[[S3:.*]] = affine.min #[[$MAP0]](%[[ARG4]])
 // CHECK:       %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG4]]] [1, 3, 3, %[[S3]]] [1, 1, 1, 1] : tensor<2x3x3x5xf32> to tensor<1x3x3x?xf32>
 // CHECK:       %[[EXTRACTED_SLICE_3:.*]] = tensor.extract_slice %[[ARG5]][0, 0, %[[ARG4]], %[[ARG2]]] [6, 6, %[[S3]], 1] [1, 1, 1, 1] : tensor<6x6x5x2xf32> to tensor<6x6x?x1xf32>
-// CHECK:       %[[S4:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x3x?xf32>) outs(%[[EXTRACTED_SLICE_3]] : tensor<6x6x?x1xf32>) -> tensor<6x6x?x1xf32>
+// CHECK:       %[[S4:.*]] = linalg.winograd_filter_transform fmr(F_4_3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x3x?xf32>) outs(%[[EXTRACTED_SLICE_3]] : tensor<6x6x?x1xf32>) -> tensor<6x6x?x1xf32>
 // CHECK:      %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S4]] into %[[ARG5]][0, 0, %[[ARG4]], %[[ARG2]]] [6, 6, %[[S3]], 1] [1, 1, 1, 1] : tensor<6x6x?x1xf32> into tensor<6x6x5x2xf32>
 // -----
 
 func.func @tile_winograd_filter(%arg0: tensor<2x3x1x5xf32>, %arg1: tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32> {
-  %0 = linalg.winograd_filter_transform m(4) r(3) ins(%arg0 : tensor<2x3x1x5xf32>) outs(%arg1 : tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32>
+  %0 = linalg.winograd_filter_transform fmr(F_4_3) ins(%arg0 : tensor<2x3x1x5xf32>) outs(%arg1 : tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32>
   return %0 : tensor<6x1x5x2xf32>
 }
 
@@ -87,13 +87,13 @@ module attributes {transform.with_named_sequence} {
 // CHECK:     %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C5]] step %[[C1_1]] iter_args(%[[ARG5:.*]] = %[[ARG3]])
 // CHECK:       %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG4]]] [1, 3, 1, 1] [1, 1, 1, 1] : tensor<2x3x1x5xf32> to tensor<1x3x1x1xf32>
 // CHECK:       %[[EXTRACTED_SLICE_2:.*]] = tensor.extract_slice %[[ARG5]][0, 0, %[[ARG4]], %[[ARG2]]] [6, 1, 1, 1] [1, 1, 1, 1] : tensor<6x1x5x2xf32> to tensor<6x1x1x1xf32>
-// CHECK:       %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x1x1xf32>) outs(%[[EXTRACTED_SLICE_2]] : tensor<6x1x1x1xf32>) -> tensor<6x1x1x1xf32>
+// CHECK:       %[[S3:.*]] = linalg.winograd_filter_transform fmr(F_4_3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x1x1xf32>) outs(%[[EXTRACTED_SLICE_2]] : tensor<6x1x1x1xf32>) -> tensor<6x1x1x1xf32>
 // CHECK:       %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S3]] into %[[ARG5]][0, 0, %[[ARG4]], %[[ARG2]]] [6, 1, 1, 1] [1, 1, 1, 1] : tensor<6x1x1x1xf32> into tensor<6x1x5x2xf32>
 
 // -----
 
 func.func @tile_winograd_input(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> {
-  %0 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x10x10x5xf32>) outs(%arg1 : tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32>
+  %0 = linalg.winograd_input_transform fmr(F_4_3) ins(%arg0 : tensor<2x10x10x5xf32>) outs(%arg1 : tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32>
   return %0 : tensor<6x6x2x2x2x5xf32>
 }
 
@@ -123,13 +123,13 @@ module attributes {transform.with_named_sequence} {
 // CHECK:   %[[S6:.*]] = affine.apply #[[$MAP1]]()
 // CHECK:   %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, %[[S3]], %[[S4]], 0] [2, %[[S5]], %[[S6]], 5] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<2x?x?x5xf32>
 // CHECK:   %[[EXTRACTED_SLICE_5:.*]] = tensor.extract_slice %[[ARG5]][0, 0, %[[ARG2]], %[[ARG4]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x1x1x2x5xf32>
-// CHECK:   %[[S7:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<2x?x?x5xf32>) outs(%[[EXTRACTED_SLICE_5]] : tensor<6x6x1x1x2x5xf32>) -> tensor<6x6x1x1x2x5xf32>
+// CHECK:   %[[S7:.*]] = linalg.winograd_input_transform fmr(F_4_3) ins(%[[EXTRACTED_SLICE]] : tensor<2x?x?x5xf32>) outs(%[[EXTRACTED_SLICE_5]] : tensor<6x6x1x1x2x5xf32>) -> tensor<6x6x1x1x2x5xf32>
 // CHECK:   %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S7]] into %[[ARG5]][0, 0, %[[ARG2]], %[[ARG4]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x2x5xf32> into tensor<6x6x2x2x2x5xf32>
 
 // -----
 
 func.func @tile_winograd_input(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> {
-  %0 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x10x10x5xf32>) outs(%arg1 : tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32>
+  %0 = linalg.winograd_input_transform fmr(F_4_3) ins(%arg0 : tensor<2x10x10x5xf32>) outs(%arg1 : tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32>
   return %0 : tensor<6x6x2x2x2x5xf32>
 }
 
@@ -167,13 +167,13 @@ module attributes {transform.with_named_sequence} {
 // CHECK:         %[[S8:.*]] = affine.apply #[[$MAP1]]()
 // CHECK:         %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG6]], %[[S5]], %[[S6]], %[[ARG8]]] [1, %[[S7]], %[[S8]], 1] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<1x?x?x1xf32>
 // CHECK:         %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[ARG9]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x1x1x1x1xf32>
-// CHECK:         %[[S9:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x?x?x1xf32>) outs(%[[EXTRACTED_SLICE_10]] : tensor<6x6x1x1x1x1xf32>) -> tensor<6x6x1x1x1x1xf32>
+// CHECK:         %[[S9:.*]] = linalg.winograd_input_transform fmr(F_4_3) ins(%[[EXTRACTED_SLICE]] : tensor<1x?x?x1xf32>) outs(%[[EXTRACTED_SLICE_10]] : tensor<6x6x1x1x1x1xf32>) -> tensor<6x6x1x1x1x1xf32>
 // CHECK:         %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[ARG9]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x1x1xf32> into tensor<6x6x2x2x2x5xf32>
 
 // -----
 
 func.func @tile_winograd_input(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> {
-  %0 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x10x10x5xf32>) outs(%arg1 : tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32>
+  %0 = linalg.winograd_input_transform fmr(F_4_3) ins(%arg0 : tensor<2x10x10x5xf32>) outs(%arg1 : tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32>
   return %0 : tensor<6x6x2x2x2x5xf32>
 }
 
@@ -213,13 +213,13 @@ module attributes {transform.with_named_sequence} {
 // CHECK:         %[[S9:.*]] = affine.apply #[[$MAP2]]()
 // CHECK:         %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG6]], %[[S6]], %[[S7]], %[[ARG8]]] [2, %[[S8]], %[[S9]], %[[S5]]] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<2x?x?x?xf32>
 // CHECK:         %[[EXTRACTED_SLICE_12:.*]] = tensor.extract_slice %[[ARG9]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 6, 2, 2, 2, %[[S5]]] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x2x2x2x?xf32>
-// CHECK:         %[[S10:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<2x?x?x?xf32>) outs(%[[EXTRACTED_SLICE_12]] : tensor<6x6x2x2x2x?xf32>) -> tensor<6x6x2x2x2x?xf32>
+// CHECK:         %[[S10:.*]] = linalg.winograd_input_transform fmr(F_4_3) ins(%[[EXTRACTED_SLICE]] : tensor<2x?x?x?xf32>) outs(%[[EXTRACTED_SLICE_12]] : tensor<6x6x2x2x2x?xf32>) -> tensor<6x6x2x2x2x?xf32>
 // CHECK:         %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S10]] into %[[ARG9]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 6, 2, 2, 2, %[[S5]]] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x?xf32> into tensor<6x6x2x2x2x5xf32>
 
 // -----
 
 func.func @tile_winograd_input(%arg0: tensor<2x1x10x5xf32>, %arg1: tensor<1x6x1x2x2x5xf32>) -> tensor<1x6x1x2x2x5xf32> {
-  %0 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x1x10x5xf32>) outs(%arg1 : tensor<1x6x1x2x2x5xf32>) -> tensor<1x6x1x2x2x5xf32>
+  %0 = linalg.winograd_input_transform fmr(F_4_3) ins(%arg0 : tensor<2x1x10x5xf32>) outs(%arg1 : tensor<1x6x1x2x2x5xf32>) -> tensor<1x6x1x2x2x5xf32>
   return %0 : tensor<1x6x1x2x2x5xf32>
 }
 
@@ -258,13 +258,13 @@ module attributes {transform.with_named_sequence} {
 // CHECK:           %[[S8:.*]] = affine.apply #[[$MAP2]]()
 // CHECK:           %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG6]], %[[S5]], %[[S6]], %[[ARG8]]] [1, 1, %[[S8]], 1] [1, 1, 1, 1] : tensor<2x1x10x5xf32> to tensor<1x1x?x1xf32>
 // CHECK:           %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[ARG9]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [1, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x6x1x2x2x5xf32> to tensor<1x6x1x1x1x1xf32>
-// CHECK:           %[[S9:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x1x?x1xf32>) outs(%[[EXTRACTED_SLICE_10]] : tensor<1x6x1x1x1x1xf32>) -> tensor<1x6x1x1x1x1xf32>
+// CHECK:           %[[S9:.*]] = linalg.winograd_input_transform fmr(F_4_3) ins(%[[EXTRACTED_SLICE]] : tensor<1x1x?x1xf32>) outs(%[[EXTRACTED_SLICE_10]] : tensor<1x6x1x1x1x1xf32>) -> tensor<1x6x1x1x1x1xf32>
 // CHECK:         %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[ARG9]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [1, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x6x1x1x1x1xf32> into tensor<1x6x1x2x2x5xf32>
 
 // -----
 
 func.func @tile_winograd_output(%arg0 : tensor<6x6x2x2x2x2xf32>, %arg1: tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> {
-  %0 = linalg.winograd_output_transform m(4) r(3) ins(%arg0 : tensor<6x6x2x2x2x2xf32>) outs(%arg1 : tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32>
+  %0 = linalg.winograd_output_transform fmr(F_4_3) ins(%arg0 : tensor<6x6x2x2x2x2xf32>) outs(%arg1 : tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32>
   return %0 : tensor<2x8x8x2xf32>
 }
 
@@ -298,7 +298,7 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 func.func @tile_winograd_output(%arg0 : tensor<6x6x2x2x3x5xf32>, %arg1: tensor<3x8x8x5xf32>) -> tensor<3x8x8x5xf32> {
-  %0 = linalg.winograd_output_transform m(4) r(3) ins(%arg0 : tensor<6x6x2x2x3x5xf32>) outs(%arg1 : tensor<3x8x8x5xf32>) -> tensor<3x8x8x5xf32>
+  %0 = linalg.winograd_output_transform fmr(F_4_3) ins(%arg0 : tensor<6x6x2x2x3x5xf32>) outs(%arg1 : tensor<3x8x8x5xf32>) -> tensor<3x8x8x5xf32>
   return %0 : tensor<3x8x8x5xf32>
 }
 
@@ -346,7 +346,7 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 func.func @tile_winograd_output(%arg0 : tensor<6x1x2x1x3x5xf32>, %arg1: tensor<3x8x1x5xf32>) -> tensor<3x8x1x5xf32> {
-  %0 = linalg.winograd_output_transform m(4) r(3) ins(%arg0 : tensor<6x1x2x1x3x5xf32>) outs(%arg1 : tensor<3x8x1x5xf32>) -> tensor<3x8x1x5xf32>
+  %0 = linalg.winograd_output_transform fmr(F_4_3) ins(%arg0 : tensor<6x1x2x1x3x5xf32>) outs(%arg1 : tensor<3x8x1x5xf32>) -> tensor<3x8x1x5xf32>
   return %0 : tensor<3x8x1x5xf32>
 }
 
@@ -385,4 +385,4 @@ module attributes {transform.with_named_sequence} {
 // CHECK:           %[[S7:.*]] = affine.apply #[[$MAP2]]()
 // CHECK:           %[[S8:.*]] = affine.apply #[[$MAP2]]()
 // CHECK:           %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[ARG12]][%[[ARG6]], %[[S5]], %[[S6]], %[[ARG8]]] [1, %[[S7]], 1, 1] [1, 1, 1, 1] : tensor<3x8x1x5xf32> to tensor<1x?x1x1xf32>
-// CHECK:           %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<6x1x1x1x1x1xf32>) outs(%[[EXTRACTED_SLICE_9]] : tensor<1x?x1x1xf32>) -> tensor<1x?x1x1xf32>
+// CHECK:           %[[S9:.*]] = linalg.winograd_output_transform fmr(F_4_3) ins(%[[EXTRACTED_SLICE]] : tensor<6x1x1x1x1x1xf32>) outs(%[[EXTRACTED_SLICE_9]] : tensor<1x?x1x1xf32>) -> tensor<1x?x1x1xf32>
diff --git a/mlir/test/Dialect/Linalg/transform-winograd-conv2d.mlir b/mlir/test/Dialect/Linalg/transform-winograd-conv2d.mlir
index 1de861e653005..e0ead54c956fc 100644
--- a/mlir/test/Dialect/Linalg/transform-winograd-conv2d.mlir
+++ b/mlir/test/Dialect/Linalg/transform-winograd-conv2d.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -transform-interpreter -canonicalize --split-input-file -verify-diagnostics| FileCheck %s
+// RUN: mlir-opt %s -allow-unregistered-dialect -transform-interpreter -canonicalize --split-input-file -verify-diagnostics| FileCheck %s
 
 func.func @conv2d(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg2: tensor<1xf32>, %arg3: tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> {
   %0 = linalg.conv_2d_nhwc_fhwc {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<2x10x10x5xf32>, tensor<2x3x3x5xf32>) outs(%arg3 : tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32>
@@ -8,16 +8,16 @@ func.func @conv2d(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.conv_2d_nhwc_fhwc"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.structured.winograd_conv2d %0 { m = 4, r = 3 } : (!transform.any_op) -> (!transform.any_op)
+    %1 = transform.structured.winograd_conv2d %0 { fmr = 1: i32 } : (!transform.any_op) -> (!transform.any_op)
     transform.yield
   }
 }
 
 // CHECK-LABEL: func.func @conv2d
-// CHECK: linalg.winograd_filter_transform m(4) r(3)
-// CHECK: linalg.winograd_input_transform m(4) r(3)
+// CHECK: linalg.winograd_filter_transform fmr(F_4_3)
+// CHECK: linalg.winograd_input_transform fmr(F_4_3)
 // CHECK: linalg.batch_matmul
-// CHECK: linalg.winograd_output_transform m(4) r(3)
+// CHECK: linalg.winograd_output_transform fmr(F_4_3)
 
 // -----
 
@@ -29,19 +29,19 @@ func.func @conv2d_unaligned(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5x
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.conv_2d_nhwc_fhwc"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.structured.winograd_conv2d %0 { m = 4, r = 3 } : (!transform.any_op) -> (!transform.any_op)
+    %1 = transform.structured.winograd_conv2d %0 { fmr = 1: i32 } : (!transform.any_op) -> (!transform.any_op)
     transform.yield
   }
 }
 
 // CHECK-LABEL: func.func @conv2d_unaligned
-// CHECK:       linalg.winograd_filter_transform m(4) r(3)
+// CHECK:       linalg.winograd_filter_transform fmr(F_4_3)
 // CHECK:       tensor.pad
 // CHECK-SAME:  low[0, 0, 0, 0] high[0, 3, 3, 0]
-// CHECK:       linalg.winograd_input_transform m(4) r(3)
+// CHECK:       linalg.winograd_input_transform fmr(F_4_3)
 // CHECK:       tensor.pad
 // CHECK-SAME:  low[0, 0, 0, 0] high[0, 3, 3, 0]
-// CHECK:       linalg.winograd_output_transform m(4) r(3)
+// CHECK:       linalg.winograd_output_transform fmr(F_4_3)
 
 // -----
 
@@ -54,7 +54,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.conv_2d_nhwc_hwcf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     // expected-error @+1 {{this operation is not supported to convert to Winograd Conv2D}}
-    %1 = transform.structured.winograd_conv2d %0 { m = 4, r = 3 } : (!transform.any_op) -> (!transform.any_op)
+    %1 = transform.structured.winograd_conv2d %0 { fmr = 1: i32 } : (!transform.any_op) -> (!transform.any_op)
     transform.yield
   }
 }
@@ -70,7 +70,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.conv_2d_nhwc_fhwc"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     // expected-error @+1 {{apply Winograd Conv2D failed}}
-    %1 = transform.structured.winograd_conv2d %0 { m = 4, r = 3 } : (!transform.any_op) -> (!transform.any_op)
+    %1 = transform.structured.winograd_conv2d %0 { fmr = 1: i32 } : (!transform.any_op) -> (!transform.any_op)
     transform.yield
   }
 }
@@ -86,7 +86,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.conv_2d_nhwc_fhwc"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     // expected-error @+1 {{apply Winograd Conv2D failed}}
-    %1 = transform.structured.winograd_conv2d %0 { m = 4, r = 3 } : (!transform.any_op) -> (!transform.any_op)
+    %1 = transform.structured.winograd_conv2d %0 { fmr = 1: i32 } : (!transform.any_op) -> (!transform.any_op)
     transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir b/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir
index 16d06a7473272..c7b0bd51308ba 100644
--- a/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir
+++ b/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir
@@ -3,13 +3,13 @@
 func.func @conv2d(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg2: tensor<2x9x9x2xf32>) -> tensor<2x9x9x2xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %2 = tensor.empty() : tensor<6x6x5x2xf32>
-  %3 = linalg.winograd_filter_transform m(4) r(3) ins(%arg1 : tensor<2x3x3x5xf32>) outs(%2 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
+  %3 = linalg.winograd_filter_transform fmr(F_4_3) ins(%arg1 : tensor<2x3x3x5xf32>) outs(%2 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
   %padded = tensor.pad %arg0 low[0, 0, 0, 0] high[0, 3, 3, 0] {
   ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
     tensor.yield %cst : f32
   } : tensor<2x11x11x5xf32> to tensor<2x14x14x5xf32>
   %4 = tensor.empty() : tensor<6x6x3x3x2x5xf32>
-  %5 = linalg.winograd_input_transform m(4) r(3) ins(%padded : tensor<2x14x14x5xf32>) outs(%4 : tensor<6x6x3x3x2x5xf32>) -> tensor<6x6x3x3x2x5xf32>
+  %5 = linalg.winograd_input_transform fmr(F_4_3) ins(%padded : tensor<2x14x14x5xf32>) outs(%4 : tensor<6x6x3x3x2x5xf32>) -> tensor<6x6x3x3x2x5xf32>
   %collapsed = tensor.collapse_shape %3 [[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
   %collapsed_0 = tensor.collapse_shape %5 [[0, 1], [2, 3, 4], [5]] : tensor<6x6x3x3x2x5xf32> into tensor<36x18x5xf32>
   %6 = tensor.empty() : tensor<36x18x2xf32>
@@ -20,7 +20,7 @@ func.func @conv2d(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg
   ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
     tensor.yield %cst : f32
   } : tensor<2x9x9x2xf32> to tensor<2x12x12x2xf32>
-  %9 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x6x3x3x2x2xf32>) outs(%padded_1 : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
+  %9 = linalg.winograd_output_transform fmr(F_4_3) ins(%expanded : tensor<6x6x3x3x2x2xf32>) outs(%padded_1 : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
   %extracted_slice = tensor.extract_slice %9[0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32>
   return %extracted_slice : tensor<2x9x9x2xf32>
 }
diff --git a/mlir/test/Dialect/Linalg/winograd-conv2d.mlir b/mlir/test/Dialect/Linalg/winograd-conv2d.mlir
index 0040d81a2d24e..e80fa6b4af944 100644
--- a/mlir/test/Dialect/Linalg/winograd-conv2d.mlir
+++ b/mlir/test/Dialect/Linalg/winograd-conv2d.mlir
@@ -9,16 +9,16 @@ func.func @conv2d_4x4_3x3(%arg0: tensor<2x6x6x5xf32>, %arg1: tensor<2x3x3x5xf32>
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x6x6x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32> {
 // CHECK-NEXT:  %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-NEXT:  %[[S2:.*]] = tensor.empty() : tensor<6x6x5x2xf32>
-// CHECK-NEXT:  %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[ARG1]] : tensor<2x3x3x5xf32>) outs(%[[S2]] : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
+// CHECK-NEXT:  %[[S3:.*]] = linalg.winograd_filter_transform fmr(F_4_3) ins(%[[ARG1]] : tensor<2x3x3x5xf32>) outs(%[[S2]] : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
 // CHECK-NEXT:  %[[S4:.*]] = tensor.empty() : tensor<6x6x1x1x2x5xf32>
-// CHECK-NEXT:  %[[S5:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[ARG0]] : tensor<2x6x6x5xf32>) outs(%[[S4]] : tensor<6x6x1x1x2x5xf32>) -> tensor<6x6x1x1x2x5xf32>
+// CHECK-NEXT:  %[[S5:.*]] = linalg.winograd_input_transform fmr(F_4_3) ins(%[[ARG0]] : tensor<2x6x6x5xf32>) outs(%[[S4]] : tensor<6x6x1x1x2x5xf32>) -> tensor<6x6x1x1x2x5xf32>
 // CHECK-NEXT:  %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
 // CHECK-NEXT:  %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x1x1x2x5xf32> into tensor<36x2x5xf32>
 // CHECK-NEXT:  %[[S6:.*]] = tensor.empty() : tensor<36x2x2xf32>
 // CHECK-NEXT:  %[[S7:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S6]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
 // CHECK-NEXT:  %[[S8:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x2x5xf32>, tensor<36x5x2xf32>) outs(%[[S7]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
 // CHECK-NEXT:  %[[EXPANDED:.*]] = tensor.expand_shape %[[S8]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 1, 1, 2, 2] : tensor<36x2x2xf32> into tensor<6x6x1x1x2x2xf32>
-// CHECK-NEXT:  %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32>
+// CHECK-NEXT:  %[[S9:.*]] = linalg.winograd_output_transform fmr(F_4_3) ins(%[[EXPANDED]] : tensor<6x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32>
 // CHECK-NEXT:  return %[[S9]] : tensor<2x4x4x2xf32>
 // CHECK-NEXT: }
 
@@ -33,16 +33,16 @@ func.func @conv2d_2x2_5x5(%arg0: tensor<2x6x6x5xf32>, %arg1: tensor<2x5x5x5xf32>
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x6x6x5xf32>, %[[ARG1:.*]]: tensor<2x5x5x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
 // CHECK-NEXT:   %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-NEXT:   %[[S2:.*]] = tensor.empty() : tensor<6x6x5x2xf32>
-// CHECK-NEXT:   %[[S3:.*]] = linalg.winograd_filter_transform m(2) r(5) ins(%[[ARG1]] : tensor<2x5x5x5xf32>) outs(%[[S2]] : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
+// CHECK-NEXT:   %[[S3:.*]] = linalg.winograd_filter_transform fmr(F_2_5) ins(%[[ARG1]] : tensor<2x5x5x5xf32>) outs(%[[S2]] : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
 // CHECK-NEXT:   %[[S4:.*]] = tensor.empty() : tensor<6x6x1x1x2x5xf32>
-// CHECK-NEXT:   %[[S5:.*]] = linalg.winograd_input_transform m(2) r(5) ins(%[[ARG0]] : tensor<2x6x6x5xf32>) outs(%[[S4]] : tensor<6x6x1x1x2x5xf32>) -> tensor<6x6x1x1x2x5xf32>
+// CHECK-NEXT:   %[[S5:.*]] = linalg.winograd_input_transform fmr(F_2_5) ins(%[[ARG0]] : tensor<2x6x6x5xf32>) outs(%[[S4]] : tensor<6x6x1x1x2x5xf32>) -> tensor<6x6x1x1x2x5xf32>
 // CHECK-NEXT:   %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
 // CHECK-NEXT:   %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x1x1x2x5xf32> into tensor<36x2x5xf32>
 // CHECK-NEXT:   %[[S6:.*]] = tensor.empty() : tensor<36x2x2xf32>
 // CHECK-NEXT:   %[[S7:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S6]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
 // CHECK-NEXT:   %[[S8:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x2x5xf32>, tensor<36x5x2xf32>) outs(%[[S7]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
 // CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S8]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 1, 1, 2, 2] : tensor<36x2x2xf32> into tensor<6x6x1x1x2x2xf32>
-// CHECK-NEXT:   %[[S9:.*]] = linalg.winograd_output_transform m(2) r(5) ins(%[[EXPANDED]] : tensor<6x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32>
+// CHECK-NEXT:   %[[S9:.*]] = linalg.winograd_output_transform fmr(F_2_5) ins(%[[EXPANDED]] : tensor<6x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32>
 // CHECK-NEXT:   return %[[S9]] : tensor<2x2x2x2xf32>
 // CHECK-NEXT: }
 
@@ -57,16 +57,16 @@ func.func @conv2d_1x4_1x3(%arg0: tensor<2x1x6x5xf32>, %arg1: tensor<2x1x3x5xf32>
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x1x6x5xf32>, %[[ARG1:.*]]: tensor<2x1x3x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x1x4x2xf32>) -> tensor<2x1x4x2xf32> {
 // CHECK-NEXT:   %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-NEXT:   %[[S2:.*]] = tensor.empty() : tensor<1x6x5x2xf32>
-// CHECK-NEXT:   %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[ARG1]] : tensor<2x1x3x5xf32>) outs(%[[S2]] : tensor<1x6x5x2xf32>) -> tensor<1x6x5x2xf32>
+// CHECK-NEXT:   %[[S3:.*]] = linalg.winograd_filter_transform fmr(F_4_3) ins(%[[ARG1]] : tensor<2x1x3x5xf32>) outs(%[[S2]] : tensor<1x6x5x2xf32>) -> tensor<1x6x5x2xf32>
 // CHECK-NEXT:   %[[S4:.*]] = tensor.empty() : tensor<1x6x1x1x2x5xf32>
-// CHECK-NEXT:   %[[S5:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[ARG0]] : tensor<2x1x6x5xf32>) outs(%[[S4]] : tensor<1x6x1x1x2x5xf32>) -> tensor<1x6x1x1x2x5xf32>
+// CHECK-NEXT:   %[[S5:.*]] = linalg.winograd_input_transform fmr(F_4_3) ins(%[[ARG0]] : tensor<2x1x6x5xf32>) outs(%[[S4]] : tensor<1x6x1x1x2x5xf32>) -> tensor<1x6x1x1x2x5xf32>
 // CHECK-NEXT:   %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2], [3]] : tensor<1x6x5x2xf32> into tensor<6x5x2xf32>
 // CHECK-NEXT:   %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<1x6x1x1x2x5xf32> into tensor<6x2x5xf32>
 // CHECK-NEXT:   %[[S6:.*]] = tensor.empty() : tensor<6x2x2xf32>
 // CHECK-NEXT:   %[[S7:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S6]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
 // CHECK-NEXT:   %[[S8:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<6x2x5xf32>, tensor<6x5x2xf32>) outs(%[[S7]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
 // CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S8]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [1, 6, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<1x6x1x1x2x2xf32>
-// CHECK-NEXT:   %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<1x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x1x4x2xf32>) -> tensor<2x1x4x2xf32>
+// CHECK-NEXT:   %[[S9:.*]] = linalg.winograd_output_transform fmr(F_4_3) ins(%[[EXPANDED]] : tensor<1x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x1x4x2xf32>) -> tensor<2x1x4x2xf32>
 // CHECK-NEXT:   return %[[S9]] : tensor<2x1x4x2xf32>
 // CHECK-NEXT: }
 
@@ -81,16 +81,16 @@ func.func @conv2d_4x1_3x1(%arg0: tensor<2x6x1x5xf32>, %arg1: tensor<2x3x1x5xf32>
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x6x1x5xf32>, %[[ARG1:.*]]: tensor<2x3x1x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32> {
 // CHECK-NEXT:   %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-NEXT:   %[[S2:.*]] = tensor.empty() : tensor<6x1x5x2xf32>
-// CHECK-NEXT:   %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[ARG1]] : tensor<2x3x1x5xf32>) outs(%[[S2]] : tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32>
+// CHECK-NEXT:   %[[S3:.*]] = linalg.winograd_filter_transform fmr(F_4_3) ins(%[[ARG1]] : tensor<2x3x1x5xf32>) outs(%[[S2]] : tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32>
 // CHECK-NEXT:   %[[S4:.*]] = tensor.empty() : tensor<6x1x1x1x2x5xf32>
-// CHECK-NEXT:   %[[S5:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[ARG0]] : tensor<2x6x1x5xf32>) outs(%[[S4]] : tensor<6x1x1x1x2x5xf32>) -> tensor<6x1x1x1x2x5xf32>
+// CHECK-NEXT:   %[[S5:.*]] = linalg.winograd_input_transform fmr(F_4_3) ins(%[[ARG0]] : tensor<2x6x1x5xf32>) outs(%[[S4]] : tensor<6x1x1x1x2x5xf32>) -> tensor<6x1x1x1x2x5xf32>
 // CHECK-NEXT:   %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2], [3]] : tensor<6x1x5x2xf32> into tensor<6x5x2xf32>
 // CHECK-NEXT:   %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x1x1x1x2x5xf32> into tensor<6x2x5xf32>
 // CHECK-NEXT:   %[[S6:.*]] = tensor.empty() : tensor<6x2x2xf32>
 // CHECK-NEXT:   %[[S7:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S6]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
 // CHECK-NEXT:   %[[S8:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<6x2x5xf32>, tensor<6x5x2xf32>) outs(%[[S7]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
 // CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S8]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<6x1x1x1x2x2xf32>
-// CHECK-NEXT:   %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x1x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32>
+// CHECK-NEXT:   %[[S9:.*]] = linalg.winograd_output_transform fmr(F_4_3) ins(%[[EXPANDED]] : tensor<6x1x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32>
 // CHECK-NEXT:   return %[[S9]] : tensor<2x4x1x2xf32>
 // CHECK-NEXT: }
 
@@ -105,16 +105,16 @@ func.func @conv2d_aligned(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<2x3x3x5xf3
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> {
 // CHECK-NEXT:  %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-NEXT:  %[[S2:.*]] = tensor.empty() : tensor<6x6x5x2xf32>
-// CHECK-NEXT:  %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[ARG1]] : tensor<2x3x3x5xf32>) outs(%[[S2]] : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
+// CHECK-NEXT:  %[[S3:.*]] = linalg.winograd_filter_transform fmr(F_4_3) ins(%[[ARG1]] : tensor<2x3x3x5xf32>) outs(%[[S2]] : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
 // CHECK-NEXT:  %[[S4:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32>
-// CHECK-NEXT:  %[[S5:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[ARG0]] : tensor<2x10x10x5xf32>) outs(%[[S4]] : tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32>
+// CHECK-NEXT:  %[[S5:.*]] = linalg.winograd_input_transform fmr(F_4_3) ins(%[[ARG0]] : tensor<2x10x10x5xf32>) outs(%[[S4]] : tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32>
 // CHECK-NEXT:  %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
 // CHECK-NEXT:  %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x2x2x2x5xf32> into tensor<36x8x5xf32>
 // CHECK-NEXT:  %[[S6:.*]] = tensor.empty() : tensor<36x8x2xf32>
 // CHECK-NEXT:  %[[S7:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S6]] : tensor<36x8x2xf32>) -> tensor<36x8x2xf32>
 // CHECK-NEXT:  %[[S8:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x8x5xf32>, tensor<36x5x2xf32>) outs(%[[S7]] : tensor<36x8x2xf32>) -> tensor<36x8x2xf32>
 // CHECK-NEXT:  %[[EXPANDED:.*]] = tensor.expand_shape %[[S8]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 2, 2, 2, 2] : tensor<36x8x2xf32> into tensor<6x6x2x2x2x2xf32>
-// CHECK-NEXT:  %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x2x2x2x2xf32>) outs(%[[ARG3]] : tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32>
+// CHECK-NEXT:  %[[S9:.*]] = linalg.winograd_output_transform fmr(F_4_3) ins(%[[EXPANDED]] : tensor<6x6x2x2x2x2xf32>) outs(%[[ARG3]] : tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32>
 // CHECK-NEXT:  return %[[S9]] : tensor<2x8x8x2xf32>
 // CHECK-NEXT: }
 
@@ -129,13 +129,13 @@ func.func @conv2d_unaligned(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5x
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x11x11x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x9x9x2xf32>) -> tensor<2x9x9x2xf32> {
 // CHECK-DAG:   %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:       %[[S0:.*]] = tensor.empty() : tensor<6x6x5x2xf32>
-// CHECK-NEXT:  %[[S1:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[ARG1]] : tensor<2x3x3x5xf32>) outs(%[[S0]] : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
+// CHECK-NEXT:  %[[S1:.*]] = linalg.winograd_filter_transform fmr(F_4_3) ins(%[[ARG1]] : tensor<2x3x3x5xf32>) outs(%[[S0]] : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
 // CHECK-NEXT:  %[[PADDED:.*]] = tensor.pad %[[ARG0]] low[0, 0, 0, 0] high[0, 3, 3, 0] {
 // CHECK-NEXT:  ^bb0
 // CHECK-NEXT:    tensor.yield %[[CST]] : f32
 // CHECK-NEXT:  } : tensor<2x11x11x5xf32> to tensor<2x14x14x5xf32>
 // CHECK-NEXT:  %[[S2:.*]] = tensor.empty() : tensor<6x6x3x3x2x5xf32>
-// CHECK-NEXT:  %[[S3:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[PADDED]] : tensor<2x14x14x5xf32>) outs(%[[S2]] : tensor<6x6x3x3x2x5xf32>) -> tensor<6x6x3x3x2x5xf32>
+// CHECK-NEXT:  %[[S3:.*]] = linalg.winograd_input_transform fmr(F_4_3) ins(%[[PADDED]] : tensor<2x14x14x5xf32>) outs(%[[S2]] : tensor<6x6x3x3x2x5xf32>) -> tensor<6x6x3x3x2x5xf32>
 // CHECK-NEXT:  %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
 // CHECK-NEXT:  %[[COLLAPSED_0:.*]] = tensor.collapse_shape %3 {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x3x3x2x5xf32> into tensor<36x18x5xf32>
 // CHECK-NEXT:  %[[S4:.*]] = tensor.empty() : tensor<36x18x2xf32>
@@ -146,7 +146,7 @@ func.func @conv2d_unaligned(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5x
 // CHECK-NEXT:  ^bb0
 // CHECK-NEXT:    tensor.yield %[[CST]] : f32
 // CHECK-NEXT:  } : tensor<2x9x9x2xf32> to tensor<2x12x12x2xf32>
-// CHECK-NEXT:  %[[S7:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x3x3x2x2xf32>) outs(%[[PADDED_1]] : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
+// CHECK-NEXT:  %[[S7:.*]] = linalg.winograd_output_transform fmr(F_4_3) ins(%[[EXPANDED]] : tensor<6x6x3x3x2x2xf32>) outs(%[[PADDED_1]] : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
 // CHECK-NEXT:  %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[S7]][0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32>
 // CHECK-NEXT:  return %[[EXTRACTED_SLICE]] : tensor<2x9x9x2xf32>
 // CHECK-NEXT: }
@@ -162,16 +162,16 @@ func.func @conv2d_type_promotion(%arg0: tensor<2x6x6x5xf16>, %arg1: tensor<2x3x3
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x6x6x5xf16>, %[[ARG1:.*]]: tensor<2x3x3x5xf16>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32> {
 // CHECK:        %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-NEXT:   %[[S0:.*]] = tensor.empty() : tensor<6x6x5x2xf16>
-// CHECK-NEXT:   %[[S1:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[ARG1]] : tensor<2x3x3x5xf16>) outs(%[[S0]] : tensor<6x6x5x2xf16>) -> tensor<6x6x5x2xf16>
+// CHECK-NEXT:   %[[S1:.*]] = linalg.winograd_filter_transform fmr(F_4_3) ins(%[[ARG1]] : tensor<2x3x3x5xf16>) outs(%[[S0]] : tensor<6x6x5x2xf16>) -> tensor<6x6x5x2xf16>
 // CHECK-NEXT:   %[[S2:.*]] = tensor.empty() : tensor<6x6x1x1x2x5xf16>
-// CHECK-NEXT:   %[[S3:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[ARG0]] : tensor<2x6x6x5xf16>) outs(%[[S2]] : tensor<6x6x1x1x2x5xf16>) -> tensor<6x6x1x1x2x5xf16>
+// CHECK-NEXT:   %[[S3:.*]] = linalg.winograd_input_transform fmr(F_4_3) ins(%[[ARG0]] : tensor<2x6x6x5xf16>) outs(%[[S2]] : tensor<6x6x1x1x2x5xf16>) -> tensor<6x6x1x1x2x5xf16>
 // CHECK-NEXT:   %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf16> into tensor<36x5x2xf16>
 // CHECK-NEXT:   %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x1x1x2x5xf16> into tensor<36x2x5xf16>
 // CHECK-NEXT:   %[[S4:.*]] = tensor.empty() : tensor<36x2x2xf32>
 // CHECK-NEXT:   %[[S5:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S4]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
 // CHECK-NEXT:   %[[S6:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x2x5xf16>, tensor<36x5x2xf16>) outs(%[[S5]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
 // CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 1, 1, 2, 2] : tensor<36x2x2xf32> into tensor<6x6x1x1x2x2xf32>
-// CHECK-NEXT:   %[[S7:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32>
+// CHECK-NEXT:   %[[S7:.*]] = linalg.winograd_output_transform fmr(F_4_3) ins(%[[EXPANDED]] : tensor<6x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32>
 // CHECK-NEXT:   return %[[S7]] : tensor<2x4x4x2xf32>
 // CHECK-NEXT: }
 
diff --git a/mlir/test/Dialect/MemRef/invalid.mlir b/mlir/test/Dialect/MemRef/invalid.mlir
index f908efb638446..8e394b2ac04c8 100644
--- a/mlir/test/Dialect/MemRef/invalid.mlir
+++ b/mlir/test/Dialect/MemRef/invalid.mlir
@@ -342,6 +342,16 @@ memref.global "priate" constant @memref5 : memref<2xf32>  = uninitialized
 
 // -----
 
+// expected-error @+1 {{op initial value element expected to be of type 'f16', but was of type 'f32'}}
+"memref.global"() <{constant, initial_value = dense<1.000000e+00> : tensor<1xf32>, sym_name = "memref6", sym_visibility = "private", type = memref<1xf16>}> : () -> ()
+
+// -----
+
+// expected-error @+1 {{op initial value shape expected to be 1, 2 but was 2, 2}}
+"memref.global"() <{constant, initial_value = dense<1.000000e+00> : tensor<2x2xf16>, sym_name = "memref7", sym_visibility = "private", type = memref<1x2xf16>}> : () -> ()
+
+// -----
+
 func.func @nonexistent_global_memref() {
   // expected-error @+1 {{'gv' does not reference a valid global memref}}
   %0 = memref.get_global @gv : memref<3xf32>
diff --git a/mlir/test/Dialect/MemRef/ops.mlir b/mlir/test/Dialect/MemRef/ops.mlir
index 13fdf3cf13510..e11de7bec2d0a 100644
--- a/mlir/test/Dialect/MemRef/ops.mlir
+++ b/mlir/test/Dialect/MemRef/ops.mlir
@@ -174,6 +174,9 @@ memref.global "private" @memref3 : memref<2xf32>  = uninitialized
 // CHECK-LABEL: memref.global "private" constant @memref4 : memref<2xf32> = uninitialized
 memref.global "private" constant @memref4 : memref<2xf32>  = uninitialized
 
+// CHECK-LABEL: memref.global "private" constant @memref5 : memref<1xf16, 42 : i32> = dense<1.000000e+00>
+"memref.global"() <{constant, initial_value = dense<1.000000e+00> : tensor<1xf16>, sym_name = "memref5", sym_visibility = "private", type = memref<1xf16, 42 : i32>}> : () -> ()
+
 // CHECK-LABEL: func @read_global_memref
 func.func @read_global_memref() {
   %0 = memref.get_global @memref0 : memref<2xf32>
diff --git a/mlir/test/Dialect/Vector/canonicalize/vector-transpose.mlir b/mlir/test/Dialect/Vector/canonicalize/vector-transpose.mlir
index c84aea6609665..f1e1c5e896c66 100644
--- a/mlir/test/Dialect/Vector/canonicalize/vector-transpose.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize/vector-transpose.mlir
@@ -165,6 +165,25 @@ func.func @shape_cast_of_transpose(%arg : vector<1x4x4x1x1xi8>) -> vector<4x4xi8
 
 // -----
 
+// In this test, the permutation maps the non-unit dimensions (1 and 2) as follows:
+// 1 -> 0
+// 2 -> 4
+// Because 0 < 4, this permutation is order preserving and effectively a shape_cast.
+// (same as the example above, but one of the dims is scalable)
+// CHECK-LABEL: @shape_cast_of_transpose_scalable
+//  CHECK-SAME:   %[[ARG:.*]]: vector<1x[4]x4x1x1xi8>) -> vector<[4]x4xi8> {
+//       CHECK:   %[[SHAPE_CAST:.*]] = vector.shape_cast %[[ARG]] :
+//  CHECK-SAME:   vector<1x[4]x4x1x1xi8> to vector<[4]x4xi8>
+//       CHECK:   return %[[SHAPE_CAST]] : vector<[4]x4xi8>
+func.func @shape_cast_of_transpose_scalable(%arg : vector<1x[4]x4x1x1xi8>) -> vector<[4]x4xi8> {
+  %0 = vector.transpose %arg, [1, 0, 3, 4, 2]
+     : vector<1x[4]x4x1x1xi8> to vector<[4]x1x1x1x4xi8>
+  %1 = vector.shape_cast %0 : vector<[4]x1x1x1x4xi8> to vector<[4]x4xi8>
+  return %1 : vector<[4]x4xi8>
+}
+
+// -----
+
 // In this test, the mapping of non-unit dimensions (1 and 2) is as follows:
 // 1 -> 2
 // 2 -> 1
@@ -184,36 +203,10 @@ func.func @negative_shape_cast_of_transpose(%arg : vector<1x4x4x1xi8>) -> vector
 
 // -----
 
-// Currently the conversion shape_cast(transpose) -> shape_cast is disabled for
-// scalable vectors because of bad interaction with ConvertIllegalShapeCastOpsToTransposes
-// CHECK-LABEL: @negative_shape_cast_of_transpose_scalable
-//       CHECK:  vector.transpose
-//       CHECK:  vector.shape_cast
-func.func @negative_shape_cast_of_transpose_scalable(%arg : vector<[4]x1xi8>) -> vector<[4]xi8> {
-  %0 = vector.transpose %arg, [1, 0] : vector<[4]x1xi8> to vector<1x[4]xi8>
-  %1 = vector.shape_cast %0 : vector<1x[4]xi8> to vector<[4]xi8>
-  return %1 : vector<[4]xi8>
-}
-
-// -----
-
 /// +--------------------------------------------------------------------------
 /// Tests of FoldTransposeShapeCast:  transpose(shape_cast) -> shape_cast
 /// +--------------------------------------------------------------------------
 
-// The conversion transpose(shape_cast) -> shape_cast is not disabled for scalable
-// vectors.
-// CHECK-LABEL: @transpose_of_shape_cast_scalable
-//       CHECK: vector.shape_cast
-//  CHECK-SAME: vector<[4]xi8> to vector<[4]x1xi8>
-func.func @transpose_of_shape_cast_scalable(%arg : vector<[4]xi8>) -> vector<[4]x1xi8> {
-  %0 = vector.shape_cast %arg : vector<[4]xi8> to vector<1x[4]xi8>
-  %1 = vector.transpose %0, [1, 0] : vector<1x[4]xi8> to vector<[4]x1xi8>
-  return %1 : vector<[4]x1xi8>
-}
-
-// -----
-
 // A transpose that is 'order preserving' can be treated like a shape_cast. 
 // CHECK-LABEL: @transpose_of_shape_cast
 //  CHECK-SAME:   %[[ARG:.*]]: vector<2x3x1x1xi8>) -> vector<6x1x1xi8> {
@@ -229,11 +222,26 @@ func.func @transpose_of_shape_cast(%arg : vector<2x3x1x1xi8>) ->  vector<6x1x1xi
 
 // -----
 
-// Scalable dimensions should be treated as non-unit dimensions.
 // CHECK-LABEL: @transpose_of_shape_cast_scalable
+//  CHECK-SAME:   %[[ARG:.*]]: vector<[2]x3x1x1xi8>) -> vector<[6]x1x1xi8> {
+//       CHECK:   %[[SHAPE_CAST:.*]] = vector.shape_cast %[[ARG]] :
+//  CHECK-SAME:   vector<[2]x3x1x1xi8> to vector<[6]x1x1xi8>
+//       CHECK:   return %[[SHAPE_CAST]] : vector<[6]x1x1xi8>
+func.func @transpose_of_shape_cast_scalable(%arg : vector<[2]x3x1x1xi8>) ->  vector<[6]x1x1xi8> {
+  %0 = vector.shape_cast %arg : vector<[2]x3x1x1xi8> to vector<[6]x1x1xi8>
+  %1 = vector.transpose %0, [0, 2, 1]
+     : vector<[6]x1x1xi8> to vector<[6]x1x1xi8>
+  return %1 : vector<[6]x1x1xi8>
+}
+
+// -----
+
+// Scalable 1 dimensions (i.e. [1]) should be treated as non-unit dimensions
+// (hence no folding).
+// CHECK-LABEL: @negative_transpose_of_shape_cast_scalable_unit
 //       CHECK: vector.shape_cast
 //       CHECK: vector.transpose
-func.func @transpose_of_shape_cast_scalable_unit(%arg : vector<[1]x4x1xi8>) -> vector<4x[1]xi8> {
+func.func @negative_transpose_of_shape_cast_scalable_unit(%arg : vector<[1]x4x1xi8>) -> vector<4x[1]xi8> {
   %0 = vector.shape_cast %arg : vector<[1]x4x1xi8> to vector<[1]x4xi8>
   %1 = vector.transpose %0, [1, 0] : vector<[1]x4xi8> to vector<4x[1]xi8>
   return %1 : vector<4x[1]xi8>
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index ec7cee7b2c641..4935ec8ba8e61 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -1975,6 +1975,15 @@ func.func @flat_transpose_scalable(%arg0: vector<[16]xf32>) -> vector<[16]xf32>
 
 // -----
 
+// expected-note @+1 {{prior use here}}
+func.func @vector_splat_type_mismatch(%a: f32) {
+  // expected-error @+1 {{expects different type than prior uses: 'i32' vs 'f32'}}
+  %0 = vector.splat %a : vector<1xi32>
+  return
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // vector.load
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir
index c59f7bd001905..0121bcdbbba45 100644
--- a/mlir/test/Dialect/Vector/ops.mlir
+++ b/mlir/test/Dialect/Vector/ops.mlir
@@ -149,7 +149,7 @@ func.func @vector_transfer_ops_tensor(%arg0: tensor<?x?xf32>,
 }
 
 // CHECK-LABEL: @vector_broadcast
-func.func @vector_broadcast(%a: f32, %b: vector<f32>, %c: vector<16xf32>, %d: vector<1x16xf32>, %e: vector<8x1xf32>) -> vector<8x16xf32> {
+func.func @vector_broadcast(%a: f32, %b: vector<f32>, %c: vector<16xf32>, %d: vector<1x16xf32>, %e: vector<8x1xf32>, %f: vector<8x1x!llvm.ptr<1>>) {
   // CHECK: vector.broadcast %{{.*}} : f32 to vector<f32>
   %0 = vector.broadcast %a : f32 to vector<f32>
   // CHECK: vector.broadcast %{{.*}} : vector<f32> to vector<4xf32>
@@ -162,7 +162,9 @@ func.func @vector_broadcast(%a: f32, %b: vector<f32>, %c: vector<16xf32>, %d: ve
   %4 = vector.broadcast %d : vector<1x16xf32> to vector<8x16xf32>
   // CHECK-NEXT: vector.broadcast %{{.*}} : vector<8x1xf32> to vector<8x16xf32>
   %5 = vector.broadcast %e : vector<8x1xf32> to vector<8x16xf32>
-  return %4 : vector<8x16xf32>
+  // CHECK-NEXT: vector.broadcast %{{.*}} : vector<8x1x!llvm.ptr<1>> to vector<8x16x!llvm.ptr<1>>
+  %6 = vector.broadcast %f : vector<8x1x!llvm.ptr<1>> to vector<8x16x!llvm.ptr<1>>
+  return
 }
 
 // CHECK-LABEL: @shuffle0D
@@ -959,13 +961,16 @@ func.func @vector_scan(%0: vector<4x8x16x32xf32>) -> vector<4x8x16x32xf32> {
 }
 
 // CHECK-LABEL: func @test_splat_op
-// CHECK-SAME: [[S:%arg[0-9]+]]: f32
-func.func @test_splat_op(%s : f32) {
-  // CHECK: vector.splat [[S]] : vector<8xf32>
+// CHECK-SAME: %[[s:.*]]: f32, %[[s2:.*]]: !llvm.ptr<1>
+func.func @test_splat_op(%s : f32, %s2 : !llvm.ptr<1>) {
+  // CHECK: vector.splat %[[s]] : vector<8xf32>
   %v = vector.splat %s : vector<8xf32>
 
-  // CHECK: vector.splat [[S]] : vector<4xf32>
+  // CHECK: vector.splat %[[s]] : vector<4xf32>
   %u = "vector.splat"(%s) : (f32) -> vector<4xf32>
+
+  // CHECK: vector.splat %[[s2]] : vector<16x!llvm.ptr<1>>
+  %w = vector.splat %s2 : vector<16x!llvm.ptr<1>>
   return
 }
 
diff --git a/mlir/test/IR/attribute.mlir b/mlir/test/IR/attribute.mlir
index 4f280bde1aecc..edb7357e4e04b 100644
--- a/mlir/test/IR/attribute.mlir
+++ b/mlir/test/IR/attribute.mlir
@@ -454,6 +454,10 @@ func.func @allowed_cases_pass() {
   %0 = "test.i32_enum_attr"() {attr = 5: i32} : () -> i32
   // CHECK: test.i32_enum_attr
   %1 = "test.i32_enum_attr"() {attr = 10: i32} : () -> i32
+  // CHECK: test.i32_enum_attr
+  %2 = "test.i32_enum_attr"() {attr = 2147483648: i32} : () -> i32
+  // CHECK: test.i32_enum_attr
+  %3 = "test.i32_enum_attr"() {attr = 4294967295: i32} : () -> i32
   return
 }
 
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/transfer-read-scalable-non-trailing.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/transfer-read-scalable-non-trailing.mlir
new file mode 100644
index 0000000000000..36fdb60d3e7bf
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/transfer-read-scalable-non-trailing.mlir
@@ -0,0 +1,79 @@
+// REQUIRES: arm-emulator
+
+// DEFINE: %{compile} = mlir-opt %s \
+// DEFINE:   --arm-sve-legalize-vector-storage --convert-vector-to-scf --convert-scf-to-cf  --convert-vector-to-llvm='enable-arm-sve' \
+// DEFINE:   --expand-strided-metadata    --lower-affine --convert-to-llvm --finalize-memref-to-llvm  --reconcile-unrealized-casts \
+// DEFINE: -o %t
+
+// DEFINE: %{entry_point} = main
+
+// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void  --march=aarch64 --mattr="+sve" \
+// DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%native_mlir_arm_runner_utils
+
+// RUN: rm -f %t && %{compile} && %{run} | FileCheck %s
+
+// Test the transfer_read with vector type with a non-trailing scalable
+// dimension as transformed by the pattern LegalizeTransferRead.
+
+func.func @transfer_read_scalable_non_trailing(%vs : i32, %M : memref<?x8xi8>) {
+  func.call @setArmVLBits(%vs) : (i32) -> ()
+
+  // Read an LLVM-illegal vector
+  %c0 = arith.constant 0 : index
+  %c0_i8 = arith.constant 0 : i8
+  %A = vector.transfer_read %M[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<?x8xi8>, vector<[4]x8xi8>
+
+  // Print the vector, for verification.
+  %B = vector.shape_cast %A : vector<[4]x8xi8> to vector<[32]xi8>
+  func.call @printVec(%B) : (vector<[32]xi8>) -> ()
+
+  return
+}
+
+func.func @main() {
+
+  %c0 = arith.constant 0 : index
+
+// Prepare an 8x8 buffer with test data. The test performs two reads
+// of a [4]x8 vector from the buffer. One read, with vector length 128 bits,
+// reads the first half the buffer. The other read, with vector length
+// 256 bits, reads the entire buffer.
+  %T = arith.constant dense<[[11, 12, 13, 14, 15, 16, 17, 18],
+                             [21, 22, 23, 24, 25, 26, 27, 28],
+                             [31, 32, 33, 34, 35, 36, 37, 38],
+                             [41, 42, 43, 44, 45, 46, 47, 48],
+                             [51, 52, 53, 54, 55, 56, 57, 58],
+                             [61, 62, 63, 64, 65, 66, 67, 68],
+                             [71, 72, 73, 74, 75, 76, 77, 78],
+                             [81, 82, 83, 84, 85, 86, 87, 88]]> : vector<8x8xi8>
+
+  %M = memref.alloca() : memref<8x8xi8>
+  vector.transfer_write %T, %M[%c0, %c0] : vector<8x8xi8>, memref<8x8xi8>
+  %MM = memref.cast %M : memref<8x8xi8> to memref<?x8xi8>
+
+// CHECK-LABEL: Result(VL128):
+// CHECK:( 11, 12, 13, 14, 15, 16, 17, 18, 21, 22, 23, 24, 25, 26, 27, 28 )
+// CHECK:( 31, 32, 33, 34, 35, 36, 37, 38, 41, 42, 43, 44, 45, 46, 47, 48 )
+  vector.print str "Result(VL128):\n"
+  %c128 = arith.constant 128 : i32
+  func.call @transfer_read_scalable_non_trailing(%c128, %MM) : (i32, memref<?x8xi8>) -> ()
+
+// CHECK-LABEL: Result(VL256):
+// CHECK: ( 11, 12, 13, 14, 15, 16, 17, 18, 21, 22, 23, 24, 25, 26, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 41, 42, 43, 44, 45, 46, 47, 48 )
+// CHECK: ( 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73, 74, 75, 76, 77, 78, 81, 82, 83, 84, 85, 86, 87, 88 )
+  vector.print str "Result(VL256):\n"
+  %c256 = arith.constant 256 : i32
+  func.call @transfer_read_scalable_non_trailing(%c256, %MM) : (i32, memref<?x8xi8>) -> ()
+
+  return
+}
+
+func.func private @printVec(%v : vector<[32]xi8>) {
+  %v0 = vector.scalable.extract %v[0] : vector<[16]xi8> from vector<[32]xi8>
+  %v1 = vector.scalable.extract %v[16] : vector<[16]xi8> from vector<[32]xi8>
+  vector.print %v0 : vector<[16]xi8>
+  vector.print %v1 : vector<[16]xi8>
+  return
+}
+
+func.func private @setArmVLBits(%bits : i32)
diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
index 77e52946b830f..0f69875d596f1 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt --transform-interpreter --cse --split-input-file %s | FileCheck %s
+// RUN: mlir-opt --transform-interpreter --cse --split-input-file --verify-diagnostics %s | FileCheck %s
 
 #map = affine_map<(d0) -> (d0)>
 module {
@@ -620,3 +620,294 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+func.func @multi_slice_fusion1(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?xf32>, %arg2 : tensor<?xf32>, %arg3 : index) -> tensor<?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %dim0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
+  %dim1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
+  %loop:2 = scf.forall (%iv0) =  (%c0) to (%dim0) step (%arg3) shared_outs(%init0 = %arg1, %init1 = %arg2) -> (tensor<?xf32>, tensor<?xf32>) {
+    %tilesize = affine.min affine_map<(d0)[s0, s1] -> (s1, s0 - d0)>(%iv0)[%dim0, %arg3]
+    %arg0_slice = tensor.extract_slice %arg0[%iv0, 0] [%tilesize, %dim1] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+    %init0_slice = tensor.extract_slice %init0[%iv0] [%tilesize] [1] : tensor<?xf32> to tensor<?xf32>
+    %init1_slice = tensor.extract_slice %init1[%iv0] [%tilesize] [1] : tensor<?xf32> to tensor<?xf32>
+    %generic:2 = linalg.generic {
+        indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>],
+	iterator_types = ["parallel", "reduction"]}
+	ins(%arg0_slice : tensor<?x?xf32>) outs(%init0_slice, %init1_slice : tensor<?xf32>, tensor<?xf32>) {
+      ^bb0(%b0 : f32, %b1 : f32, %b2 : f32):
+        %0 = arith.mulf %b0, %b1 : f32
+	%1 = arith.addf %b0, %b2 : f32
+	linalg.yield %0, %1 : f32, f32
+    } -> (tensor<?xf32>, tensor<?xf32>)
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %generic#0 into %init0[%iv0] [%tilesize] [1] : tensor<?xf32> into tensor<?xf32>
+      tensor.parallel_insert_slice %generic#1 into %init1[%iv0] [%tilesize] [1] : tensor<?xf32> into tensor<?xf32>
+    }	
+  }
+  %empty = tensor.empty(%dim0) : tensor<?xf32>
+  %result = linalg.generic {
+      indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+      iterator_types = ["parallel"]}
+      ins(%loop#0, %loop#1 : tensor<?xf32>, tensor<?xf32>) outs(%empty : tensor<?xf32>) {
+    ^bb0(%b0 : f32, %b1 : f32, %b2 : f32):
+      %0 = arith.addf %b0, %b1 : f32
+      linalg.yield %0 : f32
+  } -> tensor<?xf32>
+  return %result : tensor<?xf32>
+}
+// CHECK-LABEL: func @multi_slice_fusion1(
+//  CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?xf32>
+//       CHECK:   %[[C0:.+]] = arith.constant 0
+//       CHECK:   %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
+//       CHECK:   %[[EMPTY:.+]] = tensor.empty(%[[DIM0]])
+//       CHECK:   %[[RESULT:.+]]:3 = scf.forall (%[[IV:.+]]) =
+//  CHECK-SAME:       , %[[INIT:[a-zA-Z0-9]+]] = %[[EMPTY]])
+//       CHECK:     %[[TILESIZE:.+]] = affine.min
+//   CHECK-DAG:     %[[GENERIC:.+]]:2 = linalg.generic
+//   CHECK-DAG:     %[[INIT_SLICE:.+]] = tensor.extract_slice %[[INIT]][%[[IV]]] [%[[TILESIZE]]]
+//       CHECK:     %[[FUSED:.+]] = linalg.generic
+//  CHECK-SAME:         ins(%[[GENERIC]]#0, %[[GENERIC]]#1 :
+//       CHECK:     tensor.parallel_insert_slice %[[FUSED]] into %[[INIT]][%[[IV]]] [%[[TILESIZE]]]
+//       CHECK:   return %[[RESULT]]#2
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
+    %loop = transform.structured.match ops{["scf.forall"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+    %yield = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+    %yield0, %yield1 = transform.split_handle %yield : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %a, %b = transform.test.fuse_consumer %yield0, %yield1 in (%loop)
+      : (!transform.any_op, !transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+// -----
+
+// Check that when the given operand tiles are inconsistent, tiling fails.
+
+func.func @multi_slice_fusion2(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?xf32>, %arg2 : tensor<?xf32>, %arg3 : index) -> tensor<?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %dim0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
+  %dim1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
+  %loop:2 = scf.forall (%iv0) =  (%c0) to (%dim0) step (%arg3) shared_outs(%init0 = %arg1, %init1 = %arg2) -> (tensor<?xf32>, tensor<?xf32>) {
+    %tilesize = affine.min affine_map<(d0)[s0, s1] -> (s1, s0 - d0)>(%iv0)[%dim0, %arg3]
+    %arg0_slice = tensor.extract_slice %arg0[%iv0, 0] [%tilesize, %dim1] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+    %init0_slice = tensor.extract_slice %init0[%iv0] [%tilesize] [1] : tensor<?xf32> to tensor<?xf32>
+    %generic0 = linalg.generic {
+        indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>],
+	iterator_types = ["parallel", "reduction"]}
+	ins(%arg0_slice : tensor<?x?xf32>) outs(%init0_slice : tensor<?xf32>) {
+      ^bb0(%b0 : f32, %b1 : f32):
+        %0 = arith.mulf %b0, %b1 : f32
+	linalg.yield %0 : f32
+    } -> tensor<?xf32>
+    %init1_slice = tensor.extract_slice %init1[%iv0] [%tilesize] [1] : tensor<?xf32> to tensor<?xf32>
+    %generic1 = linalg.generic {
+        indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>],
+	iterator_types = ["parallel", "reduction"]}
+	ins(%arg0_slice : tensor<?x?xf32>) outs(%init1_slice: tensor<?xf32>) {
+      ^bb0(%b0 : f32, %b1 : f32):
+	%0 = arith.addf %b0, %b1 : f32
+	linalg.yield %0: f32
+    } -> tensor<?xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %generic0 into %init0[%iv0] [%tilesize] [1] : tensor<?xf32> into tensor<?xf32>
+      tensor.parallel_insert_slice %generic1 into %init1[%iv0] [%tilesize] [1] : tensor<?xf32> into tensor<?xf32>
+    }	
+  }
+  %empty = tensor.empty(%dim0) : tensor<?xf32>
+  %result = linalg.generic {
+      indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+      iterator_types = ["parallel"]}
+      ins(%loop#0, %loop#1 : tensor<?xf32>, tensor<?xf32>) outs(%empty : tensor<?xf32>) {
+    ^bb0(%b0 : f32, %b1 : f32, %b2 : f32):
+      %0 = arith.addf %b0, %b1 : f32
+      linalg.yield %0 : f32
+  } -> tensor<?xf32>
+  return %result : tensor<?xf32>
+}
+// CHECK-LABEL: func @multi_slice_fusion2(
+//  CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?xf32>
+//       CHECK:   %[[C0:.+]] = arith.constant 0
+//       CHECK:   %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
+//       CHECK:   %[[EMPTY:.+]] = tensor.empty(%[[DIM0]])
+//       CHECK:   %[[RESULT:.+]]:3 = scf.forall (%[[IV:.+]]) =
+//  CHECK-SAME:       , %[[INIT:[a-zA-Z0-9]+]] = %[[EMPTY]])
+//       CHECK:     %[[TILESIZE:.+]] = affine.min
+//       CHECK:     %[[GENERIC0:.+]] = linalg.generic
+//       CHECK:     %[[GENERIC1:.+]] = linalg.generic
+//   CHECK-DAG:     %[[INIT_SLICE:.+]] = tensor.extract_slice %[[INIT]][%[[IV]]] [%[[TILESIZE]]]
+//       CHECK:     %[[FUSED:.+]] = linalg.generic
+//  CHECK-SAME:         ins(%[[GENERIC0]], %[[GENERIC1]] :
+//       CHECK:     tensor.parallel_insert_slice %[[FUSED]] into %[[INIT]][%[[IV]]] [%[[TILESIZE]]]
+//       CHECK:   return %[[RESULT]]#2
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
+    %loop = transform.structured.match ops{["scf.forall"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+    %yield = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+    %yield0, %yield1 = transform.split_handle %yield : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %a, %b = transform.test.fuse_consumer %yield0, %yield1 in (%loop)
+      : (!transform.any_op, !transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @multi_slice_fusion_with_broadcast(%arg0 : tensor<?x?x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : tensor<?xf32>,
+    %arg3 : index, %arg4 : index) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %dim0 = tensor.dim %arg0, %c0 : tensor<?x?x?xf32>
+  %dim1 = tensor.dim %arg0, %c1 : tensor<?x?x?xf32>
+  %dim2 = tensor.dim %arg0, %c2 : tensor<?x?x?xf32>
+  %loop:2 = scf.forall (%iv0, %iv1) =  (%c0, %c0) to (%dim0, %dim1) step (%arg3, %arg4)
+      shared_outs(%init0 = %arg1, %init1 = %arg2) -> (tensor<?x?xf32>, tensor<?xf32>) {
+    %tilesize0 = affine.min affine_map<(d0)[s0, s1] -> (s1, s0 - d0)>(%iv0)[%dim0, %arg3]
+    %tilesize1 = affine.min affine_map<(d0)[s0, s1] -> (s1, s0 - d0)>(%iv1)[%dim1, %arg4]
+    %arg0_slice = tensor.extract_slice %arg0[%iv0, %iv1, 0] [%tilesize0, %tilesize1, %dim2] [1, 1, 1]
+        : tensor<?x?x?xf32> to tensor<?x?x?xf32>
+    %init0_slice = tensor.extract_slice %init0[%iv0, %iv1] [%tilesize0, %tilesize1] [1, 1]
+        : tensor<?x?xf32> to tensor<?x?xf32>
+    %generic0 = linalg.generic {
+        indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
+	      iterator_types = ["parallel", "parallel", "reduction"]}
+	      ins(%arg0_slice : tensor<?x?x?xf32>) outs(%init0_slice : tensor<?x?xf32>) {
+      ^bb0(%b0 : f32, %b1 : f32):
+        %0 = arith.mulf %b0, %b1 : f32
+	      linalg.yield %0 : f32
+    } -> tensor<?x?xf32>
+    %init1_slice = tensor.extract_slice %init1[%iv0] [%tilesize0] [1] : tensor<?xf32> to tensor<?xf32>
+    %generic1 = linalg.generic {
+        indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>],
+	      iterator_types = ["parallel", "reduction"]}
+	      ins(%generic0 : tensor<?x?xf32>) outs(%init1_slice: tensor<?xf32>) {
+      ^bb0(%b0 : f32, %b1 : f32):
+      	%0 = arith.addf %b0, %b1 : f32
+	      linalg.yield %0: f32
+    } -> tensor<?xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %generic0 into %init0[%iv0, %iv1] [%tilesize0, %tilesize1] [1, 1]
+          : tensor<?x?xf32> into tensor<?x?xf32>
+      tensor.parallel_insert_slice %generic1 into %init1[%iv0] [%tilesize0] [1] : tensor<?xf32> into tensor<?xf32>
+    }
+  }
+  %empty = tensor.empty(%dim0, %dim1) : tensor<?x?xf32>
+  %result = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>],
+      iterator_types = ["parallel", "parallel"]}
+      ins(%loop#0, %loop#1 : tensor<?x?xf32>, tensor<?xf32>) outs(%empty : tensor<?x?xf32>) {
+    ^bb0(%b0 : f32, %b1 : f32, %b2 : f32):
+      %0 = arith.addf %b0, %b1 : f32
+      linalg.yield %0 : f32
+  } -> tensor<?x?xf32>
+  return %result : tensor<?x?xf32>
+}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
+    %loop = transform.structured.match ops{["scf.forall"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+    %yield = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+    %yield0, %yield1 = transform.split_handle %yield : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %a, %b = transform.test.fuse_consumer %yield0, %yield1 in (%loop)
+      : (!transform.any_op, !transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+// CHECK-LABEL: func @multi_slice_fusion_with_broadcast(
+//  CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?x?xf32>
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1
+//   CHECK-DAG:   %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
+//   CHECK-DAG:   %[[DIM1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
+//       CHECK:   %[[EMPTY:.+]] = tensor.empty(%[[DIM0]], %[[DIM1]])
+//       CHECK:   %[[RESULT:.+]]:3 = scf.forall (%[[IV0:[a-zA-Z0-9]+]], %[[IV1:[a-zA-Z0-9]+]]) =
+//  CHECK-SAME:       , %[[INIT:[a-zA-Z0-9]+]] = %[[EMPTY]])
+//   CHECK-DAG:     %[[TILESIZE0:.+]] = affine.min {{.+}}(%[[IV0]])
+//   CHECK-DAG:     %[[TILESIZE1:.+]] = affine.min {{.+}}(%[[IV1]])
+//       CHECK:     %[[GENERIC0:.+]] = linalg.generic
+//       CHECK:     %[[GENERIC1:.+]] = linalg.generic
+//   CHECK-DAG:     %[[INIT_SLICE:.+]] = tensor.extract_slice %[[INIT]][%[[IV0]], %[[IV1]]] [%[[TILESIZE0]], %[[TILESIZE1]]]
+//       CHECK:     %[[FUSED:.+]] = linalg.generic
+//  CHECK-SAME:         ins(%[[GENERIC0]], %[[GENERIC1]] :
+//       CHECK:     tensor.parallel_insert_slice %[[FUSED]] into %[[INIT]][%[[IV0]], %[[IV1]]] [%[[TILESIZE0]], %[[TILESIZE1]]]
+//       CHECK:   return %[[RESULT]]#2
+
+// -----
+
+func.func @multi_slice_fusion_invalid(%arg0 : tensor<?x?x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : tensor<?x?xf32>,
+    %arg3 : index, %arg4 : index) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %dim0 = tensor.dim %arg0, %c0 : tensor<?x?x?xf32>
+  %dim1 = tensor.dim %arg0, %c1 : tensor<?x?x?xf32>
+  %dim2 = tensor.dim %arg0, %c2 : tensor<?x?x?xf32>
+  %loop:2 = scf.forall (%iv0, %iv1) =  (%c0, %c0) to (%dim0, %dim1) step (%arg3, %arg4)
+      shared_outs(%init0 = %arg1, %init1 = %arg2) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+    %tilesize0 = affine.min affine_map<(d0)[s0, s1] -> (s1, s0 - d0)>(%iv0)[%dim0, %arg3]
+    %tilesize1 = affine.min affine_map<(d0)[s0, s1] -> (s1, s0 - d0)>(%iv1)[%dim1, %arg4]
+    %arg0_slice = tensor.extract_slice %arg0[%iv0, %iv1, 0] [%tilesize0, %tilesize1, %dim2] [1, 1, 1]
+        : tensor<?x?x?xf32> to tensor<?x?x?xf32>
+    %init0_slice = tensor.extract_slice %init0[%iv0, %iv1] [%tilesize0, %tilesize1] [1, 1]
+        : tensor<?x?xf32> to tensor<?x?xf32>
+    %generic0 = linalg.generic {
+        indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
+	      iterator_types = ["parallel", "parallel", "reduction"]}
+	      ins(%arg0_slice : tensor<?x?x?xf32>) outs(%init0_slice : tensor<?x?xf32>) {
+      ^bb0(%b0 : f32, %b1 : f32):
+        %0 = arith.mulf %b0, %b1 : f32
+	      linalg.yield %0 : f32
+    } -> tensor<?x?xf32>
+    %init1_slice = tensor.extract_slice %init1[%iv0, %iv1] [%tilesize0, %tilesize1] [1, 1]
+        : tensor<?x?xf32> to tensor<?x?xf32>
+    %generic1 = linalg.generic {
+        indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
+	      iterator_types = ["parallel", "parallel", "reduction"]}
+	      ins(%arg0_slice : tensor<?x?x?xf32>) outs(%init1_slice: tensor<?x?xf32>) {
+      ^bb0(%b0 : f32, %b1 : f32):
+      	%0 = arith.addf %b0, %b1 : f32
+	      linalg.yield %0: f32
+    } -> tensor<?x?xf32>
+    scf.forall.in_parallel {
+      // expected-error @below {{failed to fuse consumer of slice}}
+      tensor.parallel_insert_slice %generic0 into %init0[%iv0, %iv1] [%tilesize0, %tilesize1] [1, 1]
+          : tensor<?x?xf32> into tensor<?x?xf32>
+      tensor.parallel_insert_slice %generic1 into %init1[%iv0, %iv1] [%tilesize0, %tilesize1] [1, 1]
+          : tensor<?x?xf32> into tensor<?x?xf32>
+    }
+  }
+  %empty = tensor.empty(%dim0, %dim1) : tensor<?x?xf32>
+  %result = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
+      iterator_types = ["parallel", "parallel"]}
+      ins(%loop#0, %loop#1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%empty : tensor<?x?xf32>) {
+    ^bb0(%b0 : f32, %b1 : f32, %b2 : f32):
+      %0 = arith.addf %b0, %b1 : f32
+      linalg.yield %0 : f32
+  } -> tensor<?x?xf32>
+  return %result : tensor<?x?xf32>
+}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
+    %loop = transform.structured.match ops{["scf.forall"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+    %yield = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+    %yield0, %yield1 = transform.split_handle %yield : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %a, %b = transform.test.fuse_consumer %yield0, %yield1 in (%loop)
+      : (!transform.any_op, !transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
diff --git a/mlir/test/Target/LLVMIR/Import/import-failure.ll b/mlir/test/Target/LLVMIR/Import/import-failure.ll
index a05a2b4bd4507..d48be66f2063e 100644
--- a/mlir/test/Target/LLVMIR/Import/import-failure.ll
+++ b/mlir/test/Target/LLVMIR/Import/import-failure.ll
@@ -258,22 +258,6 @@ end:
 
 ; // -----
 
-; CHECK:      <unknown>
-; CHECK-SAME: warning: expected function_entry_count to be attached to a function
-; CHECK:      warning: unhandled metadata: !0 = !{!"function_entry_count", i64 42}
-define void @cond_br(i1 %arg) {
-entry:
-  br i1 %arg, label %bb1, label %bb2, !prof !0
-bb1:
-  ret void
-bb2:
-  ret void
-}
-
-!0 = !{!"function_entry_count", i64 42}
-
-; // -----
-
 ; CHECK:      <unknown>
 ; CHECK-SAME: warning: dropped instruction: call void @llvm.experimental.noalias.scope.decl(metadata !0)
 define void @unused_scope() {
diff --git a/mlir/test/Target/LLVMIR/nvvm/elect.mlir b/mlir/test/Target/LLVMIR/nvvm/elect.mlir
new file mode 100644
index 0000000000000..3c5cac4b650bb
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/elect.mlir
@@ -0,0 +1,20 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// CHECK-LABEL: @test_nvvm_elect_sync
+llvm.func @test_nvvm_elect_sync() -> i1 {
+  // CHECK: %[[RES:.*]] = call { i32, i1 } @llvm.nvvm.elect.sync(i32 -1)
+  // CHECK-NEXT: %[[PRED:.*]] = extractvalue { i32, i1 } %[[RES]], 1
+  // CHECK-NEXT: ret i1 %[[PRED]]
+  %0 = nvvm.elect.sync -> i1
+  llvm.return %0 : i1
+}
+
+// CHECK-LABEL: @test_nvvm_elect_sync_mask
+llvm.func @test_nvvm_elect_sync_mask(%mask : i32) -> i1 {
+  // CHECK: %[[RES:.*]] = call { i32, i1 } @llvm.nvvm.elect.sync(i32 %0)
+  // CHECK-NEXT: %[[PRED:.*]] = extractvalue { i32, i1 } %[[RES]], 1
+  // CHECK-NEXT: ret i1 %[[PRED]]
+  %0 = nvvm.elect.sync %mask -> i1
+  llvm.return %0 : i1
+}
+
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 660d0a22dce9c..f86a04186f512 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -265,15 +265,6 @@ llvm.func @nvvm_vote(%0 : i32, %1 : i1) -> i32 {
   llvm.return %3 : i32
 }
 
-// CHECK-LABEL: @nvvm_elect_sync
-llvm.func @nvvm_elect_sync() -> i1 {
-  // CHECK: %[[RES:.*]] = call { i32, i1 } @llvm.nvvm.elect.sync(i32 -1)
-  // CHECK-NEXT: %[[PRED:.*]] = extractvalue { i32, i1 } %[[RES]], 1
-  // CHECK-NEXT: ret i1 %[[PRED]]
-  %0 = nvvm.elect.sync -> i1
-  llvm.return %0 : i1
-}
-
 // CHECK-LABEL: @nvvm_mma_mn8n8k4_row_col_f32_f32
 llvm.func @nvvm_mma_mn8n8k4_row_col_f32_f32(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
                     %b0 : vector<2xf16>, %b1 : vector<2xf16>,
diff --git a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
index 971bea2068544..e6ea3aaeec656 100644
--- a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
@@ -1,15 +1,17 @@
 // RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
 
-llvm.func @_QPopenmp_target_data() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
-  %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data map_entries(%2 : !llvm.ptr) {
-    %3 = llvm.mlir.constant(99 : i32) : i32
-    llvm.store %3, %1 : i32, !llvm.ptr
-    omp.terminator
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_data() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
+    %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    omp.target_data map_entries(%2 : !llvm.ptr) {
+      %3 = llvm.mlir.constant(99 : i32) : i32
+      llvm.store %3, %1 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
   }
-  llvm.return
 }
 
 // CHECK:         @.offload_sizes = private unnamed_addr constant [1 x i64] [i64 4]
@@ -38,23 +40,25 @@ llvm.func @_QPopenmp_target_data() {
 
 // -----
 
-llvm.func @_QPopenmp_target_data_region(%0 : !llvm.ptr) {
-  %1 = llvm.mlir.constant(1023 : index) : i64
-  %2 = llvm.mlir.constant(0 : index) : i64
-  %3 = llvm.mlir.constant(1024 : index) : i64
-  %4 = llvm.mlir.constant(1 : index) : i64
-  %5 = omp.map.bounds   lower_bound(%2 : i64) upper_bound(%1 : i64) extent(%3 : i64) stride(%4 : i64) start_idx(%4 : i64)
-  %6 = omp.map.info var_ptr(%0 : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(from) capture(ByRef) bounds(%5)  -> !llvm.ptr {name = ""}
-  omp.target_data map_entries(%6 : !llvm.ptr) {
-    %7 = llvm.mlir.constant(99 : i32) : i32
-    %8 = llvm.mlir.constant(1 : i64) : i64
-    %9 = llvm.mlir.constant(1 : i64) : i64
-    %10 = llvm.mlir.constant(0 : i64) : i64
-    %11 = llvm.getelementptr %0[0, %10] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.array<1024 x i32>
-    llvm.store %7, %11 : i32, !llvm.ptr
-    omp.terminator
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_data_region(%0 : !llvm.ptr) {
+    %1 = llvm.mlir.constant(1023 : index) : i64
+    %2 = llvm.mlir.constant(0 : index) : i64
+    %3 = llvm.mlir.constant(1024 : index) : i64
+    %4 = llvm.mlir.constant(1 : index) : i64
+    %5 = omp.map.bounds   lower_bound(%2 : i64) upper_bound(%1 : i64) extent(%3 : i64) stride(%4 : i64) start_idx(%4 : i64)
+    %6 = omp.map.info var_ptr(%0 : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(from) capture(ByRef) bounds(%5)  -> !llvm.ptr {name = ""}
+    omp.target_data map_entries(%6 : !llvm.ptr) {
+      %7 = llvm.mlir.constant(99 : i32) : i32
+      %8 = llvm.mlir.constant(1 : i64) : i64
+      %9 = llvm.mlir.constant(1 : i64) : i64
+      %10 = llvm.mlir.constant(0 : i64) : i64
+      %11 = llvm.getelementptr %0[0, %10] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.array<1024 x i32>
+      llvm.store %7, %11 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
   }
-  llvm.return
 }
 
 // CHECK:         @.offload_sizes = private unnamed_addr constant [1 x i64] [i64 4096]
@@ -85,50 +89,52 @@ llvm.func @_QPopenmp_target_data_region(%0 : !llvm.ptr) {
 
 // -----
 
-llvm.func @_QPomp_target_enter_exit(%1 : !llvm.ptr, %3 : !llvm.ptr) {
-  %4 = llvm.mlir.constant(1 : i64) : i64
-  %5 = llvm.alloca %4 x i32 {bindc_name = "dvc", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_enter_exitEdvc"} : (i64) -> !llvm.ptr
-  %6 = llvm.mlir.constant(1 : i64) : i64
-  %7 = llvm.alloca %6 x i32 {bindc_name = "i", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_enter_exitEi"} : (i64) -> !llvm.ptr
-  %8 = llvm.mlir.constant(5 : i32) : i32
-  llvm.store %8, %7 : i32, !llvm.ptr
-  %9 = llvm.mlir.constant(2 : i32) : i32
-  llvm.store %9, %5 : i32, !llvm.ptr
-  %10 = llvm.load %7 : !llvm.ptr -> i32
-  %11 = llvm.mlir.constant(10 : i32) : i32
-  %12 = llvm.icmp "slt" %10, %11 : i32
-  %13 = llvm.load %5 : !llvm.ptr -> i32
-  %14 = llvm.mlir.constant(1023 : index) : i64
-  %15 = llvm.mlir.constant(0 : index) : i64
-  %16 = llvm.mlir.constant(1024 : index) : i64
-  %17 = llvm.mlir.constant(1 : index) : i64
-  %18 = omp.map.bounds   lower_bound(%15 : i64) upper_bound(%14 : i64) extent(%16 : i64) stride(%17 : i64) start_idx(%17 : i64)
-  %map1 = omp.map.info var_ptr(%1 : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(to) capture(ByRef) bounds(%18) -> !llvm.ptr {name = ""}
-  %19 = llvm.mlir.constant(511 : index) : i64
-  %20 = llvm.mlir.constant(0 : index) : i64
-  %21 = llvm.mlir.constant(512 : index) : i64
-  %22 = llvm.mlir.constant(1 : index) : i64
-  %23 = omp.map.bounds   lower_bound(%20 : i64) upper_bound(%19 : i64) extent(%21 : i64) stride(%22 : i64) start_idx(%22 : i64)
-  %map2 = omp.map.info var_ptr(%3 : !llvm.ptr, !llvm.array<512 x i32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%23) -> !llvm.ptr {name = ""}
-  omp.target_enter_data   if(%12) device(%13 : i32) map_entries(%map1, %map2 : !llvm.ptr, !llvm.ptr)
-  %24 = llvm.load %7 : !llvm.ptr -> i32
-  %25 = llvm.mlir.constant(10 : i32) : i32
-  %26 = llvm.icmp "sgt" %24, %25 : i32
-  %27 = llvm.load %5 : !llvm.ptr -> i32
-  %28 = llvm.mlir.constant(1023 : index) : i64
-  %29 = llvm.mlir.constant(0 : index) : i64
-  %30 = llvm.mlir.constant(1024 : index) : i64
-  %31 = llvm.mlir.constant(1 : index) : i64
-  %32 = omp.map.bounds   lower_bound(%29 : i64) upper_bound(%28 : i64) extent(%30 : i64) stride(%31 : i64) start_idx(%31 : i64)
-  %map3 = omp.map.info var_ptr(%1 : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(from) capture(ByRef) bounds(%32) -> !llvm.ptr {name = ""}
-  %33 = llvm.mlir.constant(511 : index) : i64
-  %34 = llvm.mlir.constant(0 : index) : i64
-  %35 = llvm.mlir.constant(512 : index) : i64
-  %36 = llvm.mlir.constant(1 : index) : i64
-  %37 = omp.map.bounds   lower_bound(%34 : i64) upper_bound(%33 : i64) extent(%35 : i64) stride(%36 : i64) start_idx(%36 : i64)
-  %map4 = omp.map.info var_ptr(%3 : !llvm.ptr, !llvm.array<512 x i32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%37) -> !llvm.ptr {name = ""}
-  omp.target_exit_data   if(%26) device(%27 : i32) map_entries(%map3, %map4 : !llvm.ptr, !llvm.ptr)
-  llvm.return
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPomp_target_enter_exit(%1 : !llvm.ptr, %3 : !llvm.ptr) {
+    %4 = llvm.mlir.constant(1 : i64) : i64
+    %5 = llvm.alloca %4 x i32 {bindc_name = "dvc", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_enter_exitEdvc"} : (i64) -> !llvm.ptr
+    %6 = llvm.mlir.constant(1 : i64) : i64
+    %7 = llvm.alloca %6 x i32 {bindc_name = "i", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_enter_exitEi"} : (i64) -> !llvm.ptr
+    %8 = llvm.mlir.constant(5 : i32) : i32
+    llvm.store %8, %7 : i32, !llvm.ptr
+    %9 = llvm.mlir.constant(2 : i32) : i32
+    llvm.store %9, %5 : i32, !llvm.ptr
+    %10 = llvm.load %7 : !llvm.ptr -> i32
+    %11 = llvm.mlir.constant(10 : i32) : i32
+    %12 = llvm.icmp "slt" %10, %11 : i32
+    %13 = llvm.load %5 : !llvm.ptr -> i32
+    %14 = llvm.mlir.constant(1023 : index) : i64
+    %15 = llvm.mlir.constant(0 : index) : i64
+    %16 = llvm.mlir.constant(1024 : index) : i64
+    %17 = llvm.mlir.constant(1 : index) : i64
+    %18 = omp.map.bounds   lower_bound(%15 : i64) upper_bound(%14 : i64) extent(%16 : i64) stride(%17 : i64) start_idx(%17 : i64)
+    %map1 = omp.map.info var_ptr(%1 : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(to) capture(ByRef) bounds(%18) -> !llvm.ptr {name = ""}
+    %19 = llvm.mlir.constant(511 : index) : i64
+    %20 = llvm.mlir.constant(0 : index) : i64
+    %21 = llvm.mlir.constant(512 : index) : i64
+    %22 = llvm.mlir.constant(1 : index) : i64
+    %23 = omp.map.bounds   lower_bound(%20 : i64) upper_bound(%19 : i64) extent(%21 : i64) stride(%22 : i64) start_idx(%22 : i64)
+    %map2 = omp.map.info var_ptr(%3 : !llvm.ptr, !llvm.array<512 x i32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%23) -> !llvm.ptr {name = ""}
+    omp.target_enter_data   if(%12) device(%13 : i32) map_entries(%map1, %map2 : !llvm.ptr, !llvm.ptr)
+    %24 = llvm.load %7 : !llvm.ptr -> i32
+    %25 = llvm.mlir.constant(10 : i32) : i32
+    %26 = llvm.icmp "sgt" %24, %25 : i32
+    %27 = llvm.load %5 : !llvm.ptr -> i32
+    %28 = llvm.mlir.constant(1023 : index) : i64
+    %29 = llvm.mlir.constant(0 : index) : i64
+    %30 = llvm.mlir.constant(1024 : index) : i64
+    %31 = llvm.mlir.constant(1 : index) : i64
+    %32 = omp.map.bounds   lower_bound(%29 : i64) upper_bound(%28 : i64) extent(%30 : i64) stride(%31 : i64) start_idx(%31 : i64)
+    %map3 = omp.map.info var_ptr(%1 : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(from) capture(ByRef) bounds(%32) -> !llvm.ptr {name = ""}
+    %33 = llvm.mlir.constant(511 : index) : i64
+    %34 = llvm.mlir.constant(0 : index) : i64
+    %35 = llvm.mlir.constant(512 : index) : i64
+    %36 = llvm.mlir.constant(1 : index) : i64
+    %37 = omp.map.bounds   lower_bound(%34 : i64) upper_bound(%33 : i64) extent(%35 : i64) stride(%36 : i64) start_idx(%36 : i64)
+    %map4 = omp.map.info var_ptr(%3 : !llvm.ptr, !llvm.array<512 x i32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%37) -> !llvm.ptr {name = ""}
+    omp.target_exit_data   if(%26) device(%27 : i32) map_entries(%map3, %map4 : !llvm.ptr, !llvm.ptr)
+    llvm.return
+  }
 }
 
 // CHECK:         @.offload_sizes = private unnamed_addr constant [2 x i64] [i64 4096, i64 2048]
@@ -205,18 +211,20 @@ llvm.func @_QPomp_target_enter_exit(%1 : !llvm.ptr, %3 : !llvm.ptr) {
 
 // -----
 
-llvm.func @_QPopenmp_target_use_dev_ptr() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
-  %map1 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
-  %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map1 : !llvm.ptr) use_device_ptr(%map2 -> %arg0 : !llvm.ptr)  {
-    %1 = llvm.mlir.constant(10 : i32) : i32
-    %2 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
-    llvm.store %1, %2 : i32, !llvm.ptr
-    omp.terminator
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_use_dev_ptr() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
+    %map1 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
+    %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
+    omp.target_data  map_entries(%map1 : !llvm.ptr) use_device_ptr(%map2 -> %arg0 : !llvm.ptr)  {
+      %1 = llvm.mlir.constant(10 : i32) : i32
+      %2 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
+      llvm.store %1, %2 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
   }
-  llvm.return
 }
 
 // CHECK:         @.offload_sizes = private unnamed_addr constant [1 x i64] [i64 8]
@@ -249,18 +257,20 @@ llvm.func @_QPopenmp_target_use_dev_ptr() {
 
 // -----
 
-llvm.func @_QPopenmp_target_use_dev_addr() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
-  %map = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
-  %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 -> %arg0 : !llvm.ptr)  {
-    %1 = llvm.mlir.constant(10 : i32) : i32
-    %2 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
-    llvm.store %1, %2 : i32, !llvm.ptr
-    omp.terminator
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_use_dev_addr() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
+    %map = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
+    %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
+    omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 -> %arg0 : !llvm.ptr)  {
+      %1 = llvm.mlir.constant(10 : i32) : i32
+      %2 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
+      llvm.store %1, %2 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
   }
-  llvm.return
 }
 
 // CHECK:         @.offload_sizes = private unnamed_addr constant [1 x i64] [i64 8]
@@ -291,17 +301,19 @@ llvm.func @_QPopenmp_target_use_dev_addr() {
 
 // -----
 
-llvm.func @_QPopenmp_target_use_dev_addr_no_ptr() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %a = llvm.alloca %0 x i32 : (i64) -> !llvm.ptr
-  %map = omp.map.info var_ptr(%a : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  %map2 = omp.map.info var_ptr(%a : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 -> %arg0 : !llvm.ptr)  {
-    %1 = llvm.mlir.constant(10 : i32) : i32
-    llvm.store %1, %arg0 : i32, !llvm.ptr
-    omp.terminator
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_use_dev_addr_no_ptr() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %a = llvm.alloca %0 x i32 : (i64) -> !llvm.ptr
+    %map = omp.map.info var_ptr(%a : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    %map2 = omp.map.info var_ptr(%a : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 -> %arg0 : !llvm.ptr)  {
+      %1 = llvm.mlir.constant(10 : i32) : i32
+      llvm.store %1, %arg0 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
   }
-  llvm.return
 }
 
 // CHECK:         @.offload_sizes = private unnamed_addr constant [1 x i64] [i64 4]
@@ -331,23 +343,25 @@ llvm.func @_QPopenmp_target_use_dev_addr_no_ptr() {
 
 // -----
 
-llvm.func @_QPopenmp_target_use_dev_addr_nomap() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
-  %1 = llvm.mlir.constant(1 : i64) : i64
-  %b = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
-  %map = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
-  %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 -> %arg0 : !llvm.ptr)  {
-    %2 = llvm.mlir.constant(10 : i32) : i32
-    %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
-    llvm.store %2, %3 : i32, !llvm.ptr
-    %4 = llvm.mlir.constant(20 : i32) : i32
-    %5 = llvm.load %b : !llvm.ptr -> !llvm.ptr
-    llvm.store %4, %5 : i32, !llvm.ptr
-    omp.terminator
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_use_dev_addr_nomap() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
+    %1 = llvm.mlir.constant(1 : i64) : i64
+    %b = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
+    %map = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
+    %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 -> %arg0 : !llvm.ptr)  {
+      %2 = llvm.mlir.constant(10 : i32) : i32
+      %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
+      llvm.store %2, %3 : i32, !llvm.ptr
+      %4 = llvm.mlir.constant(20 : i32) : i32
+      %5 = llvm.load %b : !llvm.ptr -> !llvm.ptr
+      llvm.store %4, %5 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
   }
-  llvm.return
 }
 
 // CHECK:         @.offload_sizes = private unnamed_addr constant [2 x i64] [i64 8, i64 0]
@@ -387,25 +401,27 @@ llvm.func @_QPopenmp_target_use_dev_addr_nomap() {
 
 // -----
 
-llvm.func @_QPopenmp_target_use_dev_both() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
-  %1 = llvm.mlir.constant(1 : i64) : i64
-  %b = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
-  %map = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  %map1 = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  %map3 = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map, %map1 : !llvm.ptr, !llvm.ptr) use_device_addr(%map3 -> %arg0 : !llvm.ptr) use_device_ptr(%map2 -> %arg1 : !llvm.ptr)  {
-    %2 = llvm.mlir.constant(10 : i32) : i32
-    %3 = llvm.load %arg1 : !llvm.ptr -> !llvm.ptr
-    llvm.store %2, %3 : i32, !llvm.ptr
-    %4 = llvm.mlir.constant(20 : i32) : i32
-    %5 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
-    llvm.store %4, %5 : i32, !llvm.ptr
-    omp.terminator
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_use_dev_both() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
+    %1 = llvm.mlir.constant(1 : i64) : i64
+    %b = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
+    %map = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    %map1 = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    %map3 = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    omp.target_data  map_entries(%map, %map1 : !llvm.ptr, !llvm.ptr) use_device_addr(%map3 -> %arg0 : !llvm.ptr) use_device_ptr(%map2 -> %arg1 : !llvm.ptr)  {
+      %2 = llvm.mlir.constant(10 : i32) : i32
+      %3 = llvm.load %arg1 : !llvm.ptr -> !llvm.ptr
+      llvm.store %2, %3 : i32, !llvm.ptr
+      %4 = llvm.mlir.constant(20 : i32) : i32
+      %5 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
+      llvm.store %4, %5 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
   }
-  llvm.return
 }
 
 // CHECK:         @.offload_sizes = private unnamed_addr constant [2 x i64] [i64 8, i64 8]
@@ -448,19 +464,21 @@ llvm.func @_QPopenmp_target_use_dev_both() {
 
 // -----
 
-llvm.func @_QPopenmp_target_data_update() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
-  %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data map_entries(%2 : !llvm.ptr) {
-    %3 = llvm.mlir.constant(99 : i32) : i32
-    llvm.store %3, %1 : i32, !llvm.ptr
-    omp.terminator
-  }
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_data_update() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
+    %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
+    omp.target_data map_entries(%2 : !llvm.ptr) {
+      %3 = llvm.mlir.constant(99 : i32) : i32
+      llvm.store %3, %1 : i32, !llvm.ptr
+      omp.terminator
+    }
 
-  omp.target_update map_entries(%2 : !llvm.ptr)
+    omp.target_update map_entries(%2 : !llvm.ptr)
 
-  llvm.return
+    llvm.return
+  }
 }
 
 // CHECK-LABEL: define void @_QPopenmp_target_data_update
@@ -488,26 +506,28 @@ llvm.func @_QPopenmp_target_data_update() {
 
 // -----
 
-omp.declare_mapper @_QQFmy_testmy_mapper : !llvm.struct<"_QFmy_testTmy_type", (i32)> {
-^bb0(%arg0: !llvm.ptr):
-  %0 = llvm.mlir.constant(0 : i32) : i32
-  %1 = llvm.getelementptr %arg0[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>
-  %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "var%data"}
-  %3 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>) map_clauses(tofrom) capture(ByRef) members(%2 : [0] : !llvm.ptr) -> !llvm.ptr {name = "var", partial_map = true}
-  omp.declare_mapper.info map_entries(%3, %2 : !llvm.ptr, !llvm.ptr)
-}
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  omp.declare_mapper @_QQFmy_testmy_mapper : !llvm.struct<"_QFmy_testTmy_type", (i32)> {
+  ^bb0(%arg0: !llvm.ptr):
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    %1 = llvm.getelementptr %arg0[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>
+    %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "var%data"}
+    %3 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>) map_clauses(tofrom) capture(ByRef) members(%2 : [0] : !llvm.ptr) -> !llvm.ptr {name = "var", partial_map = true}
+    omp.declare_mapper.info map_entries(%3, %2 : !llvm.ptr, !llvm.ptr)
+  }
 
-llvm.func @_QPopenmp_target_data_mapper() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %1 = llvm.alloca %0 x !llvm.struct<"_QFmy_testTmy_type", (i32)> {bindc_name = "a"} : (i64) -> !llvm.ptr
-  %2 = omp.map.info var_ptr(%1 : !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>) map_clauses(tofrom) capture(ByRef) mapper(@_QQFmy_testmy_mapper) -> !llvm.ptr {name = "a"}
-  omp.target_data map_entries(%2 : !llvm.ptr) {
-    %3 = llvm.mlir.constant(10 : i32) : i32
-    %4 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>
-    llvm.store %3, %4 : i32, !llvm.ptr
-    omp.terminator
+  llvm.func @_QPopenmp_target_data_mapper() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x !llvm.struct<"_QFmy_testTmy_type", (i32)> {bindc_name = "a"} : (i64) -> !llvm.ptr
+    %2 = omp.map.info var_ptr(%1 : !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>) map_clauses(tofrom) capture(ByRef) mapper(@_QQFmy_testmy_mapper) -> !llvm.ptr {name = "a"}
+    omp.target_data map_entries(%2 : !llvm.ptr) {
+      %3 = llvm.mlir.constant(10 : i32) : i32
+      %4 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>
+      llvm.store %3, %4 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
   }
-  llvm.return
 }
 
 // CHECK:         @.offload_sizes = private unnamed_addr constant [1 x i64] [i64 4]
diff --git a/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir b/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir
index dba8c553aaca5..f5c620a8942d7 100644
--- a/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir
@@ -1,13 +1,15 @@
 // RUN: mlir-translate -mlir-to-llvmir -split-input-file %s 2>&1 | FileCheck %s
 
-llvm.func @_QPopenmp_target_data_enter() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
-  %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_data_enter() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
+    %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
 
-  omp.target_enter_data map_entries(%2 : !llvm.ptr) nowait
+    omp.target_enter_data map_entries(%2 : !llvm.ptr) nowait
 
-  llvm.return
+    llvm.return
+  }
 }
 
 // CHECK: define void @_QPopenmp_target_data_enter() {
@@ -32,14 +34,16 @@ llvm.func @_QPopenmp_target_data_enter() {
 
 // -----
 
-llvm.func @_QPopenmp_target_data_update() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
-  %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_data_update() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
+    %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
 
-  omp.target_update map_entries(%2 : !llvm.ptr) nowait
+    omp.target_update map_entries(%2 : !llvm.ptr) nowait
 
-  llvm.return
+    llvm.return
+  }
 }
 
 // CHECK: define void @_QPopenmp_target_data_update() {
@@ -64,14 +68,16 @@ llvm.func @_QPopenmp_target_data_update() {
 
 // -----
 
-llvm.func @_QPopenmp_target_data_exit() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
-  %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_data_exit() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
+    %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
 
-  omp.target_exit_data map_entries(%2 : !llvm.ptr) nowait
+    omp.target_exit_data map_entries(%2 : !llvm.ptr) nowait
 
-  llvm.return
+    llvm.return
+  }
 }
 
 // CHECK: define void @_QPopenmp_target_data_exit() {
diff --git a/mlir/test/Target/LLVMIR/openmp-data-target-device.mlir b/mlir/test/Target/LLVMIR/openmp-data-target-device.mlir
index 717a77e61b9a1..53c9b4f559645 100644
--- a/mlir/test/Target/LLVMIR/openmp-data-target-device.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-data-target-device.mlir
@@ -3,7 +3,7 @@
 // This tests checks that a target op inside a data op
 // We are only interested in ensuring that the -mlir-to-llmvir pass doesn't crash.
 // CHECK: {{.*}} = add i32 {{.*}}, 1
-module attributes { } {
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
   llvm.mlir.global weak_odr hidden local_unnamed_addr constant @__oclc_ABI_version(400 : i32) {addr_space = 4 : i32} : i32
   llvm.func @_QQmain() attributes {fir.bindc_name = "main", omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>} {
     %0 = llvm.mlir.constant(99 : index) : i64
diff --git a/mlir/test/Target/LLVMIR/openmp-nested-task-target-parallel.mlir b/mlir/test/Target/LLVMIR/openmp-nested-task-target-parallel.mlir
new file mode 100644
index 0000000000000..1589778e0627f
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-nested-task-target-parallel.mlir
@@ -0,0 +1,62 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+// This tests the fix for https://github.com/llvm/llvm-project/issues/138102
+// We are only interested in ensuring that the -mlir-to-llvmir pass doesn't crash
+
+// CHECK-LABEL: define internal void @_QQmain..omp_par
+
+omp.private {type = private} @_QFEi_private_i32 : i32
+omp.private {type = firstprivate} @_QFEc_firstprivate_i32 : i32 copy {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+%0 = llvm.load %arg0 : !llvm.ptr -> i32
+llvm.store %0, %arg1 : i32, !llvm.ptr
+omp.yield(%arg1 : !llvm.ptr)
+}
+llvm.func @_QQmain() {
+%0 = llvm.mlir.constant(1 : i64) : i64
+%1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+%2 = llvm.mlir.constant(1 : i64) : i64
+%3 = llvm.alloca %2 x i32 {bindc_name = "c"} : (i64) -> !llvm.ptr
+%4 = llvm.mlir.constant(10 : index) : i64
+%5 = llvm.mlir.constant(0 : index) : i64
+%6 = llvm.mlir.constant(10000 : index) : i64
+%7 = llvm.mlir.constant(1 : index) : i64
+%8 = llvm.mlir.constant(1 : i64) : i64
+%9 = llvm.mlir.addressof @_QFECchunksz : !llvm.ptr
+%10 = llvm.mlir.constant(1 : i64) : i64
+%11 = llvm.trunc %7 : i64 to i32
+llvm.br ^bb1(%11, %4 : i32, i64)
+^bb1(%12: i32, %13: i64):  // 2 preds: ^bb0, ^bb2
+%14 = llvm.icmp "sgt" %13, %5 : i64
+llvm.store %12, %3 : i32, !llvm.ptr
+omp.task private(@_QFEc_firstprivate_i32 %3 -> %arg0 : !llvm.ptr) {
+  %19 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"}
+  %20 = omp.map.info var_ptr(%arg0 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "c"}
+  %21 = omp.map.info var_ptr(%9 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "chunksz"}
+  omp.target map_entries(%19 -> %arg1, %20 -> %arg2, %21 -> %arg3 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+    %22 = llvm.mlir.constant(9999 : i32) : i32
+    %23 = llvm.mlir.constant(1 : i32) : i32
+    omp.parallel {
+      %24 = llvm.load %arg2 : !llvm.ptr -> i32
+      %25 = llvm.add %24, %22 : i32
+      omp.wsloop private(@_QFEi_private_i32 %arg1 -> %arg4 : !llvm.ptr) {
+        omp.loop_nest (%arg5) : i32 = (%24) to (%25) inclusive step (%23) {
+          llvm.store %arg5, %arg4 : i32, !llvm.ptr
+          omp.yield
+        }
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  omp.terminator
+}
+llvm.return
+}
+llvm.mlir.global internal constant @_QFECchunksz() {addr_space = 0 : i32} : i32 {
+%0 = llvm.mlir.constant(10000 : i32) : i32
+llvm.return %0 : i32
+}
+llvm.mlir.global internal constant @_QFECn() {addr_space = 0 : i32} : i32 {
+%0 = llvm.mlir.constant(100000 : i32) : i32
+llvm.return %0 : i32
+}
diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
index 738648b8ccdcf..684d491b532f4 100644
--- a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
@@ -211,8 +211,8 @@ static void applyEraseUnnecessaryInputs(func::FuncOp funcOp) {
 
 static void applyWinogradConv2D(func::FuncOp funcOp) {
   RewritePatternSet patterns(funcOp.getContext());
-  populateWinogradConv2DPatterns(patterns, /*m=*/4, /*r=*/3);
-  populateWinogradConv2DPatterns(patterns, /*m=*/2, /*r=*/5);
+  populateWinogradConv2DPatterns(patterns, WinogradConv2DFmr::F_4_3);
+  populateWinogradConv2DPatterns(patterns, WinogradConv2DFmr::F_2_5);
   (void)applyPatternsGreedily(funcOp, std::move(patterns));
 }
 
diff --git a/mlir/test/lib/Dialect/Test/TestEnumDefs.td b/mlir/test/lib/Dialect/Test/TestEnumDefs.td
index 5b785a600aad2..10e424a0f2523 100644
--- a/mlir/test/lib/Dialect/Test/TestEnumDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestEnumDefs.td
@@ -17,9 +17,13 @@ include "mlir/IR/EnumAttr.td"
 
 def I32Case5:  I32EnumAttrCase<"case5", 5>;
 def I32Case10: I32EnumAttrCase<"case10", 10>;
+def I32CaseSignedMaxPlusOne
+    : I32EnumAttrCase<"caseSignedMaxPlusOne", 2147483648>;
+def I32CaseUnsignedMax : I32EnumAttrCase<"caseUnsignedMax", 4294967295>;
 
-def SomeI32Enum: I32EnumAttr<
-  "SomeI32Enum", "", [I32Case5, I32Case10]>;
+def SomeI32Enum : I32EnumAttr<"SomeI32Enum", "",
+                              [I32Case5, I32Case10, I32CaseSignedMaxPlusOne,
+                               I32CaseUnsignedMax]>;
 
 def I64Case5:  I64EnumAttrCase<"case5", 5>;
 def I64Case10: I64EnumAttrCase<"case10", 10>;
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index 9126736d1d175..6b22b171822ae 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -1014,7 +1014,7 @@ struct TestPassthroughInvalidOp : public ConversionPattern {
               .getResult());
     }
     rewriter.replaceOpWithNewOp<TestValidOp>(op, TypeRange(), flattened,
-                                             std::nullopt);
+                                             ArrayRef<NamedAttribute>());
     return success();
   }
 };
@@ -1030,7 +1030,7 @@ struct TestDropAndReplaceInvalidOp : public ConversionPattern {
   matchAndRewrite(Operation *op, ArrayRef<Value> operands,
                   ConversionPatternRewriter &rewriter) const final {
     rewriter.replaceOpWithNewOp<TestValidOp>(op, TypeRange(), ValueRange(),
-                                             std::nullopt);
+                                             ArrayRef<NamedAttribute>());
     return success();
   }
 };
diff --git a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp
index 9971f0cde4ed2..ee3eb9522db7e 100644
--- a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp
+++ b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp
@@ -21,6 +21,9 @@
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/Interfaces/TilingInterface.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "test-tiling-interface"
 
 #define GET_OP_CLASSES
 #include "TestTilingInterfaceTransformOps.h.inc"
@@ -168,29 +171,30 @@ transform::TestFuseAndYieldOp::apply(TransformRewriter &rewriter,
 
 /// Apply fusing of consumer transformation to all payload ops and store both
 /// the original consumer operation as well as the fused consumer operation.
-template <typename Range>
 static LogicalResult applyFuseConsumer(
-    RewriterBase &rewriter, Operation *transformOp, Range &&payloadOps,
-    MutableArrayRef<LoopLikeOpInterface> loops, uint32_t numConsumerToFuse,
-    TransformResults &transformResults) {
+    RewriterBase &rewriter, Operation *transformOp,
+    ArrayRef<Operation *> slices, MutableArrayRef<LoopLikeOpInterface> loops,
+    uint32_t numConsumerToFuse, TransformResults &transformResults) {
   SmallVector<Operation *> originalConsumerOps;
   SmallVector<Operation *> fusedConsumerOps;
 
-  for (Operation *target : payloadOps) {
-    rewriter.setInsertionPoint(target);
+  rewriter.setInsertionPoint(slices.front());
 
-    while (numConsumerToFuse--) {
-      FailureOr<scf::SCFFuseConsumerOfSliceResult> fuseConsumerResults =
-          scf::tileAndFuseConsumerOfSlice(rewriter, target, loops);
+  while (numConsumerToFuse--) {
+    FailureOr<scf::SCFFuseConsumerOfSliceResult> fuseConsumerResults =
+        scf::tileAndFuseConsumerOfSlices(rewriter, slices, loops);
 
-      if (failed(fuseConsumerResults))
-        return failure();
+    if (failed(fuseConsumerResults))
+      return slices.front()->emitOpError("failed to fuse consumer of slice");
 
-      // Report back the relevant handles to the transform op.
-      originalConsumerOps.push_back(
-          fuseConsumerResults->origConsumerOperand->getOwner());
-      fusedConsumerOps.push_back(
-          fuseConsumerResults->tiledAndFusedConsumerOperand->getOwner());
+    // Report back the relevant handles to the transform op.
+    for (OpOperand *origConsumerOperand :
+         fuseConsumerResults->origConsumerOperands) {
+      originalConsumerOps.push_back(origConsumerOperand->getOwner());
+    }
+    for (OpOperand *tiledAndFusedConsumerOperand :
+         fuseConsumerResults->tiledAndFusedConsumerOperands) {
+      fusedConsumerOps.push_back(tiledAndFusedConsumerOperand->getOwner());
     }
   }
 
@@ -203,6 +207,12 @@ DiagnosedSilenceableFailure
 transform::TestFuseConsumerOp::apply(TransformRewriter &rewriter,
                                      TransformResults &transformResults,
                                      TransformState &state) {
+  SmallVector<Operation *> slices;
+  for (auto op : getTargets()) {
+    auto sliceOp = *state.getPayloadOps(op).begin();
+    slices.push_back(sliceOp);
+  }
+
   SmallVector<LoopLikeOpInterface> loops;
   for (auto op : llvm::reverse(getLoops())) {
     auto loopLikeOp =
@@ -212,16 +222,16 @@ transform::TestFuseConsumerOp::apply(TransformRewriter &rewriter,
     }
     loops.push_back(loopLikeOp);
   }
-  LogicalResult result = applyFuseConsumer(
-      rewriter, getOperation(), state.getPayloadOps(getTarget()), loops,
-      getNumConsumerToFuse(), transformResults);
+  LogicalResult result =
+      applyFuseConsumer(rewriter, getOperation(), slices, loops,
+                        getNumConsumerToFuse(), transformResults);
   return failed(result) ? DiagnosedSilenceableFailure::definiteFailure()
                         : DiagnosedSilenceableFailure::success();
 }
 
 void transform::TestFuseConsumerOp::getEffects(
     SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
-  consumesHandle(getTargetMutable(), effects);
+  consumesHandle(getTargetsMutable(), effects);
   consumesHandle(getLoopsMutable(), effects);
   producesHandle(getOperation()->getOpResults(), effects);
   modifiesPayload(effects);
diff --git a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td
index 98f7145c99cb1..3c09082e192ea 100644
--- a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td
+++ b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td
@@ -50,7 +50,8 @@ def TestFuseAndYieldOp : Op<Transform_Dialect, "test.fuse_and_yield",
 }
 
 def TestFuseConsumerOp : Op<Transform_Dialect, "test.fuse_consumer",
-       [DeclareOpInterfaceMethods<TransformOpInterface>,
+       [AttrSizedOperandSegments,
+        DeclareOpInterfaceMethods<TransformOpInterface>,
         DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
         ReportTrackingListenerFailuresOpTrait]> {
   let description = [{
@@ -59,14 +60,14 @@ def TestFuseConsumerOp : Op<Transform_Dialect, "test.fuse_consumer",
   }];
 
   let arguments = (ins 
-      TransformHandleTypeInterface:$target,
+      Variadic<TransformHandleTypeInterface>:$targets,
       Variadic<TransformHandleTypeInterface>:$loops,
       DefaultValuedAttr<I32Attr, "1">:$num_consumer_to_fuse);
   let results = (outs TransformHandleTypeInterface:$consumer,
                       TransformHandleTypeInterface:$fused_consumer);
 
   let assemblyFormat = [{
-    $target `in` `(` $loops `)`
+    $targets `in` `(` $loops `)`
     (`num_consumer_to_fuse` `=` $num_consumer_to_fuse^)? 
     attr-dict `:` functional-type(operands, results)
   }];
diff --git a/mlir/test/python/dialects/transform_debug_ext.py b/mlir/test/python/dialects/transform_debug_ext.py
new file mode 100644
index 0000000000000..2dfdaed343865
--- /dev/null
+++ b/mlir/test/python/dialects/transform_debug_ext.py
@@ -0,0 +1,45 @@
+# RUN: %PYTHON %s | FileCheck %s
+
+from mlir.ir import *
+from mlir.dialects import transform
+from mlir.dialects.transform import debug
+
+
+def run(f):
+    print("\nTEST:", f.__name__)
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            sequence = transform.SequenceOp(
+                transform.FailurePropagationMode.Propagate,
+                [],
+                transform.AnyOpType.get(),
+            )
+            with InsertionPoint(sequence.body):
+                f(sequence.bodyTarget)
+                transform.YieldOp()
+        print(module)
+    return f
+
+
+@run
+def testDebugEmitParamAsRemark(target):
+    i0 = IntegerAttr.get(IntegerType.get_signless(32), 0)
+    i0_param = transform.ParamConstantOp(transform.AnyParamType.get(), i0)
+    debug.emit_param_as_remark(i0_param)
+    debug.emit_param_as_remark(i0_param, anchor=target, message="some text")
+    # CHECK-LABEL: TEST: testDebugEmitParamAsRemark
+    # CHECK: ^{{.*}}(%[[ARG0:.+]]: !transform.any_op):
+    # CHECK: %[[PARAM:.*]] = transform.param.constant
+    # CHECK: transform.debug.emit_param_as_remark %[[PARAM]]
+    # CHECK: transform.debug.emit_param_as_remark %[[PARAM]]
+    # CHECK-SAME: "some text"
+    # CHECK-SAME: at %[[ARG0]]
+
+
+@run
+def testDebugEmitRemarkAtOp(target):
+    debug.emit_remark_at(target, "some text")
+    # CHECK-LABEL: TEST: testDebugEmitRemarkAtOp
+    # CHECK: ^{{.*}}(%[[ARG0:.+]]: !transform.any_op):
+    # CHECK: transform.debug.emit_remark_at %[[ARG0]], "some text"
diff --git a/mlir/tools/mlir-tblgen/EnumsGen.cpp b/mlir/tools/mlir-tblgen/EnumsGen.cpp
index 9941a203bc5cb..06dc588f90203 100644
--- a/mlir/tools/mlir-tblgen/EnumsGen.cpp
+++ b/mlir/tools/mlir-tblgen/EnumsGen.cpp
@@ -648,8 +648,10 @@ static void emitSpecializedAttrDef(const Record &enumDef, raw_ostream &os) {
 
   os << formatv("{0} {1}::getValue() const {{\n", enumName, attrClassName);
 
-  os << formatv("  return static_cast<{0}>(::mlir::IntegerAttr::getInt());\n",
-                enumName);
+  os << formatv(
+      "  return "
+      "static_cast<{0}>(::mlir::IntegerAttr::getValue().getZExtValue());\n",
+      enumName);
 
   os << "}\n";
 }
diff --git a/offload/liboffload/API/Program.td b/offload/liboffload/API/Program.td
index 8c88fe6e21e6a..0476fa1f7c27a 100644
--- a/offload/liboffload/API/Program.td
+++ b/offload/liboffload/API/Program.td
@@ -13,7 +13,9 @@
 def : Function {
     let name = "olCreateProgram";
     let desc = "Create a program for the device from the binary image pointed to by `ProgData`.";
-    let details = [];
+    let details = [
+        "The provided `ProgData` will be copied and need not outlive the returned handle",
+    ];
     let params = [
         Param<"ol_device_handle_t", "Device", "handle of the device", PARAM_IN>,
         Param<"const void*", "ProgData", "pointer to the program binary data", PARAM_IN>,
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index da2101529ffec..c2a35a245e2a7 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -480,6 +480,14 @@ Error olCreateProgram_impl(ol_device_handle_t Device, const void *ProgData,
 }
 
 Error olDestroyProgram_impl(ol_program_handle_t Program) {
+  auto &Device = Program->Image->getDevice();
+  if (auto Err = Device.unloadBinary(Program->Image))
+    return Err;
+
+  auto &LoadedImages = Device.LoadedImages;
+  LoadedImages.erase(
+      std::find(LoadedImages.begin(), LoadedImages.end(), Program->Image));
+
   return olDestroy(Program);
 }
 
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 73e1e66928fac..bc1a768feafdd 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2023,6 +2023,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Plugin::success();
   }
 
+  Error unloadBinaryImpl(DeviceImageTy *Image) override {
+    AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(*Image);
+
+    // Unload the executable of the image.
+    return AMDImage.unloadExecutable();
+  }
+
   /// Deinitialize the device and release its resources.
   Error deinitImpl() override {
     // Deinitialize the stream and event pools.
@@ -2035,19 +2042,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (auto Err = AMDGPUSignalManager.deinit())
       return Err;
 
-    // Close modules if necessary.
-    if (!LoadedImages.empty()) {
-      // Each image has its own module.
-      for (DeviceImageTy *Image : LoadedImages) {
-        AMDGPUDeviceImageTy &AMDImage =
-            static_cast<AMDGPUDeviceImageTy &>(*Image);
-
-        // Unload the executable of the image.
-        if (auto Err = AMDImage.unloadExecutable())
-          return Err;
-      }
-    }
-
     // Invalidate agent reference.
     Agent = {0};
 
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 91df800304378..fbc798faec24b 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -752,6 +752,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   virtual Expected<DeviceImageTy *>
   loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0;
 
+  /// Unload a previously loaded Image from the device
+  Error unloadBinary(DeviceImageTy *Image);
+  virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0;
+
   /// Setup the device environment if needed. Notice this setup may not be run
   /// on some plugins. By default, it will be executed, but plugins can change
   /// this behavior by overriding the shouldSetupDeviceEnvironment function.
@@ -1036,6 +1040,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   BoolEnvar OMPX_TrackAllocationTraces =
       BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false);
 
+  /// Array of images loaded into the device. Images are automatically
+  /// deallocated by the allocator.
+  llvm::SmallVector<DeviceImageTy *> LoadedImages;
+
 private:
   /// Get and set the stack size and heap size for the device. If not used, the
   /// plugin can implement the setters as no-op and setting the output
@@ -1086,10 +1094,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   UInt32Envar OMPX_InitialNumStreams;
   UInt32Envar OMPX_InitialNumEvents;
 
-  /// Array of images loaded into the device. Images are automatically
-  /// deallocated by the allocator.
-  llvm::SmallVector<DeviceImageTy *> LoadedImages;
-
   /// The identifier of the device within the plugin. Notice this is not a
   /// global device id and is not the device id visible to the OpenMP user.
   const int32_t DeviceId;
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 3e9a62f57095f..ac7031b6e881c 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -821,26 +821,49 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
   return Plugin::success();
 }
 
-Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
-  for (DeviceImageTy *Image : LoadedImages)
-    if (auto Err = callGlobalDestructors(Plugin, *Image))
-      return Err;
+Error GenericDeviceTy::unloadBinary(DeviceImageTy *Image) {
+  if (auto Err = callGlobalDestructors(Plugin, *Image))
+    return Err;
 
   if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
     GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
-    for (auto *Image : LoadedImages) {
-      DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
-      GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
-                             sizeof(DeviceMemoryPoolTrackingTy),
-                             &ImageDeviceMemoryPoolTracking);
-      if (auto Err =
-              GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
-        consumeError(std::move(Err));
-        continue;
-      }
-      DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
+    DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
+    GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
+                           sizeof(DeviceMemoryPoolTrackingTy),
+                           &ImageDeviceMemoryPoolTracking);
+    if (auto Err =
+            GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
+      consumeError(std::move(Err));
     }
+    DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
+  }
+
+  GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
+  auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
+  if (!ProfOrErr)
+    return ProfOrErr.takeError();
+
+  if (!ProfOrErr->empty()) {
+    // Dump out profdata
+    if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) ==
+        uint32_t(DeviceDebugKind::PGODump))
+      ProfOrErr->dump();
+
+    // Write data to profiling file
+    if (auto Err = ProfOrErr->write())
+      return Err;
+  }
 
+  return unloadBinaryImpl(Image);
+}
+
+Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
+  for (auto &I : LoadedImages)
+    if (auto Err = unloadBinary(I))
+      return Err;
+  LoadedImages.clear();
+
+  if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
     // TODO: Write this by default into a file.
     printf("\n\n|-----------------------\n"
            "| Device memory tracker:\n"
@@ -856,25 +879,6 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
            DeviceMemoryPoolTracking.AllocationMax);
   }
 
-  for (auto *Image : LoadedImages) {
-    GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-    auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
-    if (!ProfOrErr)
-      return ProfOrErr.takeError();
-
-    if (ProfOrErr->empty())
-      continue;
-
-    // Dump out profdata
-    if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) ==
-        uint32_t(DeviceDebugKind::PGODump))
-      ProfOrErr->dump();
-
-    // Write data to profiling file
-    if (auto Err = ProfOrErr->write())
-      return Err;
-  }
-
   // Delete the memory manager before deinitializing the device. Otherwise,
   // we may delete device allocations after the device is deinitialized.
   if (MemoryManager)
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 9943f533ef5a8..0e662b038c363 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -358,6 +358,19 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::success();
   }
 
+  Error unloadBinaryImpl(DeviceImageTy *Image) override {
+    assert(Context && "Invalid CUDA context");
+
+    // Each image has its own module.
+    CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(*Image);
+
+    // Unload the module of the image.
+    if (auto Err = CUDAImage.unloadModule())
+      return Err;
+
+    return Plugin::success();
+  }
+
   /// Deinitialize the device and release its resources.
   Error deinitImpl() override {
     if (Context) {
@@ -372,20 +385,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (auto Err = CUDAEventManager.deinit())
       return Err;
 
-    // Close modules if necessary.
-    if (!LoadedImages.empty()) {
-      assert(Context && "Invalid CUDA context");
-
-      // Each image has its own module.
-      for (DeviceImageTy *Image : LoadedImages) {
-        CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(*Image);
-
-        // Unload the module of the image.
-        if (auto Err = CUDAImage.unloadModule())
-          return Err;
-      }
-    }
-
     if (Context) {
       CUresult Res = cuDevicePrimaryCtxRelease(Device);
       if (auto Err =
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index ced9208acaedc..a35910aece986 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -147,6 +147,12 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   /// Initialize the device, which is a no-op
   Error initImpl(GenericPluginTy &Plugin) override { return Plugin::success(); }
 
+  /// Unload the binary image
+  ///
+  /// TODO: This currently does nothing, and should be implemented as part of
+  /// broader memory handling logic for this plugin
+  Error unloadBinaryImpl(DeviceImageTy *) override { return Plugin::success(); }
+
   /// Deinitialize the device, which is a no-op
   Error deinitImpl() override { return Plugin::success(); }
 
diff --git a/openmp/runtime/src/kmp_alloc.cpp b/openmp/runtime/src/kmp_alloc.cpp
index 801cd06c95502..051f88c5a0996 100644
--- a/openmp/runtime/src/kmp_alloc.cpp
+++ b/openmp/runtime/src/kmp_alloc.cpp
@@ -70,10 +70,10 @@ static void bectl(kmp_info_t *th, bget_compact_t compact,
 /* Buffer allocation size quantum: all buffers allocated are a
    multiple of this size.  This MUST be a power of two. */
 
-/* On IA-32 architecture with  Linux* OS, malloc() does not
-   ensure 16 byte alignment */
+/* On some architectures, malloc() does not ensure 16 byte alignment,
+   Solaris/sparc and x86 among them. */
 
-#if KMP_ARCH_X86 || !KMP_HAVE_QUAD
+#if KMP_ARCH_X86 || KMP_ARCH_SPARC || !KMP_HAVE_QUAD
 
 #define SizeQuant 8
 #define AlignType double
@@ -1861,7 +1861,7 @@ typedef struct kmp_mem_desc { // Memory block descriptor
   void *ptr_align; // Pointer to aligned memory, returned
   kmp_allocator_t *allocator; // allocator
 } kmp_mem_desc_t;
-static int alignment = sizeof(void *); // align to pointer size by default
+constexpr size_t alignment = SizeQuant;
 
 // external interfaces are wrappers over internal implementation
 void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index fdbf9ff45e354..3ca32ba583fe2 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -570,7 +570,7 @@ void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
 
   this_thr->th.th_teams_microtask = NULL;
   this_thr->th.th_teams_level = 0;
-  *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
+  memset(&this_thr->th.th_teams_size, 0, sizeof(kmp_teams_size_t));
   va_end(ap);
 #if KMP_STATS_ENABLED
   if (previous_state == stats_state_e::SERIAL_REGION) {
diff --git a/openmp/runtime/src/kmp_lock.cpp b/openmp/runtime/src/kmp_lock.cpp
index 0ad14f862bcb9..11fa233c4bd27 100644
--- a/openmp/runtime/src/kmp_lock.cpp
+++ b/openmp/runtime/src/kmp_lock.cpp
@@ -3242,6 +3242,8 @@ static void __kmp_destroy_indirect_lock(kmp_dyna_lock_t *lock) {
   kmp_uint32 gtid = __kmp_entry_gtid();
   kmp_indirect_lock_t *l =
       __kmp_lookup_indirect_lock((void **)lock, "omp_destroy_lock");
+  if (l == nullptr)
+    return; // avoid segv if lock already destroyed
   KMP_I_LOCK_FUNC(l, destroy)(l->lock);
   kmp_indirect_locktag_t tag = l->type;
 
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 3d85a29423540..d7bc4922d54f7 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -1528,7 +1528,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   // Calculate shared structure offset including padding after kmp_task_t struct
   // to align pointers in shared struct
   shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
-  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
+  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(kmp_uint64));
 
   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
   KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
diff --git a/openmp/runtime/test/ompt/misc/lock_double_destroy.cpp b/openmp/runtime/test/ompt/misc/lock_double_destroy.cpp
new file mode 100644
index 0000000000000..bbdf348e97e7c
--- /dev/null
+++ b/openmp/runtime/test/ompt/misc/lock_double_destroy.cpp
@@ -0,0 +1,40 @@
+// RUN: %libomp-cxx-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include "omp_testsuite.h"
+
+// tests that the destructor doesn't segv even though
+// ompt_finalize_tool() destroys the lock
+struct myLock {
+  omp_lock_t lock;
+  myLock() { omp_init_lock(&lock); }
+  ~myLock() { omp_destroy_lock(&lock); }
+};
+
+myLock lock;
+
+int main() {
+  go_parallel_nthreads(2);
+
+  printf("Before ompt_finalize_tool\n");
+  ompt_finalize_tool();
+  printf("After ompt_finalize_tool\n");
+
+  return get_exit_value();
+}
+
+// CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+// CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin:
+// CHECK-SAME: thread_type=ompt_thread_initial=1
+
+// CHECK: {{^}}[[THREAD_ID]]: ompt_event_init_lock
+
+// CHECK: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin
+// CHECK: {{^}}[[THREAD_ID]]: ompt_event_parallel_end
+
+// CHECK: {{^}}Before ompt_finalize_tool
+
+// CHECK: {{^}}[[THREAD_ID]]: ompt_event_thread_end: thread_id=[[THREAD_ID]]
+// CHECK: 0: ompt_event_runtime_shutdown
+
+// CHECK: {{^}}After ompt_finalize_tool
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 02cc04fa4f7b6..6f65cfca32943 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -700,6 +700,7 @@ cc_library(
         ":diagnostic_defs_gen",
         ":sema_attr_gen",
         ":support",
+        "//llvm:BinaryFormat",
         "//llvm:Core",
         "//llvm:FrontendDebug",
         "//llvm:FrontendDriver",
diff --git a/utils/bazel/llvm-project-overlay/llvm/config.bzl b/utils/bazel/llvm-project-overlay/llvm/config.bzl
index 7cb4b7e9ffe75..ba9db05c651a7 100644
--- a/utils/bazel/llvm-project-overlay/llvm/config.bzl
+++ b/utils/bazel/llvm-project-overlay/llvm/config.bzl
@@ -32,6 +32,7 @@ posix_defines = [
     "BACKTRACE_HEADER=<execinfo.h>",
     r'LTDL_SHLIB_EXT=\".so\"',
     r'LLVM_PLUGIN_EXT=\".so\"',
+    "LLVM_ENABLE_LLVM_EXPORT_ANNOTATIONS=1",
     "LLVM_ENABLE_PLUGINS=1",
     "LLVM_ENABLE_THREADS=1",
     "HAVE_DEREGISTER_FRAME=1",
diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
index 8a9c74d67b124..31d0dc57a7180 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
@@ -114,6 +114,9 @@
 /* Define if building LLVM with BUILD_SHARED_LIBS */
 /* #undef LLVM_BUILD_SHARED_LIBS */
 
+/* Define if exporting LLVM public interface for shared library */
+/* LLVM_ENABLE_LLVM_EXPORT_ANNOTATIONS defined in Bazel */
+
 /* Define if building LLVM with LLVM_FORCE_USE_OLD_TOOLCHAIN_LIBS */
 /* #undef LLVM_FORCE_USE_OLD_TOOLCHAIN ${LLVM_FORCE_USE_OLD_TOOLCHAIN} */
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index a0b72b9709695..41720f132a9dd 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -10168,6 +10168,7 @@ td_library(
     ]),
     includes = ["include"],
     deps = [
+        ":LinalgOpsTdFiles",
         ":SCFTdFiles",
         ":TransformDialectTdFiles",
     ],
diff --git a/utils/bazel/llvm_configs/llvm-config.h.cmake b/utils/bazel/llvm_configs/llvm-config.h.cmake
index a0ad517a6ecf4..0d8db0b6b52f1 100644
--- a/utils/bazel/llvm_configs/llvm-config.h.cmake
+++ b/utils/bazel/llvm_configs/llvm-config.h.cmake
@@ -110,6 +110,9 @@
 /* Define if building LLVM with BUILD_SHARED_LIBS */
 #cmakedefine LLVM_BUILD_SHARED_LIBS
 
+/* Define if exporting LLVM public interface for shared library */
+#cmakedefine LLVM_ENABLE_LLVM_EXPORT_ANNOTATIONS
+
 /* Define if building LLVM with LLVM_FORCE_USE_OLD_TOOLCHAIN_LIBS */
 #cmakedefine LLVM_FORCE_USE_OLD_TOOLCHAIN ${LLVM_FORCE_USE_OLD_TOOLCHAIN}