From 626cd48200010b5bbd68b94e76fa9bdecb2a096d Mon Sep 17 00:00:00 2001
From: Hans Johnson <hans-johnson@uiowa.edu>
Date: Fri, 1 May 2026 13:18:35 -0500
Subject: [PATCH 1/6] ENH: Bump Eigen3 import tag to for/itk-20260501-879885e1

Phase 2 prep for Eigen 5.x update.  Points UpdateFromUpstream.sh at the
refreshed InsightSoftwareConsortium/eigen tag derived from gitlab
libeigen/eigen master tip 879885e (2026-05-01), which adds the
"Modernize internal utilities for C++14" patch (libeigen/eigen!2490)
on top of the 5.0.1 release tip plus three small follow-on fixes
(RealQZ pushDownZero counting, small-determinant LU fastpath, and
IncompleteLUT static row matching).

Adds new top-level support headers introduced since the previous import:
AccelerateSupport, KLUSupport, ThreadPool, and Version.  The Version
header is required because Eigen 5 moved EIGEN_{MAJOR,MINOR,PATCH}_VERSION
out of src/Core/util/Macros.h.
---
 Modules/ThirdParty/Eigen3/UpdateFromUpstream.sh | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/Modules/ThirdParty/Eigen3/UpdateFromUpstream.sh b/Modules/ThirdParty/Eigen3/UpdateFromUpstream.sh
index 9a4d2cfaa1a..4ec0799d392 100755
--- a/Modules/ThirdParty/Eigen3/UpdateFromUpstream.sh
+++ b/Modules/ThirdParty/Eigen3/UpdateFromUpstream.sh
@@ -6,10 +6,16 @@ shopt -s dotglob
 
 readonly name="Eigen3"
 readonly ownership="Eigen Upstream <kwrobot@kitware.com>"
+# ITK has applied a small number of post-import patches under itkeigen/
+# (e.g., SelfadjointMatrixVector.h pzero init), so the tree no longer
+# bytewise matches the previous import commit's tree. Use log-based
+# matching instead of exact-tree matching for the previous-import probe.
+exact_tree_match=false
 readonly subtree="Modules/ThirdParty/Eigen3/src/itkeigen"
 readonly repo="https://github.com/InsightSoftwareConsortium/eigen"
-readonly tag="for/itk-20260305-4c99fca"
+readonly tag="for/itk-20260501-879885e1"
 readonly paths="
+Eigen/AccelerateSupport
 Eigen/Cholesky
 Eigen/CholmodSupport
 Eigen/Core
@@ -20,6 +26,7 @@ Eigen/Geometry
 Eigen/Householder
 Eigen/IterativeLinearSolvers
 Eigen/Jacobi
+Eigen/KLUSupport
 Eigen/LU
 Eigen/MetisSupport
 Eigen/OrderingMethods
@@ -38,7 +45,9 @@ Eigen/StdDeque
 Eigen/StdList
 Eigen/StdVector
 Eigen/SuperLUSupport
+Eigen/ThreadPool
 Eigen/UmfPackSupport
+Eigen/Version
 Eigen/src
 
 COPYING.BSD

From bc87d16685adbadd975a619beef26a030a53b922 Mon Sep 17 00:00:00 2001
From: Eigen Upstream <kwrobot@kitware.com>
Date: Fri, 1 May 2026 13:12:39 -0500
Subject: [PATCH 2/6] Eigen3 2026-05-01 (505023a2)

Code extracted from:

    https://github.com/InsightSoftwareConsortium/eigen

at commit 505023a2a0fc8eeff5f92d07683ab5ff0c03ba0f (for/itk-20260501-879885e1).
---
 .gitattributes                                |    6 +
 CMakeLists.txt                                |  234 +-
 COPYING.MPL2                                  |    2 +-
 COPYING.README                                |   18 +-
 Eigen/AccelerateSupport                       |   52 +
 Eigen/Cholesky                                |    2 -
 Eigen/CholmodSupport                          |    2 +-
 Eigen/Core                                    |  145 +-
 Eigen/Dense                                   |   12 +
 Eigen/Eigen                                   |   12 +
 Eigen/Eigenvalues                             |   10 +-
 Eigen/Geometry                                |    4 +-
 Eigen/Householder                             |    2 +-
 Eigen/KLUSupport                              |   43 +
 Eigen/LU                                      |    7 +-
 Eigen/PaStiXSupport                           |    2 +-
 Eigen/QR                                      |    5 +-
 Eigen/QtAlignedMalloc                         |    6 +-
 Eigen/SPQRSupport                             |    2 +-
 Eigen/SVD                                     |    7 +-
 Eigen/SparseCore                              |    4 -
 Eigen/SparseQR                                |    2 +-
 Eigen/SuperLUSupport                          |    1 +
 Eigen/ThreadPool                              |   80 +
 Eigen/UmfPackSupport                          |    2 +-
 Eigen/Version                                 |   21 +
 .../src/AccelerateSupport/AccelerateSupport.h |    2 +-
 Eigen/src/Cholesky/LDLT.h                     |   45 +-
 Eigen/src/Cholesky/LLT.h                      |   31 +-
 Eigen/src/CholmodSupport/CholmodSupport.h     |    4 +-
 Eigen/src/Core/ArithmeticSequence.h           |   10 +-
 Eigen/src/Core/Array.h                        |   23 +-
 Eigen/src/Core/ArrayBase.h                    |   11 +-
 Eigen/src/Core/ArrayWrapper.h                 |   13 +-
 Eigen/src/Core/Assign.h                       |   18 +-
 Eigen/src/Core/AssignEvaluator.h              |  125 +-
 Eigen/src/Core/Assign_AOCL.h                  |  301 +
 Eigen/src/Core/Assign_MKL.h                   |    8 +-
 Eigen/src/Core/Block.h                        |   45 +-
 Eigen/src/Core/CommaInitializer.h             |    3 +-
 Eigen/src/Core/ConcatOp.h                     |  343 +
 Eigen/src/Core/ConditionEstimator.h           |   63 +-
 Eigen/src/Core/CoreEvaluators.h               |  573 +-
 Eigen/src/Core/CoreIterators.h                |    2 +-
 Eigen/src/Core/CwiseBinaryOp.h                |   20 +-
 Eigen/src/Core/CwiseNullaryOp.h               |   28 +-
 Eigen/src/Core/CwiseTernaryOp.h               |   12 +-
 Eigen/src/Core/CwiseUnaryOp.h                 |   16 +-
 Eigen/src/Core/CwiseUnaryView.h               |   15 +-
 Eigen/src/Core/DenseBase.h                    |  131 +-
 Eigen/src/Core/DenseCoeffsBase.h              |   61 +-
 Eigen/src/Core/DenseStorage.h                 |  290 +-
 Eigen/src/Core/DeviceWrapper.h                |    4 +-
 Eigen/src/Core/Diagonal.h                     |   40 +-
 Eigen/src/Core/DiagonalMatrix.h               |   85 +-
 Eigen/src/Core/Dot.h                          |   15 +-
 Eigen/src/Core/EigenBase.h                    |   18 +-
 Eigen/src/Core/Fill.h                         |   32 +-
 Eigen/src/Core/FindCoeff.h                    |   16 +-
 Eigen/src/Core/ForceAlignedAccess.h           |   21 +-
 Eigen/src/Core/Fuzzy.h                        |   12 +-
 Eigen/src/Core/GeneralProduct.h               |   29 +-
 Eigen/src/Core/GenericPacketMath.h            |  181 +-
 Eigen/src/Core/GlobalFunctions.h              |    8 +-
 Eigen/src/Core/IO.h                           |    2 +-
 Eigen/src/Core/IndexedView.h                  |   19 +-
 Eigen/src/Core/InnerProduct.h                 |   57 +-
 Eigen/src/Core/Inverse.h                      |    4 +-
 Eigen/src/Core/Map.h                          |    9 +-
 Eigen/src/Core/MapBase.h                      |   24 +-
 Eigen/src/Core/MathFunctions.h                |  224 +-
 Eigen/src/Core/MathFunctionsImpl.h            |   27 +-
 Eigen/src/Core/Matrix.h                       |   75 +-
 Eigen/src/Core/MatrixBase.h                   |  101 +-
 Eigen/src/Core/NestByValue.h                  |   16 +-
 Eigen/src/Core/NoAlias.h                      |    4 +-
 Eigen/src/Core/NumTraits.h                    |   18 +-
 Eigen/src/Core/PartialReduxEvaluator.h        |   51 +-
 Eigen/src/Core/PermutationMatrix.h            |   21 +-
 Eigen/src/Core/PlainObjectBase.h              |   79 +-
 Eigen/src/Core/Product.h                      |   10 +-
 Eigen/src/Core/ProductEvaluators.h            |  448 +-
 Eigen/src/Core/Random.h                       |    4 +-
 Eigen/src/Core/RandomImpl.h                   |   26 +-
 Eigen/src/Core/RealView.h                     |  292 +
 Eigen/src/Core/Redux.h                        |   20 +-
 Eigen/src/Core/Ref.h                          |   25 +-
 Eigen/src/Core/Replicate.h                    |   10 +-
 Eigen/src/Core/Reshaped.h                     |   87 +-
 Eigen/src/Core/ReturnByValue.h                |    4 +-
 Eigen/src/Core/Reverse.h                      |    4 +-
 Eigen/src/Core/Select.h                       |   98 +-
 Eigen/src/Core/SelfAdjointView.h              |   52 +-
 Eigen/src/Core/SelfCwiseBinaryOp.h            |   28 +-
 Eigen/src/Core/SkewSymmetricMatrix3.h         |   59 +-
 Eigen/src/Core/Solve.h                        |    4 +-
 Eigen/src/Core/SolveTriangular.h              |   26 +-
 Eigen/src/Core/SolverBase.h                   |    6 +-
 Eigen/src/Core/StableNorm.h                   |    3 +-
 Eigen/src/Core/StlIterators.h                 |   15 +-
 Eigen/src/Core/Stride.h                       |   19 +-
 Eigen/src/Core/StructuredBindings.h           |  155 +
 Eigen/src/Core/Swap.h                         |   11 +-
 Eigen/src/Core/Transpose.h                    |   29 +-
 Eigen/src/Core/TriangularMatrix.h             |   85 +-
 Eigen/src/Core/VectorBlock.h                  |    4 +-
 Eigen/src/Core/VectorwiseOp.h                 |   36 +-
 Eigen/src/Core/Visitor.h                      |   14 +-
 Eigen/src/Core/arch/AVX/Complex.h             |   52 +-
 Eigen/src/Core/arch/AVX/MathFunctions.h       |   54 +-
 Eigen/src/Core/arch/AVX/PacketMath.h          |   86 +-
 Eigen/src/Core/arch/AVX/TypeCasting.h         |   44 +-
 Eigen/src/Core/arch/AVX512/Complex.h          |   38 +-
 Eigen/src/Core/arch/AVX512/GemmKernel.h       |  280 +-
 Eigen/src/Core/arch/AVX512/MathFunctions.h    |   26 +-
 Eigen/src/Core/arch/AVX512/PacketMath.h       |  148 +-
 Eigen/src/Core/arch/AVX512/PacketMathFP16.h   |   16 +-
 Eigen/src/Core/arch/AVX512/Reductions.h       |    2 +-
 Eigen/src/Core/arch/AVX512/TrsmKernel.h       |  106 +-
 Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc    |   12 +-
 Eigen/src/Core/arch/AVX512/TypeCasting.h      |   12 +-
 Eigen/src/Core/arch/AltiVec/Complex.h         |   39 +-
 .../src/Core/arch/AltiVec/MatrixProductMMA.h  |    5 +-
 .../Core/arch/AltiVec/MatrixVectorProduct.inc |   14 +-
 Eigen/src/Core/arch/AltiVec/PacketMath.h      |  101 +-
 Eigen/src/Core/arch/AltiVec/TypeCasting.h     |    2 +-
 Eigen/src/Core/arch/Default/BFloat16.h        |   65 +-
 Eigen/src/Core/arch/Default/ConjHelper.h      |   18 +
 .../arch/Default/GenericPacketMathComplex.h   |  283 +
 .../Default/GenericPacketMathDoubleWord.h     |  208 +
 .../Default/GenericPacketMathFrexpLdexp.h     |  162 +
 .../arch/Default/GenericPacketMathFunctions.h | 2575 +----
 .../Default/GenericPacketMathFunctionsFwd.h   |  131 +-
 .../Default/GenericPacketMathPolynomials.h    |  151 +
 .../Core/arch/Default/GenericPacketMathPow.h  |  724 ++
 .../Core/arch/Default/GenericPacketMathTrig.h | 1067 ++
 Eigen/src/Core/arch/Default/Half.h            |  180 +-
 Eigen/src/Core/arch/GPU/Complex.h             |   58 +-
 Eigen/src/Core/arch/GPU/PacketMath.h          |  455 +-
 Eigen/src/Core/arch/GPU/Tuple.h               |   40 +-
 Eigen/src/Core/arch/GPU/TypeCasting.h         |    3 +-
 Eigen/src/Core/arch/HVX/PacketMath.h          |   47 +-
 Eigen/src/Core/arch/LSX/Complex.h             |   28 +-
 Eigen/src/Core/arch/LSX/PacketMath.h          |  102 +-
 Eigen/src/Core/arch/LSX/TypeCasting.h         |  318 +-
 Eigen/src/Core/arch/MSA/Complex.h             |   15 +-
 Eigen/src/Core/arch/MSA/PacketMath.h          |   92 +-
 Eigen/src/Core/arch/NEON/Complex.h            |  122 +-
 Eigen/src/Core/arch/NEON/MathFunctions.h      |    7 +-
 Eigen/src/Core/arch/NEON/PacketMath.h         |  472 +-
 Eigen/src/Core/arch/NEON/TypeCasting.h        |    2 +-
 .../Core/arch/RVV10/GeneralBlockPanelKernel.h |  236 +
 Eigen/src/Core/arch/RVV10/MathFunctions.h     |   30 +
 Eigen/src/Core/arch/RVV10/PacketMath.h        | 2442 ++++
 Eigen/src/Core/arch/RVV10/PacketMath2.h       | 1527 +++
 Eigen/src/Core/arch/RVV10/PacketMath4.h       | 1462 +++
 Eigen/src/Core/arch/RVV10/PacketMathBF16.h    |  838 ++
 Eigen/src/Core/arch/RVV10/PacketMathFP16.h    |  998 ++
 Eigen/src/Core/arch/RVV10/TypeCasting.h       |  284 +
 Eigen/src/Core/arch/SSE/Complex.h             |   41 +-
 Eigen/src/Core/arch/SSE/PacketMath.h          |  326 +-
 Eigen/src/Core/arch/SSE/TypeCasting.h         |   50 -
 Eigen/src/Core/arch/SVE/MathFunctions.h       |   26 +-
 Eigen/src/Core/arch/SVE/PacketMath.h          |   30 +-
 Eigen/src/Core/arch/SYCL/InteropHeaders.h     |   41 +-
 Eigen/src/Core/arch/SYCL/MathFunctions.h      |  314 +-
 Eigen/src/Core/arch/SYCL/PacketMath.h         |   25 -
 Eigen/src/Core/arch/ZVector/Complex.h         |   38 +-
 Eigen/src/Core/arch/ZVector/MathFunctions.h   |   16 +-
 Eigen/src/Core/arch/ZVector/PacketMath.h      |  244 +-
 Eigen/src/Core/arch/clang/Complex.h           |  702 ++
 Eigen/src/Core/arch/clang/MathFunctions.h     |   47 +
 Eigen/src/Core/arch/clang/PacketMath.h        | 1171 ++
 Eigen/src/Core/arch/clang/Reductions.h        |  355 +
 Eigen/src/Core/arch/clang/TypeCasting.h       |  186 +
 Eigen/src/Core/functors/AssignmentFunctors.h  |    8 +-
 Eigen/src/Core/functors/BinaryFunctors.h      |  130 +-
 Eigen/src/Core/functors/NullaryFunctors.h     |   34 +-
 Eigen/src/Core/functors/TernaryFunctors.h     |    7 +-
 Eigen/src/Core/functors/UnaryFunctors.h       |  215 +-
 .../Core/products/GeneralBlockPanelKernel.h   | 1729 +--
 Eigen/src/Core/products/GeneralMatrixMatrix.h |    7 +-
 .../products/GeneralMatrixMatrixTriangular.h  |    4 +-
 .../GeneralMatrixMatrixTriangular_BLAS.h      |    6 +-
 .../Core/products/GeneralMatrixMatrix_BLAS.h  |   19 +-
 Eigen/src/Core/products/GeneralMatrixVector.h |  427 +-
 .../Core/products/GeneralMatrixVector_BLAS.h  |    2 +
 Eigen/src/Core/products/Parallelizer.h        |    6 +-
 .../Core/products/SelfadjointMatrixMatrix.h   |    6 +-
 .../products/SelfadjointMatrixMatrix_BLAS.h   |    4 +
 .../Core/products/SelfadjointMatrixVector.h   |  326 +-
 .../products/SelfadjointMatrixVector_BLAS.h   |    2 +
 Eigen/src/Core/products/SelfadjointProduct.h  |  108 +-
 .../Core/products/SelfadjointRank2Update.h    |  225 +-
 .../Core/products/TriangularMatrixMatrix.h    |   24 -
 .../products/TriangularMatrixMatrix_BLAS.h    |    4 +
 .../Core/products/TriangularMatrixVector.h    |  154 +-
 .../products/TriangularMatrixVector_BLAS.h    |    3 +
 .../Core/products/TriangularSolverMatrix.h    |   82 +-
 .../products/TriangularSolverMatrix_BLAS.h    |    2 +
 Eigen/src/Core/util/AOCL_Support.h            |  174 +
 Eigen/src/Core/util/BlasUtil.h                |   66 +-
 Eigen/src/Core/util/ConfigureVectorization.h  |   94 +-
 Eigen/src/Core/util/Constants.h               |   50 +-
 Eigen/src/Core/util/DisableStupidWarnings.h   |    3 +-
 Eigen/src/Core/util/EmulateArray.h            |   25 +-
 Eigen/src/Core/util/ForwardDeclarations.h     |   11 +-
 Eigen/src/Core/util/GpuHipCudaDefines.inc     |    4 +-
 Eigen/src/Core/util/IndexedViewHelper.h       |   49 +-
 Eigen/src/Core/util/IntegralConstant.h        |   10 +-
 Eigen/src/Core/util/Macros.h                  |  195 +-
 Eigen/src/Core/util/MaxSizeVector.h           |   81 +-
 Eigen/src/Core/util/Memory.h                  |  148 +-
 Eigen/src/Core/util/Meta.h                    |  259 +-
 Eigen/src/Core/util/MoreMeta.h                |  115 +-
 Eigen/src/Core/util/Serializer.h              |    9 +-
 Eigen/src/Core/util/SymbolicIndex.h           |   12 +-
 Eigen/src/Core/util/XprHelper.h               |  296 +-
 Eigen/src/Eigenvalues/ComplexEigenSolver.h    |    7 +-
 Eigen/src/Eigenvalues/ComplexQZ.h             |  651 ++
 Eigen/src/Eigenvalues/ComplexSchur.h          |    7 +-
 Eigen/src/Eigenvalues/EigenSolver.h           |    2 +-
 .../src/Eigenvalues/HessenbergDecomposition.h |    2 +-
 Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h |    3 +-
 Eigen/src/Eigenvalues/RealQZ.h                |    4 +-
 Eigen/src/Eigenvalues/RealSchur.h             |   10 +-
 .../src/Eigenvalues/SelfAdjointEigenSolver.h  |  152 +-
 Eigen/src/Eigenvalues/Tridiagonalization.h    |  164 +-
 Eigen/src/Geometry/AngleAxis.h                |   76 +-
 Eigen/src/Geometry/EulerAngles.h              |   46 +-
 Eigen/src/Geometry/Homogeneous.h              |    8 +-
 Eigen/src/Geometry/Hyperplane.h               |    2 +-
 Eigen/src/Geometry/OrthoMethods.h             |    6 +-
 Eigen/src/Geometry/Quaternion.h               |   28 +-
 Eigen/src/Geometry/Rotation2D.h               |    4 +-
 Eigen/src/Geometry/Scaling.h                  |    2 +-
 Eigen/src/Geometry/Transform.h                |   13 +-
 Eigen/src/Geometry/Umeyama.h                  |    7 +
 Eigen/src/Householder/BlockHouseholder.h      |   74 +-
 Eigen/src/Householder/Householder.h           |   37 +-
 Eigen/src/Householder/HouseholderSequence.h   |   50 +-
 .../BasicPreconditioners.h                    |    8 +-
 Eigen/src/IterativeLinearSolvers/BiCGSTAB.h   |    5 +-
 .../ConjugateGradient.h                       |    1 +
 .../IncompleteCholesky.h                      |    2 +-
 .../IterativeLinearSolvers/IncompleteLUT.h    |  265 +-
 .../LeastSquareConjugateGradient.h            |    1 +
 Eigen/src/Jacobi/Jacobi.h                     |   65 +-
 Eigen/src/KLUSupport/KLUSupport.h             |   86 +-
 Eigen/src/LU/Determinant.h                    |    5 +-
 Eigen/src/LU/FullPivLU.h                      |  180 +-
 Eigen/src/LU/InverseImpl.h                    |    7 +-
 Eigen/src/LU/PartialPivLU.h                   |   56 +-
 Eigen/src/MetisSupport/MetisSupport.h         |    2 +-
 Eigen/src/OrderingMethods/Eigen_Colamd.h      |    2 +-
 Eigen/src/OrderingMethods/Ordering.h          |   10 +-
 Eigen/src/PardisoSupport/PardisoSupport.h     |    7 +-
 Eigen/src/QR/ColPivHouseholderQR.h            |  163 +-
 Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h    |    4 +-
 .../src/QR/CompleteOrthogonalDecomposition.h  |    8 +-
 Eigen/src/QR/FullPivHouseholderQR.h           |  178 +-
 Eigen/src/QR/HouseholderQR.h                  |   24 +-
 Eigen/src/SPQRSupport/SuiteSparseQRSupport.h  |  139 +-
 Eigen/src/SVD/BDCSVD.h                        | 1243 +--
 Eigen/src/SVD/BDCSVDImpl.h                    |  821 ++
 Eigen/src/SVD/JacobiSVD.h                     |  353 +-
 Eigen/src/SVD/SVDBase.h                       |    2 +-
 Eigen/src/SVD/UpperBidiagonalization.h        |   33 +-
 Eigen/src/SparseCholesky/SimplicialCholesky.h |   25 +-
 .../SparseCholesky/SimplicialCholesky_impl.h  |    7 +-
 Eigen/src/SparseCore/AmbiVector.h             |    6 +-
 Eigen/src/SparseCore/CompressedStorage.h      |    7 +-
 .../ConservativeSparseSparseProduct.h         |   15 +-
 Eigen/src/SparseCore/SparseAssign.h           |    2 +-
 Eigen/src/SparseCore/SparseBlock.h            |   30 +-
 Eigen/src/SparseCore/SparseCompressedBase.h   |   29 +-
 Eigen/src/SparseCore/SparseCwiseBinaryOp.h    |    4 +-
 Eigen/src/SparseCore/SparseDenseProduct.h     |  247 +-
 Eigen/src/SparseCore/SparseDiagonalProduct.h  |   12 +-
 Eigen/src/SparseCore/SparseDot.h              |    6 +-
 Eigen/src/SparseCore/SparseMap.h              |   15 +-
 Eigen/src/SparseCore/SparseMatrix.h           |   37 +-
 Eigen/src/SparseCore/SparseMatrixBase.h       |   67 +-
 Eigen/src/SparseCore/SparsePermutation.h      |    2 +-
 Eigen/src/SparseCore/SparseRef.h              |   21 +-
 Eigen/src/SparseCore/SparseSelfAdjointView.h  |   19 +-
 Eigen/src/SparseCore/SparseSolverBase.h       |   13 +-
 .../SparseSparseProductWithPruning.h          |   13 +-
 Eigen/src/SparseCore/SparseUtil.h             |    2 +-
 Eigen/src/SparseCore/SparseVector.h           |   42 +-
 Eigen/src/SparseCore/SparseView.h             |    2 +-
 Eigen/src/SparseCore/TriangularSolver.h       |   43 +-
 Eigen/src/SparseLU/SparseLU.h                 |   70 +-
 Eigen/src/SparseLU/SparseLU_Memory.h          |    2 +-
 .../src/SparseLU/SparseLU_SupernodalMatrix.h  |    6 +-
 Eigen/src/SparseLU/SparseLU_column_bmod.h     |    2 +-
 .../src/SparseLU/SparseLU_heap_relax_snode.h  |    2 +-
 Eigen/src/SparseLU/SparseLU_panel_bmod.h      |    6 +-
 Eigen/src/SparseLU/SparseLU_panel_dfs.h       |    5 +-
 Eigen/src/SparseQR/SparseQR.h                 |   41 +-
 Eigen/src/SuperLUSupport/SuperLUSupport.h     |   39 +-
 Eigen/src/ThreadPool/Barrier.h                |    2 +-
 Eigen/src/ThreadPool/InternalHeaderCheck.h    |    3 +-
 Eigen/src/ThreadPool/NonBlockingThreadPool.h  |    2 +-
 Eigen/src/ThreadPool/RunQueue.h               |   10 +-
 Eigen/src/ThreadPool/ThreadLocal.h            |    5 +-
 Eigen/src/UmfPackSupport/UmfPackSupport.h     |    6 +-
 Eigen/src/misc/RankRevealingBase.h            |  178 +
 Eigen/src/misc/RealSvd2x2.h                   |   53 -
 Eigen/src/misc/lapacke.h                      | 9912 +----------------
 Eigen/src/plugins/ArrayCwiseBinaryOps.inc     |   48 +-
 Eigen/src/plugins/ArrayCwiseUnaryOps.inc      |  125 +-
 Eigen/src/plugins/BlockMethods.inc            |  199 +-
 Eigen/src/plugins/CommonCwiseBinaryOps.inc    |   20 +-
 Eigen/src/plugins/CommonCwiseUnaryOps.inc     |   22 +-
 Eigen/src/plugins/MatrixCwiseBinaryOps.inc    |   68 +-
 Eigen/src/plugins/MatrixCwiseUnaryOps.inc     |   31 +-
 Eigen/src/plugins/ReshapedMethods.inc         |   18 +-
 README.md                                     |    2 +-
 cmake/Eigen3Config.cmake.in                   |    6 +-
 320 files changed, 25723 insertions(+), 21419 deletions(-)
 create mode 100644 Eigen/AccelerateSupport
 create mode 100644 Eigen/KLUSupport
 create mode 100644 Eigen/ThreadPool
 create mode 100644 Eigen/Version
 create mode 100644 Eigen/src/Core/Assign_AOCL.h
 create mode 100644 Eigen/src/Core/ConcatOp.h
 create mode 100644 Eigen/src/Core/RealView.h
 create mode 100644 Eigen/src/Core/StructuredBindings.h
 create mode 100644 Eigen/src/Core/arch/Default/GenericPacketMathComplex.h
 create mode 100644 Eigen/src/Core/arch/Default/GenericPacketMathDoubleWord.h
 create mode 100644 Eigen/src/Core/arch/Default/GenericPacketMathFrexpLdexp.h
 create mode 100644 Eigen/src/Core/arch/Default/GenericPacketMathPolynomials.h
 create mode 100644 Eigen/src/Core/arch/Default/GenericPacketMathPow.h
 create mode 100644 Eigen/src/Core/arch/Default/GenericPacketMathTrig.h
 create mode 100644 Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h
 create mode 100644 Eigen/src/Core/arch/RVV10/MathFunctions.h
 create mode 100644 Eigen/src/Core/arch/RVV10/PacketMath.h
 create mode 100644 Eigen/src/Core/arch/RVV10/PacketMath2.h
 create mode 100644 Eigen/src/Core/arch/RVV10/PacketMath4.h
 create mode 100644 Eigen/src/Core/arch/RVV10/PacketMathBF16.h
 create mode 100644 Eigen/src/Core/arch/RVV10/PacketMathFP16.h
 create mode 100644 Eigen/src/Core/arch/RVV10/TypeCasting.h
 create mode 100644 Eigen/src/Core/arch/clang/Complex.h
 create mode 100644 Eigen/src/Core/arch/clang/MathFunctions.h
 create mode 100644 Eigen/src/Core/arch/clang/PacketMath.h
 create mode 100644 Eigen/src/Core/arch/clang/Reductions.h
 create mode 100644 Eigen/src/Core/arch/clang/TypeCasting.h
 create mode 100644 Eigen/src/Core/util/AOCL_Support.h
 create mode 100644 Eigen/src/Eigenvalues/ComplexQZ.h
 create mode 100644 Eigen/src/SVD/BDCSVDImpl.h
 create mode 100644 Eigen/src/misc/RankRevealingBase.h
 delete mode 100644 Eigen/src/misc/RealSvd2x2.h

diff --git a/.gitattributes b/.gitattributes
index 3d370f28b2a..830efe4baa4 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,3 +1,9 @@
+*.sh                         eol=lf
+debug/msvc/*.dat             eol=crlf
+debug/msvc/*.natvis          eol=crlf
+
+# ITK fork: relax content-checks for the largest Eigen header so
+# UpdateFromUpstream.sh / KWStyle hooks do not reject the import.
 * -whitespace
 Eigen/src/misc/lapacke.h hooks-max-size=1500000
 Eigen/src/misc/lapacke.h hooks.MaxObjectKiB=2048
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4bd238866f4..80494f4445d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,8 @@ project(Eigen3)
 # ITK doesn't compile anything here, just generates targets for the INTERFACE library.
 if(FALSE)
 
+cmake_minimum_required(VERSION 3.10.0)
+
 #==============================================================================
 # CMake Policy issues.
 #==============================================================================
@@ -40,10 +42,17 @@ if (POLICY CMP0177)
   cmake_policy(SET CMP0177 NEW)
 endif ()
 
+# Respect <PackageName>_ROOT variables.
+if (POLICY CMP0074)
+  cmake_policy(SET CMP0074 NEW)
+endif ()
+
 #==============================================================================
 # CMake Project.
 #==============================================================================
 
+project(Eigen3)
+
 # Remove this block after bumping CMake to v3.21.0
 # PROJECT_IS_TOP_LEVEL is defined then by default
 if(CMAKE_VERSION VERSION_LESS 3.21.0)
@@ -67,19 +76,12 @@ option(EIGEN_LEAVE_TEST_IN_ALL_TARGET "Leaves tests in the all target, needed by
 option(EIGEN_BUILD_BLAS "Toggles the building of the Eigen Blas library" ${PROJECT_IS_TOP_LEVEL})
 option(EIGEN_BUILD_LAPACK "Toggles the building of the included Eigen LAPACK library" ${PROJECT_IS_TOP_LEVEL})
 if (EIGEN_BUILD_BLAS OR EIGEN_BUILD_LAPACK)
-  # BLAS and LAPACK currently need a fortran compiler.
-  include(CMakeDetermineFortranCompiler)
-  if (NOT CMAKE_Fortran_COMPILER)
-    set(EIGEN_BUILD_BLAS OFF)
-    set(EIGEN_BUILD_LAPACK OFF)
-  else()
-    # Determine if we should build shared libraries for BLAS/LAPACK on this platform.
+  # Determine if we should build shared libraries for BLAS/LAPACK on this platform.
+  if (NOT EIGEN_BUILD_SHARED_LIBS)
     get_cmake_property(EIGEN_BUILD_SHARED_LIBS TARGET_SUPPORTS_SHARED_LIBS)
   endif()
 endif()
 
-option(EIGEN_BUILD_BTL "Build benchmark suite" OFF)
-option(EIGEN_BUILD_SPBENCH "Build sparse benchmark suite" OFF)
 # Avoid building docs if included from another project.
 # Building documentation requires creating and running executables on the host
 # platform.  We shouldn't do this if cross-compiling.
@@ -96,7 +98,7 @@ if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
 endif()
 option(EIGEN_BUILD_CMAKE_PACKAGE "Enables the creation of EigenConfig.cmake and related files" ${PROJECT_IS_TOP_LEVEL})
 
-if (EIGEN_BUILD_TESTING OR EIGEN_BUILD_BLAS OR EIGEN_BUILD_LAPACK OR EIGEN_BUILT_BTL OR EIGEN_BUILD_BTL OR EIGEN_BUILD_SPBENCH OR EIGEN_BUILD_DOC OR EIGEN_BUILD_DEMOS)
+if (EIGEN_BUILD_TESTING OR EIGEN_BUILD_BLAS OR EIGEN_BUILD_LAPACK OR EIGEN_BUILD_DOC OR EIGEN_BUILD_DEMOS)
   set(EIGEN_IS_BUILDING_ ON)
 endif()
 
@@ -104,15 +106,28 @@ endif()
 # Version Info.
 #==============================================================================
 
-# Automatically parse the version number from header files.
-file(READ "${PROJECT_SOURCE_DIR}/Eigen/src/Core/util/Macros.h" _eigen_version_header)
-string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen_world_version_match "${_eigen_version_header}")
-set(EIGEN_WORLD_VERSION "${CMAKE_MATCH_1}")
-string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen_major_version_match "${_eigen_version_header}")
-set(EIGEN_MAJOR_VERSION "${CMAKE_MATCH_1}")
-string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen_minor_version_match "${_eigen_version_header}")
-set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}")
-set(EIGEN_VERSION_NUMBER ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})
+# If version information is not provided, automatically parse the version number
+# from header files.
+file(READ "${PROJECT_SOURCE_DIR}/Eigen/Version" _eigen_version_header)
+if (NOT DEFINED EIGEN_WORLD_VERSION)
+  string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen_world_version_match "${_eigen_version_header}")
+  set(EIGEN_WORLD_VERSION "${CMAKE_MATCH_1}" CACHE STRING "")
+endif()
+if (NOT DEFINED EIGEN_MAJOR_VERSION)
+  string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen_major_version_match "${_eigen_version_header}")
+  set(EIGEN_MAJOR_VERSION "${CMAKE_MATCH_1}" CACHE STRING "")
+endif()
+if (NOT DEFINED EIGEN_MINOR_VERSION)
+  string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen_minor_version_match "${_eigen_version_header}")
+  set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}" CACHE STRING "")
+endif()
+if (NOT DEFINED EIGEN_PATCH_VERSION)
+  string(REGEX MATCH "define[ \t]+EIGEN_PATCH_VERSION[ \t]+([0-9]+)" _eigen_patch_version_match "${_eigen_version_header}")
+  set(EIGEN_PATCH_VERSION "${CMAKE_MATCH_1}" CACHE STRING "")
+endif()
+if (NOT DEFINED EIGEN_PRERELEASE_VERSION)
+  set(EIGEN_PRERELEASE_VERSION "dev")
+endif()
 
 # If we are in a git repo, extract a changeset.
 if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.git)
@@ -123,16 +138,32 @@ endif()
 
 # extract the git rev number from the git output...
 if(EIGEN_GIT_OUTPUT)
-string(REGEX MATCH "^([0-9;a-f]+).*" EIGEN_GIT_CHANGESET_MATCH "${EIGEN_GIT_OUTPUT}")
-set(EIGEN_GIT_REVNUM "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "^([0-9;a-f]+).*" EIGEN_GIT_CHANGESET_MATCH "${EIGEN_GIT_OUTPUT}")
+  set(EIGEN_GIT_REVNUM "${CMAKE_MATCH_1}")
 endif()
-#...and show it next to the version number
-if(EIGEN_GIT_REVNUM)
-  set(EIGEN_VERSION "${EIGEN_VERSION_NUMBER} (git rev ${EIGEN_GIT_REVNUM})")
+
+if (NOT DEFINED EIGEN_BUILD_VERSION AND DEFINED EIGEN_GIT_REVNUM)
+  string(SUBSTRING "${EIGEN_GIT_REVNUM}" 0 8 EIGEN_BUILD_VERSION)
 else()
-  set(EIGEN_VERSION "${EIGEN_VERSION_NUMBER}")
+  set(EIGEN_BUILD_VERSION "" CACHE STRING "")
+endif()
+
+# The EIGEN_VERSION_NUMBER must be of the form <major.minor.patch>.
+# The EIGEN_VERSION_STRING can contain the preprelease/build strings.
+set(EIGEN_VERSION_NUMBER "${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION}.${EIGEN_PATCH_VERSION}" CACHE STRING "")
+set(EIGEN_VERSION_STRING "${EIGEN_VERSION_NUMBER}" CACHE STRING "")
+if (NOT "x${EIGEN_PRERELEASE_VERSION}" STREQUAL "x")
+  set(EIGEN_VERSION_STRING "${EIGEN_VERSION_STRING}-${EIGEN_PRERELEASE_VERSION}" CACHE STRING "")
+endif()
+if (NOT "x${EIGEN_BUILD_VERSION}" STREQUAL "x")
+  set(EIGEN_VERSION_STRING "${EIGEN_VERSION_STRING}+${EIGEN_BUILD_VERSION}" CACHE STRING "")
 endif()
 
+
+# Generate version file.
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/Version.in"
+               "${CMAKE_CURRENT_BINARY_DIR}/include/Eigen/Version")
+
 #==============================================================================
 # Install Path Configuration.
 #==============================================================================
@@ -180,11 +211,6 @@ endforeach()
 # Eigen Library.
 #==============================================================================
 
-set ( EIGEN_VERSION_STRING ${EIGEN_VERSION_NUMBER} )
-set ( EIGEN_VERSION_MAJOR  ${EIGEN_WORLD_VERSION} )
-set ( EIGEN_VERSION_MINOR  ${EIGEN_MAJOR_VERSION} )
-set ( EIGEN_VERSION_PATCH  ${EIGEN_MINOR_VERSION} )
-
 # Alias Eigen_*_DIR to Eigen3_*_DIR:
 set(Eigen_SOURCE_DIR ${Eigen3_SOURCE_DIR})
 set(Eigen_BINARY_DIR ${Eigen3_BINARY_DIR})
@@ -197,6 +223,9 @@ target_include_directories (eigen INTERFACE
   $<INSTALL_INTERFACE:${INCLUDE_INSTALL_DIR}>
 )
 
+# Eigen requires at least C++14
+target_compile_features (eigen INTERFACE cxx_std_14)
+
 # Export as title case Eigen
 set_target_properties (eigen PROPERTIES EXPORT_NAME Eigen)
 
@@ -216,6 +245,9 @@ if(EIGEN_BUILD_PKGCONFIG)
 endif()
 
 install(DIRECTORY Eigen DESTINATION ${INCLUDE_INSTALL_DIR} COMPONENT Devel)
+# Replace the "Version" header file with the generated one.
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/include/Eigen/Version
+    DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/ COMPONENT Devel)
 
 install(TARGETS eigen EXPORT Eigen3Targets)
 
@@ -229,25 +261,10 @@ if(EIGEN_BUILD_CMAKE_PACKAGE)
     NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
   )
 
-  # NOTE Remove the first code path once the minimum required CMake version is
-  # bumped to 3.14 or above.
-  if (CMAKE_VERSION VERSION_LESS 3.14)
-    # Remove CMAKE_SIZEOF_VOID_P from Eigen3ConfigVersion.cmake since Eigen does
-    # not depend on architecture specific settings or libraries. More
-    # specifically, an Eigen3Config.cmake generated from a 64 bit target can be
-    # used for 32 bit targets as well (and vice versa).
-    set (_Eigen3_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})
-    unset (CMAKE_SIZEOF_VOID_P)
-    write_basic_package_version_file (Eigen3ConfigVersion.cmake
-                                      VERSION ${EIGEN_VERSION_NUMBER}
-                                      COMPATIBILITY SameMajorVersion)
-    set (CMAKE_SIZEOF_VOID_P ${_Eigen3_CMAKE_SIZEOF_VOID_P})
-  else (CMAKE_VERSION VERSION_LESS 3.14)
-    write_basic_package_version_file (Eigen3ConfigVersion.cmake
-                                      VERSION ${EIGEN_VERSION_NUMBER}
-                                      COMPATIBILITY SameMajorVersion
-                                      ARCH_INDEPENDENT)
-  endif (CMAKE_VERSION VERSION_LESS 3.14)
+  set(CVF_VERSION "${EIGEN_VERSION_NUMBER}")
+  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigVersion.cmake.in"
+                 "Eigen3ConfigVersion.cmake"
+                 @ONLY)
 
   # The Eigen target will be located in the Eigen3 namespace. Other CMake
   # targets can refer to it using Eigen3::Eigen.
@@ -299,17 +316,29 @@ if (EIGEN_IS_BUILDING_)
   set(CMAKE_INCLUDE_CURRENT_DIR OFF)
 
   find_package(StandardMathLibrary)
+  find_package(AOCL QUIET)
   set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "")
+  if(AOCL_FOUND)
+    list(APPEND EIGEN_STANDARD_LIBRARIES_TO_LINK_TO ${AOCL_LIBRARIES})
+    if(AOCL_INCLUDE_DIRS)
+      include_directories(${AOCL_INCLUDE_DIRS})
+    endif()
+  endif()
+
   if(NOT STANDARD_MATH_LIBRARY_FOUND)
-    message(FATAL_ERROR
-      "Can't link to the standard math library. Please report to the Eigen developers, telling them about your platform.")
+  message(FATAL_ERROR
+    "Can't link to the standard math library. Please report to the Eigen developers, telling them about your platform.")
   else()
-    if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
-      set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${STANDARD_MATH_LIBRARY}")
-    else()
-      set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${STANDARD_MATH_LIBRARY}")
-    endif()
+  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+    set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${STANDARD_MATH_LIBRARY}")
+  else()
+    set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${STANDARD_MATH_LIBRARY}")
   endif()
+  # Clean up any leading/trailing whitespace in the variable to avoid CMP0004 errors
+  string(STRIP "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}" EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+ endif()
+
+
   if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
     message(STATUS "Standard libraries to link to explicitly: ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}")
   else()
@@ -386,6 +415,8 @@ if (EIGEN_BUILD_TESTING)
     ei_add_cxx_compiler_flag("-Wshorten-64-to-32")
     ei_add_cxx_compiler_flag("-Wlogical-op")
     ei_add_cxx_compiler_flag("-Wenum-conversion")
+    ei_add_cxx_compiler_flag("-Werror=deprecated-anon-enum-enum-conversion")
+    ei_add_cxx_compiler_flag("-Werror=deprecated-enum-enum-conversion")
     ei_add_cxx_compiler_flag("-Wc++11-extensions")
     ei_add_cxx_compiler_flag("-Wdouble-promotion")
     # ei_add_cxx_compiler_flag("-Wconversion")
@@ -393,6 +424,7 @@ if (EIGEN_BUILD_TESTING)
     ei_add_cxx_compiler_flag("-Wno-psabi")
     ei_add_cxx_compiler_flag("-Wno-variadic-macros")
     ei_add_cxx_compiler_flag("-Wno-long-long")
+    ei_add_cxx_compiler_flag("-Wno-pass-failed")          # disable clang's warning for unrolling when the loop count is dynamic.
     ei_add_cxx_compiler_flag("-fno-common")
     ei_add_cxx_compiler_flag("-fstrict-aliasing")
     ei_add_cxx_compiler_flag("-wd981")                    # disable ICC's "operands are evaluated in unspecified order" remark
@@ -403,6 +435,17 @@ if (EIGEN_BUILD_TESTING)
       ei_add_cxx_compiler_flag("-fno-check-new")
     endif()
 
+    # GCC 12+ emits false-positive -Warray-bounds, -Wmaybe-uninitialized,
+    # -Wstringop-overread, and -Wnonnull warnings at -O2/-O3 in heavily
+    # templated code with mixed static/dynamic sizes.  These are well-known
+    # compiler bugs (see GCC PR 109394, 106247, 105329, 98610, among others).
+    if (CMAKE_COMPILER_IS_GNUCXX)
+      ei_add_cxx_compiler_flag("-Wno-array-bounds")
+      ei_add_cxx_compiler_flag("-Wno-maybe-uninitialized")
+      ei_add_cxx_compiler_flag("-Wno-stringop-overread")
+      ei_add_cxx_compiler_flag("-Wno-nonnull")
+    endif()
+
 
     if(ANDROID_NDK)
       ei_add_cxx_compiler_flag("-pie")
@@ -639,7 +682,7 @@ if (EIGEN_BUILD_TESTING)
   endif()
 
   set(EIGEN_CUDA_CXX_FLAGS "" CACHE STRING "Additional flags to pass to the cuda compiler.")
-  set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code")
+  set(EIGEN_CUDA_COMPUTE_ARCH 70 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code")
 
   option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
   if(EIGEN_TEST_SYCL)
@@ -729,15 +772,6 @@ if(EIGEN_BUILD_DOC)
   add_subdirectory(doc EXCLUDE_FROM_ALL)
 endif()
 
-# TODO: consider also replacing EIGEN_BUILD_BTL by a custom target "make btl"?
-if(EIGEN_BUILD_BTL)
-  add_subdirectory(bench/btl EXCLUDE_FROM_ALL)
-endif()
-
-if(NOT WIN32 AND EIGEN_BUILD_SPBENCH)
-  add_subdirectory(bench/spbench EXCLUDE_FROM_ALL)
-endif()
-
 if (EIGEN_BUILD_DEMOS)
   add_subdirectory(demos EXCLUDE_FROM_ALL)
 endif()
@@ -791,8 +825,9 @@ if(PROJECT_IS_TOP_LEVEL)
 endif()
 
 message(STATUS "")
-message(STATUS "Configured Eigen ${EIGEN_VERSION_NUMBER}")
+message(STATUS "Configured Eigen ${EIGEN_VERSION_STRING}")
 message(STATUS "")
+
 endif() # Regular CMakeLists of Eigen ends here
 
 ###############################################################################
@@ -814,15 +849,33 @@ include(GNUInstallDirs)
 set(INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}")
 set(CMAKEPACKAGE_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}")
 
-# automatically parse the version number
-file(READ "${PROJECT_SOURCE_DIR}/Eigen/src/Core/util/Macros.h" _eigen_version_header)
+# Automatically parse the version number.
+# Eigen 5+ moved the version macros from Eigen/src/Core/util/Macros.h to
+# Eigen/Version, and switched to semantic versioning. EIGEN_WORLD_VERSION
+# is now permanently "3" (legacy "Eigen3" name); EIGEN_MAJOR_VERSION /
+# EIGEN_MINOR_VERSION / EIGEN_PATCH_VERSION carry the real semver triple.
+# Build EIGEN_VERSION_NUMBER as MAJOR.MINOR.PATCH so that
+# find_package(Eigen3 X.Y.Z) and Eigen3_VERSION reflect the real version.
+if(EXISTS "${PROJECT_SOURCE_DIR}/Eigen/Version")
+  file(READ "${PROJECT_SOURCE_DIR}/Eigen/Version" _eigen_version_header)
+else()
+  file(READ "${PROJECT_SOURCE_DIR}/Eigen/src/Core/util/Macros.h" _eigen_version_header)
+endif()
 string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen_world_version_match "${_eigen_version_header}")
 set(EIGEN_WORLD_VERSION "${CMAKE_MATCH_1}")
 string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen_major_version_match "${_eigen_version_header}")
 set(EIGEN_MAJOR_VERSION "${CMAKE_MATCH_1}")
 string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen_minor_version_match "${_eigen_version_header}")
 set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}")
-set(EIGEN_VERSION_NUMBER ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})
+string(REGEX MATCH "define[ \t]+EIGEN_PATCH_VERSION[ \t]+([0-9]+)" _eigen_patch_version_match "${_eigen_version_header}")
+set(EIGEN_PATCH_VERSION "${CMAKE_MATCH_1}")
+if(EIGEN_WORLD_VERSION STREQUAL "3" AND EIGEN_PATCH_VERSION)
+  # Eigen >= 5.0: WORLD frozen at 3, real version is MAJOR.MINOR.PATCH.
+  set(EIGEN_VERSION_NUMBER ${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION}.${EIGEN_PATCH_VERSION})
+else()
+  # Eigen <= 3.4.x: legacy WORLD.MAJOR.MINOR layout.
+  set(EIGEN_VERSION_NUMBER ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})
+endif()
 
 include (CMakePackageConfigHelpers)
 
@@ -882,7 +935,7 @@ install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
 # Install files (used for both eigen_external and eigen_internal)
 install(
   DIRECTORY   "${CMAKE_CURRENT_SOURCE_DIR}/Eigen/"
-  DESTINATION "${ITK3P_INSTALL_INCLUDE_DIR}/itkeigen/Eigen"
+  DESTINATION "${INCLUDE_INSTALL_DIR}/itkeigen/Eigen"
   PATTERN "*.txt" EXCLUDE)
 
 ######################### eigen_internal #####################################
@@ -896,6 +949,8 @@ add_library (ITKInternalEigen3::Eigen ALIAS eigen_internal)
 # This would wrongly enforce EIGEN_MPL2_ONLY to other libraries using Eigen.
 # We wrap this definition in ITK_USE_EIGEN_MPL2_ONLY, and only enabling it internally in the dashboards and CI,
 # to avoid introducing GPL code from Eigen3 internally in ITK.
+option(ITK_USE_EIGEN_MPL2_ONLY "Set compile definition EIGEN_MPL2_ONLY for ITKInternalEigen3." OFF)
+mark_as_advanced(ITK_USE_EIGEN_MPL2_ONLY)
 
 if(ITK_USE_EIGEN_MPL2_ONLY)
   target_compile_definitions (eigen_internal INTERFACE "EIGEN_MPL2_ONLY")
@@ -905,10 +960,41 @@ endif()
 #   #include <itkeigen/Eigen/x>
 # INSTALL: headers require pre-prend itkeigen/Eigen/X.
 target_include_directories (eigen_internal SYSTEM INTERFACE
-  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-  "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${ITK3P_INSTALL_INCLUDE_DIR}/itkeigen>;"
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/..>
+  # $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/..>
+  $<INSTALL_INTERFACE:${INCLUDE_INSTALL_DIR}>
   )
 
 # Export as title case Eigen
-install (TARGETS eigen_internal EXPORT ${ITK3P_INSTALL_EXPORT_NAME})
+set_target_properties (eigen_internal PROPERTIES EXPORT_NAME Eigen)
+install (TARGETS eigen_internal EXPORT ITKInternalEigen3Targets)
 
+set(EIGEN3_TARGETS_FILE ITKInternalEigen3Targets.cmake)
+configure_package_config_file (
+  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/ITKInternalEigen3Config.cmake
+  INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
+  NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
+  )
+# Remove CMAKE_SIZEOF_VOID_P from Eigen3ConfigVersion.cmake since Eigen does
+# not depend on architecture specific settings or libraries. More
+# specifically, an Eigen3Config.cmake generated from a 64 bit target can be
+# used for 32 bit targets as well (and vice versa).
+set (_Eigen3_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})
+unset (CMAKE_SIZEOF_VOID_P)
+write_basic_package_version_file (ITKInternalEigen3ConfigVersion.cmake
+  VERSION ${EIGEN_VERSION_NUMBER}
+  COMPATIBILITY SameMajorVersion)
+set (CMAKE_SIZEOF_VOID_P ${_Eigen3_CMAKE_SIZEOF_VOID_P})
+# The Eigen target will be located in the Eigen3 namespace. Other CMake
+# targets can refer to it using Eigen3::Eigen.
+export (TARGETS eigen_internal NAMESPACE ITKInternalEigen3:: FILE ITKInternalEigen3Targets.cmake)
+install (EXPORT ITKInternalEigen3Targets NAMESPACE ITKInternalEigen3:: DESTINATION ${CMAKEPACKAGE_INSTALL_DIR})
+# Files already installed in eigen_external
+# install(
+#   DIRECTORY   "${CMAKE_CURRENT_SOURCE_DIR}/Eigen/"
+#   DESTINATION "${INCLUDE_INSTALL_DIR}/itkeigen/Eigen"
+#   PATTERN "*.txt" EXCLUDE)
+install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/ITKInternalEigen3Config.cmake
+  ${CMAKE_CURRENT_BINARY_DIR}/ITKInternalEigen3ConfigVersion.cmake
+  DESTINATION ${CMAKEPACKAGE_INSTALL_DIR} )
diff --git a/COPYING.MPL2 b/COPYING.MPL2
index ee6256cdb62..d0a1fa1482e 100644
--- a/COPYING.MPL2
+++ b/COPYING.MPL2
@@ -35,7 +35,7 @@ Mozilla Public License Version 2.0
     means any form of the work other than Source Code Form.
 
 1.7. "Larger Work"
-    means a work that combines Covered Software with other material, in 
+    means a work that combines Covered Software with other material, in
     a separate file or files, that is not Covered Software.
 
 1.8. "License"
diff --git a/COPYING.README b/COPYING.README
index 11af93ca790..93ec692667c 100644
--- a/COPYING.README
+++ b/COPYING.README
@@ -1,6 +1,14 @@
-Eigen is primarily MPL2 licensed. See COPYING.MPL2 and these links:
-  http://www.mozilla.org/MPL/2.0/
-  http://www.mozilla.org/MPL/2.0/FAQ.html
+Eigen is primarily licensed under the Mozilla Public License 2.0.
+See LICENSE, COPYING.MPL2, and these links:
+  https://www.mozilla.org/MPL/2.0/
+  https://www.mozilla.org/MPL/2.0/FAQ.html
 
-Some files contain third-party code under BSD or other MPL2-compatible licenses,
-whence the other COPYING.* files here.
\ No newline at end of file
+Some files contain third-party code under permissive or otherwise
+MPL2-compatible licenses, hence the other COPYING.* files here. These
+include Apache-2.0, BSD-style notices, the MINPACK license, and the MORSE
+CMake module BSD-style notice in cmake/MORSE-Copyright.txt.
+
+Note that some optional external dependencies (e.g. FFTW, MPFR C++)
+are distributed under different licenses, including the GPL. Refer to
+the individual source files and their respective COPYING files for
+details.
diff --git a/Eigen/AccelerateSupport b/Eigen/AccelerateSupport
new file mode 100644
index 00000000000..533be688ed2
--- /dev/null
+++ b/Eigen/AccelerateSupport
@@ -0,0 +1,52 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ACCELERATESUPPORT_MODULE_H
+#define EIGEN_ACCELERATESUPPORT_MODULE_H
+
+#include "SparseCore"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** \ingroup Support_modules
+ * \defgroup AccelerateSupport_Module AccelerateSupport module
+ *
+ * This module provides an interface to the Apple Accelerate library.
+ * It provides the seven following main factorization classes:
+ * - class AccelerateLLT: a Cholesky (LL^T) factorization.
+ * - class AccelerateLDLT: the default LDL^T factorization.
+ * - class AccelerateLDLTUnpivoted: a Cholesky-like LDL^T factorization with only 1x1 pivots and no pivoting
+ * - class AccelerateLDLTSBK: an LDL^T factorization with Supernode Bunch-Kaufman and static pivoting
+ * - class AccelerateLDLTTPP: an LDL^T factorization with full threshold partial pivoting
+ * - class AccelerateQR: a QR factorization
+ * - class AccelerateCholeskyAtA: a QR factorization without storing Q (equivalent to A^TA = R^T R)
+ *
+ * \code
+ * #include <Eigen/AccelerateSupport>
+ * \endcode
+ *
+ * In order to use this module, the Accelerate headers must be accessible from
+ * the include paths, and your binary must be linked to the Accelerate framework.
+ * The Accelerate library is only available on Apple hardware.
+ *
+ * Note that many of the algorithms can be influenced by the UpLo template
+ * argument. All matrices are assumed to be symmetric. For example, the following
+ * creates an LDLT factorization where your matrix is symmetric (implicit) and
+ * uses the lower triangle:
+ *
+ * \code
+ * AccelerateLDLT<SparseMatrix<float>, Lower> ldlt;
+ * \endcode
+ */
+
+// IWYU pragma: begin_exports
+#include "src/AccelerateSupport/AccelerateSupport.h"
+// IWYU pragma: end_exports
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif  // EIGEN_ACCELERATESUPPORT_MODULE_H
diff --git a/Eigen/Cholesky b/Eigen/Cholesky
index b05ed8278c6..e75357a657b 100644
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@@ -14,8 +14,6 @@
 #include "src/Core/util/DisableStupidWarnings.h"
 
 /** \defgroup Cholesky_Module Cholesky module
- *
- *
  *
  * This module provides two variants of the Cholesky decomposition for selfadjoint (hermitian) matrices.
  * Those decompositions are also accessible via the following methods:
diff --git a/Eigen/CholmodSupport b/Eigen/CholmodSupport
index adc5f8d63e7..31725138be8 100644
--- a/Eigen/CholmodSupport
+++ b/Eigen/CholmodSupport
@@ -26,7 +26,7 @@
  * For the sake of completeness, this module also propose the two following classes:
  * - class CholmodSimplicialLLT
  * - class CholmodSimplicialLDLT
- * Note that these classes does not bring any particular advantage compared to the built-in
+ * Note that these classes do not bring any particular advantage compared to the built-in
  * SimplicialLLT and SimplicialLDLT factorization classes.
  *
  * \code
diff --git a/Eigen/Core b/Eigen/Core
index cf2b164b711..060c92bcd7f 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -11,6 +11,9 @@
 #ifndef EIGEN_CORE_MODULE_H
 #define EIGEN_CORE_MODULE_H
 
+// Eigen version information.
+#include "Version"
+
 // first thing Eigen does: stop the compiler from reporting useless warnings.
 #include "src/Core/util/DisableStupidWarnings.h"
 
@@ -33,12 +36,6 @@
 #include <new>
 #endif
 
-// Disable the ipa-cp-clone optimization flag with MinGW 6.x or older (enabled by default with -O3)
-// See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
-#if EIGEN_COMP_MINGW && EIGEN_GNUC_STRICT_LESS_THAN(6, 0, 0)
-#pragma GCC optimize("-fno-ipa-cp-clone")
-#endif
-
 // Prevent ICC from specializing std::complex operators that silently fail
 // on device. This allows us to use our own device-compatible specializations
 // instead.
@@ -50,10 +47,12 @@
 // this include file manages BLAS and MKL related macros
 // and inclusion of their respective header files
 #include "src/Core/util/MKL_support.h"
+#include "src/Core/util/AOCL_Support.h"
 
-#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
-#define EIGEN_HAS_GPU_FP16
-#endif
+
+// EIGEN_HAS_GPU_FP16 is now always true when compiling with CUDA or HIP.
+// Use EIGEN_GPUCC (compile-time) or EIGEN_GPU_COMPILE_PHASE (device phase) instead.
+// TODO: Remove EIGEN_HAS_GPU_BF16 similarly once HIP bf16 guards are cleaned up.
 
 #if defined(EIGEN_HAS_CUDA_BF16) || defined(EIGEN_HAS_HIP_BF16)
 #define EIGEN_HAS_GPU_BF16
@@ -68,8 +67,7 @@
 #include <omp.h>
 #endif
 
-// MSVC for windows mobile does not have the errno.h file
-#if !(EIGEN_COMP_MSVC && EIGEN_OS_WINCE) && !EIGEN_COMP_ARM
+#if !EIGEN_COMP_ARM
 #define EIGEN_HAS_ERRNO
 #endif
 
@@ -92,16 +90,30 @@
 #include <algorithm>
 
 #include <array>
+#include <memory>
 #include <vector>
 
 // for std::is_nothrow_move_assignable
 #include <type_traits>
 
+// for std::move, std::forward, std::declval
+#include <utility>
+
 // for std::this_thread::yield().
 #if !defined(EIGEN_USE_BLAS) && (defined(EIGEN_HAS_OPENMP) || defined(EIGEN_GEMM_THREADPOOL))
 #include <thread>
 #endif
 
+// for __cpp_lib feature test macros
+#if defined(__has_include) && __has_include(<version>)
+#include <version>
+#endif
+
+// for std::bit_cast()
+#if defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L
+#include <bit>
+#endif
+
 // for outputting debug info
 #ifdef EIGEN_DEBUG_ASSIGN
 #include <iostream>
@@ -109,10 +121,18 @@
 
 // required for __cpuid, needs to be included after cmath
 // also required for _BitScanReverse on Windows on ARM
-#if EIGEN_COMP_MSVC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM64) && !EIGEN_OS_WINCE
+#if EIGEN_COMP_MSVC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM64)
 #include <intrin.h>
 #endif
 
+// Required for querying cache sizes on Linux and macOS.
+#if EIGEN_OS_LINUX
+#include <unistd.h>
+#elif EIGEN_OS_MAC
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
 #if defined(EIGEN_USE_SYCL)
 #undef min
 #undef max
@@ -121,9 +141,7 @@
 #undef isfinite
 #include <CL/sycl.hpp>
 #include <map>
-#include <memory>
 #include <thread>
-#include <utility>
 #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM0
 #define EIGEN_SYCL_LOCAL_THREAD_DIM0 16
 #endif
@@ -132,19 +150,9 @@
 #endif
 #endif
 
-#if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || \
-    defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API ||  \
-    defined EIGEN2_SUPPORT
-// This will generate an error message:
-#error Eigen2-support is only available up to version 3.2. Please go to "http://eigen.tuxfamily.org/index.php?title=Eigen2" for further information
-#endif
-
 namespace Eigen {
 
-// we use size_t frequently and we'll never remember to prepend it with std:: every time just to
-// ensure QNX/QCC support
 using std::size_t;
-// gcc 4.6.0 wants std:: for ptrdiff_t
 using std::ptrdiff_t;
 
 }  // namespace Eigen
@@ -162,6 +170,8 @@ using std::ptrdiff_t;
 #ifdef EIGEN_USE_LAPACKE
 #ifdef EIGEN_USE_MKL
 #include "mkl_lapacke.h"
+#elif defined(EIGEN_LAPACKE_SYSTEM)
+#include <lapacke.h>
 #else
 #include "src/misc/lapacke.h"
 #endif
@@ -192,36 +202,58 @@ using std::ptrdiff_t;
 #include "src/Core/arch/Default/BFloat16.h"
 #include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h"
 
-#if defined EIGEN_VECTORIZE_SSE
+#if defined(EIGEN_VECTORIZE_GENERIC) && !defined(EIGEN_DONT_VECTORIZE)
+#include "src/Core/arch/clang/PacketMath.h"
+#include "src/Core/arch/clang/TypeCasting.h"
+#include "src/Core/arch/clang/Complex.h"
+#include "src/Core/arch/clang/Reductions.h"
+#include "src/Core/arch/clang/MathFunctions.h"
+#else
+#if defined EIGEN_VECTORIZE_AVX512
 #include "src/Core/arch/SSE/PacketMath.h"
 #include "src/Core/arch/SSE/Reductions.h"
-#include "src/Core/arch/SSE/Complex.h"
-#include "src/Core/arch/SSE/TypeCasting.h"
-#include "src/Core/arch/SSE/MathFunctions.h"
-#endif
-
-#if defined EIGEN_VECTORIZE_AVX
 #include "src/Core/arch/AVX/PacketMath.h"
 #include "src/Core/arch/AVX/Reductions.h"
-#include "src/Core/arch/AVX/Complex.h"
-#include "src/Core/arch/AVX/TypeCasting.h"
-#include "src/Core/arch/AVX/MathFunctions.h"
-#endif
-
-#if defined EIGEN_VECTORIZE_AVX512
 #include "src/Core/arch/AVX512/PacketMath.h"
 #include "src/Core/arch/AVX512/Reductions.h"
-#include "src/Core/arch/AVX512/Complex.h"
-#include "src/Core/arch/AVX512/TypeCasting.h"
-#include "src/Core/arch/AVX512/MathFunctions.h"
-#include "src/Core/arch/AVX512/TrsmKernel.h"
-#endif
-
 #if defined EIGEN_VECTORIZE_AVX512FP16
 #include "src/Core/arch/AVX512/PacketMathFP16.h"
+#endif
+#include "src/Core/arch/SSE/TypeCasting.h"
+#include "src/Core/arch/AVX/TypeCasting.h"
+#include "src/Core/arch/AVX512/TypeCasting.h"
+#if defined EIGEN_VECTORIZE_AVX512FP16
 #include "src/Core/arch/AVX512/TypeCastingFP16.h"
+#endif
+#include "src/Core/arch/SSE/Complex.h"
+#include "src/Core/arch/AVX/Complex.h"
+#include "src/Core/arch/AVX512/Complex.h"
+#include "src/Core/arch/SSE/MathFunctions.h"
+#include "src/Core/arch/AVX/MathFunctions.h"
+#include "src/Core/arch/AVX512/MathFunctions.h"
+#if defined EIGEN_VECTORIZE_AVX512FP16
 #include "src/Core/arch/AVX512/MathFunctionsFP16.h"
 #endif
+#include "src/Core/arch/AVX512/TrsmKernel.h"
+#elif defined EIGEN_VECTORIZE_AVX
+// Use AVX for floats and doubles, SSE for integers
+#include "src/Core/arch/SSE/PacketMath.h"
+#include "src/Core/arch/SSE/Reductions.h"
+#include "src/Core/arch/SSE/TypeCasting.h"
+#include "src/Core/arch/SSE/Complex.h"
+#include "src/Core/arch/AVX/PacketMath.h"
+#include "src/Core/arch/AVX/Reductions.h"
+#include "src/Core/arch/AVX/TypeCasting.h"
+#include "src/Core/arch/AVX/Complex.h"
+#include "src/Core/arch/SSE/MathFunctions.h"
+#include "src/Core/arch/AVX/MathFunctions.h"
+#elif defined EIGEN_VECTORIZE_SSE
+#include "src/Core/arch/SSE/PacketMath.h"
+#include "src/Core/arch/SSE/Reductions.h"
+#include "src/Core/arch/SSE/TypeCasting.h"
+#include "src/Core/arch/SSE/MathFunctions.h"
+#include "src/Core/arch/SSE/Complex.h"
+#endif
 
 #if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
 #include "src/Core/arch/AltiVec/PacketMath.h"
@@ -242,6 +274,18 @@ using std::ptrdiff_t;
 #include "src/Core/arch/SVE/PacketMath.h"
 #include "src/Core/arch/SVE/TypeCasting.h"
 #include "src/Core/arch/SVE/MathFunctions.h"
+#elif defined EIGEN_VECTORIZE_RVV10
+#include "src/Core/arch/RVV10/PacketMath.h"
+#include "src/Core/arch/RVV10/PacketMath4.h"
+#include "src/Core/arch/RVV10/PacketMath2.h"
+#include "src/Core/arch/RVV10/TypeCasting.h"
+#include "src/Core/arch/RVV10/MathFunctions.h"
+#if defined EIGEN_VECTORIZE_RVV10FP16
+#include "src/Core/arch/RVV10/PacketMathFP16.h"
+#endif
+#if defined EIGEN_VECTORIZE_RVV10BF16
+#include "src/Core/arch/RVV10/PacketMathBF16.h"
+#endif
 #elif defined EIGEN_VECTORIZE_ZVECTOR
 #include "src/Core/arch/ZVector/PacketMath.h"
 #include "src/Core/arch/ZVector/MathFunctions.h"
@@ -269,6 +313,8 @@ using std::ptrdiff_t;
 #endif
 #endif
 
+#endif  // #ifndef EIGEN_VECTORIZE_GENERIC
+
 #include "src/Core/arch/Default/Settings.h"
 // This file provides generic implementations valid for scalar as well
 #include "src/Core/arch/Default/GenericPacketMathFunctions.h"
@@ -304,6 +350,7 @@ using std::ptrdiff_t;
 #include "src/Core/Product.h"
 #include "src/Core/CoreEvaluators.h"
 #include "src/Core/AssignEvaluator.h"
+#include "src/Core/RealView.h"
 #include "src/Core/Assign.h"
 
 #include "src/Core/ArrayBase.h"
@@ -311,13 +358,12 @@ using std::ptrdiff_t;
 #include "src/Core/DenseStorage.h"
 #include "src/Core/NestByValue.h"
 
-// #include "src/Core/ForceAlignedAccess.h"
-
 #include "src/Core/ReturnByValue.h"
 #include "src/Core/NoAlias.h"
 #include "src/Core/PlainObjectBase.h"
 #include "src/Core/Matrix.h"
 #include "src/Core/Array.h"
+#include "src/Core/StructuredBindings.h"
 #include "src/Core/Fill.h"
 #include "src/Core/CwiseTernaryOp.h"
 #include "src/Core/CwiseBinaryOp.h"
@@ -378,23 +424,28 @@ using std::ptrdiff_t;
 #include "src/Core/CoreIterators.h"
 #include "src/Core/ConditionEstimator.h"
 
+#if !defined(EIGEN_VECTORIZE_GENERIC)
 #if defined(EIGEN_VECTORIZE_VSX)
 #include "src/Core/arch/AltiVec/MatrixProduct.h"
 #elif defined EIGEN_VECTORIZE_NEON
 #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h"
 #elif defined EIGEN_VECTORIZE_LSX
 #include "src/Core/arch/LSX/GeneralBlockPanelKernel.h"
+#elif defined EIGEN_VECTORIZE_RVV10
+#include "src/Core/arch/RVV10/GeneralBlockPanelKernel.h"
 #endif
 
 #if defined(EIGEN_VECTORIZE_AVX512)
 #include "src/Core/arch/AVX512/GemmKernel.h"
 #endif
+#endif
 
 #include "src/Core/Select.h"
 #include "src/Core/VectorwiseOp.h"
 #include "src/Core/PartialReduxEvaluator.h"
 #include "src/Core/Random.h"
 #include "src/Core/Replicate.h"
+#include "src/Core/ConcatOp.h"
 #include "src/Core/Reverse.h"
 #include "src/Core/ArrayWrapper.h"
 #include "src/Core/StlIterators.h"
@@ -414,6 +465,10 @@ using std::ptrdiff_t;
 #include "src/Core/Assign_MKL.h"
 #endif
 
+#ifdef EIGEN_USE_AOCL_VML
+#include "src/Core/Assign_AOCL.h"
+#endif
+
 #include "src/Core/GlobalFunctions.h"
 // IWYU pragma: end_exports
 
diff --git a/Eigen/Dense b/Eigen/Dense
index 5768910bd88..c90db7657a7 100644
--- a/Eigen/Dense
+++ b/Eigen/Dense
@@ -1,3 +1,13 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DENSE_MODULE_H
+#define EIGEN_DENSE_MODULE_H
+
 #include "Core"
 #include "LU"
 #include "Cholesky"
@@ -5,3 +15,5 @@
 #include "SVD"
 #include "Geometry"
 #include "Eigenvalues"
+
+#endif  // EIGEN_DENSE_MODULE_H
diff --git a/Eigen/Eigen b/Eigen/Eigen
index 654c8dc6380..bb8f02f04b0 100644
--- a/Eigen/Eigen
+++ b/Eigen/Eigen
@@ -1,2 +1,14 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EIGEN_MODULE_H
+#define EIGEN_EIGEN_MODULE_H
+
 #include "Dense"
 #include "Sparse"
+
+#endif  // EIGEN_EIGEN_MODULE_H
diff --git a/Eigen/Eigenvalues b/Eigen/Eigenvalues
index 3b0bdee1715..f68eb85421b 100644
--- a/Eigen/Eigenvalues
+++ b/Eigen/Eigenvalues
@@ -11,16 +11,13 @@
 #include "Core"
 
 #include "Cholesky"
-#include "Jacobi"
-#include "Householder"
 #include "LU"
 #include "Geometry"
+#include "Sparse"  // Needed by ComplexQZ.
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
 /** \defgroup Eigenvalues_Module Eigenvalues module
- *
- *
  *
  * This module mainly provides various eigenvalue solvers.
  * This module also provides some MatrixBase methods, including:
@@ -32,8 +29,6 @@
  * \endcode
  */
 
-#include "src/misc/RealSvd2x2.h"
-
 // IWYU pragma: begin_exports
 #include "src/Eigenvalues/Tridiagonalization.h"
 #include "src/Eigenvalues/RealSchur.h"
@@ -44,11 +39,14 @@
 #include "src/Eigenvalues/ComplexSchur.h"
 #include "src/Eigenvalues/ComplexEigenSolver.h"
 #include "src/Eigenvalues/RealQZ.h"
+#include "src/Eigenvalues/ComplexQZ.h"
 #include "src/Eigenvalues/GeneralizedEigenSolver.h"
 #include "src/Eigenvalues/MatrixBaseEigenvalues.h"
 #ifdef EIGEN_USE_LAPACKE
 #ifdef EIGEN_USE_MKL
 #include "mkl_lapacke.h"
+#elif defined(EIGEN_LAPACKE_SYSTEM)
+#include <lapacke.h>
 #else
 #include "src/misc/lapacke.h"
 #endif
diff --git a/Eigen/Geometry b/Eigen/Geometry
index efe3e1fa339..c3ddb3d8a33 100644
--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@@ -12,7 +12,6 @@
 
 #include "SVD"
 #include "LU"
-#include <limits>
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
@@ -48,10 +47,13 @@
 #include "src/Geometry/AlignedBox.h"
 #include "src/Geometry/Umeyama.h"
 
+#ifndef EIGEN_VECTORIZE_GENERIC
+// TODO(rmlarsen): Make these work with generic vectorization if possible.
 // Use the SSE optimized version whenever possible.
 #if (defined EIGEN_VECTORIZE_SSE) || (defined EIGEN_VECTORIZE_NEON)
 #include "src/Geometry/arch/Geometry_SIMD.h"
 #endif
+#endif
 // IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
diff --git a/Eigen/Householder b/Eigen/Householder
index 5070e070e67..719edaffedb 100644
--- a/Eigen/Householder
+++ b/Eigen/Householder
@@ -22,8 +22,8 @@
 
 // IWYU pragma: begin_exports
 #include "src/Householder/Householder.h"
-#include "src/Householder/HouseholderSequence.h"
 #include "src/Householder/BlockHouseholder.h"
+#include "src/Householder/HouseholderSequence.h"
 // IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
diff --git a/Eigen/KLUSupport b/Eigen/KLUSupport
new file mode 100644
index 00000000000..6a5c59710c2
--- /dev/null
+++ b/Eigen/KLUSupport
@@ -0,0 +1,43 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_KLUSUPPORT_MODULE_H
+#define EIGEN_KLUSUPPORT_MODULE_H
+
+#include "SparseCore"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+extern "C" {
+#include <btf.h>
+#include <klu.h>
+}
+
+/** \ingroup Support_modules
+ * \defgroup KLUSupport_Module KLUSupport module
+ *
+ * This module provides an interface to the KLU library which is part of the <a
+ * href="http://www.suitesparse.com">suitesparse</a> package. It provides the following factorization class:
+ * - class KLU: a sparse LU factorization, well-suited for circuit simulation.
+ *
+ * \code
+ * #include <Eigen/KLUSupport>
+ * \endcode
+ *
+ * In order to use this module, the klu and btf headers must be accessible from the include paths, and your binary must
+ * be linked to the klu library and its dependencies. The dependencies depend on how KLU has been compiled. For a
+ * cmake based project, you can use our FindKLU.cmake module to help you in this task.
+ *
+ */
+
+// IWYU pragma: begin_exports
+#include "src/KLUSupport/KLUSupport.h"
+// IWYU pragma: end_exports
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif  // EIGEN_KLUSUPPORT_MODULE_H
diff --git a/Eigen/LU b/Eigen/LU
index d80448039ef..ea54e353ec6 100644
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -23,10 +23,10 @@
  * \endcode
  */
 
+// IWYU pragma: begin_exports
 #include "src/misc/Kernel.h"
 #include "src/misc/Image.h"
-
-// IWYU pragma: begin_exports
+#include "src/misc/RankRevealingBase.h"
 #include "src/LU/FullPivLU.h"
 #include "src/LU/PartialPivLU.h"
 #ifdef EIGEN_USE_LAPACKE
@@ -36,9 +36,12 @@
 #include "src/LU/Determinant.h"
 #include "src/LU/InverseImpl.h"
 
+#ifndef EIGEN_VECTORIZE_GENERIC
+// TODO(rmlarsen): Make these work with generic vectorization if possible.
 #if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_NEON
 #include "src/LU/arch/InverseSize4.h"
 #endif
+#endif
 // IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
diff --git a/Eigen/PaStiXSupport b/Eigen/PaStiXSupport
index dd1cfcb12de..59442316eff 100644
--- a/Eigen/PaStiXSupport
+++ b/Eigen/PaStiXSupport
@@ -36,7 +36,7 @@ extern "C" {
  * \endcode
  *
  * In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be
- * linked to the PaSTiX library and its dependencies. This wrapper resuires PaStiX version 5.x compiled without MPI
+ * linked to the PaSTiX library and its dependencies. This wrapper requires PaStiX version 5.x compiled without MPI
  * support. The dependencies depend on how PaSTiX has been compiled. For a cmake based project, you can use our
  * FindPaSTiX.cmake module to help you in this task.
  *
diff --git a/Eigen/QR b/Eigen/QR
index c38b453b076..b29abce9ba0 100644
--- a/Eigen/QR
+++ b/Eigen/QR
@@ -11,14 +11,11 @@
 #include "Core"
 
 #include "Cholesky"
-#include "Jacobi"
 #include "Householder"
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
 /** \defgroup QR_Module QR module
- *
- *
  *
  * This module provides various QR decompositions
  * This module also provides some MatrixBase methods, including:
@@ -31,6 +28,8 @@
  * \endcode
  */
 
+#include "src/misc/RankRevealingBase.h"
+
 // IWYU pragma: begin_exports
 #include "src/QR/HouseholderQR.h"
 #include "src/QR/FullPivHouseholderQR.h"
diff --git a/Eigen/QtAlignedMalloc b/Eigen/QtAlignedMalloc
index 585f8e81ceb..6e15b26e67c 100644
--- a/Eigen/QtAlignedMalloc
+++ b/Eigen/QtAlignedMalloc
@@ -14,11 +14,11 @@
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
-void *qMalloc(std::size_t size) { return Eigen::internal::aligned_malloc(size); }
+inline void *qMalloc(std::size_t size) { return Eigen::internal::aligned_malloc(size); }
 
-void qFree(void *ptr) { Eigen::internal::aligned_free(ptr); }
+inline void qFree(void *ptr) { Eigen::internal::aligned_free(ptr); }
 
-void *qRealloc(void *ptr, std::size_t size) {
+inline void *qRealloc(void *ptr, std::size_t size) {
   void *newPtr = Eigen::internal::aligned_malloc(size);
   std::memcpy(newPtr, ptr, size);
   Eigen::internal::aligned_free(ptr);
diff --git a/Eigen/SPQRSupport b/Eigen/SPQRSupport
index c01dbe0093f..bfc2e7bfa70 100644
--- a/Eigen/SPQRSupport
+++ b/Eigen/SPQRSupport
@@ -38,4 +38,4 @@
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif
+#endif  // EIGEN_SPQRSUPPORT_MODULE_H
diff --git a/Eigen/SVD b/Eigen/SVD
index 2a013f825d7..ef5e36e825f 100644
--- a/Eigen/SVD
+++ b/Eigen/SVD
@@ -9,14 +9,10 @@
 #define EIGEN_SVD_MODULE_H
 
 #include "QR"
-#include "Householder"
-#include "Jacobi"
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
 /** \defgroup SVD_Module SVD module
- *
- *
  *
  * This module provides SVD decomposition for matrices (both real and complex).
  * Two decomposition algorithms are provided:
@@ -33,7 +29,6 @@
  */
 
 // IWYU pragma: begin_exports
-#include "src/misc/RealSvd2x2.h"
 #include "src/SVD/UpperBidiagonalization.h"
 #include "src/SVD/SVDBase.h"
 #include "src/SVD/JacobiSVD.h"
@@ -41,6 +36,8 @@
 #ifdef EIGEN_USE_LAPACKE
 #ifdef EIGEN_USE_MKL
 #include "mkl_lapacke.h"
+#elif defined(EIGEN_LAPACKE_SYSTEM)
+#include <lapacke.h>
 #else
 #include "src/misc/lapacke.h"
 #endif
diff --git a/Eigen/SparseCore b/Eigen/SparseCore
index 56a9401af34..6020e42855b 100644
--- a/Eigen/SparseCore
+++ b/Eigen/SparseCore
@@ -12,11 +12,7 @@
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
-#include <vector>
 #include <map>
-#include <cstdlib>
-#include <cstring>
-#include <algorithm>
 #include <numeric>
 
 /**
diff --git a/Eigen/SparseQR b/Eigen/SparseQR
index b4f1cad6bbb..1ad51923c87 100644
--- a/Eigen/SparseQR
+++ b/Eigen/SparseQR
@@ -35,4 +35,4 @@
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif
+#endif  // EIGEN_SPARSEQR_MODULE_H
diff --git a/Eigen/SuperLUSupport b/Eigen/SuperLUSupport
index 79e2222f40d..27e14d29eb1 100644
--- a/Eigen/SuperLUSupport
+++ b/Eigen/SuperLUSupport
@@ -16,6 +16,7 @@
 #define EIGEN_EMPTY_WAS_ALREADY_DEFINED
 #endif
 
+// Required by SuperLU headers, which expect int_t to be defined as a global typedef.
 typedef int int_t;
 #include <slu_Cnames.h>
 #include <supermatrix.h>
diff --git a/Eigen/ThreadPool b/Eigen/ThreadPool
new file mode 100644
index 00000000000..ac08bef0388
--- /dev/null
+++ b/Eigen/ThreadPool
@@ -0,0 +1,80 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_THREADPOOL_MODULE_H
+#define EIGEN_THREADPOOL_MODULE_H
+
+#include "Core"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** \defgroup ThreadPool_Module ThreadPool Module
+ *
+ * This module provides 2 threadpool implementations
+ *  - a simple reference implementation
+ *  - a faster non blocking implementation
+ *
+ * \code
+ * #include <Eigen/ThreadPool>
+ * \endcode
+ */
+
+#include <cstddef>
+#include <cstring>
+#include <ctime>
+
+#include <vector>
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <thread>
+#include <functional>
+#include <memory>
+#include <utility>
+
+// There are non-parenthesized calls to "max" in the  <unordered_map> header,
+// which trigger a check in test/main.h causing compilation to fail.
+// We work around the check here by removing the check for max in
+// the case where we have to emulate thread_local.
+#ifdef max
+#undef max
+#endif
+#include <unordered_map>
+
+#include "src/Core/util/Meta.h"
+#include "src/Core/util/MaxSizeVector.h"
+
+#ifndef EIGEN_MUTEX
+#define EIGEN_MUTEX std::mutex
+#endif
+#ifndef EIGEN_MUTEX_LOCK
+#define EIGEN_MUTEX_LOCK std::unique_lock<std::mutex>
+#endif
+#ifndef EIGEN_CONDVAR
+#define EIGEN_CONDVAR std::condition_variable
+#endif
+
+// IWYU pragma: begin_exports
+#include "src/ThreadPool/ThreadLocal.h"
+#include "src/ThreadPool/ThreadYield.h"
+#include "src/ThreadPool/ThreadCancel.h"
+#include "src/ThreadPool/EventCount.h"
+#include "src/ThreadPool/RunQueue.h"
+#include "src/ThreadPool/ThreadPoolInterface.h"
+#include "src/ThreadPool/ThreadEnvironment.h"
+#include "src/ThreadPool/Barrier.h"
+#include "src/ThreadPool/NonBlockingThreadPool.h"
+#include "src/ThreadPool/CoreThreadPoolDevice.h"
+#include "src/ThreadPool/ForkJoin.h"
+// IWYU pragma: end_exports
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif  // EIGEN_THREADPOOL_MODULE_H
diff --git a/Eigen/UmfPackSupport b/Eigen/UmfPackSupport
index 126344cba3f..28e386fad85 100644
--- a/Eigen/UmfPackSupport
+++ b/Eigen/UmfPackSupport
@@ -35,7 +35,7 @@ extern "C" {
 
 // IWYU pragma: begin_exports
 #include "src/UmfPackSupport/UmfPackSupport.h"
-// IWYU pragma: endexports
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/Version b/Eigen/Version
new file mode 100644
index 00000000000..c1083631856
--- /dev/null
+++ b/Eigen/Version
@@ -0,0 +1,21 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_VERSION_H
+#define EIGEN_VERSION_H
+
+// The "WORLD" version will forever remain "3" for the "Eigen3" library.
+#define EIGEN_WORLD_VERSION 3
+// As of Eigen3 5.0.0, we have moved to Semantic Versioning (semver.org).
+#define EIGEN_MAJOR_VERSION 5
+#define EIGEN_MINOR_VERSION 0
+#define EIGEN_PATCH_VERSION 1
+#define EIGEN_PRERELEASE_VERSION "dev"
+#define EIGEN_BUILD_VERSION "master"
+#define EIGEN_VERSION_STRING "5.0.1-dev+master"
+
+#endif  // EIGEN_VERSION_H
diff --git a/Eigen/src/AccelerateSupport/AccelerateSupport.h b/Eigen/src/AccelerateSupport/AccelerateSupport.h
index 13a26dfbb18..c944aeabd03 100644
--- a/Eigen/src/AccelerateSupport/AccelerateSupport.h
+++ b/Eigen/src/AccelerateSupport/AccelerateSupport.h
@@ -110,7 +110,7 @@ using AccelerateCholeskyAtA = AccelerateImpl<MatrixType, 0, SparseFactorizationC
 namespace internal {
 template <typename T>
 struct AccelFactorizationDeleter {
-  void operator()(T* sym) {
+  void operator()(T* sym) const {
     if (sym) {
       SparseCleanup(*sym);
       delete sym;
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index b1d801d34df..63aa5bd756c 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -84,7 +84,13 @@ class LDLT : public SolverBase<LDLT<MatrixType_, UpLo_> > {
    * The default constructor is useful in cases in which the user intends to
    * perform decompositions via LDLT::compute(const MatrixType&).
    */
-  LDLT() : m_matrix(), m_transpositions(), m_sign(internal::ZeroSign), m_isInitialized(false) {}
+  LDLT()
+      : m_matrix(),
+        m_l1_norm(0),
+        m_transpositions(),
+        m_sign(internal::ZeroSign),
+        m_isInitialized(false),
+        m_info(InvalidInput) {}
 
   /** \brief Default Constructor with memory preallocation
    *
@@ -94,10 +100,12 @@ class LDLT : public SolverBase<LDLT<MatrixType_, UpLo_> > {
    */
   explicit LDLT(Index size)
       : m_matrix(size, size),
+        m_l1_norm(0),
         m_transpositions(size),
         m_temporary(size),
         m_sign(internal::ZeroSign),
-        m_isInitialized(false) {}
+        m_isInitialized(false),
+        m_info(InvalidInput) {}
 
   /** \brief Constructor with decomposition
    *
@@ -108,10 +116,12 @@ class LDLT : public SolverBase<LDLT<MatrixType_, UpLo_> > {
   template <typename InputType>
   explicit LDLT(const EigenBase<InputType>& matrix)
       : m_matrix(matrix.rows(), matrix.cols()),
+        m_l1_norm(0),
         m_transpositions(matrix.rows()),
         m_temporary(matrix.rows()),
         m_sign(internal::ZeroSign),
-        m_isInitialized(false) {
+        m_isInitialized(false),
+        m_info(InvalidInput) {
     compute(matrix.derived());
   }
 
@@ -125,10 +135,12 @@ class LDLT : public SolverBase<LDLT<MatrixType_, UpLo_> > {
   template <typename InputType>
   explicit LDLT(EigenBase<InputType>& matrix)
       : m_matrix(matrix.derived()),
+        m_l1_norm(0),
         m_transpositions(matrix.rows()),
         m_temporary(matrix.rows()),
         m_sign(internal::ZeroSign),
-        m_isInitialized(false) {
+        m_isInitialized(false),
+        m_info(InvalidInput) {
     compute(matrix.derived());
   }
 
@@ -191,7 +203,7 @@ class LDLT : public SolverBase<LDLT<MatrixType_, UpLo_> > {
    * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt()
    */
   template <typename Rhs>
-  inline const Solve<LDLT, Rhs> solve(const MatrixBase<Rhs>& b) const;
+  inline Solve<LDLT, Rhs> solve(const MatrixBase<Rhs>& b) const;
 #endif
 
   template <typename Derived>
@@ -213,7 +225,7 @@ class LDLT : public SolverBase<LDLT<MatrixType_, UpLo_> > {
 
   /** \returns the internal LDLT decomposition matrix
    *
-   * TODO: document the storage layout
+   * TODO: document the storage layout.
    */
   inline const MatrixType& matrixLDLT() const {
     eigen_assert(m_isInitialized && "LDLT is not initialized.");
@@ -477,19 +489,8 @@ LDLT<MatrixType, UpLo_>& LDLT<MatrixType, UpLo_>::compute(const EigenBase<InputT
 
   m_matrix = a.derived();
 
-  // Compute matrix L1 norm = max abs column sum.
-  m_l1_norm = RealScalar(0);
-  // TODO move this code to SelfAdjointView
-  for (Index col = 0; col < size; ++col) {
-    RealScalar abs_col_sum;
-    if (UpLo_ == Lower)
-      abs_col_sum =
-          m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>();
-    else
-      abs_col_sum =
-          m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>();
-    if (abs_col_sum > m_l1_norm) m_l1_norm = abs_col_sum;
-  }
+  // Compute matrix L1 norm = max abs column sum over the implicit self-adjoint matrix.
+  m_l1_norm = m_matrix.template selfadjointView<UpLo_>().l1Norm();
 
   m_transpositions.resize(size);
   m_isInitialized = false;
@@ -630,8 +631,8 @@ MatrixType LDLT<MatrixType, UpLo_>::reconstructedMatrix() const {
  * \sa MatrixBase::ldlt()
  */
 template <typename MatrixType, unsigned int UpLo>
-inline const LDLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
-SelfAdjointView<MatrixType, UpLo>::ldlt() const {
+inline LDLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo> SelfAdjointView<MatrixType, UpLo>::ldlt()
+    const {
   return LDLT<PlainObject, UpLo>(m_matrix);
 }
 
@@ -640,7 +641,7 @@ SelfAdjointView<MatrixType, UpLo>::ldlt() const {
  * \sa SelfAdjointView::ldlt()
  */
 template <typename Derived>
-inline const LDLT<typename MatrixBase<Derived>::PlainObject> MatrixBase<Derived>::ldlt() const {
+inline LDLT<typename MatrixBase<Derived>::PlainObject> MatrixBase<Derived>::ldlt() const {
   return LDLT<PlainObject>(derived());
 }
 
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 7fa4fa2a0f6..9bffeae5b5c 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -86,7 +86,7 @@ class LLT : public SolverBase<LLT<MatrixType_, UpLo_> > {
    * The default constructor is useful in cases in which the user intends to
    * perform decompositions via LLT::compute(const MatrixType&).
    */
-  LLT() : m_matrix(), m_isInitialized(false) {}
+  LLT() : m_matrix(), m_l1_norm(0), m_isInitialized(false), m_info(InvalidInput) {}
 
   /** \brief Default Constructor with memory preallocation
    *
@@ -94,10 +94,11 @@ class LLT : public SolverBase<LLT<MatrixType_, UpLo_> > {
    * according to the specified problem \a size.
    * \sa LLT()
    */
-  explicit LLT(Index size) : m_matrix(size, size), m_isInitialized(false) {}
+  explicit LLT(Index size) : m_matrix(size, size), m_l1_norm(0), m_isInitialized(false), m_info(InvalidInput) {}
 
   template <typename InputType>
-  explicit LLT(const EigenBase<InputType>& matrix) : m_matrix(matrix.rows(), matrix.cols()), m_isInitialized(false) {
+  explicit LLT(const EigenBase<InputType>& matrix)
+      : m_matrix(matrix.rows(), matrix.cols()), m_l1_norm(0), m_isInitialized(false), m_info(InvalidInput) {
     compute(matrix.derived());
   }
 
@@ -109,7 +110,8 @@ class LLT : public SolverBase<LLT<MatrixType_, UpLo_> > {
    * \sa LLT(const EigenBase&)
    */
   template <typename InputType>
-  explicit LLT(EigenBase<InputType>& matrix) : m_matrix(matrix.derived()), m_isInitialized(false) {
+  explicit LLT(EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()), m_l1_norm(0), m_isInitialized(false), m_info(InvalidInput) {
     compute(matrix.derived());
   }
 
@@ -137,7 +139,7 @@ class LLT : public SolverBase<LLT<MatrixType_, UpLo_> > {
    * \sa solveInPlace(), MatrixBase::llt(), SelfAdjointView::llt()
    */
   template <typename Rhs>
-  inline const Solve<LLT, Rhs> solve(const MatrixBase<Rhs>& b) const;
+  inline Solve<LLT, Rhs> solve(const MatrixBase<Rhs>& b) const;
 #endif
 
   template <typename Derived>
@@ -402,19 +404,8 @@ LLT<MatrixType, UpLo_>& LLT<MatrixType, UpLo_>::compute(const EigenBase<InputTyp
   m_matrix.resize(size, size);
   if (!internal::is_same_dense(m_matrix, a.derived())) m_matrix = a.derived();
 
-  // Compute matrix L1 norm = max abs column sum.
-  m_l1_norm = RealScalar(0);
-  // TODO move this code to SelfAdjointView
-  for (Index col = 0; col < size; ++col) {
-    RealScalar abs_col_sum;
-    if (UpLo_ == Lower)
-      abs_col_sum =
-          m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>();
-    else
-      abs_col_sum =
-          m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>();
-    if (abs_col_sum > m_l1_norm) m_l1_norm = abs_col_sum;
-  }
+  // Compute matrix L1 norm = max abs column sum over the implicit self-adjoint matrix.
+  m_l1_norm = m_matrix.template selfadjointView<UpLo_>().l1Norm();
 
   m_isInitialized = true;
   bool ok = Traits::inplace_decomposition(m_matrix);
@@ -495,7 +486,7 @@ MatrixType LLT<MatrixType, UpLo_>::reconstructedMatrix() const {
  * \sa SelfAdjointView::llt()
  */
 template <typename Derived>
-inline const LLT<typename MatrixBase<Derived>::PlainObject> MatrixBase<Derived>::llt() const {
+inline LLT<typename MatrixBase<Derived>::PlainObject> MatrixBase<Derived>::llt() const {
   return LLT<PlainObject>(derived());
 }
 
@@ -504,7 +495,7 @@ inline const LLT<typename MatrixBase<Derived>::PlainObject> MatrixBase<Derived>:
  * \sa SelfAdjointView::llt()
  */
 template <typename MatrixType, unsigned int UpLo>
-inline const LLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo> SelfAdjointView<MatrixType, UpLo>::llt()
+inline LLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo> SelfAdjointView<MatrixType, UpLo>::llt()
     const {
   return LLT<PlainObject, UpLo>(m_matrix);
 }
diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h
index 7e3c881aec9..dc3d6a3471d 100644
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -360,7 +360,7 @@ class CholmodBase : public SparseSolverBase<Derived> {
       this->m_info = NumericalIssue;
       return;
     }
-    // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
+    // TODO: optimize this copy by swapping when possible (be careful with alignment, etc.)
     // NOTE Actually, the copy can be avoided by calling cholmod_solve2 instead of cholmod_solve
     dest = Matrix<Scalar, Dest::RowsAtCompileTime, Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),
                                                                                  b.rows(), b.cols());
@@ -386,7 +386,7 @@ class CholmodBase : public SparseSolverBase<Derived> {
       this->m_info = NumericalIssue;
       return;
     }
-    // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
+    // TODO: optimize this copy by swapping when possible (be careful with alignment, etc.)
     // NOTE cholmod_spsolve in fact just calls the dense solver for blocks of 4 columns at a time (similar to Eigen's
     // sparse solver)
     dest.derived() = viewAsEigen<typename DestDerived::Scalar, typename DestDerived::StorageIndex>(*x_cs);
diff --git a/Eigen/src/Core/ArithmeticSequence.h b/Eigen/src/Core/ArithmeticSequence.h
index ae6373dda2d..65e7961d66f 100644
--- a/Eigen/src/Core/ArithmeticSequence.h
+++ b/Eigen/src/Core/ArithmeticSequence.h
@@ -178,11 +178,10 @@ auto seq(FirstType f, LastType l, IncrType incr)
 
 namespace placeholders {
 
-/** \cpp11
- * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr.
+/** \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr.
  *
  * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode
- *
+ * \anchor Eigen_placeholders_lastN
  * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
 template <typename SizeType, typename IncrType>
 auto lastN(SizeType size, IncrType incr)
@@ -190,8 +189,7 @@ auto lastN(SizeType size, IncrType incr)
   return seqN(Eigen::placeholders::last - (size - fix<1>()) * incr, size, incr);
 }
 
-/** \cpp11
- * \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment.
+/** \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment.
  *
  *  It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode
  *
@@ -220,7 +218,7 @@ auto lastN(SizeType size) -> decltype(seqN(Eigen::placeholders::last + fix<1>()
   using Eigen::seqN;
   using Eigen::placeholders::all;
   using Eigen::placeholders::last;
-  using Eigen::placeholders::lastN;  // c++11 only
+  using Eigen::placeholders::lastN;
   using Eigen::placeholders::lastp1;
   \endcode
   */
diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h
index 57f3186b09b..28ff760e796 100644
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -123,12 +123,12 @@ class Array : public PlainObjectBase<Array<Scalar_, Rows_, Cols_, Options_, MaxR
    * \sa resize(Index,Index)
    */
 #ifdef EIGEN_INITIALIZE_COEFFS
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array() : Base() { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
+  EIGEN_DEVICE_FUNC constexpr Array() : Base() { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
 #else
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array() = default;
+  EIGEN_DEVICE_FUNC constexpr Array() = default;
 #endif
   /** \brief Move constructor */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array(Array&&) = default;
+  EIGEN_DEVICE_FUNC constexpr Array(Array&&) = default;
   EIGEN_DEVICE_FUNC Array& operator=(Array&& other) noexcept(std::is_nothrow_move_assignable<Scalar>::value) {
     Base::operator=(std::move(other));
     return *this;
@@ -141,7 +141,7 @@ class Array : public PlainObjectBase<Array<Scalar_, Rows_, Cols_, Options_, MaxR
    * This constructor is for 1D array or vectors with more than 4 coefficients.
    *
    * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
-   * constructor must match the the fixed number of rows (resp. columns) of \c *this.
+   * constructor must match the fixed number of rows (resp. columns) of \c *this.
    *
    *
    * Example: \include Array_variadic_ctor_cxx11.cpp
@@ -156,7 +156,6 @@ class Array : public PlainObjectBase<Array<Scalar_, Rows_, Cols_, Options_, MaxR
       : Base(a0, a1, a2, a3, args...) {}
 
   /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row.
-   * \cpp11
    *
    * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
    *
@@ -178,9 +177,7 @@ class Array : public PlainObjectBase<Array<Scalar_, Rows_, Cols_, Options_, MaxR
    *
    * \sa  Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array(
-      const std::initializer_list<std::initializer_list<Scalar>>& list)
-      : Base(list) {}
+  EIGEN_DEVICE_FUNC constexpr Array(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {}
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
   template <typename T>
@@ -239,7 +236,7 @@ class Array : public PlainObjectBase<Array<Scalar_, Rows_, Cols_, Options_, MaxR
   }
 
   /** Copy constructor */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array(const Array&) = default;
+  EIGEN_DEVICE_FUNC constexpr Array(const Array&) = default;
 
  private:
   struct PrivateType {};
@@ -247,7 +244,7 @@ class Array : public PlainObjectBase<Array<Scalar_, Rows_, Cols_, Options_, MaxR
  public:
   /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Array(
       const EigenBase<OtherDerived>& other,
       std::enable_if_t<internal::is_convertible<typename OtherDerived::Scalar, Scalar>::value, PrivateType> =
           PrivateType())
@@ -282,7 +279,7 @@ class Array : public PlainObjectBase<Array<Scalar_, Rows_, Cols_, Options_, MaxR
  * There are also \c ArraySizeType which are self-explanatory. For example, \c Array4cf is
  * a fixed-size 1D array of 4 complex floats.
  *
- * With \cpp11, template alias are also defined for common sizes.
+ * Template alias are also defined for common sizes.
  * They follow the same pattern as above except that the scalar type suffix is replaced by a
  * template parameter, i.e.:
  *   - `ArrayRowsCols<Type>` where `Rows` and `Cols` can be \c 2,\c 3,\c 4, or \c X for fixed or dynamic size.
@@ -324,21 +321,17 @@ EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex<double>, cd)
 
 #define EIGEN_MAKE_ARRAY_TYPEDEFS(Size, SizeSuffix)              \
   /** \ingroup arraytypedefs */                                  \
-  /** \brief \cpp11 */                                           \
   template <typename Type>                                       \
   using Array##SizeSuffix##SizeSuffix = Array<Type, Size, Size>; \
   /** \ingroup arraytypedefs */                                  \
-  /** \brief \cpp11 */                                           \
   template <typename Type>                                       \
   using Array##SizeSuffix = Array<Type, Size, 1>;
 
 #define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Size)        \
   /** \ingroup arraytypedefs */                      \
-  /** \brief \cpp11 */                               \
   template <typename Type>                           \
   using Array##Size##X = Array<Type, Size, Dynamic>; \
   /** \ingroup arraytypedefs */                      \
-  /** \brief \cpp11 */                               \
   template <typename Type>                           \
   using Array##X##Size = Array<Type, Dynamic, Size>;
 
diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h
index 8465f54feda..dacc2393334 100644
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@@ -168,19 +168,16 @@ class ArrayBase : public DenseBase<Derived> {
   }
 
  public:
-  EIGEN_DEVICE_FUNC ArrayBase<Derived>& array() { return *this; }
-  EIGEN_DEVICE_FUNC const ArrayBase<Derived>& array() const { return *this; }
+  EIGEN_DEVICE_FUNC constexpr ArrayBase<Derived>& array() { return *this; }
+  EIGEN_DEVICE_FUNC constexpr const ArrayBase<Derived>& array() const { return *this; }
 
   /** \returns an \link Eigen::MatrixBase Matrix \endlink expression of this array
    * \sa MatrixBase::array() */
-  EIGEN_DEVICE_FUNC MatrixWrapper<Derived> matrix() { return MatrixWrapper<Derived>(derived()); }
-  EIGEN_DEVICE_FUNC const MatrixWrapper<const Derived> matrix() const {
+  EIGEN_DEVICE_FUNC constexpr MatrixWrapper<Derived> matrix() { return MatrixWrapper<Derived>(derived()); }
+  EIGEN_DEVICE_FUNC constexpr const MatrixWrapper<const Derived> matrix() const {
     return MatrixWrapper<const Derived>(derived());
   }
 
-  //     template<typename Dest>
-  //     inline void evalTo(Dest& dst) const { dst = matrix(); }
-
  protected:
   EIGEN_DEFAULT_COPY_CONSTRUCTOR(ArrayBase)
   EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(ArrayBase)
diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h
index c9a194e991f..fb05ab55d88 100644
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -21,7 +21,7 @@ namespace Eigen {
  * \brief Expression of a mathematical vector or matrix as an array object
  *
  * This class is the return type of MatrixBase::array(), and most of the time
- * this is the only way it is use.
+ * this is the only way it is used.
  *
  * \sa MatrixBase::array(), class MatrixWrapper
  */
@@ -54,7 +54,8 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> > {
 
   using Base::coeffRef;
 
-  EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+  EIGEN_DEVICE_FUNC constexpr explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix)
+      : m_expression(matrix) {}
 
   EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
   EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
@@ -75,7 +76,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> > {
     dst = m_expression;
   }
 
-  EIGEN_DEVICE_FUNC const internal::remove_all_t<NestedExpressionType>& nestedExpression() const {
+  EIGEN_DEVICE_FUNC constexpr const internal::remove_all_t<NestedExpressionType>& nestedExpression() const {
     return m_expression;
   }
 
@@ -96,7 +97,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> > {
  * \brief Expression of an array as a mathematical vector or matrix
  *
  * This class is the return type of ArrayBase::matrix(), and most of the time
- * this is the only way it is use.
+ * this is the only way it is used.
  *
  * \sa MatrixBase::matrix(), class ArrayWrapper
  */
@@ -129,7 +130,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> > {
 
   using Base::coeffRef;
 
-  EIGEN_DEVICE_FUNC explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+  EIGEN_DEVICE_FUNC constexpr explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
 
   EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
   EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
@@ -145,7 +146,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> > {
 
   EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const { return m_expression.coeffRef(index); }
 
-  EIGEN_DEVICE_FUNC const internal::remove_all_t<NestedExpressionType>& nestedExpression() const {
+  EIGEN_DEVICE_FUNC constexpr const internal::remove_all_t<NestedExpressionType>& nestedExpression() const {
     return m_expression;
   }
 
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index 4b30f7bb626..3d30d868c23 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -19,7 +19,8 @@ namespace Eigen {
 
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::lazyAssign(const DenseBase<OtherDerived>& other) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::lazyAssign(
+    const DenseBase<OtherDerived>& other) {
   enum { SameType = internal::is_same<typename Derived::Scalar, typename OtherDerived::Scalar>::value };
 
   EIGEN_STATIC_ASSERT_LVALUE(Derived)
@@ -36,40 +37,43 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::lazyAssign(co
 
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase<OtherDerived>& other) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(
+    const DenseBase<OtherDerived>& other) {
   internal::call_assignment(derived(), other.derived());
   return derived();
 }
 
 template <typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase& other) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase& other) {
   internal::call_assignment(derived(), other.derived());
   return derived();
 }
 
 template <typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const MatrixBase& other) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const MatrixBase& other) {
   internal::call_assignment(derived(), other.derived());
   return derived();
 }
 
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const DenseBase<OtherDerived>& other) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(
+    const DenseBase<OtherDerived>& other) {
   internal::call_assignment(derived(), other.derived());
   return derived();
 }
 
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const EigenBase<OtherDerived>& other) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(
+    const EigenBase<OtherDerived>& other) {
   internal::call_assignment(derived(), other.derived());
   return derived();
 }
 
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(
     const ReturnByValue<OtherDerived>& other) {
   other.derived().evalTo(derived());
   return derived();
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index 36f0a9d74de..3c30d2227a2 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -63,7 +63,7 @@ struct copy_using_evaluator_traits {
   static constexpr int RestrictedLinearSize = min_size_prefer_fixed(MaxSizeAtCompileTime, MaxPacketSize);
   static constexpr int OuterStride = outer_stride_at_compile_time<Dst>::ret;
 
-  // TODO distinguish between linear traversal and inner-traversals
+  // TODO: distinguish between linear traversal and inner-traversal packet types.
   using LinearPacketType = typename find_best_packet<DstScalar, RestrictedLinearSize>::type;
   using InnerPacketType = typename find_best_packet<DstScalar, RestrictedInnerSize>::type;
 
@@ -83,20 +83,36 @@ struct copy_using_evaluator_traits {
                                             (OuterStride != Dynamic) && (OuterStride % InnerPacketSize == 0) &&
                                             (EIGEN_UNALIGNED_VECTORIZE || JointAlignment >= InnerRequiredAlignment);
   static constexpr bool MayLinearize = StorageOrdersAgree && (DstFlags & SrcFlags & LinearAccessBit);
+  static constexpr int CoeffReadCost = int(DstEvaluator::CoeffReadCost) + int(SrcEvaluator::CoeffReadCost);
+  static constexpr bool SmallAssignmentScalarPathIsCheap =
+      (SizeAtCompileTime != Dynamic) && (SizeAtCompileTime * CoeffReadCost <= EIGEN_UNROLLING_LIMIT);
+  /* Packet traversal has enough setup/tail overhead that it is not worth it
+     for very small fixed-size assignments when the scalar path can be fully
+     unrolled. More expensive RHS expressions can still amortize packet setup. */
+  static constexpr int SmallAssignmentPacketThreshold = 3;
+  static constexpr int LinearPacketThreshold = SmallAssignmentScalarPathIsCheap ? SmallAssignmentPacketThreshold : 1;
+  static constexpr int LinearSizeThreshold = LinearPacketThreshold * LinearPacketSize;
   static constexpr bool MayLinearVectorize =
       MightVectorize && MayLinearize && DstHasDirectAccess &&
       (EIGEN_UNALIGNED_VECTORIZE || (DstAlignment >= LinearRequiredAlignment) || MaxSizeAtCompileTime == Dynamic) &&
-      (MaxSizeAtCompileTime == Dynamic || MaxSizeAtCompileTime >= LinearPacketSize);
-  /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
-     so it's only good for large enough sizes. */
-  static constexpr int InnerSizeThreshold = (EIGEN_UNALIGNED_VECTORIZE ? 1 : 3) * InnerPacketSize;
+      (MaxSizeAtCompileTime == Dynamic || MaxSizeAtCompileTime >= LinearSizeThreshold);
+  /* Slice vectorization can be slow, so use MaxInnerSize rather than InnerSize:
+     a dynamic block in a fixed-size matrix can still have large slices. With
+     EIGEN_UNALIGNED_VECTORIZE and unrolling, one packet is still worthwhile for
+     non-vector slices. Cheap fixed-size vector blocks can otherwise fall back to
+     slice vectorization after the linear path is rejected, so use the same
+     conservative cutoff there. */
+  static constexpr bool UseConservativeVectorInnerThreshold = IsVectorAtCompileTime && SmallAssignmentScalarPathIsCheap;
+  static constexpr int VectorInnerPacketThreshold =
+      (UseConservativeVectorInnerThreshold || !EIGEN_UNALIGNED_VECTORIZE) ? SmallAssignmentPacketThreshold : 1;
+  static constexpr int VectorInnerSizeThreshold = VectorInnerPacketThreshold * InnerPacketSize;
+  static constexpr int NonVectorInnerSizeThreshold =
+      (EIGEN_UNALIGNED_VECTORIZE ? 1 : SmallAssignmentPacketThreshold) * InnerPacketSize;
+  static constexpr int InnerSizeThreshold =
+      IsVectorAtCompileTime ? VectorInnerSizeThreshold : NonVectorInnerSizeThreshold;
   static constexpr bool MaySliceVectorize =
       MightVectorize && DstHasDirectAccess &&
       (MaxInnerSizeAtCompileTime == Dynamic || MaxInnerSizeAtCompileTime >= InnerSizeThreshold);
-  /* slice vectorization can be slow, so we only want it if the slices are big, which is
-     indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
-     in a fixed-size matrix
-     However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */
 
  public:
   static constexpr int Traversal = SizeAtCompileTime == 0 ? AllAtOnceTraversal
@@ -115,7 +131,6 @@ struct copy_using_evaluator_traits {
  private:
   static constexpr int ActualPacketSize = Vectorized ? unpacket_traits<PacketType>::size : 1;
   static constexpr int UnrollingLimit = EIGEN_UNROLLING_LIMIT * ActualPacketSize;
-  static constexpr int CoeffReadCost = int(DstEvaluator::CoeffReadCost) + int(SrcEvaluator::CoeffReadCost);
   static constexpr bool MayUnrollCompletely =
       (SizeAtCompileTime != Dynamic) && (SizeAtCompileTime * CoeffReadCost <= UnrollingLimit);
   static constexpr bool MayUnrollInner =
@@ -474,8 +489,8 @@ struct dense_assignment_loop_impl<Kernel, LinearVectorizedTraversal, NoUnrolling
   static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
   static constexpr int DstAlignment = plain_enum_max(Kernel::AssignmentTraits::DstAlignment, alignof(Scalar));
   static constexpr int RequestedAlignment = unpacket_traits<PacketType>::alignment;
-  static constexpr bool Alignable =
-      (DstAlignment >= RequestedAlignment) || ((RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0);
+  static constexpr bool Alignable = (DstAlignment >= RequestedAlignment) ||
+                                    (static_cast<std::size_t>(RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0);
   static constexpr int Alignment = Alignable ? RequestedAlignment : DstAlignment;
   static constexpr bool DstIsAligned = DstAlignment >= Alignment;
   static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
@@ -587,8 +602,8 @@ struct dense_assignment_loop_impl<Kernel, SliceVectorizedTraversal, NoUnrolling>
   static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
   static constexpr int DstAlignment = plain_enum_max(Kernel::AssignmentTraits::DstAlignment, alignof(Scalar));
   static constexpr int RequestedAlignment = unpacket_traits<PacketType>::alignment;
-  static constexpr bool Alignable =
-      (DstAlignment >= RequestedAlignment) || ((RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0);
+  static constexpr bool Alignable = (DstAlignment >= RequestedAlignment) ||
+                                    (static_cast<std::size_t>(RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0);
   static constexpr int Alignment = Alignable ? RequestedAlignment : DstAlignment;
   static constexpr bool DstIsAligned = DstAlignment >= Alignment;
   static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
@@ -654,15 +669,15 @@ struct dense_assignment_loop_impl<Kernel, SliceVectorizedTraversal, InnerUnrolli
 template <typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version = Specialized>
 class generic_dense_assignment_kernel {
  protected:
-  typedef typename DstEvaluatorTypeT::XprType DstXprType;
-  typedef typename SrcEvaluatorTypeT::XprType SrcXprType;
+  using DstXprType = typename DstEvaluatorTypeT::XprType;
+  using SrcXprType = typename SrcEvaluatorTypeT::XprType;
 
  public:
-  typedef DstEvaluatorTypeT DstEvaluatorType;
-  typedef SrcEvaluatorTypeT SrcEvaluatorType;
-  typedef typename DstEvaluatorType::Scalar Scalar;
-  typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
-  typedef typename AssignmentTraits::PacketType PacketType;
+  using DstEvaluatorType = DstEvaluatorTypeT;
+  using SrcEvaluatorType = SrcEvaluatorTypeT;
+  using Scalar = typename DstEvaluatorType::Scalar;
+  using AssignmentTraits = copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor>;
+  using PacketType = typename AssignmentTraits::PacketType;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr generic_dense_assignment_kernel(DstEvaluatorType& dst,
                                                                                   const SrcEvaluatorType& src,
@@ -681,8 +696,8 @@ class generic_dense_assignment_kernel {
   EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_dstExpr.cols(); }
   EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_dstExpr.outerStride(); }
 
-  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() noexcept { return m_dst; }
-  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const noexcept { return m_src; }
+  EIGEN_DEVICE_FUNC constexpr DstEvaluatorType& dstEvaluator() noexcept { return m_dst; }
+  EIGEN_DEVICE_FUNC constexpr const SrcEvaluatorType& srcEvaluator() const noexcept { return m_src; }
 
   /// Assign src(row,col) to dst(row,col) through the assignment functor.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(Index row, Index col) {
@@ -690,7 +705,7 @@ class generic_dense_assignment_kernel {
   }
 
   /// \sa assignCoeff(Index,Index)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(Index index) {
     m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
   }
 
@@ -741,7 +756,7 @@ class generic_dense_assignment_kernel {
   }
 
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr Index rowIndexByOuterInner(Index outer, Index inner) {
-    typedef typename DstEvaluatorType::ExpressionTraits Traits;
+    using Traits = typename DstEvaluatorType::ExpressionTraits;
     return int(Traits::RowsAtCompileTime) == 1          ? 0
            : int(Traits::ColsAtCompileTime) == 1        ? inner
            : int(DstEvaluatorType::Flags) & RowMajorBit ? outer
@@ -749,7 +764,7 @@ class generic_dense_assignment_kernel {
   }
 
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr Index colIndexByOuterInner(Index outer, Index inner) {
-    typedef typename DstEvaluatorType::ExpressionTraits Traits;
+    using Traits = typename DstEvaluatorType::ExpressionTraits;
     return int(Traits::ColsAtCompileTime) == 1          ? 0
            : int(Traits::RowsAtCompileTime) == 1        ? inner
            : int(DstEvaluatorType::Flags) & RowMajorBit ? inner
@@ -762,7 +777,7 @@ class generic_dense_assignment_kernel {
   DstEvaluatorType& m_dst;
   const SrcEvaluatorType& m_src;
   const Functor& m_functor;
-  // TODO find a way to avoid the needs of the original expression
+  // TODO: find a way to avoid the needs of the original expression
   DstXprType& m_dstExpr;
 };
 
@@ -774,13 +789,13 @@ template <typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Funct
 class restricted_packet_dense_assignment_kernel
     : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn> {
  protected:
-  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn> Base;
+  using Base = generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn>;
 
  public:
-  typedef typename Base::Scalar Scalar;
-  typedef typename Base::DstXprType DstXprType;
-  typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, 4> AssignmentTraits;
-  typedef typename AssignmentTraits::PacketType PacketType;
+  using Scalar = typename Base::Scalar;
+  using DstXprType = typename Base::DstXprType;
+  using AssignmentTraits = copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, 4>;
+  using PacketType = typename AssignmentTraits::PacketType;
 
   EIGEN_DEVICE_FUNC restricted_packet_dense_assignment_kernel(DstEvaluatorTypeT& dst, const SrcEvaluatorTypeT& src,
                                                               const Functor& func, DstXprType& dstExpr)
@@ -804,15 +819,27 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize_if_allowed(DstXprTyp
                                                                        const internal::assign_op<T1, T2>& /*func*/) {
   Index dstRows = src.rows();
   Index dstCols = src.cols();
-  if (((dst.rows() != dstRows) || (dst.cols() != dstCols))) dst.resize(dstRows, dstCols);
-  eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols);
+  if (((dst.rows() != dstRows) || (dst.cols() != dstCols))) {
+#ifdef EIGEN_NO_AUTOMATIC_RESIZING
+    eigen_assert(
+        (dst.size() == 0 || (DstXprType::IsVectorAtCompileTime ? (dst.size() == src.size())
+                                                               : (dst.rows() == dstRows && dst.cols() == dstCols))) &&
+        "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined");
+    if (dst.size() == 0) {
+      dst.resize(dstRows, dstCols);
+    }
+#else
+    dst.resize(dstRows, dstCols);
+    eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols);
+#endif
+  }
 }
 
 template <typename DstXprType, typename SrcXprType, typename Functor>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src,
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src,
                                                                                 const Functor& func) {
-  typedef evaluator<DstXprType> DstEvaluatorType;
-  typedef evaluator<SrcXprType> SrcEvaluatorType;
+  using DstEvaluatorType = evaluator<DstXprType>;
+  using SrcEvaluatorType = evaluator<SrcXprType>;
 
   SrcEvaluatorType srcEvaluator(src);
 
@@ -822,14 +849,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_dense_assignment_loop(
 
   DstEvaluatorType dstEvaluator(dst);
 
-  typedef generic_dense_assignment_kernel<DstEvaluatorType, SrcEvaluatorType, Functor> Kernel;
+  using Kernel = generic_dense_assignment_kernel<DstEvaluatorType, SrcEvaluatorType, Functor>;
   Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
 
   dense_assignment_loop<Kernel>::run(kernel);
 }
 
 template <typename DstXprType, typename SrcXprType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src) {
   call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>());
 }
 
@@ -849,11 +876,11 @@ struct EigenBase2EigenBase {};
 
 template <typename, typename>
 struct AssignmentKind {
-  typedef EigenBase2EigenBase Kind;
+  using Kind = EigenBase2EigenBase;
 };
 template <>
 struct AssignmentKind<DenseShape, DenseShape> {
-  typedef Dense2Dense Kind;
+  using Kind = Dense2Dense;
 };
 
 // This is the main assignment class
@@ -908,11 +935,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias(Ds
                       int(Dst::SizeAtCompileTime) != 1
   };
 
-  typedef std::conditional_t<NeedToTranspose, Transpose<Dst>, Dst> ActualDstTypeCleaned;
-  typedef std::conditional_t<NeedToTranspose, Transpose<Dst>, Dst&> ActualDstType;
+  using ActualDstTypeCleaned = std::conditional_t<NeedToTranspose, Transpose<Dst>, Dst>;
+  using ActualDstType = std::conditional_t<NeedToTranspose, Transpose<Dst>, Dst&>;
   ActualDstType actualDst(dst);
 
-  // TODO check whether this is the right place to perform these checks:
+  // TODO: check whether this is the right place to perform these checks:
   EIGEN_STATIC_ASSERT_LVALUE(Dst)
   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned, Src)
   EIGEN_CHECK_BINARY_COMPATIBILIY(Func, typename ActualDstTypeCleaned::Scalar, typename Src::Scalar);
@@ -923,9 +950,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias(Ds
 template <typename Dst, typename Src, typename Func>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_restricted_packet_assignment_no_alias(Dst& dst, const Src& src,
                                                                                       const Func& func) {
-  typedef evaluator<Dst> DstEvaluatorType;
-  typedef evaluator<Src> SrcEvaluatorType;
-  typedef restricted_packet_dense_assignment_kernel<DstEvaluatorType, SrcEvaluatorType, Func> Kernel;
+  using DstEvaluatorType = evaluator<Dst>;
+  using SrcEvaluatorType = evaluator<Src>;
+  using Kernel = restricted_packet_dense_assignment_kernel<DstEvaluatorType, SrcEvaluatorType, Func>;
 
   EIGEN_STATIC_ASSERT_LVALUE(Dst)
   EIGEN_CHECK_BINARY_COMPATIBILIY(Func, typename Dst::Scalar, typename Src::Scalar);
@@ -947,7 +974,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias(Ds
 template <typename Dst, typename Src, typename Func>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src,
                                                                                            const Func& func) {
-  // TODO check whether this is the right place to perform these checks:
+  // TODO: check whether this is the right place to perform these checks:
   EIGEN_STATIC_ASSERT_LVALUE(Dst)
   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst, Src)
   EIGEN_CHECK_BINARY_COMPATIBILIY(Func, typename Dst::Scalar, typename Src::Scalar);
@@ -1007,7 +1034,7 @@ struct Assignment<DstXprType, CwiseNullaryOp<scalar_zero_op<typename DstXprType:
 };
 
 // Generic assignment through evalTo.
-// TODO: not sure we have to keep that one, but it helps porting current code to new evaluator mechanism.
+// TODO: evaluate whether this generic evalTo-based assignment path is still needed.
 // Note that the last template argument "Weak" is needed to make it possible to perform
 // both partial specialization+SFINAE without ambiguous specialization
 template <typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
diff --git a/Eigen/src/Core/Assign_AOCL.h b/Eigen/src/Core/Assign_AOCL.h
new file mode 100644
index 00000000000..da3ef7cea3a
--- /dev/null
+++ b/Eigen/src/Core/Assign_AOCL.h
@@ -0,0 +1,301 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ *
+ * Assign_AOCL.h - AOCL Vectorized Math Dispatch Layer for Eigen
+ *
+ * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Description:
+ * ------------
+ * This file implements a high-performance dispatch layer that automatically
+ * routes Eigen's element-wise mathematical operations to AMD Optimizing CPU
+ * Libraries (AOCL) Vector Math Library (VML) functions when beneficial for
+ * performance.
+ *
+ * The dispatch system uses C++ template specialization to intercept Eigen's
+ * assignment operations and redirect them to AOCL's VRDA functions, which
+ * provide optimized implementations for AMD Zen architectures.
+ *
+ * Key Features:
+ * -------------
+ * 1. Automatic Dispatch: Seamlessly routes supported operations to AOCL without
+ *    requiring code changes in user applications
+ *
+ * 2. Performance Optimization: Uses AOCL VRDA functions optimized for Zen
+ * family processors with automatic SIMD instruction selection (AVX2, AVX-512)
+ *
+ * 3. Threshold-Based Activation: Only activates for vectors larger than
+ *    EIGEN_AOCL_VML_THRESHOLD (default: 128 elements) to avoid overhead on
+ * small vectors
+ *
+ * 4. Precision-Specific Handling:
+ *    - Double precision: AOCL VRDA vectorized functions
+ *    - Single precision: Scalar fallback (preserves correctness)
+ *
+ * 5. Memory Layout Compatibility: Ensures direct memory access and compatible
+ *    storage orders between source and destination for optimal performance
+ *
+ * Supported Operations:
+ * ---------------------
+ * UNARY OPERATIONS (vector → vector):
+ * - Transcendental: exp(), sin(), cos(), sqrt(), log(), log10(), log2()
+ *
+ * BINARY OPERATIONS (vector op vector → vector):
+ * - Arithmetic: +, *, pow()
+ *
+ * Template Specialization Mechanism:
+ * -----------------------------------
+ * The system works by specializing Eigen's Assignment template for:
+ * 1. CwiseUnaryOp with scalar_*_op functors (unary operations)
+ * 2. CwiseBinaryOp with scalar_*_op functors (binary operations)
+ * 3. Dense2Dense assignment context with AOCL-compatible traits
+ *
+ * Dispatch conditions (all must be true):
+ * - Source and destination have DirectAccessBit (contiguous memory)
+ * - Compatible storage orders (both row-major or both column-major)
+ * - Vector size ≥ EIGEN_AOCL_VML_THRESHOLD or Dynamic size
+ * - Supported data type (currently double precision for VRDA)
+ *
+ * Integration Example:
+ * --------------------
+ * // Standard Eigen code - no changes required
+ * VectorXd x = VectorXd::Random(10000);
+ * VectorXd y = VectorXd::Random(10000);
+ * VectorXd result;
+ *
+ * // These operations are automatically dispatched to AOCL:
+ * result = x.array().exp();              // → amd_vrda_exp()
+ * result = x.array().sin();              // → amd_vrda_sin()
+ * result = x.array() + y.array();        // → amd_vrda_add()
+ * result = x.array().pow(y.array());     // → amd_vrda_pow()
+ *
+ * Configuration:
+ * --------------
+ * Required preprocessor definitions:
+ * - EIGEN_USE_AOCL_ALL or EIGEN_USE_AOCL_MT: Enable AOCL integration
+ * - EIGEN_USE_AOCL_VML: Enable Vector Math Library dispatch
+ *
+ * Compilation Requirements:
+ * -------------------------
+ * Include paths:
+ * - AOCL headers: -I${AOCL_ROOT}/include
+ * - Eigen headers: -I/path/to/eigen
+ *
+ * Link libraries:
+ * - AOCL MathLib: -lamdlibm
+ * - Standard math: -lm
+ *
+ * Compiler flags:
+ * - Optimization: -O3 (required for inlining)
+ * - Architecture: -march=znver5 or -march=native
+ * - Vectorization: -mfma -mavx512f (if supported)
+ *
+ * Platform Support:
+ * ------------------
+ * - Primary: Linux x86_64 with AMD Zen family processors
+ * - Compilers: GCC 8+, Clang 10+, AOCC (recommended)
+ * - AOCL Version: 4.0+ (with VRDA support)
+ *
+ * Error Handling:
+ * ---------------
+ * - Graceful fallback to scalar operations for unsupported configurations
+ * - Compile-time detection of AOCL availability
+ * - Runtime size and alignment validation with eigen_assert()
+ *
+ * Developer:
+ * ----------
+ * Name: Sharad Saurabh Bhaskar
+ * Email: shbhaska@amd.com
+ * Organization: Advanced Micro Devices, Inc.
+ */
+
+
+#ifndef EIGEN_ASSIGN_AOCL_H
+#define EIGEN_ASSIGN_AOCL_H
+
+namespace Eigen {
+namespace internal {
+
+// Traits for unary operations.
+template <typename Dst, typename Src> class aocl_assign_traits {
+private:
+  enum {
+    DstHasDirectAccess = !!(Dst::Flags & DirectAccessBit),
+    SrcHasDirectAccess = !!(Src::Flags & DirectAccessBit),
+    StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Src::IsRowMajor)),
+    InnerSize = Dst::IsVectorAtCompileTime   ? int(Dst::SizeAtCompileTime)
+                : (Dst::Flags & RowMajorBit) ? int(Dst::ColsAtCompileTime)
+                                             : int(Dst::RowsAtCompileTime),
+    LargeEnough =
+        (InnerSize == Dynamic) || (InnerSize >= EIGEN_AOCL_VML_THRESHOLD)
+  };
+
+public:
+  enum {
+    EnableAoclVML = DstHasDirectAccess && SrcHasDirectAccess &&
+                    StorageOrdersAgree && LargeEnough,
+    Traversal = LinearTraversal
+  };
+};
+
+// Traits for binary operations (e.g., add, pow).
+template <typename Dst, typename Lhs, typename Rhs>
+class aocl_assign_binary_traits {
+private:
+  enum {
+    DstHasDirectAccess = !!(Dst::Flags & DirectAccessBit),
+    LhsHasDirectAccess = !!(Lhs::Flags & DirectAccessBit),
+    RhsHasDirectAccess = !!(Rhs::Flags & DirectAccessBit),
+    StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Lhs::IsRowMajor)) &&
+                         (int(Dst::IsRowMajor) == int(Rhs::IsRowMajor)),
+    InnerSize = Dst::IsVectorAtCompileTime   ? int(Dst::SizeAtCompileTime)
+                : (Dst::Flags & RowMajorBit) ? int(Dst::ColsAtCompileTime)
+                                             : int(Dst::RowsAtCompileTime),
+    LargeEnough =
+        (InnerSize == Dynamic) || (InnerSize >= EIGEN_AOCL_VML_THRESHOLD)
+  };
+
+public:
+  enum {
+    EnableAoclVML = DstHasDirectAccess && LhsHasDirectAccess &&
+                    RhsHasDirectAccess && StorageOrdersAgree && LargeEnough
+  };
+};
+
+// Unary operation dispatch for float (scalar fallback).
+#define EIGEN_AOCL_VML_UNARY_CALL_FLOAT(EIGENOP)                               \
+  template <typename DstXprType, typename SrcXprNested>                        \
+  struct Assignment<                                                           \
+      DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<float>, SrcXprNested>,    \
+      assign_op<float, float>, Dense2Dense,                                    \
+      std::enable_if_t<                                                        \
+          aocl_assign_traits<DstXprType, SrcXprNested>::EnableAoclVML>> {      \
+    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<float>, SrcXprNested>           \
+        SrcXprType;                                                            \
+    static void run(DstXprType &dst, const SrcXprType &src,                    \
+                    const assign_op<float, float> &) {                         \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());      \
+      Eigen::Index n = dst.size();                                             \
+      if (n <= 0)                                                              \
+        return;                                                                \
+      const float *input =                                                     \
+          reinterpret_cast<const float *>(src.nestedExpression().data());      \
+      float *output = reinterpret_cast<float *>(dst.data());                   \
+      for (Eigen::Index i = 0; i < n; ++i) {                                   \
+        output[i] = std::EIGENOP(input[i]);                                    \
+      }                                                                        \
+    }                                                                          \
+  };
+
+// Unary operation dispatch for double (AOCL vectorized).
+#define EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(EIGENOP, AOCLOP)                      \
+  template <typename DstXprType, typename SrcXprNested>                        \
+  struct Assignment<                                                           \
+      DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<double>, SrcXprNested>,   \
+      assign_op<double, double>, Dense2Dense,                                  \
+      std::enable_if_t<                                                        \
+          aocl_assign_traits<DstXprType, SrcXprNested>::EnableAoclVML>> {      \
+    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<double>, SrcXprNested>          \
+        SrcXprType;                                                            \
+    static void run(DstXprType &dst, const SrcXprType &src,                    \
+                    const assign_op<double, double> &) {                       \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());      \
+      Eigen::Index n = dst.size();                                             \
+      eigen_assert(n <= INT_MAX && "AOCL does not support arrays larger than INT_MAX"); \
+      if (n <= 0)                                                              \
+        return;                                                                \
+      const double *input =                                                    \
+          reinterpret_cast<const double *>(src.nestedExpression().data());     \
+      double *output = reinterpret_cast<double *>(dst.data());                 \
+      int aocl_n = internal::convert_index<int>(n);                            \
+      AOCLOP(aocl_n, const_cast<double *>(input), output);                     \
+    }                                                                          \
+  };
+
+// Instantiate unary calls for float (scalar).
+// EIGEN_AOCL_VML_UNARY_CALL_FLOAT(exp)
+
+// Instantiate unary calls for double (AOCL vectorized).
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(exp2, amd_vrda_exp2)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(exp, amd_vrda_exp)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(sin, amd_vrda_sin)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(cos, amd_vrda_cos)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(sqrt, amd_vrda_sqrt)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(cbrt, amd_vrda_cbrt)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(abs, amd_vrda_fabs)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log, amd_vrda_log)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log10, amd_vrda_log10)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log2, amd_vrda_log2)
+
+// Binary operation dispatch for float (scalar fallback).
+#define EIGEN_AOCL_VML_BINARY_CALL_FLOAT(EIGENOP, STDFUNC)                     \
+  template <typename DstXprType, typename LhsXprNested, typename RhsXprNested> \
+  struct Assignment<                                                           \
+      DstXprType,                                                              \
+      CwiseBinaryOp<scalar_##EIGENOP##_op<float, float>, LhsXprNested,         \
+                    RhsXprNested>,                                             \
+      assign_op<float, float>, Dense2Dense,                                    \
+      std::enable_if_t<aocl_assign_binary_traits<                              \
+          DstXprType, LhsXprNested, RhsXprNested>::EnableAoclVML>> {           \
+    typedef CwiseBinaryOp<scalar_##EIGENOP##_op<float, float>, LhsXprNested,   \
+                          RhsXprNested>                                        \
+        SrcXprType;                                                            \
+    static void run(DstXprType &dst, const SrcXprType &src,                    \
+                    const assign_op<float, float> &) {                         \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());      \
+      Eigen::Index n = dst.size();                                             \
+      if (n <= 0)                                                              \
+        return;                                                                \
+      const float *lhs = reinterpret_cast<const float *>(src.lhs().data());    \
+      const float *rhs = reinterpret_cast<const float *>(src.rhs().data());    \
+      float *output = reinterpret_cast<float *>(dst.data());                   \
+      for (Eigen::Index i = 0; i < n; ++i) {                                   \
+        output[i] = STDFUNC(lhs[i], rhs[i]);                                   \
+      }                                                                        \
+    }                                                                          \
+  };
+
+// Binary operation dispatch for double (AOCL vectorized).
+#define EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(EIGENOP, AOCLOP)                     \
+  template <typename DstXprType, typename LhsXprNested, typename RhsXprNested> \
+  struct Assignment<                                                           \
+      DstXprType,                                                              \
+      CwiseBinaryOp<scalar_##EIGENOP##_op<double, double>, LhsXprNested,       \
+                    RhsXprNested>,                                             \
+      assign_op<double, double>, Dense2Dense,                                  \
+      std::enable_if_t<aocl_assign_binary_traits<                              \
+          DstXprType, LhsXprNested, RhsXprNested>::EnableAoclVML>> {           \
+    typedef CwiseBinaryOp<scalar_##EIGENOP##_op<double, double>, LhsXprNested, \
+                          RhsXprNested>                                        \
+        SrcXprType;                                                            \
+    static void run(DstXprType &dst, const SrcXprType &src,                    \
+                    const assign_op<double, double> &) {                       \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());      \
+      Eigen::Index n = dst.size();                                             \
+      eigen_assert(n <= INT_MAX && "AOCL does not support arrays larger than INT_MAX"); \
+      if (n <= 0)                                                              \
+        return;                                                                \
+      const double *lhs = reinterpret_cast<const double *>(src.lhs().data());  \
+      const double *rhs = reinterpret_cast<const double *>(src.rhs().data());  \
+      double *output = reinterpret_cast<double *>(dst.data());                 \
+      int aocl_n = internal::convert_index<int>(n);                            \
+      AOCLOP(aocl_n, const_cast<double *>(lhs), const_cast<double *>(rhs), output); \
+    }                                                                          \
+  };
+
+// Instantiate binary calls for float (scalar).
+// EIGEN_AOCL_VML_BINARY_CALL_FLOAT(sum, std::plus<float>)  // Using
+// scalar_sum_op for addition EIGEN_AOCL_VML_BINARY_CALL_FLOAT(pow, std::pow)
+
+// Instantiate binary calls for double (AOCL vectorized).
+EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(sum, amd_vrda_add) // Using scalar_sum_op for addition
+EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(pow, amd_vrda_pow)
+EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(max, amd_vrda_fmax)
+EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(min, amd_vrda_fmin)
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_ASSIGN_AOCL_H
diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h
index ad112200e0f..7636445cb05 100644
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@@ -56,11 +56,11 @@ class vml_assign_traits {
                                                    : int(Dst::MaxRowsAtCompileTime),
     MaxSizeAtCompileTime = Dst::SizeAtCompileTime,
 
-    MightEnableVml = StorageOrdersAgree && DstHasDirectAccess && SrcHasDirectAccess &&
+    MightEnableVml = bool(StorageOrdersAgree) && bool(DstHasDirectAccess) && bool(SrcHasDirectAccess) &&
                      Src::InnerStrideAtCompileTime == 1 && Dst::InnerStrideAtCompileTime == 1,
-    MightLinearize = MightEnableVml && (int(Dst::Flags) & int(Src::Flags) & LinearAccessBit),
-    VmlSize = MightLinearize ? MaxSizeAtCompileTime : InnerMaxSize,
-    LargeEnough = VmlSize == Dynamic || VmlSize >= EIGEN_MKL_VML_THRESHOLD
+    MightLinearize = bool(MightEnableVml) && (int(Dst::Flags) & int(Src::Flags) & LinearAccessBit),
+    VmlSize = bool(MightLinearize) ? MaxSizeAtCompileTime : InnerMaxSize,
+    LargeEnough = (VmlSize == Dynamic) || VmlSize >= EIGEN_MKL_VML_THRESHOLD
   };
 
  public:
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 39abff71873..7dcf909305e 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -121,14 +121,14 @@ class Block
 
   /** Column or Row constructor
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Block(XprType& xpr, Index i) : Impl(xpr, i) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Block(XprType& xpr, Index i) : Impl(xpr, i) {
     eigen_assert((i >= 0) && (((BlockRows == 1) && (BlockCols == XprType::ColsAtCompileTime) && i < xpr.rows()) ||
                               ((BlockRows == XprType::RowsAtCompileTime) && (BlockCols == 1) && i < xpr.cols())));
   }
 
   /** Fixed-size constructor
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Block(XprType& xpr, Index startRow, Index startCol)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Block(XprType& xpr, Index startRow, Index startCol)
       : Impl(xpr, startRow, startCol) {
     EIGEN_STATIC_ASSERT(RowsAtCompileTime != Dynamic && ColsAtCompileTime != Dynamic,
                         THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
@@ -138,8 +138,8 @@ class Block
 
   /** Dynamic-size constructor
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Block(XprType& xpr, Index startRow, Index startCol, Index blockRows,
-                                              Index blockCols)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Block(XprType& xpr, Index startRow, Index startCol, Index blockRows,
+                                                        Index blockCols)
       : Impl(xpr, startRow, startCol, blockRows, blockCols) {
     eigen_assert((RowsAtCompileTime == Dynamic || RowsAtCompileTime == blockRows) &&
                  (ColsAtCompileTime == Dynamic || ColsAtCompileTime == blockCols));
@@ -175,11 +175,11 @@ class BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, Dense>
  public:
   typedef Impl Base;
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index i) : Impl(xpr, i) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index i) : Impl(xpr, i) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol)
       : Impl(xpr, startRow, startCol) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows,
-                                                  Index blockCols)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol,
+                                                            Index blockRows, Index blockCols)
       : Impl(xpr, startRow, startCol, blockRows, blockCols) {}
 };
 
@@ -196,11 +196,9 @@ class BlockImpl_dense : public internal::dense_xpr_base<Block<XprType, BlockRows
   EIGEN_DENSE_PUBLIC_INTERFACE(BlockType)
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl_dense)
 
-  // class InnerIterator; // FIXME apparently never used
-
   /** Column or Row constructor
    */
-  EIGEN_DEVICE_FUNC inline BlockImpl_dense(XprType& xpr, Index i)
+  EIGEN_DEVICE_FUNC constexpr BlockImpl_dense(XprType& xpr, Index i)
       : m_xpr(xpr),
         // It is a row if and only if BlockRows==1 and BlockCols==XprType::ColsAtCompileTime,
         // and it is a column if and only if BlockRows==XprType::RowsAtCompileTime and BlockCols==1,
@@ -213,17 +211,17 @@ class BlockImpl_dense : public internal::dense_xpr_base<Block<XprType, BlockRows
 
   /** Fixed-size constructor
    */
-  EIGEN_DEVICE_FUNC inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
+  EIGEN_DEVICE_FUNC constexpr BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
       : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol), m_blockRows(BlockRows), m_blockCols(BlockCols) {}
 
   /** Dynamic-size constructor
    */
-  EIGEN_DEVICE_FUNC inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol, Index blockRows,
-                                           Index blockCols)
+  EIGEN_DEVICE_FUNC constexpr BlockImpl_dense(XprType& xpr, Index startRow, Index startCol, Index blockRows,
+                                              Index blockCols)
       : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol), m_blockRows(blockRows), m_blockCols(blockCols) {}
 
-  EIGEN_DEVICE_FUNC inline Index rows() const { return m_blockRows.value(); }
-  EIGEN_DEVICE_FUNC inline Index cols() const { return m_blockCols.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_blockRows.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_blockCols.value(); }
 
   EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index rowId, Index colId) {
     EIGEN_STATIC_ASSERT_LVALUE(XprType)
@@ -289,9 +287,9 @@ class BlockImpl_dense : public internal::dense_xpr_base<Block<XprType, BlockRows
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startRow() const noexcept { return m_startRow.value(); }
+  EIGEN_DEVICE_FUNC constexpr StorageIndex startRow() const noexcept { return m_startRow.value(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startCol() const noexcept { return m_startCol.value(); }
+  EIGEN_DEVICE_FUNC constexpr StorageIndex startCol() const noexcept { return m_startCol.value(); }
 
  protected:
   XprTypeNested m_xpr;
@@ -380,22 +378,21 @@ class BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel, true>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; }
 
   /** \sa MapBase::innerStride() */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index innerStride() const noexcept {
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept {
     return internal::traits<BlockType>::HasSameStorageOrderAsXprType ? m_xpr.innerStride() : m_xpr.outerStride();
   }
 
   /** \sa MapBase::outerStride() */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const noexcept {
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept {
     return internal::traits<BlockType>::HasSameStorageOrderAsXprType ? m_xpr.outerStride() : m_xpr.innerStride();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startRow() const noexcept { return m_startRow.value(); }
+  EIGEN_DEVICE_FUNC constexpr StorageIndex startRow() const noexcept { return m_startRow.value(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startCol() const noexcept { return m_startCol.value(); }
+  EIGEN_DEVICE_FUNC constexpr StorageIndex startCol() const noexcept { return m_startCol.value(); }
 
 #ifndef __SUNPRO_CC
-  // FIXME sunstudio is not friendly with the above friend...
-  // META-FIXME there is no 'friend' keyword around here. Is this obsolete?
+  // Historical workaround for SunStudio's handling of the access specifier here.
  protected:
 #endif
 
diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h
index c4141179013..4541f47a2bb 100644
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@@ -31,7 +31,7 @@ template <typename XprType>
 struct CommaInitializer {
   typedef typename XprType::Scalar Scalar;
 
-  EIGEN_DEVICE_FUNC inline CommaInitializer(XprType& xpr, const Scalar& s)
+  EIGEN_DEVICE_FUNC constexpr CommaInitializer(XprType& xpr, const Scalar& s)
       : m_xpr(xpr), m_row(0), m_col(1), m_currentBlockRows(1) {
     eigen_assert(m_xpr.rows() > 0 && m_xpr.cols() > 0 && "Cannot comma-initialize a 0x0 matrix (operator<<)");
     m_xpr.coeffRef(0, 0) = s;
@@ -48,7 +48,6 @@ struct CommaInitializer {
 
   /* Copy/Move constructor which transfers ownership. This is crucial in
    * absence of return value optimization to avoid assertions during destruction. */
-  // FIXME in C++11 mode this could be replaced by a proper RValue constructor
   EIGEN_DEVICE_FUNC inline CommaInitializer(const CommaInitializer& o)
       : m_xpr(o.m_xpr), m_row(o.m_row), m_col(o.m_col), m_currentBlockRows(o.m_currentBlockRows) {
     // Mark original object as finished. In absence of R-value references we need to const_cast:
diff --git a/Eigen/src/Core/ConcatOp.h b/Eigen/src/Core/ConcatOp.h
new file mode 100644
index 00000000000..c01f984cf0a
--- /dev/null
+++ b/Eigen/src/Core/ConcatOp.h
@@ -0,0 +1,343 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Pavel Guzenfeld
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CONCAT_OP_H
+#define EIGEN_CONCAT_OP_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <int Direction, typename LhsType, typename RhsType>
+struct traits<Concat<Direction, LhsType, RhsType>> : traits<LhsType> {
+  typedef typename LhsType::Scalar Scalar;
+  typedef typename traits<LhsType>::StorageKind StorageKind;
+  typedef typename traits<LhsType>::XprKind XprKind;
+  typedef typename ref_selector<LhsType>::type LhsTypeNested;
+  typedef typename ref_selector<RhsType>::type RhsTypeNested;
+  typedef std::remove_reference_t<LhsTypeNested> LhsTypeNested_;
+  typedef std::remove_reference_t<RhsTypeNested> RhsTypeNested_;
+  enum {
+    // For vertical concat (stacking rows): rows add up, cols must match
+    // For horizontal concat (stacking cols): cols add up, rows must match
+    LhsRows = int(LhsType::RowsAtCompileTime),
+    RhsRows = int(RhsType::RowsAtCompileTime),
+    LhsCols = int(LhsType::ColsAtCompileTime),
+    RhsCols = int(RhsType::ColsAtCompileTime),
+
+    RowsAtCompileTime = Direction == Vertical
+                            ? (LhsRows == Dynamic || RhsRows == Dynamic ? int(Dynamic) : LhsRows + RhsRows)
+                            : size_prefer_fixed(LhsRows, RhsRows),
+    ColsAtCompileTime = Direction == Horizontal
+                            ? (LhsCols == Dynamic || RhsCols == Dynamic ? int(Dynamic) : LhsCols + RhsCols)
+                            : size_prefer_fixed(LhsCols, RhsCols),
+
+    LhsMaxRows = int(LhsType::MaxRowsAtCompileTime),
+    RhsMaxRows = int(RhsType::MaxRowsAtCompileTime),
+    LhsMaxCols = int(LhsType::MaxColsAtCompileTime),
+    RhsMaxCols = int(RhsType::MaxColsAtCompileTime),
+
+    MaxRowsAtCompileTime =
+        Direction == Vertical
+            ? (LhsMaxRows == Dynamic || RhsMaxRows == Dynamic ? int(Dynamic) : LhsMaxRows + RhsMaxRows)
+            : max_size_prefer_dynamic(LhsMaxRows, RhsMaxRows),
+    MaxColsAtCompileTime =
+        Direction == Horizontal
+            ? (LhsMaxCols == Dynamic || RhsMaxCols == Dynamic ? int(Dynamic) : LhsMaxCols + RhsMaxCols)
+            : max_size_prefer_dynamic(LhsMaxCols, RhsMaxCols),
+
+    IsRowMajor = MaxRowsAtCompileTime == 1 && MaxColsAtCompileTime != 1   ? 1
+                 : MaxColsAtCompileTime == 1 && MaxRowsAtCompileTime != 1 ? 0
+                 : (int(LhsType::Flags) & RowMajorBit)                    ? 1
+                                                                          : 0,
+    Flags = IsRowMajor ? RowMajorBit : 0
+  };
+};
+
+}  // namespace internal
+
+/**
+ * \class Concat
+ * \ingroup Core_Module
+ *
+ * \brief Expression of the concatenation of two dense expressions
+ *
+ * \tparam Direction either \c Vertical or \c Horizontal
+ * \tparam LhsType the type of the left-hand side expression
+ * \tparam RhsType the type of the right-hand side expression
+ *
+ * This class represents an expression of the concatenation of two dense expressions,
+ * either vertically (stacking rows) or horizontally (stacking columns).
+ *
+ * It is the return type of hcat() and vcat() and typically this is the only way it is used.
+ *
+ * \sa hcat(), vcat()
+ */
+template <int Direction, typename LhsType, typename RhsType>
+class Concat : public internal::dense_xpr_base<Concat<Direction, LhsType, RhsType>>::type {
+  typedef typename internal::traits<Concat>::LhsTypeNested LhsTypeNested;
+  typedef typename internal::traits<Concat>::RhsTypeNested RhsTypeNested;
+  typedef typename internal::traits<Concat>::LhsTypeNested_ LhsTypeNested_;
+  typedef typename internal::traits<Concat>::RhsTypeNested_ RhsTypeNested_;
+
+ public:
+  typedef typename internal::dense_xpr_base<Concat>::type Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Concat)
+  typedef internal::remove_all_t<LhsType> LhsNestedExpression;
+  typedef internal::remove_all_t<RhsType> RhsNestedExpression;
+
+  template <typename OriginalLhsType, typename OriginalRhsType>
+  EIGEN_DEVICE_FUNC constexpr inline Concat(const OriginalLhsType& lhs, const OriginalRhsType& rhs)
+      : m_lhs(lhs), m_rhs(rhs) {
+    EIGEN_STATIC_ASSERT((internal::is_same<std::remove_const_t<LhsType>, OriginalLhsType>::value),
+                        THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
+    EIGEN_STATIC_ASSERT((internal::is_same<std::remove_const_t<RhsType>, OriginalRhsType>::value),
+                        THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
+    EIGEN_STATIC_ASSERT(
+        (internal::is_same<typename LhsType::Scalar, typename RhsType::Scalar>::value),
+        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+    EIGEN_STATIC_ASSERT_SAME_XPR_KIND(LhsType, RhsType)
+    EIGEN_STATIC_ASSERT(Direction != Horizontal || int(LhsType::RowsAtCompileTime) == Dynamic ||
+                            int(RhsType::RowsAtCompileTime) == Dynamic ||
+                            int(LhsType::RowsAtCompileTime) == int(RhsType::RowsAtCompileTime),
+                        YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES)
+    EIGEN_STATIC_ASSERT(Direction != Vertical || int(LhsType::ColsAtCompileTime) == Dynamic ||
+                            int(RhsType::ColsAtCompileTime) == Dynamic ||
+                            int(LhsType::ColsAtCompileTime) == int(RhsType::ColsAtCompileTime),
+                        YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES)
+    if (Direction == Vertical) {
+      eigen_assert(lhs.cols() == rhs.cols() && "vcat: number of columns must match");
+    } else {
+      eigen_assert(lhs.rows() == rhs.rows() && "hcat: number of rows must match");
+    }
+  }
+
+  EIGEN_DEVICE_FUNC constexpr Index rows() const {
+    return Direction == Vertical ? m_lhs.rows() + m_rhs.rows() : m_lhs.rows();
+  }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const {
+    return Direction == Horizontal ? m_lhs.cols() + m_rhs.cols() : m_lhs.cols();
+  }
+
+  EIGEN_DEVICE_FUNC constexpr const LhsTypeNested_& lhs() const { return m_lhs; }
+  EIGEN_DEVICE_FUNC constexpr const RhsTypeNested_& rhs() const { return m_rhs; }
+
+ protected:
+  LhsTypeNested m_lhs;
+  RhsTypeNested m_rhs;
+};
+
+// Evaluator for Concat
+namespace internal {
+
+template <int Direction, typename LhsType, typename RhsType>
+struct evaluator<Concat<Direction, LhsType, RhsType>> : evaluator_base<Concat<Direction, LhsType, RhsType>> {
+  typedef Concat<Direction, LhsType, RhsType> XprType;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  typedef typename nested_eval<LhsType, 1>::type LhsNested;
+  typedef typename nested_eval<RhsType, 1>::type RhsNested;
+  typedef remove_all_t<LhsNested> LhsNestedCleaned;
+  typedef remove_all_t<RhsNested> RhsNestedCleaned;
+
+  enum {
+    CoeffReadCost = plain_enum_max(evaluator<LhsNestedCleaned>::CoeffReadCost,
+                                   evaluator<RhsNestedCleaned>::CoeffReadCost) +
+                    NumTraits<typename XprType::Scalar>::AddCost,  // cost of the branch
+    LhsFlags = evaluator<LhsNestedCleaned>::Flags,
+    RhsFlags = evaluator<RhsNestedCleaned>::Flags,
+    BothHavePacketAccess = (int(LhsFlags) & PacketAccessBit) && (int(RhsFlags) & PacketAccessBit),
+    BothHaveLinearAccess = (int(LhsFlags) & LinearAccessBit) && (int(RhsFlags) & LinearAccessBit),
+    IsRowMajor = int(traits<XprType>::Flags) & RowMajorBit,
+    IsVectorAtCompileTime = XprType::IsVectorAtCompileTime,
+    Flags = (traits<XprType>::Flags & RowMajorBit) | (BothHavePacketAccess ? PacketAccessBit : 0) |
+            (IsVectorAtCompileTime && BothHaveLinearAccess ? LinearAccessBit : 0),
+    Alignment = 0  // conservative: no alignment guarantees across boundary
+  };
+
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
+      : m_lhs(xpr.lhs()),
+        m_rhs(xpr.rhs()),
+        m_lhsImpl(m_lhs),
+        m_rhsImpl(m_rhs),
+        m_lhsRows(xpr.lhs().rows()),
+        m_lhsCols(xpr.lhs().cols()) {}
+
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+    if (Direction == Vertical) {
+      if (row < m_lhsRows.value())
+        return m_lhsImpl.coeff(row, col);
+      else
+        return m_rhsImpl.coeff(row - m_lhsRows.value(), col);
+    } else {
+      if (col < m_lhsCols.value())
+        return m_lhsImpl.coeff(row, col);
+      else
+        return m_rhsImpl.coeff(row, col - m_lhsCols.value());
+    }
+  }
+
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    const Index boundary = Direction == Vertical ? m_lhsRows.value() : m_lhsCols.value();
+    if (index < boundary)
+      return m_lhsImpl.coeff(index);
+    else
+      return m_rhsImpl.coeff(index - boundary);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    constexpr int packetSize = unpacket_traits<PacketType>::size;
+    if (Direction == Vertical) {
+      const Index boundary = m_lhsRows.value();
+      if (row >= boundary) return m_rhsImpl.template packet<LoadMode, PacketType>(row - boundary, col);
+      // Column-major: inner=rows, packet extends along rows and may straddle the row boundary.
+      // Row-major: inner=cols, packet extends along cols — never crosses the row boundary.
+      if (!IsRowMajor && row + packetSize > boundary) return packetBoundary<LoadMode, PacketType>(row, col);
+      return m_lhsImpl.template packet<LoadMode, PacketType>(row, col);
+    } else {
+      const Index boundary = m_lhsCols.value();
+      if (col >= boundary) return m_rhsImpl.template packet<LoadMode, PacketType>(row, col - boundary);
+      // Row-major: inner=cols, packet extends along cols and may straddle the col boundary.
+      // Column-major: inner=rows, packet extends along rows — never crosses the col boundary.
+      if (IsRowMajor && col + packetSize > boundary) return packetBoundary<LoadMode, PacketType>(row, col);
+      return m_lhsImpl.template packet<LoadMode, PacketType>(row, col);
+    }
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    constexpr int packetSize = unpacket_traits<PacketType>::size;
+    const Index boundary = Direction == Vertical ? m_lhsRows.value() : m_lhsCols.value();
+    if (index >= boundary) return m_rhsImpl.template packet<LoadMode, PacketType>(index - boundary);
+    if (index + packetSize > boundary) return packetBoundaryLinear<LoadMode, PacketType>(index);
+    return m_lhsImpl.template packet<LoadMode, PacketType>(index);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    if (Direction == Vertical) {
+      const Index boundary = m_lhsRows.value();
+      if (row >= boundary)
+        return m_rhsImpl.template packetSegment<LoadMode, PacketType>(row - boundary, col, begin, count);
+      if (!IsRowMajor && row + begin + count > boundary)
+        return packetSegmentBoundary<LoadMode, PacketType>(row, col, begin, count);
+      return m_lhsImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count);
+    } else {
+      const Index boundary = m_lhsCols.value();
+      if (col >= boundary)
+        return m_rhsImpl.template packetSegment<LoadMode, PacketType>(row, col - boundary, begin, count);
+      if (IsRowMajor && col + begin + count > boundary)
+        return packetSegmentBoundary<LoadMode, PacketType>(row, col, begin, count);
+      return m_lhsImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count);
+    }
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    const Index boundary = Direction == Vertical ? m_lhsRows.value() : m_lhsCols.value();
+    if (index >= boundary)
+      return m_rhsImpl.template packetSegment<LoadMode, PacketType>(index - boundary, begin, count);
+    if (index + begin + count > boundary) return packetSegmentBoundaryLinear<LoadMode, PacketType>(index, begin, count);
+    return m_lhsImpl.template packetSegment<LoadMode, PacketType>(index, begin, count);
+  }
+
+ protected:
+  typedef typename XprType::Scalar Scalar;
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetBoundary(Index row, Index col) const {
+    constexpr int packetSize = unpacket_traits<PacketType>::size;
+    EIGEN_ALIGN_MAX Scalar tmp[packetSize];
+    for (int i = 0; i < packetSize; ++i)
+      tmp[i] = coeff(row + (Direction == Vertical ? i : 0), col + (Direction == Horizontal ? i : 0));
+    return pload<PacketType>(tmp);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetBoundaryLinear(Index index) const {
+    constexpr int packetSize = unpacket_traits<PacketType>::size;
+    EIGEN_ALIGN_MAX Scalar tmp[packetSize];
+    for (int i = 0; i < packetSize; ++i) tmp[i] = coeff(index + i);
+    return pload<PacketType>(tmp);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegmentBoundary(Index row, Index col, Index begin,
+                                                                         Index count) const {
+    constexpr int packetSize = unpacket_traits<PacketType>::size;
+    EIGEN_ALIGN_MAX Scalar tmp[packetSize];
+    for (Index i = begin; i < begin + count; ++i)
+      tmp[i] = coeff(row + (Direction == Vertical ? i : 0), col + (Direction == Horizontal ? i : 0));
+    return ploadSegment<PacketType>(tmp, begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegmentBoundaryLinear(Index index, Index begin,
+                                                                               Index count) const {
+    constexpr int packetSize = unpacket_traits<PacketType>::size;
+    EIGEN_ALIGN_MAX Scalar tmp[packetSize];
+    for (Index i = begin; i < begin + count; ++i) tmp[i] = coeff(index + i);
+    return ploadSegment<PacketType>(tmp, begin, count);
+  }
+
+  LhsNested m_lhs;
+  RhsNested m_rhs;
+  evaluator<LhsNestedCleaned> m_lhsImpl;
+  evaluator<RhsNestedCleaned> m_rhsImpl;
+  const variable_if_dynamic<Index, LhsType::RowsAtCompileTime> m_lhsRows;
+  const variable_if_dynamic<Index, LhsType::ColsAtCompileTime> m_lhsCols;
+};
+
+}  // namespace internal
+
+/**
+ * \relates Concat
+ * \returns an expression of \a lhs and \a rhs concatenated horizontally (side by side).
+ *
+ * Both arguments must have the same number of rows.
+ * To concatenate more than two expressions, chain calls: \c hcat(hcat(a, b), c).
+ *
+ * Example: \code
+ * Matrix2d A, B;
+ * auto C = hcat(A, B);  // C is 2x4
+ * \endcode
+ *
+ * \sa vcat(), Concat
+ */
+template <typename Lhs, typename Rhs>
+EIGEN_DEVICE_FUNC inline const Concat<Horizontal, Lhs, Rhs> hcat(const DenseBase<Lhs>& lhs, const DenseBase<Rhs>& rhs) {
+  return Concat<Horizontal, Lhs, Rhs>(lhs.derived(), rhs.derived());
+}
+
+/**
+ * \relates Concat
+ * \returns an expression of \a lhs and \a rhs concatenated vertically (stacked on top of each other).
+ *
+ * Both arguments must have the same number of columns.
+ * To concatenate more than two expressions, chain calls: \c vcat(vcat(a, b), c).
+ *
+ * Example: \code
+ * Matrix2d A, B;
+ * auto C = vcat(A, B);  // C is 4x2
+ * \endcode
+ *
+ * \sa hcat(), Concat
+ */
+template <typename Lhs, typename Rhs>
+EIGEN_DEVICE_FUNC inline const Concat<Vertical, Lhs, Rhs> vcat(const DenseBase<Lhs>& lhs, const DenseBase<Rhs>& rhs) {
+  return Concat<Vertical, Lhs, Rhs>(lhs.derived(), rhs.derived());
+}
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CONCAT_OP_H
diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h
index dd1770b1abc..df27be3bc6b 100644
--- a/Eigen/src/Core/ConditionEstimator.h
+++ b/Eigen/src/Core/ConditionEstimator.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2016 Rasmus Munk Larsen (rmlarsen@google.com)
+// Copyright (C) 2016 Rasmus Munk Larsen (rmlarsen@gmail.com)
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -40,18 +40,17 @@ struct rcond_compute_sign<Vector, Vector, false> {
  * \a matrix that implements .solve() and .adjoint().solve() methods.
  *
  * This function implements Algorithms 4.1 and 5.1 from
- *   http://www.maths.manchester.ac.uk/~higham/narep/narep135.pdf
- * which also forms the basis for the condition number estimators in
- * LAPACK. Since at most 10 calls to the solve method of dec are
- * performed, the total cost is O(dims^2), as opposed to O(dims^3)
- * needed to compute the inverse matrix explicitly.
+ *   Higham, "Experience with a Matrix Norm Estimator",
+ *   SIAM J. Sci. Stat. Comput., 11(4):804-809, 1990.
+ * with Higham's alternating-sign safety-net estimate from
+ *   Higham and Tisseur, "A Block Algorithm for Matrix 1-Norm Estimation,
+ *   with an Application to 1-Norm Pseudospectra", SIAM J. Matrix Anal. Appl.,
+ *   21(4):1185-1201, 2000.
  *
- * The most common usage is in estimating the condition number
- * ||matrix||_1 * ||inv(matrix)||_1. The first term ||matrix||_1 can be
- * computed directly in O(n^2) operations.
+ * The Hager/Higham gradient ascent uses at most 5 iterations of 2 solves
+ * each, giving a total cost of O(n^2).
  *
- * Supports the following decompositions: FullPivLU, PartialPivLU, LDLT, and
- * LLT.
+ * Supports the following decompositions: FullPivLU, PartialPivLU, LDLT, LLT.
  *
  * \sa FullPivLU, PartialPivLU, LDLT, LLT.
  */
@@ -66,7 +65,7 @@ typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomp
 
   eigen_assert(dec.rows() == dec.cols());
   const Index n = dec.rows();
-  if (n == 0) return 0;
+  if (n == 0) return RealScalar(0);
 
     // Disable Index to float conversion warning
 #ifdef __INTEL_COMPILER
@@ -80,14 +79,12 @@ typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomp
 
   // lower_bound is a lower bound on
   //   ||inv(matrix)||_1  = sup_v ||inv(matrix) v||_1 / ||v||_1
-  // and is the objective maximized by the ("super-") gradient ascent
-  // algorithm below.
+  // and is the objective maximized by the supergradient ascent algorithm below.
   RealScalar lower_bound = v.template lpNorm<1>();
   if (n == 1) return lower_bound;
 
-  // Gradient ascent algorithm follows: We know that the optimum is achieved at
-  // one of the simplices v = e_i, so in each iteration we follow a
-  // super-gradient to move towards the optimal one.
+  // Gradient ascent: the optimum is achieved at a unit vector e_j. Each
+  // iteration follows the supergradient to find which unit vector to probe next.
   RealScalar old_lower_bound = lower_bound;
   Vector sign_vector(n);
   Vector old_sign_vector;
@@ -96,21 +93,21 @@ typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomp
   for (int k = 0; k < 4; ++k) {
     sign_vector = internal::rcond_compute_sign<Vector, RealVector, is_complex>::run(v);
     if (k > 0 && !is_complex && sign_vector == old_sign_vector) {
-      // Break if the solution stagnated.
+      // Break if the sign vector stagnated.
       break;
     }
-    // v_max_abs_index = argmax |real( inv(matrix)^T * sign_vector )|
+    // Supergradient: z = A^{-T} * sign(v), pick argmax |z_i|.
     v = dec.adjoint().solve(sign_vector);
     v.real().cwiseAbs().maxCoeff(&v_max_abs_index);
     if (v_max_abs_index == old_v_max_abs_index) {
-      // Break if the solution stagnated.
+      // Optimality: supergradient points to the same unit vector.
       break;
     }
-    // Move to the new simplex e_j, where j = v_max_abs_index.
-    v = dec.solve(Vector::Unit(n, v_max_abs_index));  // v = inv(matrix) * e_j.
+    // Probe the best unit vector: v = A^{-1} * e_j.
+    v = dec.solve(Vector::Unit(n, v_max_abs_index));
     lower_bound = v.template lpNorm<1>();
     if (lower_bound <= old_lower_bound) {
-      // Break if the gradient step did not increase the lower_bound.
+      // No improvement from the gradient step.
       break;
     }
     if (!is_complex) {
@@ -119,25 +116,19 @@ typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomp
     old_v_max_abs_index = v_max_abs_index;
     old_lower_bound = lower_bound;
   }
-  // The following calculates an independent estimate of ||matrix||_1 by
-  // multiplying matrix by a vector with entries of slowly increasing
-  // magnitude and alternating sign:
-  //   v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1.
-  // This improvement to Hager's algorithm above is due to Higham. It was
-  // added to make the algorithm more robust in certain corner cases where
-  // large elements in the matrix might otherwise escape detection due to
-  // exact cancellation (especially when op and op_adjoint correspond to a
-  // sequence of backsubstitutions and permutations), which could cause
-  // Hager's algorithm to vastly underestimate ||matrix||_1.
+  // Higham's alternating-sign estimate: an independent safety-net that catches
+  // cases where the gradient ascent converges to a local maximum due to exact
+  // cancellation patterns (especially with permutations and backsubstitutions).
+  //   v_i = (-1)^i * (1 + i/(n-1)), then estimate = 2*||A^{-1}*v||_1 / (3*n).
   Scalar alternating_sign(RealScalar(1));
   for (Index i = 0; i < n; ++i) {
-    // The static_cast is needed when Scalar is a complex and RealScalar implements expression templates
+    // The static_cast is needed when Scalar is complex and RealScalar uses expression templates.
     v[i] = alternating_sign * static_cast<RealScalar>(RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1))));
     alternating_sign = -alternating_sign;
   }
   v = dec.solve(v);
-  const RealScalar alternate_lower_bound = (2 * v.template lpNorm<1>()) / (3 * RealScalar(n));
-  return numext::maxi(lower_bound, alternate_lower_bound);
+  const RealScalar alt_est = (RealScalar(2) * v.template lpNorm<1>()) / (RealScalar(3) * RealScalar(n));
+  return numext::maxi(lower_bound, alt_est);
 }
 
 /** \brief Reciprocal condition number estimator.
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 63f1895d2ab..ef1642c54ed 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -23,7 +23,7 @@ namespace internal {
 // Default assumes index based accessors
 template <typename StorageKind>
 struct storage_kind_to_evaluator_kind {
-  typedef IndexBased Kind;
+  using Kind = IndexBased;
 };
 
 // This class returns the evaluator shape from the expression storage kind.
@@ -33,19 +33,19 @@ struct storage_kind_to_shape;
 
 template <>
 struct storage_kind_to_shape<Dense> {
-  typedef DenseShape Shape;
+  using Shape = DenseShape;
 };
 template <>
 struct storage_kind_to_shape<SolverStorage> {
-  typedef SolverShape Shape;
+  using Shape = SolverShape;
 };
 template <>
 struct storage_kind_to_shape<PermutationStorage> {
-  typedef PermutationShape Shape;
+  using Shape = PermutationShape;
 };
 template <>
 struct storage_kind_to_shape<TranspositionsStorage> {
-  typedef TranspositionsShape Shape;
+  using Shape = TranspositionsShape;
 };
 
 // Evaluators have to be specialized with respect to various criteria such as:
@@ -86,8 +86,8 @@ struct unary_evaluator;
 template <typename T>
 struct evaluator_traits_base {
   // by default, get evaluator kind and shape from storage
-  typedef typename storage_kind_to_evaluator_kind<typename traits<T>::StorageKind>::Kind Kind;
-  typedef typename storage_kind_to_shape<typename traits<T>::StorageKind>::Shape Shape;
+  using Kind = typename storage_kind_to_evaluator_kind<typename traits<T>::StorageKind>::Kind;
+  using Shape = typename storage_kind_to_shape<typename traits<T>::StorageKind>::Shape;
 };
 
 // Default evaluator traits
@@ -95,40 +95,36 @@ template <typename T>
 struct evaluator_traits : public evaluator_traits_base<T> {};
 
 template <typename T, typename Shape = typename evaluator_traits<T>::Shape>
-struct evaluator_assume_aliasing {
-  static const bool value = false;
-};
+struct evaluator_assume_aliasing : std::false_type {};
 
 // By default, we assume a unary expression:
 template <typename T>
 struct evaluator : public unary_evaluator<T> {
-  typedef unary_evaluator<T> Base;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const T& xpr) : Base(xpr) {}
+  using Base = unary_evaluator<T>;
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit evaluator(const T& xpr) : Base(xpr) {}
 };
 
 // TODO: Think about const-correctness
 template <typename T>
 struct evaluator<const T> : evaluator<T> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const T& xpr) : evaluator<T>(xpr) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit evaluator(const T& xpr) : evaluator<T>(xpr) {}
 };
 
 // ---------- base class for all evaluators ----------
 
 template <typename ExpressionType>
 struct evaluator_base {
-  // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle
+  // TODO: find a way to avoid propagating all these traits. They are currently only needed to handle
   // outer,inner indices.
-  typedef traits<ExpressionType> ExpressionTraits;
+  using ExpressionTraits = traits<ExpressionType>;
 
   enum { Alignment = 0 };
-  // noncopyable:
-  // Don't make this class inherit noncopyable as this kills EBO (Empty Base Optimization)
-  // and make complex evaluator much larger than then should do.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator_base() = default;
+  // Spell out deleted copy operations instead of inheriting from an empty helper:
+  // an extra base can kill EBO and make complex evaluators larger than they should be.
+  EIGEN_DEVICE_FUNC constexpr evaluator_base() = default;
 
- private:
-  EIGEN_DEVICE_FUNC evaluator_base(const evaluator_base&);
-  EIGEN_DEVICE_FUNC const evaluator_base& operator=(const evaluator_base&);
+  evaluator_base(const evaluator_base&) = delete;
+  evaluator_base& operator=(const evaluator_base&) = delete;
 };
 
 // -------------------- Matrix and Array --------------------
@@ -142,23 +138,22 @@ struct evaluator_base {
 template <typename Scalar, int OuterStride>
 class plainobjectbase_evaluator_data {
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
-      : data(ptr) {
+  EIGEN_DEVICE_FUNC constexpr plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr) {
 #ifndef EIGEN_INTERNAL_DEBUGGING
     EIGEN_UNUSED_VARIABLE(outerStride);
 #endif
     eigen_internal_assert(outerStride == OuterStride);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const noexcept { return OuterStride; }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return OuterStride; }
   const Scalar* data;
 };
 
 template <typename Scalar>
 class plainobjectbase_evaluator_data<Scalar, Dynamic> {
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
+  EIGEN_DEVICE_FUNC constexpr plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
       : data(ptr), m_outerStride(outerStride) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const { return m_outerStride; }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const { return m_outerStride; }
   const Scalar* data;
 
  protected:
@@ -167,9 +162,9 @@ class plainobjectbase_evaluator_data<Scalar, Dynamic> {
 
 template <typename Derived>
 struct evaluator<PlainObjectBase<Derived>> : evaluator_base<Derived> {
-  typedef PlainObjectBase<Derived> PlainObjectType;
-  typedef typename PlainObjectType::Scalar Scalar;
-  typedef typename PlainObjectType::CoeffReturnType CoeffReturnType;
+  using PlainObjectType = PlainObjectBase<Derived>;
+  using Scalar = typename PlainObjectType::Scalar;
+  using CoeffReturnType = typename PlainObjectType::CoeffReturnType;
 
   enum {
     IsRowMajor = PlainObjectType::IsRowMajor,
@@ -188,11 +183,11 @@ struct evaluator<PlainObjectBase<Derived>> : evaluator_base<Derived> {
                                                      : RowsAtCompileTime
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() : m_d(0, OuterStrideAtCompileTime) {
+  EIGEN_DEVICE_FUNC constexpr evaluator() : m_d(0, OuterStrideAtCompileTime) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const PlainObjectType& m)
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const PlainObjectType& m)
       : m_d(m.data(), IsVectorAtCompileTime ? 0 : m.outerStride()) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
@@ -257,7 +252,7 @@ struct evaluator<PlainObjectBase<Derived>> : evaluator_base<Derived> {
   plainobjectbase_evaluator_data<Scalar, OuterStrideAtCompileTime> m_d;
 
  private:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index constexpr getIndex(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr Index getIndex(Index row, Index col) const {
     return IsRowMajor ? row * m_d.outerStride() + col : row + col * m_d.outerStride();
   }
 };
@@ -265,30 +260,28 @@ struct evaluator<PlainObjectBase<Derived>> : evaluator_base<Derived> {
 template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
 struct evaluator<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>
     : evaluator<PlainObjectBase<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>> {
-  typedef Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
+  using XprType = Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() = default;
+  EIGEN_DEVICE_FUNC constexpr evaluator() = default;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const XprType& m)
-      : evaluator<PlainObjectBase<XprType>>(m) {}
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const XprType& m) : evaluator<PlainObjectBase<XprType>>(m) {}
 };
 
 template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
 struct evaluator<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>
     : evaluator<PlainObjectBase<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>> {
-  typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
+  using XprType = Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() = default;
+  EIGEN_DEVICE_FUNC constexpr evaluator() = default;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const XprType& m)
-      : evaluator<PlainObjectBase<XprType>>(m) {}
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const XprType& m) : evaluator<PlainObjectBase<XprType>>(m) {}
 };
 
 // -------------------- Transpose --------------------
 
 template <typename ArgType>
 struct unary_evaluator<Transpose<ArgType>, IndexBased> : evaluator_base<Transpose<ArgType>> {
-  typedef Transpose<ArgType> XprType;
+  using XprType = Transpose<ArgType>;
 
   enum {
     CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
@@ -296,20 +289,25 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased> : evaluator_base<Transpos
     Alignment = evaluator<ArgType>::Alignment
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& t)
+      : m_argImpl(t.nestedExpression()) {}
 
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  using Scalar = typename XprType::Scalar;
+  using CoeffReturnType = typename XprType::CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_argImpl.coeff(col, row);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_argImpl.coeff(index); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_argImpl.coeff(index);
+  }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { return m_argImpl.coeffRef(col, row); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+    return m_argImpl.coeffRef(col, row);
+  }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename XprType::Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename XprType::Scalar& coeffRef(Index index) {
     return m_argImpl.coeffRef(index);
   }
 
@@ -368,11 +366,12 @@ template <typename Scalar, typename NullaryOp, bool has_nullary = has_nullary_op
           bool has_binary = has_binary_operator<NullaryOp>::value>
 struct nullary_wrapper {
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i,
+                                                                    IndexType j) const {
     return op(i, j);
   }
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const {
     return op(i);
   }
 
@@ -389,7 +388,8 @@ struct nullary_wrapper {
 template <typename Scalar, typename NullaryOp>
 struct nullary_wrapper<Scalar, NullaryOp, true, false, false> {
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType = 0, IndexType = 0) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType = 0,
+                                                                    IndexType = 0) const {
     return op();
   }
   template <typename T, typename IndexType>
@@ -401,7 +401,8 @@ struct nullary_wrapper<Scalar, NullaryOp, true, false, false> {
 template <typename Scalar, typename NullaryOp>
 struct nullary_wrapper<Scalar, NullaryOp, false, false, true> {
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j = 0) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i,
+                                                                    IndexType j = 0) const {
     return op(i, j);
   }
   template <typename T, typename IndexType>
@@ -416,7 +417,8 @@ struct nullary_wrapper<Scalar, NullaryOp, false, false, true> {
 template <typename Scalar, typename NullaryOp>
 struct nullary_wrapper<Scalar, NullaryOp, false, true, false> {
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i,
+                                                                    IndexType j) const {
     eigen_assert(i == 0 || j == 0);
     return op(i + j);
   }
@@ -427,7 +429,7 @@ struct nullary_wrapper<Scalar, NullaryOp, false, true, false> {
   }
 
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const {
     return op(i);
   }
   template <typename T, typename IndexType>
@@ -439,69 +441,11 @@ struct nullary_wrapper<Scalar, NullaryOp, false, true, false> {
 template <typename Scalar, typename NullaryOp>
 struct nullary_wrapper<Scalar, NullaryOp, false, false, false> {};
 
-#if 0 && EIGEN_COMP_MSVC > 0
-// Disable this ugly workaround. This is now handled in traits<Ref>::match,
-// but this piece of code might still become handly if some other weird compilation
-// errors pop up again.
-
-// MSVC exhibits a weird compilation error when
-// compiling:
-//    Eigen::MatrixXf A = MatrixXf::Random(3,3);
-//    Ref<const MatrixXf> R = 2.f*A;
-// and that has_*ary_operator<scalar_constant_op<float>> have not been instantiated yet.
-// The "problem" is that evaluator<2.f*A> is instantiated by traits<Ref>::match<2.f*A>
-// and at that time has_*ary_operator<T> returns true regardless of T.
-// Then nullary_wrapper is badly instantiated as nullary_wrapper<.,.,true,true,true>.
-// The trick is thus to defer the proper instantiation of nullary_wrapper when coeff(),
-// and packet() are really instantiated as implemented below:
-
-// This is a simple wrapper around Index to enforce the re-instantiation of
-// has_*ary_operator when needed.
-template<typename T> struct nullary_wrapper_workaround_msvc {
-  nullary_wrapper_workaround_msvc(const T&);
-  operator T()const;
-};
-
-template<typename Scalar,typename NullaryOp>
-struct nullary_wrapper<Scalar,NullaryOp,true,true,true>
-{
-  template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const {
-    return nullary_wrapper<Scalar,NullaryOp,
-    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
-    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
-    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().operator()(op,i,j);
-  }
-  template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const {
-    return nullary_wrapper<Scalar,NullaryOp,
-    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
-    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
-    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().operator()(op,i);
-  }
-
-  template <typename T, typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const {
-    return nullary_wrapper<Scalar,NullaryOp,
-    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
-    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
-    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().template packetOp<T>(op,i,j);
-  }
-  template <typename T, typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const {
-    return nullary_wrapper<Scalar,NullaryOp,
-    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
-    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
-    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().template packetOp<T>(op,i);
-  }
-};
-#endif  // MSVC workaround
-
 template <typename NullaryOp, typename PlainObjectType>
 struct evaluator<CwiseNullaryOp<NullaryOp, PlainObjectType>>
     : evaluator_base<CwiseNullaryOp<NullaryOp, PlainObjectType>> {
-  typedef CwiseNullaryOp<NullaryOp, PlainObjectType> XprType;
-  typedef remove_all_t<PlainObjectType> PlainObjectTypeCleaned;
+  using XprType = CwiseNullaryOp<NullaryOp, PlainObjectType>;
+  using PlainObjectTypeCleaned = remove_all_t<PlainObjectType>;
 
   enum {
     CoeffReadCost = functor_traits<NullaryOp>::Cost,
@@ -513,19 +457,19 @@ struct evaluator<CwiseNullaryOp<NullaryOp, PlainObjectType>>
     Alignment = AlignedMax
   };
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& n) : m_functor(n.functor()), m_wrapper() {
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const XprType& n) : m_functor(n.functor()), m_wrapper() {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  using CoeffReturnType = typename XprType::CoeffReturnType;
 
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(IndexType row, IndexType col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(IndexType row, IndexType col) const {
     return m_wrapper(m_functor, row, col);
   }
 
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(IndexType index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(IndexType index) const {
     return m_wrapper(m_functor, index);
   }
 
@@ -560,7 +504,7 @@ struct evaluator<CwiseNullaryOp<NullaryOp, PlainObjectType>>
 
 template <typename UnaryOp, typename ArgType>
 struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased> : evaluator_base<CwiseUnaryOp<UnaryOp, ArgType>> {
-  typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
+  using XprType = CwiseUnaryOp<UnaryOp, ArgType>;
 
   enum {
     CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<UnaryOp>::Cost),
@@ -570,18 +514,18 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased> : evaluator_b
     Alignment = evaluator<ArgType>::Alignment
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& op) : m_d(op) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& op) : m_d(op) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  using CoeffReturnType = typename XprType::CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_d.func()(m_d.argImpl.coeff(row, col));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return m_d.func()(m_d.argImpl.coeff(index));
   }
 
@@ -608,9 +552,9 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased> : evaluator_b
  protected:
   // this helper permits to completely eliminate the functor if it is empty
   struct Data {
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Data(const XprType& xpr)
+    EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Data(const XprType& xpr)
         : op(xpr.functor()), argImpl(xpr.nestedExpression()) {}
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryOp& func() const { return op; }
+    EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const UnaryOp& func() const { return op; }
     UnaryOp op;
     evaluator<ArgType> argImpl;
   };
@@ -639,7 +583,7 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
     Alignment = evaluator<ArgType>::Alignment
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& xpr)
       : m_argImpl(xpr.nestedExpression()), m_rows(xpr.rows()), m_cols(xpr.cols()) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<CastOp>::Cost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
@@ -671,15 +615,15 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
     Index actualCol = IsRowMajor ? col + offset : col;
     return m_argImpl.coeff(actualRow, actualCol);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SrcType srcCoeff(Index index, Index offset) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE SrcType srcCoeff(Index index, Index offset) const {
     Index actualIndex = index + offset;
     return m_argImpl.coeff(actualIndex);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE DstType coeff(Index row, Index col) const {
     return cast<SrcType, DstType>(srcCoeff(row, col, 0));
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE DstType coeff(Index index) const {
     return cast<SrcType, DstType>(srcCoeff(index, 0));
   }
 
@@ -707,7 +651,7 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
     Index packetOffset = offset * PacketSize;
     Index actualRow = IsRowMajor ? row : row + packetOffset;
     Index actualCol = IsRowMajor ? col + packetOffset : col;
-    eigen_assert(check_array_bounds(actualRow, actualCol, 0, count) && "Array index out of bounds");
+    eigen_assert(check_array_bounds(actualRow, actualCol, begin, count) && "Array index out of bounds");
     return m_argImpl.template packetSegment<LoadMode, PacketType>(actualRow, actualCol, begin, count);
   }
   template <int LoadMode, typename PacketType = SrcPacketType>
@@ -715,8 +659,8 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
                                                                     Index offset) const {
     constexpr int PacketSize = unpacket_traits<PacketType>::size;
     Index packetOffset = offset * PacketSize;
-    Index actualIndex = index + packetOffset + begin;
-    eigen_assert(check_array_bounds(actualIndex, 0, count) && "Array index out of bounds");
+    Index actualIndex = index + packetOffset;
+    eigen_assert(check_array_bounds(actualIndex, begin, count) && "Array index out of bounds");
     return m_argImpl.template packetSegment<LoadMode, PacketType>(actualIndex, begin, count);
   }
 
@@ -958,16 +902,16 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
 template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
 struct evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>>
     : public ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>> {
-  typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
-  typedef ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>> Base;
+  using XprType = CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>;
+  using Base = ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>>;
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
 
 template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
 struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased, IndexBased>
     : evaluator_base<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>> {
-  typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
+  using XprType = CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>;
 
   enum {
     CoeffReadCost = int(evaluator<Arg1>::CoeffReadCost) + int(evaluator<Arg2>::CoeffReadCost) +
@@ -990,18 +934,18 @@ struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased
                                evaluator<Arg3>::Alignment)
   };
 
-  EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr) : m_d(xpr) {
+  EIGEN_DEVICE_FUNC constexpr explicit ternary_evaluator(const XprType& xpr) : m_d(xpr) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<TernaryOp>::Cost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  using CoeffReturnType = typename XprType::CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_d.func()(m_d.arg1Impl.coeff(row, col), m_d.arg2Impl.coeff(row, col), m_d.arg3Impl.coeff(row, col));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return m_d.func()(m_d.arg1Impl.coeff(index), m_d.arg2Impl.coeff(index), m_d.arg3Impl.coeff(index));
   }
 
@@ -1036,9 +980,9 @@ struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased
  protected:
   // this helper permits to completely eliminate the functor if it is empty
   struct Data {
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Data(const XprType& xpr)
+    EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Data(const XprType& xpr)
         : op(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {}
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TernaryOp& func() const { return op; }
+    EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const TernaryOp& func() const { return op; }
     TernaryOp op;
     evaluator<Arg1> arg1Impl;
     evaluator<Arg2> arg2Impl;
@@ -1048,24 +992,35 @@ struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased
   Data m_d;
 };
 
-// specialization for expressions like (a < b).select(c, d) to enable full vectorization
 template <typename Arg1, typename Arg2, typename Scalar, typename CmpLhsType, typename CmpRhsType, ComparisonName cmp>
-struct evaluator<CwiseTernaryOp<scalar_boolean_select_op<Scalar, Scalar, bool>, Arg1, Arg2,
-                                CwiseBinaryOp<scalar_cmp_op<Scalar, Scalar, cmp, false>, CmpLhsType, CmpRhsType>>>
-    : public ternary_evaluator<
-          CwiseTernaryOp<scalar_boolean_select_op<Scalar, Scalar, Scalar>, Arg1, Arg2,
-                         CwiseBinaryOp<scalar_cmp_op<Scalar, Scalar, cmp, true>, CmpLhsType, CmpRhsType>>> {
+struct scalar_boolean_select_spec {
   using DummyTernaryOp = scalar_boolean_select_op<Scalar, Scalar, bool>;
   using DummyArg3 = CwiseBinaryOp<scalar_cmp_op<Scalar, Scalar, cmp, false>, CmpLhsType, CmpRhsType>;
   using DummyXprType = CwiseTernaryOp<DummyTernaryOp, Arg1, Arg2, DummyArg3>;
 
-  using TernaryOp = scalar_boolean_select_op<Scalar, Scalar, Scalar>;
-  using Arg3 = CwiseBinaryOp<scalar_cmp_op<Scalar, Scalar, cmp, true>, CmpLhsType, CmpRhsType>;
+  // only use the typed comparison if it is vectorized
+  static constexpr bool UseTyped = functor_traits<scalar_cmp_op<Scalar, Scalar, cmp, true>>::PacketAccess;
+  using CondScalar = std::conditional_t<UseTyped, Scalar, bool>;
+
+  using TernaryOp = scalar_boolean_select_op<Scalar, Scalar, CondScalar>;
+  using Arg3 = CwiseBinaryOp<scalar_cmp_op<Scalar, Scalar, cmp, UseTyped>, CmpLhsType, CmpRhsType>;
   using XprType = CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>;
 
   using Base = ternary_evaluator<XprType>;
+};
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const DummyXprType& xpr)
+// specialization for expressions like (a < b).select(c, d) to enable full vectorization
+template <typename Arg1, typename Arg2, typename Scalar, typename CmpLhsType, typename CmpRhsType, ComparisonName cmp>
+struct evaluator<CwiseTernaryOp<scalar_boolean_select_op<Scalar, Scalar, bool>, Arg1, Arg2,
+                                CwiseBinaryOp<scalar_cmp_op<Scalar, Scalar, cmp, false>, CmpLhsType, CmpRhsType>>>
+    : public scalar_boolean_select_spec<Arg1, Arg2, Scalar, CmpLhsType, CmpRhsType, cmp>::Base {
+  using Helper = scalar_boolean_select_spec<Arg1, Arg2, Scalar, CmpLhsType, CmpRhsType, cmp>;
+  using Base = typename Helper::Base;
+  using DummyXprType = typename Helper::DummyXprType;
+  using Arg3 = typename Helper::Arg3;
+  using XprType = typename Helper::XprType;
+
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const DummyXprType& xpr)
       : Base(XprType(xpr.arg1(), xpr.arg2(), Arg3(xpr.arg3().lhs(), xpr.arg3().rhs()))) {}
 };
 
@@ -1074,16 +1029,16 @@ struct evaluator<CwiseTernaryOp<scalar_boolean_select_op<Scalar, Scalar, bool>,
 // this is a binary expression
 template <typename BinaryOp, typename Lhs, typename Rhs>
 struct evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>> : public binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>> {
-  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
-  typedef binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>> Base;
+  using XprType = CwiseBinaryOp<BinaryOp, Lhs, Rhs>;
+  using Base = binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>>;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
 
 template <typename BinaryOp, typename Lhs, typename Rhs>
 struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBased>
     : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs>> {
-  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  using XprType = CwiseBinaryOp<BinaryOp, Lhs, Rhs>;
 
   enum {
     CoeffReadCost =
@@ -1102,18 +1057,18 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
     Alignment = plain_enum_min(evaluator<Lhs>::Alignment, evaluator<Rhs>::Alignment)
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit binary_evaluator(const XprType& xpr) : m_d(xpr) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit binary_evaluator(const XprType& xpr) : m_d(xpr) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  using CoeffReturnType = typename XprType::CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_d.func()(m_d.lhsImpl.coeff(row, col), m_d.rhsImpl.coeff(row, col));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return m_d.func()(m_d.lhsImpl.coeff(index), m_d.rhsImpl.coeff(index));
   }
 
@@ -1144,9 +1099,9 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
  protected:
   // this helper permits to completely eliminate the functor if it is empty
   struct Data {
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Data(const XprType& xpr)
+    EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Data(const XprType& xpr)
         : op(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {}
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const BinaryOp& func() const { return op; }
+    EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const BinaryOp& func() const { return op; }
     BinaryOp op;
     evaluator<Lhs> lhsImpl;
     evaluator<Rhs> rhsImpl;
@@ -1160,46 +1115,46 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
 template <typename UnaryOp, typename ArgType, typename StrideType>
 struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType, StrideType>, IndexBased>
     : evaluator_base<CwiseUnaryView<UnaryOp, ArgType, StrideType>> {
-  typedef CwiseUnaryView<UnaryOp, ArgType, StrideType> XprType;
+  using XprType = CwiseUnaryView<UnaryOp, ArgType, StrideType>;
 
   enum {
     CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<UnaryOp>::Cost),
 
     Flags = (evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit)),
 
-    Alignment = 0  // FIXME it is not very clear why alignment is necessarily lost...
+    Alignment = 0  // FIXME: clarify why alignment is lost for CwiseUnaryView.
   };
 
-  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) : m_d(op) {
+  EIGEN_DEVICE_FUNC constexpr explicit unary_evaluator(const XprType& op) : m_d(op) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  using Scalar = typename XprType::Scalar;
+  using CoeffReturnType = typename XprType::CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_d.func()(m_d.argImpl.coeff(row, col));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return m_d.func()(m_d.argImpl.coeff(index));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
     return m_d.func()(m_d.argImpl.coeffRef(row, col));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
     return m_d.func()(m_d.argImpl.coeffRef(index));
   }
 
  protected:
   // this helper permits to completely eliminate the functor if it is empty
   struct Data {
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Data(const XprType& xpr)
+    EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Data(const XprType& xpr)
         : op(xpr.functor()), argImpl(xpr.nestedExpression()) {}
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryOp& func() const { return op; }
+    EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const UnaryOp& func() const { return op; }
     UnaryOp op;
     evaluator<ArgType> argImpl;
   };
@@ -1209,25 +1164,25 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType, StrideType>, IndexBased>
 
 // -------------------- Map --------------------
 
-// FIXME perhaps the PlainObjectType could be provided by Derived::PlainObject ?
+// FIXME: consider using Derived::PlainObject for PlainObjectType.
 // but that might complicate template specialization
 template <typename Derived, typename PlainObjectType>
 struct mapbase_evaluator;
 
 template <typename Derived, typename PlainObjectType>
 struct mapbase_evaluator : evaluator_base<Derived> {
-  typedef Derived XprType;
-  typedef typename XprType::PointerType PointerType;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  using XprType = Derived;
+  using PointerType = typename XprType::PointerType;
+  using Scalar = typename XprType::Scalar;
+  using CoeffReturnType = typename XprType::CoeffReturnType;
 
   enum {
-    IsRowMajor = XprType::RowsAtCompileTime,
+    IsRowMajor = XprType::IsRowMajor,
     ColsAtCompileTime = XprType::ColsAtCompileTime,
     CoeffReadCost = NumTraits<Scalar>::ReadCost
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit mapbase_evaluator(const XprType& map)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit mapbase_evaluator(const XprType& map)
       : m_data(const_cast<PointerType>(map.data())),
         m_innerStride(map.innerStride()),
         m_outerStride(map.outerStride()) {
@@ -1237,19 +1192,21 @@ struct mapbase_evaluator : evaluator_base<Derived> {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_data[col * colStride() + row * rowStride()];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return m_data[index * m_innerStride.value()];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
     return m_data[col * colStride() + row * rowStride()];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return m_data[index * m_innerStride.value()]; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    return m_data[index * m_innerStride.value()];
+  }
 
   template <int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
@@ -1298,10 +1255,10 @@ struct mapbase_evaluator : evaluator_base<Derived> {
   }
 
  protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowStride() const noexcept {
+  EIGEN_DEVICE_FUNC constexpr Index rowStride() const noexcept {
     return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value();
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colStride() const noexcept {
+  EIGEN_DEVICE_FUNC constexpr Index colStride() const noexcept {
     return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value();
   }
 
@@ -1313,10 +1270,10 @@ struct mapbase_evaluator : evaluator_base<Derived> {
 template <typename PlainObjectType, int MapOptions, typename StrideType>
 struct evaluator<Map<PlainObjectType, MapOptions, StrideType>>
     : public mapbase_evaluator<Map<PlainObjectType, MapOptions, StrideType>, PlainObjectType> {
-  typedef Map<PlainObjectType, MapOptions, StrideType> XprType;
-  typedef typename XprType::Scalar Scalar;
+  using XprType = Map<PlainObjectType, MapOptions, StrideType>;
+  using Scalar = typename XprType::Scalar;
   // TODO: should check for smaller packet types once we can handle multi-sized packet types
-  typedef typename packet_traits<Scalar>::type PacketScalar;
+  using PacketScalar = typename packet_traits<Scalar>::type;
 
   enum {
     InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
@@ -1338,7 +1295,8 @@ struct evaluator<Map<PlainObjectType, MapOptions, StrideType>>
     Alignment = int(MapOptions) & int(AlignedMask)
   };
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& map) : mapbase_evaluator<XprType, PlainObjectType>(map) {}
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const XprType& map)
+      : mapbase_evaluator<XprType, PlainObjectType>(map) {}
 };
 
 // -------------------- Ref --------------------
@@ -1346,14 +1304,14 @@ struct evaluator<Map<PlainObjectType, MapOptions, StrideType>>
 template <typename PlainObjectType, int RefOptions, typename StrideType>
 struct evaluator<Ref<PlainObjectType, RefOptions, StrideType>>
     : public mapbase_evaluator<Ref<PlainObjectType, RefOptions, StrideType>, PlainObjectType> {
-  typedef Ref<PlainObjectType, RefOptions, StrideType> XprType;
+  using XprType = Ref<PlainObjectType, RefOptions, StrideType>;
 
   enum {
     Flags = evaluator<Map<PlainObjectType, RefOptions, StrideType>>::Flags,
     Alignment = evaluator<Map<PlainObjectType, RefOptions, StrideType>>::Alignment
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& ref)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit evaluator(const XprType& ref)
       : mapbase_evaluator<XprType, PlainObjectType>(ref) {}
 };
 
@@ -1366,10 +1324,10 @@ struct block_evaluator;
 template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>>
     : block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> {
-  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
-  typedef typename XprType::Scalar Scalar;
+  using XprType = Block<ArgType, BlockRows, BlockCols, InnerPanel>;
+  using Scalar = typename XprType::Scalar;
   // TODO: should check for smaller packet types once we can handle multi-sized packet types
-  typedef typename packet_traits<Scalar>::type PacketScalar;
+  using PacketScalar = typename packet_traits<Scalar>::type;
 
   enum {
     CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
@@ -1406,8 +1364,9 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>>
                      : 0,
     Alignment = plain_enum_min(evaluator<ArgType>::Alignment, Alignment0)
   };
-  typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& block) : block_evaluator_type(block) {
+  using block_evaluator_type = block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel>;
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit evaluator(const XprType& block)
+      : block_evaluator_type(block) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 };
@@ -1416,18 +1375,18 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>>
 template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /*HasDirectAccess*/ false>
     : unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>> {
-  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+  using XprType = Block<ArgType, BlockRows, BlockCols, InnerPanel>;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit block_evaluator(const XprType& block)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit block_evaluator(const XprType& block)
       : unary_evaluator<XprType>(block) {}
 };
 
 template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBased>
     : evaluator_base<Block<ArgType, BlockRows, BlockCols, InnerPanel>> {
-  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+  using XprType = Block<ArgType, BlockRows, BlockCols, InnerPanel>;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& block)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& block)
       : m_argImpl(block.nestedExpression()),
         m_startRow(block.startRow()),
         m_startCol(block.startCol()),
@@ -1437,8 +1396,8 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
                                    : block.startCol() * block.nestedExpression().rows() + block.startRow())
                             : 0) {}
 
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  using Scalar = typename XprType::Scalar;
+  using CoeffReturnType = typename XprType::CoeffReturnType;
 
   enum {
     RowsAtCompileTime = XprType::RowsAtCompileTime,
@@ -1446,19 +1405,19 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
                           bool(evaluator<ArgType>::Flags & LinearAccessBit)
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return linear_coeff_impl(index, bool_constant<ForwardLinearAccess>());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
     return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
     return linear_coeffRef_impl(index, bool_constant<ForwardLinearAccess>());
   }
 
@@ -1469,10 +1428,9 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
 
   template <int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
-    if (ForwardLinearAccess)
-      return m_argImpl.template packet<LoadMode, PacketType>(m_linear_offset.value() + index);
-    else
-      return packet<LoadMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+    EIGEN_IF_CONSTEXPR(ForwardLinearAccess)
+    return m_argImpl.template packet<LoadMode, PacketType>(m_linear_offset.value() + index);
+    else return packet<LoadMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
   }
 
   template <int StoreMode, typename PacketType>
@@ -1482,11 +1440,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
 
   template <int StoreMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
-    if (ForwardLinearAccess)
-      return m_argImpl.template writePacket<StoreMode, PacketType>(m_linear_offset.value() + index, x);
-    else
-      return writePacket<StoreMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0,
-                                                x);
+    EIGEN_IF_CONSTEXPR(ForwardLinearAccess)
+    return m_argImpl.template writePacket<StoreMode, PacketType>(m_linear_offset.value() + index, x);
+    else return writePacket<StoreMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                                   RowsAtCompileTime == 1 ? index : 0, x);
   }
 
   template <int LoadMode, typename PacketType>
@@ -1497,11 +1454,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
 
   template <int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
-    if (ForwardLinearAccess)
-      return m_argImpl.template packetSegment<LoadMode, PacketType>(m_linear_offset.value() + index, begin, count);
-    else
-      return packetSegment<LoadMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0,
-                                                 begin, count);
+    EIGEN_IF_CONSTEXPR(ForwardLinearAccess)
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(m_linear_offset.value() + index, begin, count);
+    else return packetSegment<LoadMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                                    RowsAtCompileTime == 1 ? index : 0, begin, count);
   }
 
   template <int StoreMode, typename PacketType>
@@ -1514,29 +1470,28 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
   template <int StoreMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
                                                                 Index count) {
-    if (ForwardLinearAccess)
-      return m_argImpl.template writePacketSegment<StoreMode, PacketType>(m_linear_offset.value() + index, x, begin,
-                                                                          count);
-    else
-      return writePacketSegment<StoreMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index,
-                                                       RowsAtCompileTime == 1 ? index : 0, x, begin, count);
+    EIGEN_IF_CONSTEXPR(ForwardLinearAccess)
+    return m_argImpl.template writePacketSegment<StoreMode, PacketType>(m_linear_offset.value() + index, x, begin,
+                                                                        count);
+    else return writePacketSegment<StoreMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                                          RowsAtCompileTime == 1 ? index : 0, x, begin, count);
   }
 
  protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType
   linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const {
     return m_argImpl.coeff(m_linear_offset.value() + index);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType
   linear_coeff_impl(Index index, internal::false_type /* not ForwardLinearAccess */) const {
     return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& linear_coeffRef_impl(Index index,
-                                                                     internal::true_type /* ForwardLinearAccess */) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& linear_coeffRef_impl(
+      Index index, internal::true_type /* ForwardLinearAccess */) {
     return m_argImpl.coeffRef(m_linear_offset.value() + index);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& linear_coeffRef_impl(
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& linear_coeffRef_impl(
       Index index, internal::false_type /* not ForwardLinearAccess */) {
     return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
   }
@@ -1554,10 +1509,10 @@ template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAccess */ true>
     : mapbase_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>,
                         typename Block<ArgType, BlockRows, BlockCols, InnerPanel>::PlainObject> {
-  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
-  typedef typename XprType::Scalar Scalar;
+  using XprType = Block<ArgType, BlockRows, BlockCols, InnerPanel>;
+  using Scalar = typename XprType::Scalar;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit block_evaluator(const XprType& block)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit block_evaluator(const XprType& block)
       : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) {
     eigen_internal_assert((internal::is_constant_evaluated() ||
                            (std::uintptr_t(block.data()) % plain_enum_max(1, evaluator<XprType>::Alignment)) == 0) &&
@@ -1565,60 +1520,16 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAc
   }
 };
 
-// -------------------- Select --------------------
-// NOTE shall we introduce a ternary_evaluator?
-
-// TODO enable vectorization for Select
-template <typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
-struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType>>
-    : evaluator_base<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType>> {
-  typedef Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> XprType;
-  enum {
-    CoeffReadCost = evaluator<ConditionMatrixType>::CoeffReadCost +
-                    plain_enum_max(evaluator<ThenMatrixType>::CoeffReadCost, evaluator<ElseMatrixType>::CoeffReadCost),
-
-    Flags = (unsigned int)evaluator<ThenMatrixType>::Flags & evaluator<ElseMatrixType>::Flags & HereditaryBits,
-
-    Alignment = plain_enum_min(evaluator<ThenMatrixType>::Alignment, evaluator<ElseMatrixType>::Alignment)
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& select)
-      : m_conditionImpl(select.conditionMatrix()), m_thenImpl(select.thenMatrix()), m_elseImpl(select.elseMatrix()) {
-    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
-  }
-
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
-    if (m_conditionImpl.coeff(row, col))
-      return m_thenImpl.coeff(row, col);
-    else
-      return m_elseImpl.coeff(row, col);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    if (m_conditionImpl.coeff(index))
-      return m_thenImpl.coeff(index);
-    else
-      return m_elseImpl.coeff(index);
-  }
-
- protected:
-  evaluator<ConditionMatrixType> m_conditionImpl;
-  evaluator<ThenMatrixType> m_thenImpl;
-  evaluator<ElseMatrixType> m_elseImpl;
-};
-
 // -------------------- Replicate --------------------
 
 template <typename ArgType, int RowFactor, int ColFactor>
 struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor>>
     : evaluator_base<Replicate<ArgType, RowFactor, ColFactor>> {
-  typedef Replicate<ArgType, RowFactor, ColFactor> XprType;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  using XprType = Replicate<ArgType, RowFactor, ColFactor>;
+  using CoeffReturnType = typename XprType::CoeffReturnType;
   enum { Factor = (RowFactor == Dynamic || ColFactor == Dynamic) ? Dynamic : RowFactor * ColFactor };
-  typedef typename nested_eval<ArgType, Factor>::type ArgTypeNested;
-  typedef remove_all_t<ArgTypeNested> ArgTypeNestedCleaned;
+  using ArgTypeNested = typename nested_eval<ArgType, Factor>::type;
+  using ArgTypeNestedCleaned = remove_all_t<ArgTypeNested>;
 
   enum {
     CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
@@ -1629,13 +1540,13 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor>>
     Alignment = evaluator<ArgTypeNestedCleaned>::Alignment
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& replicate)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& replicate)
       : m_arg(replicate.nestedExpression()),
         m_argImpl(m_arg),
         m_rows(replicate.nestedExpression().rows()),
         m_cols(replicate.nestedExpression().cols()) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     // try to avoid using modulo; this is a pure optimization strategy
     const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
     const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
@@ -1643,7 +1554,7 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor>>
     return m_argImpl.coeff(actual_row, actual_col);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     // try to avoid using modulo; this is a pure optimization strategy
     const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
                                    ? (ColFactor == 1 ? index : index % m_cols.value())
@@ -1687,7 +1598,7 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor>>
   }
 
  protected:
-  const ArgTypeNested m_arg;
+  ArgTypeNested m_arg;
   evaluator<ArgTypeNestedCleaned> m_argImpl;
   const variable_if_dynamic<Index, ArgType::RowsAtCompileTime> m_rows;
   const variable_if_dynamic<Index, ArgType::ColsAtCompileTime> m_cols;
@@ -1700,7 +1611,7 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor>>
 
 template <typename XprType>
 struct evaluator_wrapper_base : evaluator_base<XprType> {
-  typedef remove_all_t<typename XprType::NestedExpressionType> ArgType;
+  using ArgType = remove_all_t<typename XprType::NestedExpressionType>;
   enum {
     CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
     Flags = evaluator<ArgType>::Flags,
@@ -1709,18 +1620,22 @@ struct evaluator_wrapper_base : evaluator_base<XprType> {
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
 
-  typedef typename ArgType::Scalar Scalar;
-  typedef typename ArgType::CoeffReturnType CoeffReturnType;
+  using Scalar = typename ArgType::Scalar;
+  using CoeffReturnType = typename ArgType::CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_argImpl.coeff(row, col);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_argImpl.coeff(index); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_argImpl.coeff(index);
+  }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { return m_argImpl.coeffRef(row, col); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+    return m_argImpl.coeffRef(row, col);
+  }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return m_argImpl.coeffRef(index); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return m_argImpl.coeffRef(index); }
 
   template <int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
@@ -1770,17 +1685,17 @@ struct evaluator_wrapper_base : evaluator_base<XprType> {
 
 template <typename TArgType>
 struct unary_evaluator<MatrixWrapper<TArgType>> : evaluator_wrapper_base<MatrixWrapper<TArgType>> {
-  typedef MatrixWrapper<TArgType> XprType;
+  using XprType = MatrixWrapper<TArgType>;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& wrapper)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& wrapper)
       : evaluator_wrapper_base<MatrixWrapper<TArgType>>(wrapper.nestedExpression()) {}
 };
 
 template <typename TArgType>
 struct unary_evaluator<ArrayWrapper<TArgType>> : evaluator_wrapper_base<ArrayWrapper<TArgType>> {
-  typedef ArrayWrapper<TArgType> XprType;
+  using XprType = ArrayWrapper<TArgType>;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& wrapper)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& wrapper)
       : evaluator_wrapper_base<ArrayWrapper<TArgType>>(wrapper.nestedExpression()) {}
 };
 
@@ -1792,9 +1707,9 @@ struct reverse_packet_cond;
 
 template <typename ArgType, int Direction>
 struct unary_evaluator<Reverse<ArgType, Direction>> : evaluator_base<Reverse<ArgType, Direction>> {
-  typedef Reverse<ArgType, Direction> XprType;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  using XprType = Reverse<ArgType, Direction>;
+  using Scalar = typename XprType::Scalar;
+  using CoeffReturnType = typename XprType::CoeffReturnType;
 
   enum {
     IsRowMajor = XprType::IsRowMajor,
@@ -1807,7 +1722,7 @@ struct unary_evaluator<Reverse<ArgType, Direction>> : evaluator_base<Reverse<Arg
     CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
 
     // let's enable LinearAccess only with vectorization because of the product overhead
-    // FIXME enable DirectAccess with negative strides?
+    // FIXME: consider enabling DirectAccess with negative strides.
     Flags0 = evaluator<ArgType>::Flags,
     LinearAccess =
         ((Direction == BothDirections) && (int(Flags0) & PacketAccessBit)) ||
@@ -1817,27 +1732,27 @@ struct unary_evaluator<Reverse<ArgType, Direction>> : evaluator_base<Reverse<Arg
 
     Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess),
 
-    Alignment = 0  // FIXME in some rare cases, Alignment could be preserved, like a Vector4f.
+    Alignment = 0  // FIXME: in some rare cases, Alignment could be preserved.
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& reverse)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& reverse)
       : m_argImpl(reverse.nestedExpression()),
         m_rows(ReverseRow ? reverse.nestedExpression().rows() : 1),
         m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_argImpl.coeff(ReverseRow ? m_rows.value() - row - 1 : row, ReverseCol ? m_cols.value() - col - 1 : col);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return m_argImpl.coeff(m_rows.value() * m_cols.value() - index - 1);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
     return m_argImpl.coeffRef(ReverseRow ? m_rows.value() - row - 1 : row, ReverseCol ? m_cols.value() - col - 1 : col);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
     return m_argImpl.coeffRef(m_rows.value() * m_cols.value() - index - 1);
   }
 
@@ -1949,36 +1864,36 @@ struct unary_evaluator<Reverse<ArgType, Direction>> : evaluator_base<Reverse<Arg
 
 template <typename ArgType, int DiagIndex>
 struct evaluator<Diagonal<ArgType, DiagIndex>> : evaluator_base<Diagonal<ArgType, DiagIndex>> {
-  typedef Diagonal<ArgType, DiagIndex> XprType;
+  using XprType = Diagonal<ArgType, DiagIndex>;
 
   enum {
     CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
 
-    Flags =
-        (unsigned int)(evaluator<ArgType>::Flags & (HereditaryBits | DirectAccessBit) & ~RowMajorBit) | LinearAccessBit,
+    Flags = static_cast<unsigned int>(evaluator<ArgType>::Flags & (HereditaryBits | DirectAccessBit) & ~RowMajorBit) |
+            LinearAccessBit,
 
     Alignment = 0
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& diagonal)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit evaluator(const XprType& diagonal)
       : m_argImpl(diagonal.nestedExpression()), m_index(diagonal.index()) {}
 
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  using Scalar = typename XprType::Scalar;
+  using CoeffReturnType = typename XprType::CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index) const {
     return m_argImpl.coeff(row + rowOffset(), row + colOffset());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return m_argImpl.coeff(index + rowOffset(), index + colOffset());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index) {
     return m_argImpl.coeffRef(row + rowOffset(), row + colOffset());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
     return m_argImpl.coeffRef(index + rowOffset(), index + colOffset());
   }
 
@@ -1987,12 +1902,8 @@ struct evaluator<Diagonal<ArgType, DiagIndex>> : evaluator_base<Diagonal<ArgType
   const variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
 
  private:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowOffset() const {
-    return m_index.value() > 0 ? 0 : -m_index.value();
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colOffset() const {
-    return m_index.value() > 0 ? m_index.value() : 0;
-  }
+  EIGEN_DEVICE_FUNC constexpr Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; }
 };
 
 //----------------------------------------------------------------------
@@ -2012,7 +1923,7 @@ struct traits<EvalToTemp<ArgType>> : public traits<ArgType> {};
 template <typename ArgType>
 class EvalToTemp : public dense_xpr_base<EvalToTemp<ArgType>>::type {
  public:
-  typedef typename dense_xpr_base<EvalToTemp>::type Base;
+  using Base = typename dense_xpr_base<EvalToTemp>::type;
   EIGEN_GENERIC_PUBLIC_INTERFACE(EvalToTemp)
 
   explicit EvalToTemp(const ArgType& arg) : m_arg(arg) {}
@@ -2029,16 +1940,18 @@ class EvalToTemp : public dense_xpr_base<EvalToTemp<ArgType>>::type {
 
 template <typename ArgType>
 struct evaluator<EvalToTemp<ArgType>> : public evaluator<typename ArgType::PlainObject> {
-  typedef EvalToTemp<ArgType> XprType;
-  typedef typename ArgType::PlainObject PlainObject;
-  typedef evaluator<PlainObject> Base;
+  using XprType = EvalToTemp<ArgType>;
+  using PlainObject = typename ArgType::PlainObject;
+  using Base = evaluator<PlainObject>;
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : m_result(xpr.arg()) {
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const XprType& xpr) : m_result(xpr.arg()) {
     internal::construct_at<Base>(this, m_result);
   }
 
   // This constructor is used when nesting an EvalTo evaluator in another evaluator
-  EIGEN_DEVICE_FUNC evaluator(const ArgType& arg) : m_result(arg) { internal::construct_at<Base>(this, m_result); }
+  EIGEN_DEVICE_FUNC constexpr evaluator(const ArgType& arg) : m_result(arg) {
+    internal::construct_at<Base>(this, m_result);
+  }
 
  protected:
   PlainObject m_result;
diff --git a/Eigen/src/Core/CoreIterators.h b/Eigen/src/Core/CoreIterators.h
index f62cf238e75..3143726867b 100644
--- a/Eigen/src/Core/CoreIterators.h
+++ b/Eigen/src/Core/CoreIterators.h
@@ -57,7 +57,7 @@ class InnerIterator {
     m_iter.operator+=(i);
     return *this;
   }
-  EIGEN_STRONG_INLINE InnerIterator operator+(Index i) {
+  EIGEN_STRONG_INLINE InnerIterator operator+(Index i) const {
     InnerIterator result(*this);
     result += i;
     return result;
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index e2b2da5a643..27fd4340f8a 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -98,33 +98,33 @@ class CwiseBinaryOp : public CwiseBinaryOpImpl<BinaryOp, LhsType, RhsType,
   typedef std::remove_reference_t<RhsNested> RhsNested_;
 
 #if EIGEN_COMP_MSVC
-  // Required for Visual Studio or the Copy constructor will probably not get inlined!
+  // Required for Visual Studio, which may fail to inline the copy constructor otherwise.
   EIGEN_STRONG_INLINE CwiseBinaryOp(const CwiseBinaryOp<BinaryOp, LhsType, RhsType>&) = default;
 #endif
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs,
-                                                      const BinaryOp& func = BinaryOp())
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs,
+                                                                const BinaryOp& func = BinaryOp())
       : m_lhs(aLhs), m_rhs(aRhs), m_functor(func) {
     eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept {
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept {
     // return the fixed size type if available to enable compile time optimizations
     return internal::traits<internal::remove_all_t<LhsNested>>::RowsAtCompileTime == Dynamic ? m_rhs.rows()
                                                                                              : m_lhs.rows();
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept {
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept {
     // return the fixed size type if available to enable compile time optimizations
     return internal::traits<internal::remove_all_t<LhsNested>>::ColsAtCompileTime == Dynamic ? m_rhs.cols()
                                                                                              : m_lhs.cols();
   }
 
   /** \returns the left hand side nested expression */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const LhsNested_& lhs() const { return m_lhs; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const LhsNested_& lhs() const { return m_lhs; }
   /** \returns the right hand side nested expression */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const RhsNested_& rhs() const { return m_rhs; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const RhsNested_& rhs() const { return m_rhs; }
   /** \returns the functor representing the binary operation */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const BinaryOp& functor() const { return m_functor; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const BinaryOp& functor() const { return m_functor; }
 
  protected:
   LhsNested m_lhs;
@@ -145,7 +145,7 @@ class CwiseBinaryOpImpl : public internal::generic_xpr_base<CwiseBinaryOp<Binary
  */
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived>& other) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Derived& MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived>& other) {
   call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar, typename OtherDerived::Scalar>());
   return derived();
 }
@@ -156,7 +156,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator-=(c
  */
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Derived& MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other) {
   call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar, typename OtherDerived::Scalar>());
   return derived();
 }
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index 13a542a023f..bf6b6f30499 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -50,7 +50,7 @@ struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectT
   for vectors.
   *
   * See DenseBase::NullaryExpr(Index,const CustomNullaryOp&) for an example binding
-  * C++11 random number generators.
+  * std random number generators.
   *
   * A nullary expression can also be used to implement custom sophisticated matrix manipulations
   * that cannot be covered by the existing set of natively supported matrix manipulations.
@@ -66,21 +66,21 @@ class CwiseNullaryOp : public internal::dense_xpr_base<CwiseNullaryOp<NullaryOp,
   typedef typename internal::dense_xpr_base<CwiseNullaryOp>::type Base;
   EIGEN_DENSE_PUBLIC_INTERFACE(CwiseNullaryOp)
 
-  EIGEN_DEVICE_FUNC CwiseNullaryOp(Index rows, Index cols, const NullaryOp& func = NullaryOp())
+  EIGEN_DEVICE_FUNC constexpr CwiseNullaryOp(Index rows, Index cols, const NullaryOp& func = NullaryOp())
       : m_rows(rows), m_cols(cols), m_functor(func) {
     eigen_assert(rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows) && cols >= 0 &&
                  (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
   }
-  EIGEN_DEVICE_FUNC CwiseNullaryOp(Index size, const NullaryOp& func = NullaryOp())
+  EIGEN_DEVICE_FUNC constexpr CwiseNullaryOp(Index size, const NullaryOp& func = NullaryOp())
       : CwiseNullaryOp(RowsAtCompileTime == 1 ? 1 : size, RowsAtCompileTime == 1 ? size : 1, func) {
     EIGEN_STATIC_ASSERT(CwiseNullaryOp::IsVectorAtCompileTime, YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows.value(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_rows.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_cols.value(); }
 
   /** \returns the functor representing the nullary operation */
-  EIGEN_DEVICE_FUNC const NullaryOp& functor() const { return m_functor; }
+  EIGEN_DEVICE_FUNC constexpr const NullaryOp& functor() const { return m_functor; }
 
  protected:
   const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_rows;
@@ -94,7 +94,7 @@ class CwiseNullaryOp : public internal::dense_xpr_base<CwiseNullaryOp<NullaryOp,
  * the returned matrix. Must be compatible with this MatrixBase type.
  *
  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
- * it is redundant to pass \a rows and \a cols as arguments, so Zero() should be used
+ * it is redundant to pass \a rows and \a cols as arguments, so NullaryExpr(const CustomNullaryOp&) should be used
  * instead.
  *
  * The template parameter \a CustomNullaryOp is the type of the functor.
@@ -121,12 +121,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  * \only_for_vectors
  *
  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
- * it is redundant to pass \a size as argument, so Zero() should be used
+ * it is redundant to pass \a size as argument, so NullaryExpr(const CustomNullaryOp&) should be used
  * instead.
  *
  * The template parameter \a CustomNullaryOp is the type of the functor.
  *
- * Here is an example with C++11 random generators: \include random_cpp11.cpp
+ * Here is an example with std random generators: \include random_cpp11.cpp
  * Output: \verbinclude random_cpp11.out
  *
  * \sa class CwiseNullaryOp
@@ -174,7 +174,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  * the returned matrix. Must be compatible with this DenseBase type.
  *
  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
- * it is redundant to pass \a rows and \a cols as arguments, so Zero() should be used
+ * it is redundant to pass \a rows and \a cols as arguments, so Constant(const Scalar&) should be used
  * instead.
  *
  * The template parameter \a CustomNullaryOp is the type of the functor.
@@ -195,7 +195,7 @@ DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value) {
  * \only_for_vectors
  *
  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
- * it is redundant to pass \a size as argument, so Zero() should be used
+ * it is redundant to pass \a size as argument, so Constant(const Scalar&) should be used
  * instead.
  *
  * The template parameter \a CustomNullaryOp is the type of the functor.
@@ -235,8 +235,7 @@ DenseBase<Derived>::Constant(const Scalar& value) {
  * \sa LinSpaced(Index,const Scalar&, const Scalar&), setLinSpaced(Index,const Scalar&,const Scalar&)
  */
 template <typename Derived>
-EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<
-    Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar>(low, high, size));
@@ -247,8 +246,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const
  * \sa LinSpaced(const Scalar&, const Scalar&)
  */
 template <typename Derived>
-EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<
-    Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
diff --git a/Eigen/src/Core/CwiseTernaryOp.h b/Eigen/src/Core/CwiseTernaryOp.h
index 9bb0d4075c8..87377917dca 100644
--- a/Eigen/src/Core/CwiseTernaryOp.h
+++ b/Eigen/src/Core/CwiseTernaryOp.h
@@ -118,7 +118,7 @@ class CwiseTernaryOp : public CwiseTernaryOpImpl<TernaryOp, Arg1Type, Arg2Type,
     eigen_assert(a1.rows() == a2.rows() && a1.cols() == a2.cols() && a1.rows() == a3.rows() && a1.cols() == a3.cols());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Index rows() const {
     // return the fixed size type if available to enable compile time
     // optimizations
     if (internal::traits<internal::remove_all_t<Arg1Nested>>::RowsAtCompileTime == Dynamic &&
@@ -130,7 +130,7 @@ class CwiseTernaryOp : public CwiseTernaryOpImpl<TernaryOp, Arg1Type, Arg2Type,
     else
       return m_arg1.rows();
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Index cols() const {
     // return the fixed size type if available to enable compile time
     // optimizations
     if (internal::traits<internal::remove_all_t<Arg1Nested>>::ColsAtCompileTime == Dynamic &&
@@ -144,13 +144,13 @@ class CwiseTernaryOp : public CwiseTernaryOpImpl<TernaryOp, Arg1Type, Arg2Type,
   }
 
   /** \returns the first argument nested expression */
-  EIGEN_DEVICE_FUNC const Arg1Nested_& arg1() const { return m_arg1; }
+  EIGEN_DEVICE_FUNC constexpr const Arg1Nested_& arg1() const { return m_arg1; }
   /** \returns the first argument nested expression */
-  EIGEN_DEVICE_FUNC const Arg2Nested_& arg2() const { return m_arg2; }
+  EIGEN_DEVICE_FUNC constexpr const Arg2Nested_& arg2() const { return m_arg2; }
   /** \returns the third argument nested expression */
-  EIGEN_DEVICE_FUNC const Arg3Nested_& arg3() const { return m_arg3; }
+  EIGEN_DEVICE_FUNC constexpr const Arg3Nested_& arg3() const { return m_arg3; }
   /** \returns the functor representing the ternary operation */
-  EIGEN_DEVICE_FUNC const TernaryOp& functor() const { return m_functor; }
+  EIGEN_DEVICE_FUNC constexpr const TernaryOp& functor() const { return m_functor; }
 
  protected:
   Arg1Nested m_arg1;
diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h
index 94ec1a0fe2e..fabf4fc7ce5 100644
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -57,22 +57,26 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal
   typedef typename internal::ref_selector<XprType>::type XprTypeNested;
   typedef internal::remove_all_t<XprType> NestedExpression;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit CwiseUnaryOp(const XprType& xpr,
+                                                                        const UnaryOp& func = UnaryOp())
       : m_xpr(xpr), m_functor(func) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_xpr.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_xpr.cols(); }
 
   /** \returns the functor representing the unary operation */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryOp& functor() const { return m_functor; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const UnaryOp& functor() const { return m_functor; }
 
   /** \returns the nested expression */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<XprTypeNested>& nestedExpression() const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const internal::remove_all_t<XprTypeNested>& nestedExpression()
+      const {
     return m_xpr;
   }
 
   /** \returns the nested expression */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::remove_all_t<XprTypeNested>& nestedExpression() { return m_xpr; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE internal::remove_all_t<XprTypeNested>& nestedExpression() {
+    return m_xpr;
+  }
 
  protected:
   XprTypeNested m_xpr;
diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h
index 7dd7623fc48..384c8b1542b 100644
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -100,6 +100,7 @@ class CwiseUnaryViewImpl<ViewOp, MatrixType, StrideType, Dense, true>
   EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryViewImpl)
 
+  using Base::coeffRef;
   using Base::data;
   EIGEN_DEVICE_FUNC inline Scalar* data() { return &(this->coeffRef(0)); }
 
@@ -140,22 +141,24 @@ class CwiseUnaryView : public internal::CwiseUnaryViewImpl<ViewOp, MatrixType, S
   typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
   typedef internal::remove_all_t<MatrixType> NestedExpression;
 
-  explicit EIGEN_DEVICE_FUNC inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
+  explicit EIGEN_DEVICE_FUNC constexpr inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
       : m_matrix(mat), m_functor(func) {}
 
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView)
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_matrix.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
   /** \returns the functor representing unary operation */
-  EIGEN_DEVICE_FUNC const ViewOp& functor() const { return m_functor; }
+  EIGEN_DEVICE_FUNC constexpr const ViewOp& functor() const { return m_functor; }
 
   /** \returns the nested expression */
-  EIGEN_DEVICE_FUNC const internal::remove_all_t<MatrixTypeNested>& nestedExpression() const { return m_matrix; }
+  EIGEN_DEVICE_FUNC constexpr const internal::remove_all_t<MatrixTypeNested>& nestedExpression() const {
+    return m_matrix;
+  }
 
   /** \returns the nested expression */
-  EIGEN_DEVICE_FUNC std::remove_reference_t<MatrixTypeNested>& nestedExpression() { return m_matrix; }
+  EIGEN_DEVICE_FUNC constexpr std::remove_reference_t<MatrixTypeNested>& nestedExpression() { return m_matrix; }
 
  protected:
   MatrixTypeNested m_matrix;
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index 4f6894280e1..a24fc4766d2 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -260,21 +260,21 @@ class DenseBase
 
   /** Copies \a other into *this. \returns a reference to *this. */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other);
 
   /** Special case of the template operator=, in order to prevent the compiler
    * from generating a default operator= (issue hit with g++ 4.1)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase& other);
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& operator=(const DenseBase& other);
 
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC Derived& operator=(const EigenBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC constexpr Derived& operator=(const EigenBase<OtherDerived>& other);
 
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC Derived& operator+=(const EigenBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC constexpr Derived& operator+=(const EigenBase<OtherDerived>& other);
 
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC Derived& operator-=(const EigenBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC constexpr Derived& operator-=(const EigenBase<OtherDerived>& other);
 
   template <typename OtherDerived>
   EIGEN_DEVICE_FUNC Derived& operator=(const ReturnByValue<OtherDerived>& func);
@@ -283,7 +283,7 @@ class DenseBase
    * Copies \a other into *this without evaluating other. \returns a reference to *this. */
   template <typename OtherDerived>
   /** \deprecated */
-  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC Derived& lazyAssign(const DenseBase<OtherDerived>& other);
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC constexpr Derived& lazyAssign(const DenseBase<OtherDerived>& other);
 
   EIGEN_DEVICE_FUNC CommaInitializer<Derived> operator<<(const Scalar& s);
 
@@ -306,12 +306,12 @@ class DenseBase
   EIGEN_DEVICE_FUNC static const ConstantReturnType Constant(Index size, const Scalar& value);
   EIGEN_DEVICE_FUNC static const ConstantReturnType Constant(const Scalar& value);
 
-  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Sequential_t, Index size,
-                                                                                            const Scalar& low,
-                                                                                            const Scalar& high);
-  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Sequential_t,
-                                                                                            const Scalar& low,
-                                                                                            const Scalar& high);
+  EIGEN_DEPRECATED_WITH_REASON("The method may result in accuracy loss. Use .EqualSpaced() instead.")
+  EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Sequential_t, Index size, const Scalar& low,
+                                                                           const Scalar& high);
+  EIGEN_DEPRECATED_WITH_REASON("The method may result in accuracy loss. Use .EqualSpaced() instead.")
+  EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Sequential_t, const Scalar& low,
+                                                                           const Scalar& high);
 
   EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Index size, const Scalar& low,
                                                                            const Scalar& high);
@@ -348,13 +348,13 @@ class DenseBase
   EIGEN_DEVICE_FUNC Derived& setRandom();
 
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC bool isApprox(const DenseBase<OtherDerived>& other,
-                                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-  EIGEN_DEVICE_FUNC bool isMuchSmallerThan(const RealScalar& other,
-                                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+  EIGEN_DEVICE_FUNC constexpr bool isApprox(const DenseBase<OtherDerived>& other,
+                                            const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+  EIGEN_DEVICE_FUNC constexpr bool isMuchSmallerThan(
+      const RealScalar& other, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC bool isMuchSmallerThan(const DenseBase<OtherDerived>& other,
-                                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+  EIGEN_DEVICE_FUNC constexpr bool isMuchSmallerThan(
+      const DenseBase<OtherDerived>& other, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
 
   EIGEN_DEVICE_FUNC bool isApproxToConstant(const Scalar& value,
                                             const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
@@ -366,8 +366,13 @@ class DenseBase
   EIGEN_DEVICE_FUNC inline bool hasNaN() const;
   EIGEN_DEVICE_FUNC inline bool allFinite() const;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*=(const Scalar& other);
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator/=(const Scalar& other);
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& operator*=(const Scalar& other);
+  template <bool Enable = internal::complex_array_access<Scalar>::value, typename = std::enable_if_t<Enable>>
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& operator*=(const RealScalar& other);
+
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& operator/=(const Scalar& other);
+  template <bool Enable = internal::complex_array_access<Scalar>::value, typename = std::enable_if_t<Enable>>
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& operator/=(const RealScalar& other);
 
   typedef internal::add_const_on_value_type_t<typename internal::eval<Derived>::type> EvalReturnType;
   /** \returns the matrix or vector obtained by evaluating this expression.
@@ -404,7 +409,7 @@ class DenseBase
     call_assignment(derived(), other.derived(), internal::swap_assign_op<Scalar>());
   }
 
-  EIGEN_DEVICE_FUNC inline const NestByValue<Derived> nestByValue() const;
+  EIGEN_DEVICE_FUNC constexpr inline const NestByValue<Derived> nestByValue() const;
   EIGEN_DEVICE_FUNC inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
   EIGEN_DEVICE_FUNC inline ForceAlignedAccess<Derived> forceAlignedAccess();
   template <bool Enable>
@@ -419,49 +424,21 @@ class DenseBase
 
   EIGEN_DEVICE_FUNC Scalar prod() const;
 
-  template <int NaNPropagation>
+  // The default PropagateFast gives undefined behavior on NaN inputs but the fastest code.
+  template <int NaNPropagation = PropagateFast>
   EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar minCoeff() const;
-  template <int NaNPropagation>
+  template <int NaNPropagation = PropagateFast>
   EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar maxCoeff() const;
 
-  // By default, the fastest version with undefined NaN propagation semantics is
-  // used.
-  // TODO(rmlarsen): Replace with default template argument when we move to
-  // c++11 or beyond.
-  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar minCoeff() const {
-    return minCoeff<PropagateFast>();
-  }
-  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar maxCoeff() const {
-    return maxCoeff<PropagateFast>();
-  }
-
-  template <int NaNPropagation, typename IndexType>
+  template <int NaNPropagation = PropagateFast, typename IndexType>
   EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const;
-  template <int NaNPropagation, typename IndexType>
+  template <int NaNPropagation = PropagateFast, typename IndexType>
   EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const;
-  template <int NaNPropagation, typename IndexType>
+  template <int NaNPropagation = PropagateFast, typename IndexType>
   EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const;
-  template <int NaNPropagation, typename IndexType>
+  template <int NaNPropagation = PropagateFast, typename IndexType>
   EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const;
 
-  // TODO(rmlarsen): Replace these methods with a default template argument.
-  template <typename IndexType>
-  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const {
-    return minCoeff<PropagateFast>(row, col);
-  }
-  template <typename IndexType>
-  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const {
-    return maxCoeff<PropagateFast>(row, col);
-  }
-  template <typename IndexType>
-  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const {
-    return minCoeff<PropagateFast>(index);
-  }
-  template <typename IndexType>
-  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const {
-    return maxCoeff<PropagateFast>(index);
-  }
-
   template <typename BinaryOp>
   EIGEN_DEVICE_FUNC Scalar redux(const BinaryOp& func) const;
 
@@ -519,25 +496,25 @@ class DenseBase
   static const RandomReturnType Random();
 
   template <typename ThenDerived, typename ElseDerived>
-  inline EIGEN_DEVICE_FUNC
-      CwiseTernaryOp<internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar,
-                                                        typename DenseBase<ElseDerived>::Scalar, Scalar>,
-                     ThenDerived, ElseDerived, Derived>
-      select(const DenseBase<ThenDerived>& thenMatrix, const DenseBase<ElseDerived>& elseMatrix) const;
+  inline EIGEN_DEVICE_FUNC constexpr CwiseTernaryOp<
+      internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar,
+                                         typename DenseBase<ElseDerived>::Scalar, Scalar>,
+      ThenDerived, ElseDerived, Derived>
+  select(const DenseBase<ThenDerived>& thenMatrix, const DenseBase<ElseDerived>& elseMatrix) const;
 
   template <typename ThenDerived>
-  inline EIGEN_DEVICE_FUNC
-      CwiseTernaryOp<internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar,
-                                                        typename DenseBase<ThenDerived>::Scalar, Scalar>,
-                     ThenDerived, typename DenseBase<ThenDerived>::ConstantReturnType, Derived>
-      select(const DenseBase<ThenDerived>& thenMatrix, const typename DenseBase<ThenDerived>::Scalar& elseScalar) const;
+  inline EIGEN_DEVICE_FUNC constexpr CwiseTernaryOp<
+      internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar,
+                                         typename DenseBase<ThenDerived>::Scalar, Scalar>,
+      ThenDerived, typename DenseBase<ThenDerived>::ConstantReturnType, Derived>
+  select(const DenseBase<ThenDerived>& thenMatrix, const typename DenseBase<ThenDerived>::Scalar& elseScalar) const;
 
   template <typename ElseDerived>
-  inline EIGEN_DEVICE_FUNC
-      CwiseTernaryOp<internal::scalar_boolean_select_op<typename DenseBase<ElseDerived>::Scalar,
-                                                        typename DenseBase<ElseDerived>::Scalar, Scalar>,
-                     typename DenseBase<ElseDerived>::ConstantReturnType, ElseDerived, Derived>
-      select(const typename DenseBase<ElseDerived>::Scalar& thenScalar, const DenseBase<ElseDerived>& elseMatrix) const;
+  inline EIGEN_DEVICE_FUNC constexpr CwiseTernaryOp<
+      internal::scalar_boolean_select_op<typename DenseBase<ElseDerived>::Scalar,
+                                         typename DenseBase<ElseDerived>::Scalar, Scalar>,
+      typename DenseBase<ElseDerived>::ConstantReturnType, ElseDerived, Derived>
+  select(const typename DenseBase<ElseDerived>::Scalar& thenScalar, const DenseBase<ElseDerived>& elseMatrix) const;
 
   template <int p>
   RealScalar lpNorm() const;
@@ -575,12 +552,12 @@ class DenseBase
 #else
   typedef std::conditional_t<(Flags & DirectAccessBit) == DirectAccessBit,
                              internal::pointer_based_stl_iterator<Derived>,
-                             internal::generic_randaccess_stl_iterator<Derived> >
+                             internal::generic_randaccess_stl_iterator<Derived>>
       iterator_type;
 
   typedef std::conditional_t<(Flags & DirectAccessBit) == DirectAccessBit,
                              internal::pointer_based_stl_iterator<const Derived>,
-                             internal::generic_randaccess_stl_iterator<const Derived> >
+                             internal::generic_randaccess_stl_iterator<const Derived>>
       const_iterator_type;
 
   // Stl-style iterators are supported only for vectors.
@@ -597,12 +574,20 @@ class DenseBase
   inline const_iterator end() const;
   inline const_iterator cend() const;
 
+  using RealViewReturnType = std::conditional_t<NumTraits<Scalar>::IsComplex, RealView<Derived>, Derived&>;
+  using ConstRealViewReturnType =
+      std::conditional_t<NumTraits<Scalar>::IsComplex, RealView<const Derived>, const Derived&>;
+
+  EIGEN_DEVICE_FUNC RealViewReturnType realView();
+  EIGEN_DEVICE_FUNC ConstRealViewReturnType realView() const;
+
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
 #define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 #define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
 #define EIGEN_DOC_UNARY_ADDONS(X, Y)
 #include "../plugins/CommonCwiseUnaryOps.inc"
 #include "../plugins/BlockMethods.inc"
+// Defines operator()(const RowIndices&, const ColIndices&) and other indexed view methods.
 #include "../plugins/IndexedViewMethods.inc"
 #include "../plugins/ReshapedMethods.inc"
 #ifdef EIGEN_DENSEBASE_PLUGIN
diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h
index 377df574ffd..c5284169423 100644
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -67,14 +67,14 @@ class DenseCoeffsBase<Derived, ReadOnlyAccessors> : public EigenBase<Derived> {
   using Base::rows;
   using Base::size;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) const {
     return int(Derived::RowsAtCompileTime) == 1   ? 0
            : int(Derived::ColsAtCompileTime) == 1 ? inner
            : int(Derived::Flags) & RowMajorBit    ? outer
                                                   : inner;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) const {
     return int(Derived::ColsAtCompileTime) == 1   ? 0
            : int(Derived::RowsAtCompileTime) == 1 ? inner
            : int(Derived::Flags) & RowMajorBit    ? inner
@@ -95,12 +95,12 @@ class DenseCoeffsBase<Derived, ReadOnlyAccessors> : public EigenBase<Derived> {
    *
    * \sa operator()(Index,Index) const, coeffRef(Index,Index), coeff(Index) const
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType coeff(Index row, Index col) const {
     eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
     return internal::evaluator<Derived>(derived()).coeff(row, col);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeffByOuterInner(Index outer, Index inner) const {
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType coeffByOuterInner(Index outer, Index inner) const {
     return coeff(rowIndexByOuterInner(outer, inner), colIndexByOuterInner(outer, inner));
   }
 
@@ -108,11 +108,19 @@ class DenseCoeffsBase<Derived, ReadOnlyAccessors> : public EigenBase<Derived> {
    *
    * \sa operator()(Index,Index), operator[](Index)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator()(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType operator()(Index row, Index col) const {
     eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
     return coeff(row, col);
   }
 
+#ifdef EIGEN_MULTIDIMENSIONAL_SUBSCRIPT
+  /** \returns the coefficient at given the given row and column.
+   *
+   * \sa operator[](Index,Index), operator[](Index)
+   */
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType operator[](Index row, Index col) const { return operator()(row, col); }
+#endif
+
   /** Short version: don't use this function, use
    * \link operator[](Index) const \endlink instead.
    *
@@ -128,7 +136,7 @@ class DenseCoeffsBase<Derived, ReadOnlyAccessors> : public EigenBase<Derived> {
    * \sa operator[](Index) const, coeffRef(Index), coeff(Index,Index) const
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType coeff(Index index) const {
     EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
                         THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
     eigen_internal_assert(index >= 0 && index < size());
@@ -143,7 +151,7 @@ class DenseCoeffsBase<Derived, ReadOnlyAccessors> : public EigenBase<Derived> {
    * z() const, w() const
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator[](Index index) const {
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType operator[](Index index) const {
     EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                         THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
     eigen_assert(index >= 0 && index < size());
@@ -160,32 +168,32 @@ class DenseCoeffsBase<Derived, ReadOnlyAccessors> : public EigenBase<Derived> {
    * z() const, w() const
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator()(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType operator()(Index index) const {
     eigen_assert(index >= 0 && index < size());
     return coeff(index);
   }
 
   /** equivalent to operator[](0).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType x() const { return (*this)[0]; }
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType x() const { return (*this)[0]; }
 
   /** equivalent to operator[](1).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType y() const {
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType y() const {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 2, OUT_OF_RANGE_ACCESS);
     return (*this)[1];
   }
 
   /** equivalent to operator[](2).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType z() const {
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType z() const {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 3, OUT_OF_RANGE_ACCESS);
     return (*this)[2];
   }
 
   /** equivalent to operator[](3).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType w() const {
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType w() const {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 4, OUT_OF_RANGE_ACCESS);
     return (*this)[3];
   }
@@ -303,12 +311,12 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
    *
    * \sa operator()(Index,Index), coeff(Index, Index) const, coeffRef(Index)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index row, Index col) {
+  EIGEN_DEVICE_FUNC constexpr Scalar& coeffRef(Index row, Index col) {
     eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
     return internal::evaluator<Derived>(derived()).coeffRef(row, col);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRefByOuterInner(Index outer, Index inner) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRefByOuterInner(Index outer, Index inner) {
     return coeffRef(rowIndexByOuterInner(outer, inner), colIndexByOuterInner(outer, inner));
   }
 
@@ -316,12 +324,19 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
    *
    * \sa operator[](Index)
    */
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator()(Index row, Index col) {
+  EIGEN_DEVICE_FUNC constexpr Scalar& operator()(Index row, Index col) {
     eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
     return coeffRef(row, col);
   }
 
+#ifdef EIGEN_MULTIDIMENSIONAL_SUBSCRIPT
+  /** \returns a reference to the coefficient at given the given row and column.
+   *
+   * \sa operator[](Index)
+   */
+  EIGEN_DEVICE_FUNC constexpr Scalar& operator[](Index row, Index col) { return operator()(row, col); }
+#endif
+
   /** Short version: don't use this function, use
    * \link operator[](Index) \endlink instead.
    *
@@ -337,7 +352,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
    * \sa operator[](Index), coeff(Index) const, coeffRef(Index,Index)
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC constexpr Scalar& coeffRef(Index index) {
     EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
                         THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
     eigen_internal_assert(index >= 0 && index < size());
@@ -351,7 +366,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
    * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator[](Index index) {
+  EIGEN_DEVICE_FUNC constexpr Scalar& operator[](Index index) {
     EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                         THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
     eigen_assert(index >= 0 && index < size());
@@ -367,32 +382,32 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
    * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator()(Index index) {
+  EIGEN_DEVICE_FUNC constexpr Scalar& operator()(Index index) {
     eigen_assert(index >= 0 && index < size());
     return coeffRef(index);
   }
 
   /** equivalent to operator[](0).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& x() { return (*this)[0]; }
+  EIGEN_DEVICE_FUNC constexpr Scalar& x() { return (*this)[0]; }
 
   /** equivalent to operator[](1).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& y() {
+  EIGEN_DEVICE_FUNC constexpr Scalar& y() {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 2, OUT_OF_RANGE_ACCESS);
     return (*this)[1];
   }
 
   /** equivalent to operator[](2).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& z() {
+  EIGEN_DEVICE_FUNC constexpr Scalar& z() {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 3, OUT_OF_RANGE_ACCESS);
     return (*this)[2];
   }
 
   /** equivalent to operator[](3).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& w() {
+  EIGEN_DEVICE_FUNC constexpr Scalar& w() {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 4, OUT_OF_RANGE_ACCESS);
     return (*this)[3];
   }
diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index d62586c99b5..8f2d1b12022 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -54,7 +54,7 @@ template <typename T, int Size, int MatrixOrArrayOptions,
 struct plain_array {
   EIGEN_ALIGN_TO_BOUNDARY(Alignment) T array[Size];
 #if defined(EIGEN_NO_DEBUG) || defined(EIGEN_TESTING_PLAINOBJECT_CTOR)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plain_array() = default;
+  EIGEN_DEVICE_FUNC constexpr plain_array() = default;
 #else
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plain_array() {
     EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(Alignment)
@@ -65,20 +65,15 @@ struct plain_array {
 
 template <typename T, int Size, int MatrixOrArrayOptions>
 struct plain_array<T, Size, MatrixOrArrayOptions, 0> {
-  T array[Size];
+  // on some 32-bit platforms, stack-allocated arrays are aligned to 4 bytes, not the preferred alignment of T
+  EIGEN_ALIGN_TO_BOUNDARY(alignof(T)) T array[Size];
 #if defined(EIGEN_NO_DEBUG) || defined(EIGEN_TESTING_PLAINOBJECT_CTOR)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plain_array() = default;
+  EIGEN_DEVICE_FUNC constexpr plain_array() = default;
 #else
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plain_array() { EIGEN_MAKE_STACK_ALLOCATION_ASSERT(Size * sizeof(T)) }
 #endif
 };
 
-template <typename T, int MatrixOrArrayOptions, int Alignment>
-struct plain_array<T, 0, MatrixOrArrayOptions, Alignment> {
-  T array[1];
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plain_array() = default;
-};
-
 template <typename T, int Size, int Options, int Alignment>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap_plain_array(plain_array<T, Size, Options, Alignment>& a,
                                                                       plain_array<T, Size, Options, Alignment>& b,
@@ -97,8 +92,8 @@ class DenseStorage_impl {
 
  public:
 #ifndef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
 #else
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
@@ -108,19 +103,18 @@ class DenseStorage_impl {
     smart_copy(other.m_data.array, other.m_data.array + Size, m_data.array);
   }
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) {
     numext::swap(m_data, other.m_data);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index /*rows*/,
-                                                                          Index /*cols*/) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data.array; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data.array; }
+  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC constexpr void resize(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return Rows * Cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data.array; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data.array; }
 };
 template <typename T, int Size, int Cols, int Options>
 class DenseStorage_impl<T, Size, Dynamic, Cols, Options> {
@@ -128,7 +122,7 @@ class DenseStorage_impl<T, Size, Dynamic, Cols, Options> {
   Index m_rows = 0;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
       : m_rows(other.m_rows) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
@@ -137,7 +131,7 @@ class DenseStorage_impl<T, Size, Dynamic, Cols, Options> {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index size, Index rows, Index /*cols*/)
       : m_rows(rows) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
-    EIGEN_UNUSED_VARIABLE(size)
+    EIGEN_UNUSED_VARIABLE(size);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl& other) {
     smart_copy(other.m_data.array, other.m_data.array + other.size(), m_data.array);
@@ -148,17 +142,13 @@ class DenseStorage_impl<T, Size, Dynamic, Cols, Options> {
     swap_plain_array(m_data, other.m_data, size(), other.size());
     numext::swap(m_rows, other.m_rows);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index rows, Index /*cols*/) {
-    m_rows = rows;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index rows, Index /*cols*/) {
-    m_rows = rows;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data.array; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data.array; }
+  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index /*size*/, Index rows, Index /*cols*/) { m_rows = rows; }
+  EIGEN_DEVICE_FUNC constexpr void resize(Index /*size*/, Index rows, Index /*cols*/) { m_rows = rows; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return m_rows * Cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data.array; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data.array; }
 };
 template <typename T, int Size, int Rows, int Options>
 class DenseStorage_impl<T, Size, Rows, Dynamic, Options> {
@@ -166,7 +156,7 @@ class DenseStorage_impl<T, Size, Rows, Dynamic, Options> {
   Index m_cols = 0;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
       : m_cols(other.m_cols) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
@@ -175,7 +165,7 @@ class DenseStorage_impl<T, Size, Rows, Dynamic, Options> {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index size, Index /*rows*/, Index cols)
       : m_cols(cols) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
-    EIGEN_UNUSED_VARIABLE(size)
+    EIGEN_UNUSED_VARIABLE(size);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl& other) {
     smart_copy(other.m_data.array, other.m_data.array + other.size(), m_data.array);
@@ -186,17 +176,13 @@ class DenseStorage_impl<T, Size, Rows, Dynamic, Options> {
     swap_plain_array(m_data, other.m_data, size(), other.size());
     numext::swap(m_cols, other.m_cols);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index /*rows*/, Index cols) {
-    m_cols = cols;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index /*rows*/, Index cols) {
-    m_cols = cols;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data.array; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data.array; }
+  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index /*size*/, Index /*rows*/, Index cols) { m_cols = cols; }
+  EIGEN_DEVICE_FUNC constexpr void resize(Index /*size*/, Index /*rows*/, Index cols) { m_cols = cols; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return Rows * m_cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data.array; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data.array; }
 };
 template <typename T, int Size, int Options>
 class DenseStorage_impl<T, Size, Dynamic, Dynamic, Options> {
@@ -205,7 +191,7 @@ class DenseStorage_impl<T, Size, Dynamic, Dynamic, Options> {
   Index m_cols = 0;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
       : m_rows(other.m_rows), m_cols(other.m_cols) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
@@ -214,7 +200,7 @@ class DenseStorage_impl<T, Size, Dynamic, Dynamic, Options> {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index size, Index rows, Index cols)
       : m_rows(rows), m_cols(cols) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
-    EIGEN_UNUSED_VARIABLE(size)
+    EIGEN_UNUSED_VARIABLE(size);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl& other) {
     smart_copy(other.m_data.array, other.m_data.array + other.size(), m_data.array);
@@ -227,87 +213,72 @@ class DenseStorage_impl<T, Size, Dynamic, Dynamic, Options> {
     numext::swap(m_rows, other.m_rows);
     numext::swap(m_cols, other.m_cols);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index rows, Index cols) {
+  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index /*size*/, Index rows, Index cols) {
     m_rows = rows;
     m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index rows, Index cols) {
+  EIGEN_DEVICE_FUNC constexpr void resize(Index /*size*/, Index rows, Index cols) {
     m_rows = rows;
     m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data.array; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data.array; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return m_rows * m_cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data.array; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data.array; }
 };
 // null matrix variants
 template <typename T, int Rows, int Cols, int Options>
 class DenseStorage_impl<T, 0, Rows, Cols, Options> {
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl&) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index /*rows*/,
-                                                                          Index /*cols*/) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return nullptr; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return nullptr; }
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr void swap(DenseStorage_impl&) {}
+  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC constexpr void resize(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return Rows * Cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return nullptr; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return nullptr; }
 };
 template <typename T, int Cols, int Options>
 class DenseStorage_impl<T, 0, Dynamic, Cols, Options> {
   Index m_rows = 0;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index rows, Index /*cols*/)
-      : m_rows(rows) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
-    numext::swap(m_rows, other.m_rows);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index rows, Index /*cols*/) {
-    m_rows = rows;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index rows, Index /*cols*/) {
-    m_rows = rows;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return nullptr; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return nullptr; }
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(Index /*size*/, Index rows, Index /*cols*/) : m_rows(rows) {}
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr void swap(DenseStorage_impl& other) noexcept { numext::swap(m_rows, other.m_rows); }
+  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index /*size*/, Index rows, Index /*cols*/) { m_rows = rows; }
+  EIGEN_DEVICE_FUNC constexpr void resize(Index /*size*/, Index rows, Index /*cols*/) { m_rows = rows; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return m_rows * Cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return nullptr; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return nullptr; }
 };
 template <typename T, int Rows, int Options>
 class DenseStorage_impl<T, 0, Rows, Dynamic, Options> {
   Index m_cols = 0;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index /*rows*/, Index cols)
-      : m_cols(cols) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
-    numext::swap(m_cols, other.m_cols);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index /*rows*/, Index cols) {
-    m_cols = cols;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index /*rows*/, Index cols) {
-    m_cols = cols;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return nullptr; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return nullptr; }
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(Index /*size*/, Index /*rows*/, Index cols) : m_cols(cols) {}
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr void swap(DenseStorage_impl& other) noexcept { numext::swap(m_cols, other.m_cols); }
+  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index /*size*/, Index /*rows*/, Index cols) { m_cols = cols; }
+  EIGEN_DEVICE_FUNC constexpr void resize(Index /*size*/, Index /*rows*/, Index cols) { m_cols = cols; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return Rows * m_cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return nullptr; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return nullptr; }
 };
 template <typename T, int Options>
 class DenseStorage_impl<T, 0, Dynamic, Dynamic, Options> {
@@ -315,28 +286,27 @@ class DenseStorage_impl<T, 0, Dynamic, Dynamic, Options> {
   Index m_cols = 0;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index rows, Index cols)
-      : m_rows(rows), m_cols(cols) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(Index /*size*/, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC constexpr void swap(DenseStorage_impl& other) noexcept {
     numext::swap(m_rows, other.m_rows);
     numext::swap(m_cols, other.m_cols);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index rows, Index cols) {
+  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index /*size*/, Index rows, Index cols) {
     m_rows = rows;
     m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index rows, Index cols) {
+  EIGEN_DEVICE_FUNC constexpr void resize(Index /*size*/, Index rows, Index cols) {
     m_rows = rows;
     m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return nullptr; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return nullptr; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return m_rows * m_cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return nullptr; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return nullptr; }
 };
 // fixed-size matrix with dynamic memory allocation not currently supported
 template <typename T, int Rows, int Cols, int Options>
@@ -350,7 +320,7 @@ class DenseStorage_impl<T, Dynamic, Dynamic, Cols, Options> {
 
  public:
   static constexpr int Size = Dynamic;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
       : m_data(conditional_aligned_new_auto<T, Align>(other.size())), m_rows(other.m_rows) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
@@ -360,7 +330,7 @@ class DenseStorage_impl<T, Dynamic, Dynamic, Cols, Options> {
       : m_data(conditional_aligned_new_auto<T, Align>(size)), m_rows(rows) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(DenseStorage_impl&& other) noexcept
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(DenseStorage_impl&& other) noexcept
       : m_data(other.m_data), m_rows(other.m_rows) {
     other.m_data = nullptr;
     other.m_rows = 0;
@@ -371,11 +341,11 @@ class DenseStorage_impl<T, Dynamic, Dynamic, Cols, Options> {
     smart_copy(other.m_data, other.m_data + other.size(), m_data);
     return *this;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(DenseStorage_impl&& other) noexcept {
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl& operator=(DenseStorage_impl&& other) noexcept {
     this->swap(other);
     return *this;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
+  EIGEN_DEVICE_FUNC constexpr void swap(DenseStorage_impl& other) noexcept {
     numext::swap(m_data, other.m_data);
     numext::swap(m_rows, other.m_rows);
   }
@@ -392,11 +362,11 @@ class DenseStorage_impl<T, Dynamic, Dynamic, Cols, Options> {
     }
     m_rows = rows;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * Cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return m_rows * Cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data; }
 };
 template <typename T, int Rows, int Options>
 class DenseStorage_impl<T, Dynamic, Rows, Dynamic, Options> {
@@ -406,7 +376,7 @@ class DenseStorage_impl<T, Dynamic, Rows, Dynamic, Options> {
 
  public:
   static constexpr int Size = Dynamic;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
       : m_data(conditional_aligned_new_auto<T, Align>(other.size())), m_cols(other.m_cols) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
@@ -416,7 +386,7 @@ class DenseStorage_impl<T, Dynamic, Rows, Dynamic, Options> {
       : m_data(conditional_aligned_new_auto<T, Align>(size)), m_cols(cols) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(DenseStorage_impl&& other) noexcept
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(DenseStorage_impl&& other) noexcept
       : m_data(other.m_data), m_cols(other.m_cols) {
     other.m_data = nullptr;
     other.m_cols = 0;
@@ -427,11 +397,11 @@ class DenseStorage_impl<T, Dynamic, Rows, Dynamic, Options> {
     smart_copy(other.m_data, other.m_data + other.size(), m_data);
     return *this;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(DenseStorage_impl&& other) noexcept {
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl& operator=(DenseStorage_impl&& other) noexcept {
     this->swap(other);
     return *this;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
+  EIGEN_DEVICE_FUNC constexpr void swap(DenseStorage_impl& other) noexcept {
     numext::swap(m_data, other.m_data);
     numext::swap(m_cols, other.m_cols);
   }
@@ -448,11 +418,11 @@ class DenseStorage_impl<T, Dynamic, Rows, Dynamic, Options> {
     }
     m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return Rows * m_cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data; }
 };
 template <typename T, int Options>
 class DenseStorage_impl<T, Dynamic, Dynamic, Dynamic, Options> {
@@ -463,7 +433,7 @@ class DenseStorage_impl<T, Dynamic, Dynamic, Dynamic, Options> {
 
  public:
   static constexpr int Size = Dynamic;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl() = default;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
       : m_data(conditional_aligned_new_auto<T, Align>(other.size())), m_rows(other.m_rows), m_cols(other.m_cols) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
@@ -473,7 +443,7 @@ class DenseStorage_impl<T, Dynamic, Dynamic, Dynamic, Options> {
       : m_data(conditional_aligned_new_auto<T, Align>(size)), m_rows(rows), m_cols(cols) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(DenseStorage_impl&& other) noexcept
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl(DenseStorage_impl&& other) noexcept
       : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {
     other.m_data = nullptr;
     other.m_rows = 0;
@@ -485,11 +455,11 @@ class DenseStorage_impl<T, Dynamic, Dynamic, Dynamic, Options> {
     smart_copy(other.m_data, other.m_data + other.size(), m_data);
     return *this;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(DenseStorage_impl&& other) noexcept {
+  EIGEN_DEVICE_FUNC constexpr DenseStorage_impl& operator=(DenseStorage_impl&& other) noexcept {
     this->swap(other);
     return *this;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
+  EIGEN_DEVICE_FUNC constexpr void swap(DenseStorage_impl& other) noexcept {
     numext::swap(m_data, other.m_data);
     numext::swap(m_rows, other.m_rows);
     numext::swap(m_cols, other.m_cols);
@@ -509,11 +479,11 @@ class DenseStorage_impl<T, Dynamic, Dynamic, Dynamic, Options> {
     m_rows = rows;
     m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * m_cols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC constexpr Index size() const { return m_rows * m_cols; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data; }
 };
 template <typename T, int Size, int Rows, int Cols>
 struct use_default_move {
@@ -542,15 +512,14 @@ class DenseStorage : public internal::DenseStorage_impl<T, Size, Rows, Cols, Opt
   using Base = internal::DenseStorage_impl<T, Size, Rows, Cols, Options>;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage() = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(const DenseStorage&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(Index size, Index rows, Index cols)
-      : Base(size, rows, cols) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage& operator=(const DenseStorage&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage(Index size, Index rows, Index cols) : Base(size, rows, cols) {}
+  EIGEN_DEVICE_FUNC constexpr DenseStorage& operator=(const DenseStorage&) = default;
   // if DenseStorage meets the requirements of use_default_move, then use the move construction and move assignment
   // operation defined in DenseStorage_impl, or the compiler-generated version if none is defined
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(DenseStorage&&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage& operator=(DenseStorage&&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage(DenseStorage&&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage& operator=(DenseStorage&&) = default;
 };
 template <typename T, int Size, int Rows, int Cols, int Options>
 class DenseStorage<T, Size, Rows, Cols, Options, false>
@@ -558,16 +527,15 @@ class DenseStorage<T, Size, Rows, Cols, Options, false>
   using Base = internal::DenseStorage_impl<T, Size, Rows, Cols, Options>;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage() = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(const DenseStorage&) = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(Index size, Index rows, Index cols)
-      : Base(size, rows, cols) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage& operator=(const DenseStorage&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage() = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage&) = default;
+  EIGEN_DEVICE_FUNC constexpr DenseStorage(Index size, Index rows, Index cols) : Base(size, rows, cols) {}
+  EIGEN_DEVICE_FUNC constexpr DenseStorage& operator=(const DenseStorage&) = default;
   // if DenseStorage does not meet the requirements of use_default_move, then defer to the copy construction and copy
   // assignment behavior
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(DenseStorage&& other)
+  EIGEN_DEVICE_FUNC constexpr DenseStorage(DenseStorage&& other)
       : DenseStorage(static_cast<const DenseStorage&>(other)) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage& operator=(DenseStorage&& other) {
+  EIGEN_DEVICE_FUNC constexpr DenseStorage& operator=(DenseStorage&& other) {
     *this = other;
     return *this;
   }
diff --git a/Eigen/src/Core/DeviceWrapper.h b/Eigen/src/Core/DeviceWrapper.h
index 012dce10d1d..1a326ae3bae 100644
--- a/Eigen/src/Core/DeviceWrapper.h
+++ b/Eigen/src/Core/DeviceWrapper.h
@@ -87,7 +87,7 @@ template <typename Kernel, typename Device, int Traversal = Kernel::AssignmentTr
           int Unrolling = Kernel::AssignmentTraits::Unrolling>
 struct dense_assignment_loop_with_device {
   using Base = dense_assignment_loop<Kernel, Traversal, Unrolling>;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Device&) { Base::run(kernel); }
+  static EIGEN_DEVICE_FUNC constexpr void run(Kernel& kernel, Device&) { Base::run(kernel); }
 };
 
 // entry point for a generic expression with device
@@ -104,7 +104,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias(De
   using ActualDstType = std::conditional_t<NeedToTranspose, Transpose<Dst>, Dst&>;
   ActualDstType actualDst(dst.derived());
 
-  // TODO check whether this is the right place to perform these checks:
+  // TODO: check whether this is the right place to perform these checks:
   EIGEN_STATIC_ASSERT_LVALUE(Dst)
   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned, Src)
   EIGEN_CHECK_BINARY_COMPATIBILIY(Func, typename ActualDstTypeCleaned::Scalar, typename Src::Scalar);
diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h
index ff8611c6071..61a47842e01 100644
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -71,14 +71,14 @@ class Diagonal : public internal::dense_xpr_base<Diagonal<MatrixType, DiagIndex_
   typedef typename internal::dense_xpr_base<Diagonal>::type Base;
   EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)
 
-  EIGEN_DEVICE_FUNC explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex)
+  EIGEN_DEVICE_FUNC constexpr explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex)
       : m_matrix(matrix), m_index(a_index) {
     eigen_assert(a_index <= m_matrix.cols() && -a_index <= m_matrix.rows());
   }
 
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)
 
-  EIGEN_DEVICE_FUNC inline Index rows() const {
+  EIGEN_DEVICE_FUNC constexpr inline Index rows() const {
     return m_index.value() < 0 ? numext::mini<Index>(m_matrix.cols(), m_matrix.rows() + m_index.value())
                                : numext::mini<Index>(m_matrix.rows(), m_matrix.cols() - m_index.value());
   }
@@ -91,8 +91,12 @@ class Diagonal : public internal::dense_xpr_base<Diagonal<MatrixType, DiagIndex_
 
   typedef std::conditional_t<internal::is_lvalue<MatrixType>::value, Scalar, const Scalar> ScalarWithConstIfNotLvalue;
 
-  EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.coeffRef(rowOffset(), colOffset())); }
-  EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(m_matrix.coeffRef(rowOffset(), colOffset())); }
+  EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() {
+    return rows() > 0 ? &(m_matrix.coeffRef(rowOffset(), colOffset())) : nullptr;
+  }
+  EIGEN_DEVICE_FUNC inline const Scalar* data() const {
+    return rows() > 0 ? &(m_matrix.coeffRef(rowOffset(), colOffset())) : nullptr;
+  }
 
   EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index) {
     EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
@@ -120,11 +124,12 @@ class Diagonal : public internal::dense_xpr_base<Diagonal<MatrixType, DiagIndex_
     return m_matrix.coeff(idx + rowOffset(), idx + colOffset());
   }
 
-  EIGEN_DEVICE_FUNC inline const internal::remove_all_t<typename MatrixType::Nested>& nestedExpression() const {
+  EIGEN_DEVICE_FUNC constexpr inline const internal::remove_all_t<typename MatrixType::Nested>& nestedExpression()
+      const {
     return m_matrix;
   }
 
-  EIGEN_DEVICE_FUNC inline Index index() const { return m_index.value(); }
+  EIGEN_DEVICE_FUNC constexpr inline Index index() const { return m_index.value(); }
 
  protected:
   typename internal::ref_selector<MatrixType>::non_const_type m_matrix;
@@ -132,15 +137,11 @@ class Diagonal : public internal::dense_xpr_base<Diagonal<MatrixType, DiagIndex_
 
  private:
   // some compilers may fail to optimize std::max etc in case of compile-time constants...
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index absDiagIndex() const noexcept {
+  EIGEN_DEVICE_FUNC constexpr Index absDiagIndex() const noexcept {
     return m_index.value() > 0 ? m_index.value() : -m_index.value();
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowOffset() const noexcept {
-    return m_index.value() > 0 ? 0 : -m_index.value();
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colOffset() const noexcept {
-    return m_index.value() > 0 ? m_index.value() : 0;
-  }
+  EIGEN_DEVICE_FUNC constexpr Index rowOffset() const noexcept { return m_index.value() > 0 ? 0 : -m_index.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index colOffset() const noexcept { return m_index.value() > 0 ? m_index.value() : 0; }
   // trigger a compile-time error if someone try to call packet
   template <int LoadMode>
   typename MatrixType::PacketReturnType packet(Index) const;
@@ -157,13 +158,13 @@ class Diagonal : public internal::dense_xpr_base<Diagonal<MatrixType, DiagIndex_
  *
  * \sa class Diagonal */
 template <typename Derived>
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalReturnType MatrixBase<Derived>::diagonal() {
+EIGEN_DEVICE_FUNC constexpr typename MatrixBase<Derived>::DiagonalReturnType MatrixBase<Derived>::diagonal() {
   return DiagonalReturnType(derived());
 }
 
 /** This is the const version of diagonal(). */
 template <typename Derived>
-EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::ConstDiagonalReturnType MatrixBase<Derived>::diagonal()
+EIGEN_DEVICE_FUNC constexpr const typename MatrixBase<Derived>::ConstDiagonalReturnType MatrixBase<Derived>::diagonal()
     const {
   return ConstDiagonalReturnType(derived());
 }
@@ -180,13 +181,14 @@ EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::ConstDiagonalReturn
  *
  * \sa MatrixBase::diagonal(), class Diagonal */
 template <typename Derived>
-EIGEN_DEVICE_FUNC inline Diagonal<Derived, DynamicIndex> MatrixBase<Derived>::diagonal(Index index) {
+EIGEN_DEVICE_FUNC constexpr Diagonal<Derived, DynamicIndex> MatrixBase<Derived>::diagonal(Index index) {
   return Diagonal<Derived, DynamicIndex>(derived(), index);
 }
 
 /** This is the const version of diagonal(Index). */
 template <typename Derived>
-EIGEN_DEVICE_FUNC inline const Diagonal<const Derived, DynamicIndex> MatrixBase<Derived>::diagonal(Index index) const {
+EIGEN_DEVICE_FUNC constexpr const Diagonal<const Derived, DynamicIndex> MatrixBase<Derived>::diagonal(
+    Index index) const {
   return Diagonal<const Derived, DynamicIndex>(derived(), index);
 }
 
@@ -203,14 +205,14 @@ EIGEN_DEVICE_FUNC inline const Diagonal<const Derived, DynamicIndex> MatrixBase<
  * \sa MatrixBase::diagonal(), class Diagonal */
 template <typename Derived>
 template <int Index_>
-EIGEN_DEVICE_FUNC inline Diagonal<Derived, Index_> MatrixBase<Derived>::diagonal() {
+EIGEN_DEVICE_FUNC constexpr Diagonal<Derived, Index_> MatrixBase<Derived>::diagonal() {
   return Diagonal<Derived, Index_>(derived());
 }
 
 /** This is the const version of diagonal<int>(). */
 template <typename Derived>
 template <int Index_>
-EIGEN_DEVICE_FUNC inline const Diagonal<const Derived, Index_> MatrixBase<Derived>::diagonal() const {
+EIGEN_DEVICE_FUNC constexpr const Diagonal<const Derived, Index_> MatrixBase<Derived>::diagonal() const {
   return Diagonal<const Derived, Index_>(derived());
 }
 
diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h
index 52630d9297e..dda6c8c2fe9 100644
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -61,7 +61,7 @@ class DiagonalBase : public EigenBase<Derived> {
   /**
    * Constructs a dense matrix from \c *this. Note, this directly returns a dense matrix type,
    * not an expression.
-   * \returns A dense matrix, with its diagonal entries set from the the derived object. */
+   * \returns A dense matrix, with its diagonal entries set from the derived object. */
   EIGEN_DEVICE_FUNC DenseMatrixType toDenseMatrix() const { return derived(); }
 
   /** \returns a reference to the derived object's vector of diagonal coefficients. */
@@ -184,21 +184,22 @@ class DiagonalMatrix : public DiagonalBase<DiagonalMatrix<Scalar_, SizeAtCompile
 
  public:
   /** const version of diagonal(). */
-  EIGEN_DEVICE_FUNC inline const DiagonalVectorType& diagonal() const { return m_diagonal; }
+  EIGEN_DEVICE_FUNC constexpr inline const DiagonalVectorType& diagonal() const { return m_diagonal; }
   /** \returns a reference to the stored vector of diagonal coefficients. */
-  EIGEN_DEVICE_FUNC inline DiagonalVectorType& diagonal() { return m_diagonal; }
+  EIGEN_DEVICE_FUNC constexpr inline DiagonalVectorType& diagonal() { return m_diagonal; }
 
   /** Default constructor without initialization */
-  EIGEN_DEVICE_FUNC inline DiagonalMatrix() {}
+  EIGEN_DEVICE_FUNC constexpr inline DiagonalMatrix() {}
 
   /** Constructs a diagonal matrix with given dimension  */
-  EIGEN_DEVICE_FUNC explicit inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}
+  EIGEN_DEVICE_FUNC constexpr explicit inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}
 
   /** 2D constructor. */
-  EIGEN_DEVICE_FUNC inline DiagonalMatrix(const Scalar& x, const Scalar& y) : m_diagonal(x, y) {}
+  EIGEN_DEVICE_FUNC constexpr inline DiagonalMatrix(const Scalar& x, const Scalar& y) : m_diagonal(x, y) {}
 
   /** 3D constructor. */
-  EIGEN_DEVICE_FUNC inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x, y, z) {}
+  EIGEN_DEVICE_FUNC constexpr inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z)
+      : m_diagonal(x, y, z) {}
 
   /** \brief Construct a diagonal matrix with fixed size from an arbitrary number of coefficients.
    *
@@ -209,23 +210,24 @@ class DiagonalMatrix : public DiagonalBase<DiagonalMatrix<Scalar_, SizeAtCompile
    * \sa DiagonalMatrix(const Scalar&, const Scalar&, const Scalar&)
    */
   template <typename... ArgTypes>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DiagonalMatrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,
-                                                       const ArgTypes&... args)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE DiagonalMatrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,
+                                                                 const ArgTypes&... args)
       : m_diagonal(a0, a1, a2, args...) {}
 
   /** \brief Constructs a DiagonalMatrix and initializes it by elements given by an initializer list of initializer
-   * lists \cpp11
+   * lists
    */
   EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE DiagonalMatrix(
       const std::initializer_list<std::initializer_list<Scalar>>& list)
       : m_diagonal(list) {}
 
   /** \brief Constructs a DiagonalMatrix from an r-value diagonal vector type */
-  EIGEN_DEVICE_FUNC explicit inline DiagonalMatrix(DiagonalVectorType&& diag) : m_diagonal(std::move(diag)) {}
+  EIGEN_DEVICE_FUNC constexpr explicit inline DiagonalMatrix(DiagonalVectorType&& diag) : m_diagonal(std::move(diag)) {}
 
   /** Copy constructor. */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC inline DiagonalMatrix(const DiagonalBase<OtherDerived>& other) : m_diagonal(other.diagonal()) {}
+  EIGEN_DEVICE_FUNC constexpr inline DiagonalMatrix(const DiagonalBase<OtherDerived>& other)
+      : m_diagonal(other.diagonal()) {}
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
   /** copy constructor. prevent a default copy constructor from hiding the other templated constructor */
@@ -234,7 +236,8 @@ class DiagonalMatrix : public DiagonalBase<DiagonalMatrix<Scalar_, SizeAtCompile
 
   /** generic constructor from expression of the diagonal coefficients */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC explicit inline DiagonalMatrix(const MatrixBase<OtherDerived>& other) : m_diagonal(other) {}
+  EIGEN_DEVICE_FUNC constexpr explicit inline DiagonalMatrix(const MatrixBase<OtherDerived>& other)
+      : m_diagonal(other) {}
 
   /** Copy operator. */
   template <typename OtherDerived>
@@ -325,10 +328,11 @@ class DiagonalWrapper : public DiagonalBase<DiagonalWrapper<DiagonalVectorType_>
 #endif
 
   /** Constructor from expression of diagonal coefficients to wrap. */
-  EIGEN_DEVICE_FUNC explicit inline DiagonalWrapper(DiagonalVectorType& a_diagonal) : m_diagonal(a_diagonal) {}
+  EIGEN_DEVICE_FUNC constexpr explicit inline DiagonalWrapper(DiagonalVectorType& a_diagonal)
+      : m_diagonal(a_diagonal) {}
 
   /** \returns a const reference to the wrapped expression of diagonal coefficients. */
-  EIGEN_DEVICE_FUNC const DiagonalVectorType& diagonal() const { return m_diagonal; }
+  EIGEN_DEVICE_FUNC constexpr const DiagonalVectorType& diagonal() const { return m_diagonal; }
 
  protected:
   typename DiagonalVectorType::Nested m_diagonal;
@@ -344,7 +348,7 @@ class DiagonalWrapper : public DiagonalBase<DiagonalWrapper<DiagonalVectorType_>
  * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
  **/
 template <typename Derived>
-EIGEN_DEVICE_FUNC inline const DiagonalWrapper<const Derived> MatrixBase<Derived>::asDiagonal() const {
+EIGEN_DEVICE_FUNC constexpr const DiagonalWrapper<const Derived> MatrixBase<Derived>::asDiagonal() const {
   return DiagonalWrapper<const Derived>(derived());
 }
 
@@ -372,6 +376,55 @@ bool MatrixBase<Derived>::isDiagonal(const RealScalar& prec) const {
   return true;
 }
 
+/** \returns DiagonalWrapper.
+ *
+ * Example: \include MatrixBase_diagonalView.cpp
+ * Output: \verbinclude MatrixBase_diagonalView.out
+ *
+ * \sa diagonalView()
+ */
+
+/** This is the non-const version of diagonalView() with DiagIndex_ . */
+template <typename Derived>
+template <int DiagIndex_>
+EIGEN_DEVICE_FUNC constexpr DiagonalWrapper<Diagonal<Derived, DiagIndex_>> MatrixBase<Derived>::diagonalView() {
+  typedef Diagonal<Derived, DiagIndex_> DiagType;
+  typedef DiagonalWrapper<DiagType> ReturnType;
+  DiagType diag(this->derived());
+  return ReturnType(diag);
+}
+
+/** This is the const version of diagonalView() with DiagIndex_ . */
+template <typename Derived>
+template <int DiagIndex_>
+EIGEN_DEVICE_FUNC constexpr DiagonalWrapper<Diagonal<const Derived, DiagIndex_>> MatrixBase<Derived>::diagonalView()
+    const {
+  typedef Diagonal<const Derived, DiagIndex_> DiagType;
+  typedef DiagonalWrapper<DiagType> ReturnType;
+  DiagType diag(this->derived());
+  return ReturnType(diag);
+}
+
+/** This is the non-const version of diagonalView() with dynamic index. */
+template <typename Derived>
+EIGEN_DEVICE_FUNC constexpr DiagonalWrapper<Diagonal<Derived, DynamicIndex>> MatrixBase<Derived>::diagonalView(
+    Index index) {
+  typedef Diagonal<Derived, DynamicIndex> DiagType;
+  typedef DiagonalWrapper<DiagType> ReturnType;
+  DiagType diag(this->derived(), index);
+  return ReturnType(diag);
+}
+
+/** This is the const version of diagonalView() with dynamic index. */
+template <typename Derived>
+EIGEN_DEVICE_FUNC constexpr DiagonalWrapper<Diagonal<const Derived, DynamicIndex>> MatrixBase<Derived>::diagonalView(
+    Index index) const {
+  typedef Diagonal<const Derived, DynamicIndex> DiagType;
+  typedef DiagonalWrapper<DiagType> ReturnType;
+  DiagType diag(this->derived(), index);
+  return ReturnType(diag);
+}
+
 namespace internal {
 
 template <>
diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index 059527c85f8..d0a30dd8210 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -20,15 +20,14 @@ namespace internal {
 template <typename Derived, typename Scalar = typename traits<Derived>::Scalar>
 struct squared_norm_impl {
   using Real = typename NumTraits<Scalar>::Real;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Real run(const Derived& a) {
-    Scalar result = a.unaryExpr(squared_norm_functor<Scalar>()).sum();
-    return numext::real(result) + numext::imag(result);
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Real run(const Derived& a) {
+    return a.realView().cwiseAbs2().sum();
   }
 };
 
 template <typename Derived>
 struct squared_norm_impl<Derived, bool> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Derived& a) { return a.any(); }
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE bool run(const Derived& a) { return a.any(); }
 };
 
 }  // end namespace internal
@@ -46,7 +45,7 @@ struct squared_norm_impl<Derived, bool> {
  */
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
     typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,
                                   typename internal::traits<OtherDerived>::Scalar>::ReturnType
     MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const {
@@ -57,19 +56,19 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 
 /** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the squared Frobenius norm.
  * In both cases, it consists in the sum of the square of all the matrix entries.
- * For vectors, this is also equals to the dot product of \c *this with itself.
+ * For vectors, this is also equal to the dot product of \c *this with itself.
  *
  * \sa dot(), norm(), lpNorm()
  */
 template <typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::squaredNorm() const {
   return internal::squared_norm_impl<Derived>::run(derived());
 }
 
 /** \returns, for vectors, the \em l2 norm of \c *this, and for matrices the Frobenius norm.
  * In both cases, it consists in the square root of the sum of the square of all the matrix entries.
- * For vectors, this is also equals to the square root of the dot product of \c *this with itself.
+ * For vectors, this is also equal to the square root of the dot product of \c *this with itself.
  *
  * \sa lpNorm(), dot(), squaredNorm()
  */
diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h
index c9a6e88e2f7..dfe4a640431 100644
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@@ -53,7 +53,7 @@ struct EigenBase {
   EIGEN_DEVICE_FUNC inline constexpr Derived& const_cast_derived() const {
     return *static_cast<Derived*>(const_cast<EigenBase*>(this));
   }
-  EIGEN_DEVICE_FUNC inline const Derived& const_derived() const { return *static_cast<const Derived*>(this); }
+  EIGEN_DEVICE_FUNC constexpr inline const Derived& const_derived() const { return *static_cast<const Derived*>(this); }
 
   /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
   EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return derived().rows(); }
@@ -65,13 +65,13 @@ struct EigenBase {
 
   /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */
   template <typename Dest>
-  EIGEN_DEVICE_FUNC inline void evalTo(Dest& dst) const {
+  EIGEN_DEVICE_FUNC constexpr inline void evalTo(Dest& dst) const {
     derived().evalTo(dst);
   }
 
   /** \internal Don't use it, but do the equivalent: \code dst += *this; \endcode */
   template <typename Dest>
-  EIGEN_DEVICE_FUNC inline void addTo(Dest& dst) const {
+  EIGEN_DEVICE_FUNC constexpr inline void addTo(Dest& dst) const {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
     typename Dest::PlainObject res(rows(), cols());
@@ -81,7 +81,7 @@ struct EigenBase {
 
   /** \internal Don't use it, but do the equivalent: \code dst -= *this; \endcode */
   template <typename Dest>
-  EIGEN_DEVICE_FUNC inline void subTo(Dest& dst) const {
+  EIGEN_DEVICE_FUNC constexpr inline void subTo(Dest& dst) const {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
     typename Dest::PlainObject res(rows(), cols());
@@ -91,7 +91,7 @@ struct EigenBase {
 
   /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheRight(*this); \endcode */
   template <typename Dest>
-  EIGEN_DEVICE_FUNC inline void applyThisOnTheRight(Dest& dst) const {
+  EIGEN_DEVICE_FUNC constexpr inline void applyThisOnTheRight(Dest& dst) const {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
     dst = dst * this->derived();
@@ -99,7 +99,7 @@ struct EigenBase {
 
   /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheLeft(*this); \endcode */
   template <typename Dest>
-  EIGEN_DEVICE_FUNC inline void applyThisOnTheLeft(Dest& dst) const {
+  EIGEN_DEVICE_FUNC constexpr inline void applyThisOnTheLeft(Dest& dst) const {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
     dst = this->derived() * dst;
@@ -125,21 +125,21 @@ struct EigenBase {
  */
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived>& other) {
+EIGEN_DEVICE_FUNC constexpr Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived>& other) {
   call_assignment(derived(), other.derived());
   return derived();
 }
 
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived>& other) {
+EIGEN_DEVICE_FUNC constexpr Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived>& other) {
   call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar, typename OtherDerived::Scalar>());
   return derived();
 }
 
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived>& other) {
+EIGEN_DEVICE_FUNC constexpr Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived>& other) {
   call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar, typename OtherDerived::Scalar>());
   return derived();
 }
diff --git a/Eigen/src/Core/Fill.h b/Eigen/src/Core/Fill.h
index 9d4ecd445a4..ccbeb50f0c7 100644
--- a/Eigen/src/Core/Fill.h
+++ b/Eigen/src/Core/Fill.h
@@ -20,11 +20,14 @@ namespace internal {
 template <typename Xpr>
 struct eigen_fill_helper : std::false_type {};
 
+// Only enable std::fill_n for trivially copyable scalars.  GCC's libstdc++
+// fill_n pessimizes non-trivially-copyable types (extra moves per iteration),
+// causing measurable regressions for types like AutoDiffScalar (issue #2956).
 template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
-struct eigen_fill_helper<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>> : std::true_type {};
+struct eigen_fill_helper<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>> : std::is_trivially_copyable<Scalar> {};
 
 template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
-struct eigen_fill_helper<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>> : std::true_type {};
+struct eigen_fill_helper<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>> : std::is_trivially_copyable<Scalar> {};
 
 template <typename Xpr, int BlockRows, int BlockCols>
 struct eigen_fill_helper<Block<Xpr, BlockRows, BlockCols, /*InnerPanel*/ true>> : eigen_fill_helper<Xpr> {};
@@ -60,12 +63,12 @@ struct eigen_fill_impl<Xpr, /*use_fill*/ false> {
   using Func = scalar_constant_op<Scalar>;
   using PlainObject = typename Xpr::PlainObject;
   using Constant = typename PlainObject::ConstantReturnType;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst, const Scalar& val) {
+  static EIGEN_DEVICE_FUNC constexpr void run(Xpr& dst, const Scalar& val) {
     const Constant src(dst.rows(), dst.cols(), val);
     run(dst, src);
   }
   template <typename SrcXpr>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst, const SrcXpr& src) {
+  static EIGEN_DEVICE_FUNC constexpr void run(Xpr& dst, const SrcXpr& src) {
     call_dense_assignment_loop(dst, src, assign_op<Scalar, Scalar>());
   }
 };
@@ -78,8 +81,9 @@ template <typename Xpr>
 struct eigen_fill_impl<Xpr, /*use_fill*/ true> {
   using Scalar = typename Xpr::Scalar;
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst, const Scalar& val) {
+    const Scalar val_copy = val;
     using std::fill_n;
-    fill_n(dst.data(), dst.size(), val);
+    fill_n(dst.data(), dst.size(), val_copy);
   }
   template <typename SrcXpr>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst, const SrcXpr& src) {
@@ -92,8 +96,10 @@ struct eigen_fill_impl<Xpr, /*use_fill*/ true> {
 
 template <typename Xpr>
 struct eigen_memset_helper {
-  static constexpr bool value =
-      std::is_trivially_copyable<typename Xpr::Scalar>::value && eigen_fill_helper<Xpr>::value;
+  using Scalar = typename Xpr::Scalar;
+  static constexpr bool value = std::is_trivially_copyable<Scalar>::value &&
+                                !static_cast<bool>(NumTraits<Scalar>::RequireInitialization) &&
+                                eigen_fill_helper<Xpr>::value;
 };
 
 template <typename Xpr>
@@ -101,12 +107,12 @@ struct eigen_zero_impl<Xpr, /*use_memset*/ false> {
   using Scalar = typename Xpr::Scalar;
   using PlainObject = typename Xpr::PlainObject;
   using Zero = typename PlainObject::ZeroReturnType;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst) {
+  static EIGEN_DEVICE_FUNC constexpr void run(Xpr& dst) {
     const Zero src(dst.rows(), dst.cols());
     run(dst, src);
   }
   template <typename SrcXpr>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst, const SrcXpr& src) {
+  static EIGEN_DEVICE_FUNC constexpr void run(Xpr& dst, const SrcXpr& src) {
     call_dense_assignment_loop(dst, src, assign_op<Scalar, Scalar>());
   }
 };
@@ -114,17 +120,15 @@ struct eigen_zero_impl<Xpr, /*use_memset*/ false> {
 template <typename Xpr>
 struct eigen_zero_impl<Xpr, /*use_memset*/ true> {
   using Scalar = typename Xpr::Scalar;
-  static constexpr size_t max_bytes = (std::numeric_limits<std::ptrdiff_t>::max)();
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst) {
-    const size_t num_bytes = dst.size() * sizeof(Scalar);
-    if (num_bytes == 0) return;
+    const std::ptrdiff_t num_bytes = dst.size() * static_cast<std::ptrdiff_t>(sizeof(Scalar));
+    if (num_bytes <= 0) return;
     void* dst_ptr = static_cast<void*>(dst.data());
 #ifndef EIGEN_NO_DEBUG
-    if (num_bytes > max_bytes) throw_std_bad_alloc();
     eigen_assert((dst_ptr != nullptr) && "null pointer dereference error!");
 #endif
     EIGEN_USING_STD(memset);
-    memset(dst_ptr, 0, num_bytes);
+    memset(dst_ptr, 0, static_cast<std::size_t>(num_bytes));
   }
   template <typename SrcXpr>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst, const SrcXpr& src) {
diff --git a/Eigen/src/Core/FindCoeff.h b/Eigen/src/Core/FindCoeff.h
index 0102e8af3ab..b2645d8e83e 100644
--- a/Eigen/src/Core/FindCoeff.h
+++ b/Eigen/src/Core/FindCoeff.h
@@ -34,11 +34,11 @@ struct max_coeff_functor {
 
 template <typename Scalar>
 struct max_coeff_functor<Scalar, PropagateNaN, false> {
-  EIGEN_DEVICE_FUNC inline Scalar compareCoeff(const Scalar& incumbent, const Scalar& candidate) {
+  EIGEN_DEVICE_FUNC inline Scalar compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
     return (candidate > incumbent) || ((candidate != candidate) && (incumbent == incumbent));
   }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) {
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
     return pandnot(pcmp_lt_or_nan(incumbent, candidate), pisnan(incumbent));
   }
   template <typename Packet>
@@ -79,11 +79,11 @@ struct min_coeff_functor {
 
 template <typename Scalar>
 struct min_coeff_functor<Scalar, PropagateNaN, false> {
-  EIGEN_DEVICE_FUNC inline Scalar compareCoeff(const Scalar& incumbent, const Scalar& candidate) {
+  EIGEN_DEVICE_FUNC inline Scalar compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
     return (candidate < incumbent) || ((candidate != candidate) && (incumbent == incumbent));
   }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) {
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
     return pandnot(pcmp_lt_or_nan(candidate, incumbent), pisnan(incumbent));
   }
   template <typename Packet>
@@ -173,6 +173,10 @@ struct find_coeff_loop<Evaluator, Func, /*Linear*/ false, /*Vectorize*/ true> {
                                            Index& inner) {
     Index outerSize = eval.outerSize();
     Index innerSize = eval.innerSize();
+    if (innerSize < PacketSize) {
+      ScalarImpl::run(eval, func, result, outer, inner);
+      return;
+    }
     Index packetEnd = numext::round_down(innerSize, PacketSize);
 
     /* initialization performed in calling function */
@@ -229,6 +233,10 @@ struct find_coeff_loop<Evaluator, Func, /*Linear*/ true, /*Vectorize*/ true> {
 
   static EIGEN_DEVICE_FUNC inline void run(const Evaluator& eval, Func& func, Scalar& result, Index& index) {
     Index size = eval.size();
+    if (size < PacketSize) {
+      ScalarImpl::run(eval, func, result, index);
+      return;
+    }
     Index packetEnd = numext::round_down(size, PacketSize);
 
     /* initialization performed in calling function */
diff --git a/Eigen/src/Core/ForceAlignedAccess.h b/Eigen/src/Core/ForceAlignedAccess.h
index 55beab35a19..4f69c20f93a 100644
--- a/Eigen/src/Core/ForceAlignedAccess.h
+++ b/Eigen/src/Core/ForceAlignedAccess.h
@@ -39,7 +39,7 @@ class ForceAlignedAccess : public internal::dense_xpr_base<ForceAlignedAccess<Ex
   typedef typename internal::dense_xpr_base<ForceAlignedAccess>::type Base;
   EIGEN_DENSE_PUBLIC_INTERFACE(ForceAlignedAccess)
 
-  EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
+  EIGEN_DEVICE_FUNC explicit constexpr ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
 
   EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
   EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
@@ -103,25 +103,6 @@ inline ForceAlignedAccess<Derived> MatrixBase<Derived>::forceAlignedAccess() {
   return ForceAlignedAccess<Derived>(derived());
 }
 
-/** \returns an expression of *this with forced aligned access if \a Enable is true.
- * \sa forceAlignedAccess(), class ForceAlignedAccess
- */
-template <typename Derived>
-template <bool Enable>
-inline add_const_on_value_type_t<std::conditional_t<Enable, ForceAlignedAccess<Derived>, Derived&>>
-MatrixBase<Derived>::forceAlignedAccessIf() const {
-  return derived();  // FIXME This should not work but apparently is never used
-}
-
-/** \returns an expression of *this with forced aligned access if \a Enable is true.
- * \sa forceAlignedAccess(), class ForceAlignedAccess
- */
-template <typename Derived>
-template <bool Enable>
-inline std::conditional_t<Enable, ForceAlignedAccess<Derived>, Derived&> MatrixBase<Derived>::forceAlignedAccessIf() {
-  return derived();  // FIXME This should not work but apparently is never used
-}
-
 }  // end namespace Eigen
 
 #endif  // EIGEN_FORCEALIGNEDACCESS_H
diff --git a/Eigen/src/Core/Fuzzy.h b/Eigen/src/Core/Fuzzy.h
index ed6b4ffead7..eaa553c93cd 100644
--- a/Eigen/src/Core/Fuzzy.h
+++ b/Eigen/src/Core/Fuzzy.h
@@ -86,8 +86,8 @@ struct isMuchSmallerThan_scalar_selector<Derived, true> {
  */
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(const DenseBase<OtherDerived>& other,
-                                                    const RealScalar& prec) const {
+EIGEN_DEVICE_FUNC constexpr bool DenseBase<Derived>::isApprox(const DenseBase<OtherDerived>& other,
+                                                              const RealScalar& prec) const {
   return internal::isApprox_selector<Derived, OtherDerived>::run(derived(), other.derived(), prec);
 }
 
@@ -105,8 +105,8 @@ EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(const DenseBase<OtherDerived
  * \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
  */
 template <typename Derived>
-EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(const typename NumTraits<Scalar>::Real& other,
-                                                             const RealScalar& prec) const {
+EIGEN_DEVICE_FUNC constexpr bool DenseBase<Derived>::isMuchSmallerThan(const typename NumTraits<Scalar>::Real& other,
+                                                                       const RealScalar& prec) const {
   return internal::isMuchSmallerThan_scalar_selector<Derived>::run(derived(), other, prec);
 }
 
@@ -122,8 +122,8 @@ EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(const typename NumT
  */
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(const DenseBase<OtherDerived>& other,
-                                                             const RealScalar& prec) const {
+EIGEN_DEVICE_FUNC constexpr bool DenseBase<Derived>::isMuchSmallerThan(const DenseBase<OtherDerived>& other,
+                                                                       const RealScalar& prec) const {
   return internal::isMuchSmallerThan_object_selector<Derived, OtherDerived>::run(derived(), other.derived(), prec);
 }
 
diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index e4c51d2a6f6..707611a82ff 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -89,7 +89,7 @@ struct product_type {
 /* The following allows to select the kind of product at compile time
  * based on the three dimensions of the product.
  * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
-// FIXME I'm not sure the current mapping is the ideal one.
+// FIXME: the current compile-time product-type mapping may not be optimal.
 template <int M, int N>
 struct product_type_selector<M, N, 1> {
   enum { ret = OuterProduct };
@@ -193,12 +193,11 @@ struct product_type_selector<Large, Large, Small> {
  *  Implementation of Inner Vector Vector Product
  ***********************************************************************/
 
-// FIXME : maybe the "inner product" could return a Scalar
-// instead of a 1x1 matrix ??
-// Pro: more natural for the user
-// Cons: this could be a problem if in a meta unrolled algorithm a matrix-matrix
-// product ends up to a row-vector times col-vector product... To tackle this use
-// case, we could have a specialization for Block<MatrixType,1,1> with: operator=(Scalar x);
+// FIXME: consider returning a Scalar instead of a 1x1 matrix for inner products.
+// Pro: more natural for the user.
+// Con: in a meta-unrolled algorithm a matrix-matrix product may reduce to a
+// row-vector times column-vector product. To handle this, we could specialize
+// Block<MatrixType,1,1> with operator=(Scalar x).
 
 /***********************************************************************
  *  Implementation of Outer Vector Vector Product
@@ -208,7 +207,7 @@ struct product_type_selector<Large, Large, Small> {
  *  Implementation of General Matrix Vector Product
  ***********************************************************************/
 
-/*  According to the shape/flags of the matrix we have to distinghish 3 different cases:
+/*  According to the shape/flags of the matrix we have to distinguish 3 different cases:
  *   1 - the matrix is col-major, BLAS compatible and M is large => call fast BLAS-like colmajor routine
  *   2 - the matrix is row-major, BLAS compatible and N is large => call fast BLAS-like rowmajor routine
  *   3 - all other cases are handled using a simple loop along the outer-storage direction.
@@ -229,7 +228,7 @@ struct gemv_static_vector_if;
 
 template <typename Scalar, int Size, int MaxSize>
 struct gemv_static_vector_if<Scalar, Size, MaxSize, false> {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Scalar* data() {
+  EIGEN_DEVICE_FUNC constexpr Scalar* data() {
     eigen_internal_assert(false && "should never be called");
     return 0;
   }
@@ -237,19 +236,19 @@ struct gemv_static_vector_if<Scalar, Size, MaxSize, false> {
 
 template <typename Scalar, int Size>
 struct gemv_static_vector_if<Scalar, Size, Dynamic, true> {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Scalar* data() { return 0; }
+  EIGEN_DEVICE_FUNC constexpr Scalar* data() { return 0; }
 };
 
 template <typename Scalar, int Size, int MaxSize>
 struct gemv_static_vector_if<Scalar, Size, MaxSize, true> {
 #if EIGEN_MAX_STATIC_ALIGN_BYTES != 0
   internal::plain_array<Scalar, internal::min_size_prefer_fixed(Size, MaxSize), 0, AlignedMax> m_data;
-  EIGEN_STRONG_INLINE constexpr Scalar* data() { return m_data.array; }
+  constexpr Scalar* data() { return m_data.array; }
 #else
   // Some architectures cannot align on the stack,
   // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
   internal::plain_array<Scalar, internal::min_size_prefer_fixed(Size, MaxSize) + EIGEN_MAX_ALIGN_BYTES, 0> m_data;
-  EIGEN_STRONG_INLINE constexpr Scalar* data() {
+  constexpr Scalar* data() {
     return reinterpret_cast<Scalar*>((std::uintptr_t(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES - 1))) +
                                      EIGEN_MAX_ALIGN_BYTES);
   }
@@ -293,7 +292,7 @@ struct gemv_dense_selector<OnTheRight, ColMajor, true> {
     typedef std::conditional_t<Dest::IsVectorAtCompileTime, Dest, typename Dest::ColXpr> ActualDest;
 
     enum {
-      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
+      // FIXME: find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
       // on, the other hand it is good for the cache to pack the vector anyways...
       EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime == 1),
       ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
@@ -376,7 +375,7 @@ struct gemv_dense_selector<OnTheRight, RowMajor, true> {
     ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs);
 
     enum {
-      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
+      // FIXME: find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
       // on, the other hand it is good for the cache to pack the vector anyways...
       DirectlyUseRhs =
           ActualRhsTypeCleaned::InnerStrideAtCompileTime == 1 || ActualRhsTypeCleaned::MaxSizeAtCompileTime == 0
@@ -417,7 +416,7 @@ struct gemv_dense_selector<OnTheRight, ColMajor, false> {
   static void run(const Lhs& lhs, const Rhs& rhs, Dest& dest, const typename Dest::Scalar& alpha) {
     EIGEN_STATIC_ASSERT((!nested_eval<Lhs, 1>::Evaluate),
                         EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE);
-    // TODO if rhs is large enough it might be beneficial to make sure that dest is sequentially stored in memory,
+    // TODO: if rhs is large enough it might be beneficial to make sure that dest is sequentially stored in memory,
     // otherwise use a temp
     typename nested_eval<Rhs, 1>::type actual_rhs(rhs);
     const Index size = rhs.rows();
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index e1d62fa170d..4ef92dbf132 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -57,15 +57,14 @@ struct default_packet_traits {
     HasConj = 1,
     HasSetLinear = 1,
     HasSign = 1,
+    HasAbsDiff = 1,
     // By default, the nearest integer functions (rint, round, floor, ceil, trunc) are enabled for all scalar and packet
     // types
     HasRound = 1,
 
     HasArg = 0,
-    HasAbsDiff = 0,
-    HasBlend = 0,
     // This flag is used to indicate whether packet comparison is supported.
-    // pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true.
+    // pcmp_eq and pcmp_lt should be defined for it to be true.
     HasCmp = 0,
 
     HasDiv = 0,
@@ -88,6 +87,8 @@ struct default_packet_traits {
     HasATanh = 0,
     HasSinh = 0,
     HasCosh = 0,
+    HasASinh = 0,
+    HasACosh = 0,
     HasTanh = 0,
     HasLGamma = 0,
     HasDiGamma = 0,
@@ -117,6 +118,7 @@ struct packet_traits : default_packet_traits {
   enum {
     HasAdd = 0,
     HasSub = 0,
+    HasAbsDiff = 0,
     HasMul = 0,
     HasNegate = 0,
     HasAbs = 0,
@@ -131,17 +133,18 @@ struct packet_traits : default_packet_traits {
 template <typename T>
 struct packet_traits<const T> : packet_traits<T> {};
 
+struct default_unpacket_traits {
+  enum { vectorizable = false, masked_load_available = false, masked_store_available = false };
+};
+
 template <typename T>
-struct unpacket_traits {
+struct unpacket_traits : default_unpacket_traits {
   typedef T type;
   typedef T half;
   typedef typename numext::get_integer_by_size<sizeof(T)>::signed_type integer_packet;
   enum {
     size = 1,
     alignment = alignof(T),
-    vectorizable = false,
-    masked_load_available = false,
-    masked_store_available = false
   };
 };
 
@@ -253,6 +256,12 @@ struct preinterpret_generic<Packet, Packet, true> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& a) { return a; }
 };
 
+template <typename ComplexPacket>
+struct preinterpret_generic<typename unpacket_traits<ComplexPacket>::as_real, ComplexPacket, false> {
+  using RealPacket = typename unpacket_traits<ComplexPacket>::as_real;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RealPacket run(const ComplexPacket& a) { return a.v; }
+};
+
 /** \internal \returns reinterpret_cast<Target>(a) */
 template <typename Target, typename Packet>
 EIGEN_DEVICE_FUNC inline Target preinterpret(const Packet& a) {
@@ -426,30 +435,6 @@ EIGEN_DEVICE_FUNC inline Packet pzero(const Packet& a) {
   return pzero_impl<Packet>::run(a);
 }
 
-/** \internal \returns a <= b as a bit mask */
-template <typename Packet>
-EIGEN_DEVICE_FUNC inline Packet pcmp_le(const Packet& a, const Packet& b) {
-  return a <= b ? ptrue(a) : pzero(a);
-}
-
-/** \internal \returns a < b as a bit mask */
-template <typename Packet>
-EIGEN_DEVICE_FUNC inline Packet pcmp_lt(const Packet& a, const Packet& b) {
-  return a < b ? ptrue(a) : pzero(a);
-}
-
-/** \internal \returns a == b as a bit mask */
-template <typename Packet>
-EIGEN_DEVICE_FUNC inline Packet pcmp_eq(const Packet& a, const Packet& b) {
-  return a == b ? ptrue(a) : pzero(a);
-}
-
-/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */
-template <typename Packet>
-EIGEN_DEVICE_FUNC inline Packet pcmp_lt_or_nan(const Packet& a, const Packet& b) {
-  return a >= b ? pzero(a) : ptrue(a);
-}
-
 template <typename T>
 struct bit_and {
   EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a & b; }
@@ -576,6 +561,30 @@ EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) {
   return pand(a, pnot(b));
 }
 
+/** \internal \returns a < b as a bit mask */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pcmp_lt(const Packet& a, const Packet& b) {
+  return a < b ? ptrue(a) : pzero(a);
+}
+
+/** \internal \returns a == b as a bit mask */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pcmp_eq(const Packet& a, const Packet& b) {
+  return a == b ? ptrue(a) : pzero(a);
+}
+
+/** \internal \returns a <= b as a bit mask */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pcmp_le(const Packet& a, const Packet& b) {
+  return por(pcmp_eq(a, b), pcmp_lt(a, b));
+}
+
+/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pcmp_lt_or_nan(const Packet& a, const Packet& b) {
+  return a >= b ? pzero(a) : ptrue(a);
+}
+
 // In the general case, use bitwise select.
 template <typename Packet, bool is_scalar = is_scalar<Packet>::value>
 struct pselect_impl {
@@ -603,7 +612,7 @@ EIGEN_DEVICE_FUNC inline bool pselect<bool>(const bool& cond, const bool& a, con
   return cond ? a : b;
 }
 
-/** \internal \returns the min or of \a a and \a b (coeff-wise)
+/** \internal \returns the min or max of \a a and \a b (coeff-wise)
     If either \a a or \a b are NaN, the result is implementation defined. */
 template <int NaNPropagation, bool IsInteger>
 struct pminmax_impl {
@@ -641,7 +650,7 @@ struct pminmax_impl<PropagateNumbers, false> {
 #define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) [](const Type& aa, const Type& bb) { return Func(aa, bb); }
 
 /** \internal \returns the min of \a a and \a b  (coeff-wise).
-    If \a a or \b b is NaN, the return value is implementation defined. */
+    If \a a or \a b is NaN, the return value is implementation defined. */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pmin(const Packet& a, const Packet& b) {
   return numext::mini(a, b);
@@ -656,7 +665,7 @@ EIGEN_DEVICE_FUNC inline Packet pmin(const Packet& a, const Packet& b) {
 }
 
 /** \internal \returns the max of \a a and \a b  (coeff-wise)
-    If \a a or \b b is NaN, the return value is implementation defined. */
+    If \a a or \a b is NaN, the return value is implementation defined. */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pmax(const Packet& a, const Packet& b) {
   return numext::maxi(a, b);
@@ -742,9 +751,15 @@ EIGEN_DEVICE_FUNC inline Packet pldexp(const Packet& a, const Packet& exponent)
 
 /** \internal \returns the min of \a a and \a b  (coeff-wise) */
 template <typename Packet>
-EIGEN_DEVICE_FUNC inline Packet pabsdiff(const Packet& a, const Packet& b) {
+EIGEN_DEVICE_FUNC inline std::enable_if_t<NumTraits<typename unpacket_traits<Packet>::type>::IsInteger, Packet>
+pabsdiff(const Packet& a, const Packet& b) {
   return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b));
 }
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsInteger, Packet>
+pabsdiff(const Packet& a, const Packet& b) {
+  return pabs(psub(a, b));
+}
 
 /** \internal \returns a packet version of \a *from, from must be properly aligned */
 template <typename Packet>
@@ -808,10 +823,24 @@ EIGEN_DEVICE_FUNC inline Packet pset1(const typename unpacket_traits<Packet>::ty
 template <typename Packet, typename BitsType>
 EIGEN_DEVICE_FUNC inline Packet pset1frombits(BitsType a);
 
+template <typename Scalar, std::enable_if_t<std::is_trivially_copyable<Scalar>::value, int> = 0>
+EIGEN_DEVICE_FUNC inline Scalar pload1_scalar(const Scalar* a) {
+  Scalar scalar;
+  EIGEN_USING_STD(memcpy)
+  memcpy(&scalar, a, sizeof(Scalar));
+  return scalar;
+}
+
+template <typename Scalar, std::enable_if_t<!std::is_trivially_copyable<Scalar>::value, int> = 0>
+EIGEN_DEVICE_FUNC inline Scalar pload1_scalar(const Scalar* a) {
+  return Scalar(*a);
+}
+
 /** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pload1(const typename unpacket_traits<Packet>::type* a) {
-  return pset1<Packet>(*a);
+  using Scalar = typename unpacket_traits<Packet>::type;
+  return pset1<Packet>(pload1_scalar<Scalar>(a));
 }
 
 /** \internal \returns a packet with elements of \a *from duplicated.
@@ -821,7 +850,7 @@ EIGEN_DEVICE_FUNC inline Packet pload1(const typename unpacket_traits<Packet>::t
  */
 template <typename Packet>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet ploaddup(const typename unpacket_traits<Packet>::type* from) {
-  return *from;
+  return pload1<Packet>(from);
 }
 
 /** \internal \returns a packet with elements of \a *from quadrupled.
@@ -997,12 +1026,26 @@ EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a) {
   return a;
 }
 
-/** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
+/** \internal \returns \a a with real and imaginary parts flipped (for complex types only) */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) {
   return Packet(numext::imag(a), numext::real(a));
 }
 
+/** \internal \returns \a a with real part duplicated (for complex types only) */
+// TODO(rmlarsen): Define and use in all complex backends.
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pdupreal(const Packet& a) {
+  return Packet(numext::real(a), numext::real(a));
+}
+
+/** \internal \returns \a a with imaginary part duplicated (for complex types only) */
+// TODO(rmlarsen): Define and use in all complex backends.
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pdupimag(const Packet& a) {
+  return Packet(numext::imag(a), numext::imag(a));
+}
+
 /**************************
  * Special math functions
  ***************************/
@@ -1091,6 +1134,20 @@ EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh(const Packet&
   return atanh(a);
 }
 
+/** \internal \returns the inverse hyperbolic sine of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasinh(const Packet& a) {
+  EIGEN_USING_STD(asinh);
+  return asinh(a);
+}
+
+/** \internal \returns the inverse hyperbolic cosine of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacosh(const Packet& a) {
+  EIGEN_USING_STD(acosh);
+  return acosh(a);
+}
+
 /** \internal \returns the exp of \a a (coeff-wise) */
 template <typename Packet>
 EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp(const Packet& a) {
@@ -1219,7 +1276,7 @@ EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type pfirst(const Pac
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline std::conditional_t<(unpacket_traits<Packet>::size % 8) == 0,
                                             typename unpacket_traits<Packet>::half, Packet>
-predux_half_dowto4(const Packet& a) {
+predux_half(const Packet& a) {
   return a;
 }
 
@@ -1302,9 +1359,7 @@ EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const
 /** \internal \returns true if all coeffs of \a a means "true"
  * It is supposed to be called on values returned by pcmp_*.
  */
-// not needed yet
-// template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a)
-// { return bool(a); }
+// TODO: implement predux_all when needed.
 
 /** \internal \returns true if any coeffs of \a a means "true"
  * It is supposed to be called on values returned by pcmp_*.
@@ -1337,27 +1392,27 @@ struct pmadd_impl {
     return psub(c, pmul(a, b));
   }
   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pnmsub(const Packet& a, const Packet& b, const Packet& c) {
-    return pnegate(pmadd(a, b, c));
+    return pnegate(padd(pmul(a, b), c));
   }
 };
 
 template <typename Scalar>
 struct pmadd_impl<Scalar, std::enable_if_t<is_scalar<Scalar>::value && NumTraits<Scalar>::IsSigned>> {
   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pmadd(const Scalar& a, const Scalar& b, const Scalar& c) {
-    return numext::fma(a, b, c);
+    return numext::madd<Scalar>(a, b, c);
   }
   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pmsub(const Scalar& a, const Scalar& b, const Scalar& c) {
-    return numext::fma(a, b, Scalar(-c));
+    return numext::madd<Scalar>(a, b, Scalar(-c));
   }
   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pnmadd(const Scalar& a, const Scalar& b, const Scalar& c) {
-    return numext::fma(Scalar(-a), b, c);
+    return numext::madd<Scalar>(Scalar(-a), b, c);
   }
   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pnmsub(const Scalar& a, const Scalar& b, const Scalar& c) {
-    return -Scalar(numext::fma(a, b, c));
+    return -Scalar(numext::madd<Scalar>(a, b, c));
   }
 };
 
-// FMA instructions.
+// Multiply-add instructions.
 /** \internal \returns a * b + c (coeff-wise) */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pmadd(const Packet& a, const Packet& b, const Packet& c) {
@@ -1471,26 +1526,11 @@ struct PacketBlock {
   Packet packet[N];
 };
 
-template <typename Packet>
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet, 1>& /*kernel*/) {
+template <typename Packet, int size = 1>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet, size>& /*kernel*/) {
   // Nothing to do in the scalar case, i.e. a 1x1 matrix.
 }
 
-/***************************************************************************
- * Selector, i.e. vector of N boolean values used to select (i.e. blend)
- * words from 2 packets.
- ***************************************************************************/
-template <size_t N>
-struct Selector {
-  bool select[N];
-};
-
-template <typename Packet>
-EIGEN_DEVICE_FUNC inline Packet pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket,
-                                       const Packet& thenPacket, const Packet& elsePacket) {
-  return ifPacket.select[0] ? thenPacket : elsePacket;
-}
-
 /** \internal \returns 1 / a (coeff-wise) */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet preciprocal(const Packet& a) {
@@ -1596,9 +1636,10 @@ EIGEN_DEVICE_FUNC inline Packet ploaduSegment(const typename unpacket_traits<Pac
   using Scalar = typename unpacket_traits<Packet>::type;
   constexpr Index PacketSize = unpacket_traits<Packet>::size;
   eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range");
-  Scalar aux[PacketSize];
-  memset(static_cast<void*>(aux), 0x00, sizeof(Scalar) * PacketSize);
-  smart_copy(from + begin, from + begin + count, aux + begin);
+  Scalar aux[PacketSize] = {};
+  for (Index k = begin; k < begin + count; k++) {
+    aux[k] = from[k];
+  }
   return ploadu<Packet>(aux);
 }
 
@@ -1619,7 +1660,9 @@ EIGEN_DEVICE_FUNC inline void pstoreuSegment(Scalar* to, const Packet& from, Ind
   eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range");
   Scalar aux[PacketSize];
   pstoreu<Scalar, Packet>(aux, from);
-  smart_copy(aux + begin, aux + begin + count, to + begin);
+  for (Index k = begin; k < begin + count; k++) {
+    to[k] = aux[k];
+  }
 }
 
 /** \internal copy the packet \a from in the range [begin, begin + count) to \a *to.
diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h
index df1098e27e6..c32aac9acc5 100644
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -130,12 +130,12 @@ using GlobalUnaryPowReturnType = std::enable_if_t<
  */
 #ifdef EIGEN_PARSED_BY_DOXYGEN
 template <typename Derived, typename ScalarExponent>
-EIGEN_DEVICE_FUNC inline const GlobalUnaryPowReturnType<Derived, ScalarExponent> pow(const Eigen::ArrayBase<Derived>& x,
-                                                                                     const ScalarExponent& exponent);
+EIGEN_DEVICE_FUNC constexpr inline const GlobalUnaryPowReturnType<Derived, ScalarExponent> pow(
+    const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent);
 #else
 template <typename Derived, typename ScalarExponent>
-EIGEN_DEVICE_FUNC inline const GlobalUnaryPowReturnType<Derived, ScalarExponent> pow(const Eigen::ArrayBase<Derived>& x,
-                                                                                     const ScalarExponent& exponent) {
+EIGEN_DEVICE_FUNC constexpr inline const GlobalUnaryPowReturnType<Derived, ScalarExponent> pow(
+    const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent) {
   return GlobalUnaryPowReturnType<Derived, ScalarExponent>(
       x.derived(), internal::scalar_unary_pow_op<typename Derived::Scalar, ScalarExponent>(exponent));
 }
diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h
index 0a1b583d6cc..e2362fcdc55 100644
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@@ -65,7 +65,7 @@ struct IOFormat {
         fill(_fill),
         precision(_precision),
         flags(_flags) {
-    // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline
+    // TODO: check if rowPrefix, rowSuffix or rowSeparator contains a newline
     // don't add rowSpacer if columns are not to be aligned
     if ((flags & DontAlignCols)) return;
     int i = int(matPrefix.length()) - 1;
diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h
index 358239ca86a..150565ad6a0 100644
--- a/Eigen/src/Core/IndexedView.h
+++ b/Eigen/src/Core/IndexedView.h
@@ -59,7 +59,7 @@ struct traits<IndexedView<XprType, RowIndices, ColIndices>> : traits<XprType> {
     ReturnAsBlock = (!ReturnAsScalar) && IsBlockAlike,
     ReturnAsIndexedView = (!ReturnAsScalar) && (!ReturnAsBlock),
 
-    // FIXME we deal with compile-time strides if and only if we have DirectAccessBit flag,
+    // FIXME: we deal with compile-time strides if and only if we have DirectAccessBit flag,
     // but this is too strict regarding negative strides...
     DirectAccessMask = (int(InnerIncr) != Undefined && int(OuterIncr) != Undefined && InnerIncr >= 0 && OuterIncr >= 0)
                            ? DirectAccessBit
@@ -259,26 +259,27 @@ struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
     Alignment = 0
   };
 
-  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr) {
+  EIGEN_DEVICE_FUNC constexpr explicit unary_evaluator(const XprType& xpr)
+      : m_argImpl(xpr.nestedExpression()), m_xpr(xpr) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() &&
                  m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
     return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
     eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() &&
                  m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
     return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
     EIGEN_STATIC_ASSERT_LVALUE(XprType)
     Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
     Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
@@ -287,7 +288,7 @@ struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
     return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const {
     Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
     Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
     eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() &&
@@ -295,7 +296,7 @@ struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
     return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index index) const {
     Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
     Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
     eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() &&
@@ -308,6 +309,10 @@ struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
   const XprType& m_xpr;
 };
 
+// Catch assignments to an IndexedView.
+template <typename ArgType, typename RowIndices, typename ColIndices>
+struct evaluator_assume_aliasing<IndexedView<ArgType, RowIndices, ColIndices>> : std::true_type {};
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/InnerProduct.h b/Eigen/src/Core/InnerProduct.h
index 9849d9b1fe4..9c57e7a7aa2 100644
--- a/Eigen/src/Core/InnerProduct.h
+++ b/Eigen/src/Core/InnerProduct.h
@@ -142,31 +142,36 @@ struct inner_product_impl<Evaluator, true> {
     const UnsignedIndex numPackets = size / PacketSize;
     const UnsignedIndex numRemPackets = (packetEnd - quadEnd) / PacketSize;
 
-    Packet presult0, presult1, presult2, presult3;
-
-    presult0 = eval.template packet<Packet>(0 * PacketSize);
-    if (numPackets >= 2) presult1 = eval.template packet<Packet>(1 * PacketSize);
-    if (numPackets >= 3) presult2 = eval.template packet<Packet>(2 * PacketSize);
-    if (numPackets >= 4) {
-      presult3 = eval.template packet<Packet>(3 * PacketSize);
-
-      for (UnsignedIndex k = 4 * PacketSize; k < quadEnd; k += 4 * PacketSize) {
-        presult0 = eval.packet(presult0, k + 0 * PacketSize);
-        presult1 = eval.packet(presult1, k + 1 * PacketSize);
-        presult2 = eval.packet(presult2, k + 2 * PacketSize);
-        presult3 = eval.packet(presult3, k + 3 * PacketSize);
+    Packet presult0 = eval.template packet<Packet>(0 * PacketSize);
+    if (numPackets >= 2) {
+      Packet presult1 = eval.template packet<Packet>(1 * PacketSize);
+      if (numPackets >= 3) {
+        Packet presult2 = eval.template packet<Packet>(2 * PacketSize);
+        if (numPackets >= 4) {
+          Packet presult3 = eval.template packet<Packet>(3 * PacketSize);
+
+          for (UnsignedIndex k = 4 * PacketSize; k < quadEnd; k += 4 * PacketSize) {
+            presult0 = eval.packet(presult0, k + 0 * PacketSize);
+            presult1 = eval.packet(presult1, k + 1 * PacketSize);
+            presult2 = eval.packet(presult2, k + 2 * PacketSize);
+            presult3 = eval.packet(presult3, k + 3 * PacketSize);
+          }
+
+          if (numRemPackets >= 1) {
+            presult0 = eval.packet(presult0, quadEnd + 0 * PacketSize);
+            if (numRemPackets >= 2) {
+              presult1 = eval.packet(presult1, quadEnd + 1 * PacketSize);
+              if (numRemPackets == 3) presult2 = eval.packet(presult2, quadEnd + 2 * PacketSize);
+            }
+          }
+
+          presult2 = padd(presult2, presult3);
+        }
+        presult1 = padd(presult1, presult2);
       }
-
-      if (numRemPackets >= 1) presult0 = eval.packet(presult0, quadEnd + 0 * PacketSize);
-      if (numRemPackets >= 2) presult1 = eval.packet(presult1, quadEnd + 1 * PacketSize);
-      if (numRemPackets == 3) presult2 = eval.packet(presult2, quadEnd + 2 * PacketSize);
-
-      presult2 = padd(presult2, presult3);
+      presult0 = padd(presult0, presult1);
     }
 
-    if (numPackets >= 3) presult1 = padd(presult1, presult2);
-    if (numPackets >= 2) presult0 = padd(presult0, presult1);
-
     Scalar result = predux(presult0);
     for (UnsignedIndex k = packetEnd; k < size; k++) {
       result = eval.coeff(result, k);
@@ -211,8 +216,14 @@ struct scalar_inner_product_op {
   static constexpr bool PacketAccess = false;
 };
 
+// Partial specialization for packet access if and only if
+// LhsScalar == RhsScalar == ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType.
 template <typename Scalar, bool Conj>
-struct scalar_inner_product_op<Scalar, Scalar, Conj> {
+struct scalar_inner_product_op<
+    Scalar,
+    std::enable_if_t<internal::is_same<typename ScalarBinaryOpTraits<Scalar, Scalar>::ReturnType, Scalar>::value,
+                     Scalar>,
+    Conj> {
   using result_type = Scalar;
   using conj_helper = conditional_conj<Scalar, Conj>;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(const Scalar& a, const Scalar& b) const {
diff --git a/Eigen/src/Core/Inverse.h b/Eigen/src/Core/Inverse.h
index 79fc3ab6a5a..855e3b3e2d4 100644
--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@@ -49,12 +49,12 @@ class Inverse : public InverseImpl<XprType, typename internal::traits<XprType>::
   typedef typename internal::ref_selector<Inverse>::type Nested;
   typedef internal::remove_all_t<XprType> NestedExpression;
 
-  explicit EIGEN_DEVICE_FUNC Inverse(const XprType& xpr) : m_xpr(xpr) {}
+  explicit EIGEN_DEVICE_FUNC constexpr Inverse(const XprType& xpr) : m_xpr(xpr) {}
 
   EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_xpr.cols(); }
   EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_xpr.rows(); }
 
-  EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }
+  EIGEN_DEVICE_FUNC constexpr const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }
 
  protected:
   XprTypeNested m_xpr;
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index c740da72603..33b62c2028b 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -100,7 +100,7 @@ class Map : public MapBase<Map<PlainObjectType, MapOptions, StrideType> > {
 
   typedef typename Base::PointerType PointerType;
   typedef PointerType PointerArgType;
-  EIGEN_DEVICE_FUNC inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; }
+  EIGEN_DEVICE_FUNC constexpr inline PointerType cast_to_pointer_type(PointerArgType ptr) const { return ptr; }
 
   EIGEN_DEVICE_FUNC constexpr Index innerStride() const {
     return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
@@ -120,7 +120,7 @@ class Map : public MapBase<Map<PlainObjectType, MapOptions, StrideType> > {
    * \param dataPtr pointer to the array to map
    * \param stride optional Stride object, passing the strides.
    */
-  EIGEN_DEVICE_FUNC explicit inline Map(PointerArgType dataPtr, const StrideType& stride = StrideType())
+  EIGEN_DEVICE_FUNC constexpr explicit inline Map(PointerArgType dataPtr, const StrideType& stride = StrideType())
       : Base(cast_to_pointer_type(dataPtr)), m_stride(stride) {}
 
   /** Constructor in the dynamic-size vector case.
@@ -129,7 +129,7 @@ class Map : public MapBase<Map<PlainObjectType, MapOptions, StrideType> > {
    * \param size the size of the vector expression
    * \param stride optional Stride object, passing the strides.
    */
-  EIGEN_DEVICE_FUNC inline Map(PointerArgType dataPtr, Index size, const StrideType& stride = StrideType())
+  EIGEN_DEVICE_FUNC constexpr inline Map(PointerArgType dataPtr, Index size, const StrideType& stride = StrideType())
       : Base(cast_to_pointer_type(dataPtr), size), m_stride(stride) {}
 
   /** Constructor in the dynamic-size matrix case.
@@ -139,7 +139,8 @@ class Map : public MapBase<Map<PlainObjectType, MapOptions, StrideType> > {
    * \param cols the number of columns of the matrix expression
    * \param stride optional Stride object, passing the strides.
    */
-  EIGEN_DEVICE_FUNC inline Map(PointerArgType dataPtr, Index rows, Index cols, const StrideType& stride = StrideType())
+  EIGEN_DEVICE_FUNC constexpr inline Map(PointerArgType dataPtr, Index rows, Index cols,
+                                         const StrideType& stride = StrideType())
       : Base(cast_to_pointer_type(dataPtr), rows, cols), m_stride(stride) {}
 
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index 5e3d746bcd2..3e117253a93 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -97,23 +97,23 @@ class MapBase<Derived, ReadOnlyAccessors> : public internal::dense_xpr_base<Deri
   EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_data; }
 
   /** \copydoc PlainObjectBase::coeff(Index,Index) const */
-  EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index rowId, Index colId) const {
+  EIGEN_DEVICE_FUNC constexpr inline const Scalar& coeff(Index rowId, Index colId) const {
     return m_data[colId * colStride() + rowId * rowStride()];
   }
 
   /** \copydoc PlainObjectBase::coeff(Index) const */
-  EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr inline const Scalar& coeff(Index index) const {
     EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
     return m_data[index * innerStride()];
   }
 
   /** \copydoc PlainObjectBase::coeffRef(Index,Index) const */
-  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index rowId, Index colId) const {
+  EIGEN_DEVICE_FUNC constexpr inline const Scalar& coeffRef(Index rowId, Index colId) const {
     return this->m_data[colId * colStride() + rowId * rowStride()];
   }
 
   /** \copydoc PlainObjectBase::coeffRef(Index) const */
-  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr inline const Scalar& coeffRef(Index index) const {
     EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
     return this->m_data[index * innerStride()];
   }
@@ -132,14 +132,14 @@ class MapBase<Derived, ReadOnlyAccessors> : public internal::dense_xpr_base<Deri
   }
 
   /** \internal Constructor for fixed size matrices or vectors */
-  EIGEN_DEVICE_FUNC explicit inline MapBase(PointerType dataPtr)
+  EIGEN_DEVICE_FUNC constexpr explicit inline MapBase(PointerType dataPtr)
       : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime) {
     EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
     checkSanity<Derived>();
   }
 
   /** \internal Constructor for dynamically sized vectors */
-  EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index vecSize)
+  EIGEN_DEVICE_FUNC constexpr inline MapBase(PointerType dataPtr, Index vecSize)
       : m_data(dataPtr),
         m_rows(RowsAtCompileTime == Dynamic ? vecSize : Index(RowsAtCompileTime)),
         m_cols(ColsAtCompileTime == Dynamic ? vecSize : Index(ColsAtCompileTime)) {
@@ -150,7 +150,7 @@ class MapBase<Derived, ReadOnlyAccessors> : public internal::dense_xpr_base<Deri
   }
 
   /** \internal Constructor for dynamically sized matrices */
-  EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index rows, Index cols)
+  EIGEN_DEVICE_FUNC constexpr inline MapBase(PointerType dataPtr, Index rows, Index cols)
       : m_data(dataPtr), m_rows(rows), m_cols(cols) {
     eigen_assert((dataPtr == 0) || (rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows) &&
                                     cols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols)));
@@ -238,11 +238,11 @@ class MapBase<Derived, WriteAccessors> : public MapBase<Derived, ReadOnlyAccesso
     return this->m_data;
   }  // no const-cast here so non-const-correct code will give a compile error
 
-  EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue& coeffRef(Index row, Index col) {
+  EIGEN_DEVICE_FUNC constexpr inline ScalarWithConstIfNotLvalue& coeffRef(Index row, Index col) {
     return this->m_data[col * colStride() + row * rowStride()];
   }
 
-  EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC constexpr inline ScalarWithConstIfNotLvalue& coeffRef(Index index) {
     EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
     return this->m_data[index * innerStride()];
   }
@@ -258,9 +258,9 @@ class MapBase<Derived, WriteAccessors> : public MapBase<Derived, ReadOnlyAccesso
     internal::pstoret<Scalar, PacketScalar, StoreMode>(this->m_data + index * innerStride(), val);
   }
 
-  EIGEN_DEVICE_FUNC explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
-  EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
-  EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index rows, Index cols) : Base(dataPtr, rows, cols) {}
+  EIGEN_DEVICE_FUNC constexpr explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
+  EIGEN_DEVICE_FUNC constexpr inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
+  EIGEN_DEVICE_FUNC constexpr inline MapBase(PointerType dataPtr, Index rows, Index cols) : Base(dataPtr, rows, cols) {}
 
   EIGEN_DEVICE_FUNC Derived& operator=(const MapBase& other) {
     ReadOnlyMapBase::Base::operator=(other);
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 481e057d03e..83e22147dc2 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_MATHFUNCTIONS_H
 #define EIGEN_MATHFUNCTIONS_H
 
-// TODO this should better be moved to NumTraits
+// TODO: consider moving these constants to NumTraits.
 // Source: WolframAlpha
 #define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406L
 #define EIGEN_LOG2E 1.442695040888963407359924681001892137426645954152985934135449406931109219L
@@ -74,7 +74,7 @@ struct global_math_functions_filtering_base<
 template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct real_default_impl {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) { return x; }
+  EIGEN_DEVICE_FUNC static constexpr RealScalar run(const Scalar& x) { return x; }
 };
 
 template <typename Scalar>
@@ -170,18 +170,24 @@ struct imag_ref_default_impl {
 
 template <typename Scalar>
 struct imag_ref_default_impl<Scalar, false> {
-  EIGEN_DEVICE_FUNC constexpr static Scalar run(Scalar&) { return Scalar(0); }
-  EIGEN_DEVICE_FUNC constexpr static const Scalar run(const Scalar&) { return Scalar(0); }
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC constexpr static inline RealScalar run(Scalar&) { return RealScalar(0); }
+  EIGEN_DEVICE_FUNC constexpr static inline RealScalar run(const Scalar&) { return RealScalar(0); }
 };
 
 template <typename Scalar>
 struct imag_ref_impl : imag_ref_default_impl<Scalar, NumTraits<Scalar>::IsComplex> {};
 
-template <typename Scalar>
+template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct imag_ref_retval {
   typedef typename NumTraits<Scalar>::Real& type;
 };
 
+template <typename Scalar>
+struct imag_ref_retval<Scalar, false> {
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
 }  // namespace internal
 
 namespace numext {
@@ -222,7 +228,7 @@ namespace internal {
 
 template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct conj_default_impl {
-  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { return x; }
+  EIGEN_DEVICE_FUNC static constexpr Scalar run(const Scalar& x) { return x; }
 };
 
 template <typename Scalar>
@@ -287,7 +293,7 @@ struct sqrt_impl {
 
 // Complex sqrt defined in MathFunctionsImpl.h.
 template <typename ComplexT>
-EIGEN_DEVICE_FUNC ComplexT complex_sqrt(const ComplexT& a_x);
+EIGEN_DEVICE_FUNC constexpr ComplexT complex_sqrt(const ComplexT& a_x);
 
 // Custom implementation is faster than `std::sqrt`, works on
 // GPU, and correctly handles special cases (unlike MSVC).
@@ -307,7 +313,7 @@ struct rsqrt_impl;
 
 // Complex rsqrt defined in MathFunctionsImpl.h.
 template <typename ComplexT>
-EIGEN_DEVICE_FUNC ComplexT complex_rsqrt(const ComplexT& a_x);
+EIGEN_DEVICE_FUNC constexpr ComplexT complex_rsqrt(const ComplexT& a_x);
 
 template <typename T>
 struct rsqrt_impl<std::complex<T>> {
@@ -390,7 +396,7 @@ struct cast_impl<OldType, NewType,
   }
 };
 
-// here, for once, we're plainly returning NewType: we don't want cast to do weird things.
+// Returns NewType directly to avoid unintended intermediate conversions.
 
 template <typename OldType, typename NewType>
 EIGEN_DEVICE_FUNC inline NewType cast(const OldType& x) {
@@ -504,7 +510,7 @@ struct expm1_retval {
 
 // Complex log defined in MathFunctionsImpl.h.
 template <typename ComplexT>
-EIGEN_DEVICE_FUNC ComplexT complex_log(const ComplexT& z);
+EIGEN_DEVICE_FUNC constexpr ComplexT complex_log(const ComplexT& z);
 
 template <typename Scalar>
 struct log_impl {
@@ -832,8 +838,8 @@ EIGEN_DEVICE_FUNC std::enable_if_t<(std::numeric_limits<T>::has_infinity && !Num
 
 template <typename T>
 EIGEN_DEVICE_FUNC
-std::enable_if_t<!(std::numeric_limits<T>::has_quiet_NaN || std::numeric_limits<T>::has_signaling_NaN), bool>
-isnan_impl(const T&) {
+    std::enable_if_t<!(std::numeric_limits<T>::has_quiet_NaN || std::numeric_limits<T>::has_signaling_NaN), bool>
+    isnan_impl(const T&) {
   return false;
 }
 
@@ -895,6 +901,37 @@ struct sign_retval {
   typedef Scalar type;
 };
 
+template <typename Scalar, bool IsComplex = (NumTraits<Scalar>::IsComplex != 0),
+          bool IsInteger = (NumTraits<Scalar>::IsInteger != 0)>
+struct copysign_impl {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& a, const Scalar& b) {
+    EIGEN_USING_STD(copysign);
+    return Scalar(copysign(a, b));
+  }
+};
+
+template <typename Scalar, bool IsInteger>
+struct copysign_impl<Scalar, true, IsInteger> {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& a, const Scalar& b) {
+    EIGEN_USING_STD(copysign);
+    return Scalar(copysign(numext::real(a), numext::real(b)), copysign(numext::imag(a), numext::imag(b)));
+  }
+};
+
+template <typename Scalar>
+struct copysign_impl<Scalar, false, true> {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& a, const Scalar& b) {
+    EIGEN_IF_CONSTEXPR(!NumTraits<Scalar>::IsSigned) return a;
+    const Scalar abs_a = a < Scalar(0) ? -a : a;
+    return b < Scalar(0) ? -abs_a : abs_a;
+  }
+};
+
+template <typename Scalar>
+struct copysign_retval {
+  typedef Scalar type;
+};
+
 // suppress "unary minus operator applied to unsigned type, result still unsigned" warnings on MSVC
 // note: `0 - a` is distinct from `-a` when Scalar is a floating point type and `a` is zero
 
@@ -941,23 +978,43 @@ struct nearest_integer_impl<Scalar, true> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_trunc(const Scalar& x) { return x; }
 };
 
+// Extra namespace to prevent leaking std::fma into Eigen::internal.
+namespace has_fma_detail {
+
+template <typename T, typename EnableIf = void>
+struct has_fma_impl : public std::false_type {};
+
+using std::fma;
+
+template <typename T>
+struct has_fma_impl<
+    T, std::enable_if_t<std::is_same<T, decltype(fma(std::declval<T>(), std::declval<T>(), std::declval<T>()))>::value>>
+    : public std::true_type {};
+
+}  // namespace has_fma_detail
+
+template <typename T>
+struct has_fma : public has_fma_detail::has_fma_impl<T> {};
+
 // Default implementation.
-template <typename Scalar, typename Enable = void>
+template <typename T, typename Enable = void>
 struct fma_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run(const Scalar& a, const Scalar& b, const Scalar& c) {
-    return a * b + c;
-  }
+  static_assert(has_fma<T>::value, "No function fma(...) for type.  Please provide an implementation.");
 };
 
-// ADL version if it exists.
+// STD or ADL version if it exists.
 template <typename T>
-struct fma_impl<
-    T,
-    std::enable_if_t<std::is_same<T, decltype(fma(std::declval<T>(), std::declval<T>(), std::declval<T>()))>::value>> {
-  static T run(const T& a, const T& b, const T& c) { return fma(a, b, c); }
+struct fma_impl<T, std::enable_if_t<has_fma<T>::value>> {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T run(const T& a, const T& b, const T& c) {
+    using std::fma;
+    return fma(a, b, c);
+  }
 };
 
 #if defined(EIGEN_GPUCC)
+template <>
+struct has_fma<float> : public true_type {};
+
 template <>
 struct fma_impl<float, void> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float run(const float& a, const float& b, const float& c) {
@@ -965,6 +1022,9 @@ struct fma_impl<float, void> {
   }
 };
 
+template <>
+struct has_fma<double> : public true_type {};
+
 template <>
 struct fma_impl<double, void> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double run(const double& a, const double& b, const double& c) {
@@ -973,6 +1033,23 @@ struct fma_impl<double, void> {
 };
 #endif
 
+// Basic multiply-add.
+template <typename Scalar, typename EnableIf = void>
+struct madd_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run(const Scalar& x, const Scalar& y, const Scalar& z) {
+    return x * y + z;
+  }
+};
+
+#if EIGEN_SCALAR_MADD_USE_FMA
+template <typename Scalar>
+struct madd_impl<Scalar, std::enable_if_t<has_fma<Scalar>::value>> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run(const Scalar& x, const Scalar& y, const Scalar& z) {
+    return fma_impl<Scalar>::run(x, y, z);
+  }
+};
+#endif
+
 }  // end namespace internal
 
 /****************************************************************************
@@ -983,13 +1060,13 @@ namespace numext {
 
 #if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) {
   EIGEN_USING_STD(min)
   return min EIGEN_NOT_A_MACRO(x, y);
 }
 
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) {
   EIGEN_USING_STD(max)
   return max EIGEN_NOT_A_MACRO(x, y);
 }
@@ -1134,6 +1211,11 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(sign, Scalar) sign(const Scalar&
   return EIGEN_MATHFUNC_IMPL(sign, Scalar)::run(x);
 }
 
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(copysign, Scalar) copysign(const Scalar& x, const Scalar& y) {
+  return EIGEN_MATHFUNC_IMPL(copysign, Scalar)::run(x, y);
+}
+
 template <typename Scalar>
 EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(negate, Scalar) negate(const Scalar& x) {
   return EIGEN_MATHFUNC_IMPL(negate, Scalar)::run(x);
@@ -1294,7 +1376,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double trunc(const double& x) {
 // T is assumed to be an integer type with a>=0, and b>0
 template <typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr T div_ceil(T a, T b) {
-  using UnsignedT = typename internal::make_unsigned<T>::type;
+  using UnsignedT = std::make_unsigned_t<T>;
   EIGEN_STATIC_ASSERT((NumTraits<T>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
   // Note: explicitly declaring a and b as non-negative values allows the compiler to use better optimizations
   const UnsignedT ua = UnsignedT(a);
@@ -1307,8 +1389,8 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr T div_ceil(T a, T b) {
 // T is assumed to be an integer type with a>=0, and b>0
 template <typename T, typename U>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr T round_down(T a, U b) {
-  using UnsignedT = typename internal::make_unsigned<T>::type;
-  using UnsignedU = typename internal::make_unsigned<U>::type;
+  using UnsignedT = std::make_unsigned_t<T>;
+  using UnsignedU = std::make_unsigned_t<U>;
   EIGEN_STATIC_ASSERT((NumTraits<T>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
   EIGEN_STATIC_ASSERT((NumTraits<U>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
   // Note: explicitly declaring a and b as non-negative values allows the compiler to use better optimizations
@@ -1317,6 +1399,12 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr T round_down(T a, U b) {
   return ub * (ua / ub);
 }
 
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T log2(T x) {
+  EIGEN_USING_STD(log2);
+  return log2(x);
+}
+
 /** Log base 2 for 32 bits positive integers.
  * Conveniently returns 0 for x==0. */
 constexpr int log2(int x) {
@@ -1396,17 +1484,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double log(const double& x) {
 #endif
 
 template <typename T>
-EIGEN_DEVICE_FUNC
-EIGEN_ALWAYS_INLINE std::enable_if_t<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex, typename NumTraits<T>::Real>
-abs(const T& x) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+    std::enable_if_t<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex, typename NumTraits<T>::Real>
+    abs(const T& x) {
   EIGEN_USING_STD(abs);
   return abs(x);
 }
 
 template <typename T>
-EIGEN_DEVICE_FUNC
-EIGEN_ALWAYS_INLINE std::enable_if_t<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex), typename NumTraits<T>::Real>
-abs(const T& x) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+    std::enable_if_t<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex), typename NumTraits<T>::Real>
+    abs(const T& x) {
   return x;
 }
 
@@ -1871,7 +1959,8 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double fmod(const double& a, const double&
 
 template <typename Scalar, typename Enable = std::enable_if_t<std::is_integral<Scalar>::value>>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar logical_shift_left(const Scalar& a, int n) {
-  return a << n;
+  using UnsignedScalar = typename numext::get_integer_by_size<sizeof(Scalar)>::unsigned_type;
+  return bit_cast<Scalar, UnsignedScalar>(bit_cast<UnsignedScalar, Scalar>(a) << n);
 }
 
 template <typename Scalar, typename Enable = std::enable_if_t<std::is_integral<Scalar>::value>>
@@ -1886,15 +1975,17 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar arithmetic_shift_right(const Scalar
   return bit_cast<Scalar, SignedScalar>(bit_cast<SignedScalar, Scalar>(a) >> n);
 }
 
-// Use std::fma if available.
-using std::fma;
-
-// Otherwise, rely on template implementation.
 template <typename Scalar>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar fma(const Scalar& x, const Scalar& y, const Scalar& z) {
   return internal::fma_impl<Scalar>::run(x, y, z);
 }
 
+// Multiply-add.
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar madd(const Scalar& x, const Scalar& y, const Scalar& z) {
+  return internal::madd_impl<Scalar>::run(x, y, z);
+}
+
 }  // end namespace numext
 
 namespace internal {
@@ -2042,7 +2133,15 @@ struct expm1_impl<std::complex<RealScalar>> {
 
 template <typename T>
 struct rsqrt_impl {
+// C4804: unsafe use of type 'bool' in operation. Unavoidable when instantiated with T=bool.
+#if EIGEN_COMP_MSVC
+#pragma warning(push)
+#pragma warning(disable : 4804)
+#endif
   EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE T run(const T& x) { return T(1) / numext::sqrt(x); }
+#if EIGEN_COMP_MSVC
+#pragma warning(pop)
+#endif
 };
 
 #if defined(EIGEN_GPU_COMPILE_PHASE)
@@ -2054,6 +2153,57 @@ struct conj_impl<std::complex<T>, true> {
 };
 #endif
 
+// Complex multiply and division operators.
+// Note that these do not handle the case if inf+NaNi, which is considered an infinity.
+// This is for consistency with our standard pmul, pdiv implementations.
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_multiply(const std::complex<T>& a,
+                                                                       const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  return std::complex<T>(a_real * b_real - a_imag * b_imag, a_imag * b_real + a_real * b_imag);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide_fast(const std::complex<T>& a,
+                                                                          const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  const T norm = (b_real * b_real + b_imag * b_imag);
+  return std::complex<T>((a_real * b_real + a_imag * b_imag) / norm, (a_imag * b_real - a_real * b_imag) / norm);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide_smith(const std::complex<T>& a,
+                                                                           const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  // Smith's complex division (https://arxiv.org/pdf/1210.4539.pdf),
+  // guards against over/under-flow.
+  const bool scale_imag = numext::abs(b_imag) <= numext::abs(b_real);
+  const T rscale = scale_imag ? T(1) : b_real / b_imag;
+  const T iscale = scale_imag ? b_imag / b_real : T(1);
+  const T denominator = b_real * rscale + b_imag * iscale;
+  return std::complex<T>((a_real * rscale + a_imag * iscale) / denominator,
+                         (a_imag * rscale - a_real * iscale) / denominator);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide(const std::complex<T>& a,
+                                                                     const std::complex<T>& b) {
+#if EIGEN_FAST_MATH
+  return complex_divide_fast(a, b);
+#else
+  return complex_divide_smith(a, b);
+#endif
+}
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
index cf8dcc3b893..243318762e4 100644
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -28,7 +28,7 @@ namespace internal {
    2. If a is zero, approx_a_recip must be infinite with the same sign as a.
    3. If a is infinite, approx_a_recip must be zero with the same sign as a.
 
-   If the preconditions are satisfied, which they are for for the _*_rcp_ps
+   If the preconditions are satisfied, which they are for the _*_rcp_ps
    instructions on x86, the result has a maximum relative error of 2 ulps,
    and correctly handles reciprocals of zero, infinity, and NaN.
 */
@@ -37,15 +37,16 @@ struct generic_reciprocal_newton_step {
   static_assert(Steps > 0, "Steps must be at least 1.");
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(const Packet& a, const Packet& approx_a_recip) {
     using Scalar = typename unpacket_traits<Packet>::type;
-    const Packet two = pset1<Packet>(Scalar(2));
+    const Packet one = pset1<Packet>(Scalar(1));
     // Refine the approximation using one Newton-Raphson step:
     //   x_{i} = x_{i-1} * (2 - a * x_{i-1})
     const Packet x = generic_reciprocal_newton_step<Packet, Steps - 1>::run(a, approx_a_recip);
-    const Packet tmp = pnmadd(a, x, two);
+    const Packet tmp = pnmadd(a, x, one);
     // If tmp is NaN, it means that a is either +/-0 or +/-Inf.
     // In this case return the approximation directly.
     const Packet is_not_nan = pcmp_eq(tmp, tmp);
-    return pselect(is_not_nan, pmul(x, tmp), x);
+    // Use two FMAs instead of FMA+FMUL to improve precision.
+    return pselect(is_not_nan, pmadd(x, tmp, x), x);
   }
 };
 
@@ -66,7 +67,7 @@ struct generic_reciprocal_newton_step<Packet, 0> {
    2. If a is zero, approx_a_recip must be infinite with the same sign as a.
    3. If a is infinite, approx_a_recip must be zero with the same sign as a.
 
-   If the preconditions are satisfied, which they are for for the _*_rcp_ps
+   If the preconditions are satisfied, which they are for the _*_rcp_ps
    instructions on x86, the result has a maximum relative error of 2 ulps,
    and correctly handles zero, infinity, and NaN. Positive denormals are
    treated as zero.
@@ -116,7 +117,7 @@ struct generic_rsqrt_newton_step<Packet, 0> {
    2. If a is zero, approx_rsqrt must be infinite.
    3. If a is infinite, approx_rsqrt must be zero.
 
-   If the preconditions are satisfied, which they are for for the _*_rsqrt_ps
+   If the preconditions are satisfied, which they are for the _*_rsqrt_ps
    instructions on x86, the result has a maximum relative error of 2 ulps,
    and correctly handles zero and infinity, and NaN. Positive denormal inputs
    are treated as zero.
@@ -147,16 +148,16 @@ struct generic_sqrt_newton_step {
 };
 
 template <typename RealScalar>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE RealScalar positive_real_hypot(const RealScalar& x,
+                                                                               const RealScalar& y) {
   // IEEE IEC 6059 special cases.
   if ((numext::isinf)(x) || (numext::isinf)(y)) return NumTraits<RealScalar>::infinity();
   if ((numext::isnan)(x) || (numext::isnan)(y)) return NumTraits<RealScalar>::quiet_NaN();
 
   EIGEN_USING_STD(sqrt);
-  RealScalar p, qp;
-  p = numext::maxi(x, y);
+  RealScalar p = numext::maxi(x, y);
   if (numext::is_exactly_zero(p)) return RealScalar(0);
-  qp = numext::mini(y, x) / p;
+  RealScalar qp = numext::mini(y, x) / p;
   return p * sqrt(RealScalar(1) + qp * qp);
 }
 
@@ -172,7 +173,7 @@ struct hypot_impl {
 // Generic complex sqrt implementation that correctly handles corner cases
 // according to https://en.cppreference.com/w/cpp/numeric/complex/sqrt
 template <typename ComplexT>
-EIGEN_DEVICE_FUNC ComplexT complex_sqrt(const ComplexT& z) {
+EIGEN_DEVICE_FUNC constexpr ComplexT complex_sqrt(const ComplexT& z) {
   // Computes the principal sqrt of the input.
   //
   // For a complex square root of the number x + i*y. We want to find real
@@ -208,7 +209,7 @@ EIGEN_DEVICE_FUNC ComplexT complex_sqrt(const ComplexT& z) {
 
 // Generic complex rsqrt implementation.
 template <typename ComplexT>
-EIGEN_DEVICE_FUNC ComplexT complex_rsqrt(const ComplexT& z) {
+EIGEN_DEVICE_FUNC constexpr ComplexT complex_rsqrt(const ComplexT& z) {
   // Computes the principal reciprocal sqrt of the input.
   //
   // For a complex reciprocal square root of the number z = x + i*y. We want to
@@ -247,7 +248,7 @@ EIGEN_DEVICE_FUNC ComplexT complex_rsqrt(const ComplexT& z) {
 }
 
 template <typename ComplexT>
-EIGEN_DEVICE_FUNC ComplexT complex_log(const ComplexT& z) {
+EIGEN_DEVICE_FUNC constexpr ComplexT complex_log(const ComplexT& z) {
   // Computes complex log.
   using T = typename NumTraits<ComplexT>::Real;
   T a = numext::abs(z);
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index a2c8eba574a..314c01dd9fa 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -207,7 +207,7 @@ class Matrix : public PlainObjectBase<Matrix<Scalar_, Rows_, Cols_, Options_, Ma
    *
    * \callgraph
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix& operator=(const Matrix& other) { return Base::_set(other); }
+  EIGEN_DEVICE_FUNC constexpr Matrix& operator=(const Matrix& other) { return Base::_set(other); }
 
   /** \internal
    * \brief Copies the value of the expression \a other into \c *this with automatic resizing.
@@ -249,16 +249,16 @@ class Matrix : public PlainObjectBase<Matrix<Scalar_, Rows_, Cols_, Options_, Ma
    * \sa resize(Index,Index)
    */
 #if defined(EIGEN_INITIALIZE_COEFFS)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix() { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
+  EIGEN_DEVICE_FUNC constexpr Matrix() { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
 #else
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix() = default;
+  EIGEN_DEVICE_FUNC constexpr Matrix() = default;
 #endif
   /** \brief Move constructor */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix(Matrix&&) = default;
+  EIGEN_DEVICE_FUNC constexpr Matrix(Matrix&&) = default;
   /** \brief Moves the matrix into the other one.
    *
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix& operator=(Matrix&& other) noexcept(
+  EIGEN_DEVICE_FUNC constexpr Matrix& operator=(Matrix&& other) noexcept(
       std::is_nothrow_move_assignable<Scalar>::value) {
     Base::operator=(std::move(other));
     return *this;
@@ -271,7 +271,7 @@ class Matrix : public PlainObjectBase<Matrix<Scalar_, Rows_, Cols_, Options_, Ma
    * This constructor is for 1D array or vectors with more than 4 coefficients.
    *
    * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
-   * constructor must match the the fixed number of rows (resp. columns) of \c *this.
+   * constructor must match the fixed number of rows (resp. columns) of \c *this.
    *
    *
    * Example: \include Matrix_variadic_ctor_cxx11.cpp
@@ -285,7 +285,6 @@ class Matrix : public PlainObjectBase<Matrix<Scalar_, Rows_, Cols_, Options_, Ma
       : Base(a0, a1, a2, a3, args...) {}
 
   /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row.
-   * \cpp11
    * \anchor matrix_initializer_list
    *
    * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
@@ -316,12 +315,12 @@ class Matrix : public PlainObjectBase<Matrix<Scalar_, Rows_, Cols_, Options_, Ma
 
   // This constructor is for both 1x1 matrices and dynamic vectors
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Matrix(const T& x) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit Matrix(const T& x) {
     Base::template _init1<T>(x);
   }
 
   template <typename T0, typename T1>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y) {
     Base::template _init2<T0, T1>(x, y);
   }
 
@@ -367,7 +366,7 @@ class Matrix : public PlainObjectBase<Matrix<Scalar_, Rows_, Cols_, Options_, Ma
   /** \brief Constructs an initialized 3D vector with given coefficients
    * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z) {
     EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Matrix, 3)
     m_storage.data()[0] = x;
     m_storage.data()[1] = y;
@@ -376,7 +375,8 @@ class Matrix : public PlainObjectBase<Matrix<Scalar_, Rows_, Cols_, Options_, Ma
   /** \brief Constructs an initialized 4D vector with given coefficients
    * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z,
+                                                         const Scalar& w) {
     EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Matrix, 4)
     m_storage.data()[0] = x;
     m_storage.data()[1] = y;
@@ -385,13 +385,14 @@ class Matrix : public PlainObjectBase<Matrix<Scalar_, Rows_, Cols_, Options_, Ma
   }
 
   /** \brief Copy constructor */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix(const Matrix&) = default;
+  EIGEN_DEVICE_FUNC constexpr Matrix(const Matrix&) = default;
 
   /** \brief Copy constructor for generic expressions.
    * \sa MatrixBase::operator=(const EigenBase<OtherDerived>&)
    */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived>& other) : Base(other.derived()) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived>& other)
+      : Base(other.derived()) {}
 
   EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return 1; }
   EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return this->innerSize(); }
@@ -433,14 +434,14 @@ class Matrix : public PlainObjectBase<Matrix<Scalar_, Rows_, Cols_, Options_, Ma
  * There are also \c VectorSizeType and \c RowVectorSizeType which are self-explanatory. For example, \c Vector4cf is
  * a fixed-size vector of 4 complex floats.
  *
- * With \cpp11, template alias are also defined for common sizes.
+ * Template alias are also defined for common sizes.
  * They follow the same pattern as above except that the scalar type suffix is replaced by a
  * template parameter, i.e.:
  *   - `MatrixSize<Type>` where `Size` can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size.
  *   - `MatrixXSize<Type>` and `MatrixSizeX<Type>` where `Size` can be \c 2,\c 3,\c 4 for hybrid dynamic/fixed matrices.
  *   - `VectorSize<Type>` and `RowVectorSize<Type>` for column and row vectors.
  *
- * With \cpp11, you can also use fully generic column and row vector types: `Vector<Type,Size>` and
+ * You can also use fully generic column and row vector types: `Vector<Type,Size>` and
  * `RowVector<Type,Size>`.
  *
  * \sa class Matrix
@@ -484,28 +485,28 @@ EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex<double>, cd)
 #undef EIGEN_MAKE_TYPEDEFS
 #undef EIGEN_MAKE_FIXED_TYPEDEFS
 
-#define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix)                    \
-  /** \ingroup matrixtypedefs */                                 \
-  /** \brief \cpp11 `Size`&times;`Size` matrix of type `Type`.*/ \
-  template <typename Type>                                       \
-  using Matrix##SizeSuffix = Matrix<Type, Size, Size>;           \
-  /** \ingroup matrixtypedefs */                                 \
-  /** \brief \cpp11 `Size`&times;`1` vector of type `Type`.*/    \
-  template <typename Type>                                       \
-  using Vector##SizeSuffix = Matrix<Type, Size, 1>;              \
-  /** \ingroup matrixtypedefs */                                 \
-  /** \brief \cpp11 `1`&times;`Size` vector of type `Type`.*/    \
-  template <typename Type>                                       \
+#define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix)             \
+  /** \ingroup matrixtypedefs */                          \
+  /** \brief `Size`&times;`Size` matrix of type `Type`.*/ \
+  template <typename Type>                                \
+  using Matrix##SizeSuffix = Matrix<Type, Size, Size>;    \
+  /** \ingroup matrixtypedefs */                          \
+  /** \brief `Size`&times;`1` vector of type `Type`.*/    \
+  template <typename Type>                                \
+  using Vector##SizeSuffix = Matrix<Type, Size, 1>;       \
+  /** \ingroup matrixtypedefs */                          \
+  /** \brief `1`&times;`Size` vector of type `Type`.*/    \
+  template <typename Type>                                \
   using RowVector##SizeSuffix = Matrix<Type, 1, Size>;
 
-#define EIGEN_MAKE_FIXED_TYPEDEFS(Size)                              \
-  /** \ingroup matrixtypedefs */                                     \
-  /** \brief \cpp11 `Size`&times;`Dynamic` matrix of type `Type` */  \
-  template <typename Type>                                           \
-  using Matrix##Size##X = Matrix<Type, Size, Dynamic>;               \
-  /** \ingroup matrixtypedefs */                                     \
-  /** \brief \cpp11 `Dynamic`&times;`Size` matrix of type `Type`. */ \
-  template <typename Type>                                           \
+#define EIGEN_MAKE_FIXED_TYPEDEFS(Size)                       \
+  /** \ingroup matrixtypedefs */                              \
+  /** \brief `Size`&times;`Dynamic` matrix of type `Type` */  \
+  template <typename Type>                                    \
+  using Matrix##Size##X = Matrix<Type, Size, Dynamic>;        \
+  /** \ingroup matrixtypedefs */                              \
+  /** \brief `Dynamic`&times;`Size` matrix of type `Type`. */ \
+  template <typename Type>                                    \
   using Matrix##X##Size = Matrix<Type, Dynamic, Size>;
 
 EIGEN_MAKE_TYPEDEFS(2, 2)
@@ -517,12 +518,12 @@ EIGEN_MAKE_FIXED_TYPEDEFS(3)
 EIGEN_MAKE_FIXED_TYPEDEFS(4)
 
 /** \ingroup matrixtypedefs
- * \brief \cpp11 `Size`&times;`1` vector of type `Type`. */
+ * \brief `Size`&times;`1` vector of type `Type`. */
 template <typename Type, int Size>
 using Vector = Matrix<Type, Size, 1>;
 
 /** \ingroup matrixtypedefs
- * \brief \cpp11 `1`&times;`Size` vector of type `Type`. */
+ * \brief `1`&times;`Size` vector of type `Type`. */
 template <typename Type, int Size>
 using RowVector = Matrix<Type, 1, Size>;
 
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 8d5c47e472f..ff1a2c23264 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -99,7 +99,7 @@ class MatrixBase : public DenseBase<Derived> {
 
   /** \returns the size of the main diagonal, which is min(rows(),cols()).
    * \sa rows(), cols(), SizeAtCompileTime. */
-  EIGEN_DEVICE_FUNC inline Index diagonalSize() const { return (numext::mini)(rows(), cols()); }
+  EIGEN_DEVICE_FUNC constexpr Index diagonalSize() const { return (numext::mini)(rows(), cols()); }
 
   typedef typename Base::PlainObject PlainObject;
 
@@ -136,19 +136,19 @@ class MatrixBase : public DenseBase<Derived> {
   /** Special case of the template operator=, in order to prevent the compiler
    * from generating a default operator= (issue hit with g++ 4.1)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const MatrixBase& other);
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& operator=(const MatrixBase& other);
 
   // We cannot inherit here via Base::operator= since it is causing
   // trouble with MSVC.
 
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other);
 
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC Derived& operator=(const EigenBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC constexpr Derived& operator=(const EigenBase<OtherDerived>& other);
 
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC Derived& operator=(const ReturnByValue<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC constexpr Derived& operator=(const ReturnByValue<OtherDerived>& other);
 
   template <typename OtherDerived>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const MatrixBase<OtherDerived>& other);
@@ -180,11 +180,11 @@ class MatrixBase : public DenseBase<Derived> {
       const SkewSymmetricBase<SkewDerived>& skew) const;
 
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,
-                                                  typename internal::traits<OtherDerived>::Scalar>::ReturnType
+  EIGEN_DEVICE_FUNC constexpr typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,
+                                                            typename internal::traits<OtherDerived>::Scalar>::ReturnType
   dot(const MatrixBase<OtherDerived>& other) const;
 
-  EIGEN_DEVICE_FUNC RealScalar squaredNorm() const;
+  EIGEN_DEVICE_FUNC constexpr RealScalar squaredNorm() const;
   EIGEN_DEVICE_FUNC RealScalar norm() const;
   RealScalar stableNorm() const;
   RealScalar blueNorm() const;
@@ -194,23 +194,23 @@ class MatrixBase : public DenseBase<Derived> {
   EIGEN_DEVICE_FUNC void normalize();
   EIGEN_DEVICE_FUNC void stableNormalize();
 
-  EIGEN_DEVICE_FUNC const AdjointReturnType adjoint() const;
+  EIGEN_DEVICE_FUNC constexpr const AdjointReturnType adjoint() const;
   EIGEN_DEVICE_FUNC void adjointInPlace();
 
   typedef Diagonal<Derived> DiagonalReturnType;
-  EIGEN_DEVICE_FUNC DiagonalReturnType diagonal();
+  EIGEN_DEVICE_FUNC constexpr DiagonalReturnType diagonal();
 
   typedef Diagonal<const Derived> ConstDiagonalReturnType;
-  EIGEN_DEVICE_FUNC const ConstDiagonalReturnType diagonal() const;
+  EIGEN_DEVICE_FUNC constexpr const ConstDiagonalReturnType diagonal() const;
 
   template <int Index>
-  EIGEN_DEVICE_FUNC Diagonal<Derived, Index> diagonal();
+  EIGEN_DEVICE_FUNC constexpr Diagonal<Derived, Index> diagonal();
 
   template <int Index>
-  EIGEN_DEVICE_FUNC const Diagonal<const Derived, Index> diagonal() const;
+  EIGEN_DEVICE_FUNC constexpr const Diagonal<const Derived, Index> diagonal() const;
 
-  EIGEN_DEVICE_FUNC Diagonal<Derived, DynamicIndex> diagonal(Index index);
-  EIGEN_DEVICE_FUNC const Diagonal<const Derived, DynamicIndex> diagonal(Index index) const;
+  EIGEN_DEVICE_FUNC constexpr Diagonal<Derived, DynamicIndex> diagonal(Index index);
+  EIGEN_DEVICE_FUNC constexpr const Diagonal<const Derived, DynamicIndex> diagonal(Index index) const;
 
   template <unsigned int Mode>
   struct TriangularViewReturnType {
@@ -222,9 +222,9 @@ class MatrixBase : public DenseBase<Derived> {
   };
 
   template <unsigned int Mode>
-  EIGEN_DEVICE_FUNC typename TriangularViewReturnType<Mode>::Type triangularView();
+  EIGEN_DEVICE_FUNC constexpr typename TriangularViewReturnType<Mode>::Type triangularView();
   template <unsigned int Mode>
-  EIGEN_DEVICE_FUNC typename ConstTriangularViewReturnType<Mode>::Type triangularView() const;
+  EIGEN_DEVICE_FUNC constexpr typename ConstTriangularViewReturnType<Mode>::Type triangularView() const;
 
   template <unsigned int UpLo>
   struct SelfAdjointViewReturnType {
@@ -236,9 +236,9 @@ class MatrixBase : public DenseBase<Derived> {
   };
 
   template <unsigned int UpLo>
-  EIGEN_DEVICE_FUNC typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
+  EIGEN_DEVICE_FUNC constexpr typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
   template <unsigned int UpLo>
-  EIGEN_DEVICE_FUNC typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
+  EIGEN_DEVICE_FUNC constexpr typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
 
   const SparseView<Derived> sparseView(
       const Scalar& m_reference = Scalar(0),
@@ -252,9 +252,9 @@ class MatrixBase : public DenseBase<Derived> {
   EIGEN_DEVICE_FUNC static const BasisReturnType UnitZ();
   EIGEN_DEVICE_FUNC static const BasisReturnType UnitW();
 
-  EIGEN_DEVICE_FUNC const DiagonalWrapper<const Derived> asDiagonal() const;
+  EIGEN_DEVICE_FUNC constexpr const DiagonalWrapper<const Derived> asDiagonal() const;
   const PermutationWrapper<const Derived> asPermutation() const;
-  EIGEN_DEVICE_FUNC const SkewSymmetricWrapper<const Derived> asSkewSymmetric() const;
+  EIGEN_DEVICE_FUNC constexpr const SkewSymmetricWrapper<const Derived> asSkewSymmetric() const;
 
   EIGEN_DEVICE_FUNC Derived& setIdentity();
   EIGEN_DEVICE_FUNC Derived& setIdentity(Index rows, Index cols);
@@ -274,6 +274,17 @@ class MatrixBase : public DenseBase<Derived> {
                     const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
   bool isUnitary(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
 
+  /* diagonalView */
+  template <int DiagIndex_ = 0>
+  EIGEN_DEVICE_FUNC constexpr DiagonalWrapper<Diagonal<Derived, DiagIndex_>> diagonalView();
+
+  template <int DiagIndex_ = 0>
+  EIGEN_DEVICE_FUNC constexpr DiagonalWrapper<Diagonal<const Derived, DiagIndex_>> diagonalView() const;
+
+  EIGEN_DEVICE_FUNC constexpr DiagonalWrapper<Diagonal<Derived, DynamicIndex>> diagonalView(Index index);
+
+  EIGEN_DEVICE_FUNC constexpr DiagonalWrapper<Diagonal<const Derived, DynamicIndex>> diagonalView(Index index) const;
+
   /** \returns true if each coefficients of \c *this and \a other are all exactly equal.
    * \warning When using floating point scalar values you probably should rather use a
    *          fuzzy comparison such as isApprox()
@@ -296,14 +307,14 @@ class MatrixBase : public DenseBase<Derived> {
 
   // TODO forceAlignedAccess is temporarily disabled
   // Need to find a nicer workaround.
-  inline const Derived& forceAlignedAccess() const { return derived(); }
-  inline Derived& forceAlignedAccess() { return derived(); }
+  constexpr const Derived& forceAlignedAccess() const { return derived(); }
+  constexpr Derived& forceAlignedAccess() { return derived(); }
   template <bool Enable>
-  inline const Derived& forceAlignedAccessIf() const {
+  constexpr const Derived& forceAlignedAccessIf() const {
     return derived();
   }
   template <bool Enable>
-  inline Derived& forceAlignedAccessIf() {
+  constexpr Derived& forceAlignedAccessIf() {
     return derived();
   }
 
@@ -312,29 +323,31 @@ class MatrixBase : public DenseBase<Derived> {
   template <int p>
   EIGEN_DEVICE_FUNC RealScalar lpNorm() const;
 
-  EIGEN_DEVICE_FUNC MatrixBase<Derived>& matrix() { return *this; }
-  EIGEN_DEVICE_FUNC const MatrixBase<Derived>& matrix() const { return *this; }
+  EIGEN_DEVICE_FUNC constexpr MatrixBase<Derived>& matrix() { return *this; }
+  EIGEN_DEVICE_FUNC constexpr const MatrixBase<Derived>& matrix() const { return *this; }
 
   /** \returns an \link Eigen::ArrayBase Array \endlink expression of this matrix
    * \sa ArrayBase::matrix() */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ArrayWrapper<Derived> array() { return ArrayWrapper<Derived>(derived()); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE ArrayWrapper<Derived> array() {
+    return ArrayWrapper<Derived>(derived());
+  }
   /** \returns a const \link Eigen::ArrayBase Array \endlink expression of this matrix
    * \sa ArrayBase::matrix() */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ArrayWrapper<const Derived> array() const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const ArrayWrapper<const Derived> array() const {
     return ArrayWrapper<const Derived>(derived());
   }
 
   /////////// LU module ///////////
 
   template <typename PermutationIndex = DefaultPermutationIndex>
-  inline const FullPivLU<PlainObject, PermutationIndex> fullPivLu() const;
+  inline FullPivLU<PlainObject, PermutationIndex> fullPivLu() const;
   template <typename PermutationIndex = DefaultPermutationIndex>
-  inline const PartialPivLU<PlainObject, PermutationIndex> partialPivLu() const;
+  inline PartialPivLU<PlainObject, PermutationIndex> partialPivLu() const;
 
   template <typename PermutationIndex = DefaultPermutationIndex>
-  inline const PartialPivLU<PlainObject, PermutationIndex> lu() const;
+  inline PartialPivLU<PlainObject, PermutationIndex> lu() const;
 
-  EIGEN_DEVICE_FUNC inline const Inverse<Derived> inverse() const;
+  EIGEN_DEVICE_FUNC inline Inverse<Derived> inverse() const;
 
   template <typename ResultType>
   inline void computeInverseAndDetWithCheck(
@@ -350,18 +363,18 @@ class MatrixBase : public DenseBase<Derived> {
 
   /////////// Cholesky module ///////////
 
-  inline const LLT<PlainObject> llt() const;
-  inline const LDLT<PlainObject> ldlt() const;
+  inline LLT<PlainObject> llt() const;
+  inline LDLT<PlainObject> ldlt() const;
 
   /////////// QR module ///////////
 
-  inline const HouseholderQR<PlainObject> householderQr() const;
+  inline HouseholderQR<PlainObject> householderQr() const;
   template <typename PermutationIndex = DefaultPermutationIndex>
-  inline const ColPivHouseholderQR<PlainObject, PermutationIndex> colPivHouseholderQr() const;
+  inline ColPivHouseholderQR<PlainObject, PermutationIndex> colPivHouseholderQr() const;
   template <typename PermutationIndex = DefaultPermutationIndex>
-  inline const FullPivHouseholderQR<PlainObject, PermutationIndex> fullPivHouseholderQr() const;
+  inline FullPivHouseholderQR<PlainObject, PermutationIndex> fullPivHouseholderQr() const;
   template <typename PermutationIndex = DefaultPermutationIndex>
-  inline const CompleteOrthogonalDecomposition<PlainObject, PermutationIndex> completeOrthogonalDecomposition() const;
+  inline CompleteOrthogonalDecomposition<PlainObject, PermutationIndex> completeOrthogonalDecomposition() const;
 
   /////////// Eigenvalues module ///////////
 
@@ -373,12 +386,14 @@ class MatrixBase : public DenseBase<Derived> {
   template <int Options = 0>
   inline JacobiSVD<PlainObject, Options> jacobiSvd() const;
   template <int Options = 0>
-  EIGEN_DEPRECATED inline JacobiSVD<PlainObject, Options> jacobiSvd(unsigned int computationOptions) const;
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using method's template parameter.")
+  inline JacobiSVD<PlainObject, Options> jacobiSvd(unsigned int computationOptions) const;
 
   template <int Options = 0>
   inline BDCSVD<PlainObject, Options> bdcSvd() const;
   template <int Options = 0>
-  EIGEN_DEPRECATED inline BDCSVD<PlainObject, Options> bdcSvd(unsigned int computationOptions) const;
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using method's template parameter.")
+  inline BDCSVD<PlainObject, Options> bdcSvd(unsigned int computationOptions) const;
 
   /////////// Geometry module ///////////
 
@@ -391,11 +406,11 @@ class MatrixBase : public DenseBase<Derived> {
 
   EIGEN_DEVICE_FUNC inline PlainObject unitOrthogonal(void) const;
 
-  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline Matrix<Scalar, 3, 1> eulerAngles(Index a0, Index a1, Index a2) const;
+  EIGEN_DEPRECATED_WITH_REASON("Use .canonicalEulerAngles() instead.")
+  EIGEN_DEVICE_FUNC inline Matrix<Scalar, 3, 1> eulerAngles(Index a0, Index a1, Index a2) const;
 
   EIGEN_DEVICE_FUNC inline Matrix<Scalar, 3, 1> canonicalEulerAngles(Index a0, Index a1, Index a2) const;
 
-  // put this as separate enum value to work around possible GCC 4.3 bug (?)
   enum {
     HomogeneousReturnTypeDirection =
         ColsAtCompileTime == 1 && RowsAtCompileTime == 1
diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h
index 2ce83a8c564..f18559ba18c 100644
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -43,24 +43,24 @@ class NestByValue : public internal::dense_xpr_base<NestByValue<ExpressionType>
 
   EIGEN_DENSE_PUBLIC_INTERFACE(NestByValue)
 
-  EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
+  EIGEN_DEVICE_FUNC constexpr explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
 
   EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
   EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
 
-  EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
+  EIGEN_DEVICE_FUNC constexpr operator const ExpressionType&() const { return m_expression; }
 
-  EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; }
+  EIGEN_DEVICE_FUNC constexpr const ExpressionType& nestedExpression() const { return m_expression; }
 
-  EIGEN_DEVICE_FUNC typename std::enable_if<HasDirectAccess, const Scalar*>::type data() const {
+  EIGEN_DEVICE_FUNC constexpr std::enable_if_t<HasDirectAccess, const Scalar*> data() const {
     return m_expression.data();
   }
 
-  EIGEN_DEVICE_FUNC typename std::enable_if<HasDirectAccess, Index>::type innerStride() const {
+  EIGEN_DEVICE_FUNC constexpr std::enable_if_t<HasDirectAccess, Index> innerStride() const {
     return m_expression.innerStride();
   }
 
-  EIGEN_DEVICE_FUNC typename std::enable_if<HasDirectAccess, Index>::type outerStride() const {
+  EIGEN_DEVICE_FUNC constexpr std::enable_if_t<HasDirectAccess, Index> outerStride() const {
     return m_expression.outerStride();
   }
 
@@ -71,7 +71,7 @@ class NestByValue : public internal::dense_xpr_base<NestByValue<ExpressionType>
 /** \returns an expression of the temporary version of *this.
  */
 template <typename Derived>
-EIGEN_DEVICE_FUNC inline const NestByValue<Derived> DenseBase<Derived>::nestByValue() const {
+EIGEN_DEVICE_FUNC constexpr inline const NestByValue<Derived> DenseBase<Derived>::nestByValue() const {
   return NestByValue<Derived>(derived());
 }
 
@@ -82,7 +82,7 @@ template <typename ArgType>
 struct evaluator<NestByValue<ArgType> > : public evaluator<ArgType> {
   typedef evaluator<ArgType> Base;
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const NestByValue<ArgType>& xpr) : Base(xpr.nestedExpression()) {}
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const NestByValue<ArgType>& xpr) : Base(xpr.nestedExpression()) {}
 };
 }  // namespace internal
 
diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h
index b6c7209104e..6a882014b40 100644
--- a/Eigen/src/Core/NoAlias.h
+++ b/Eigen/src/Core/NoAlias.h
@@ -35,7 +35,7 @@ class NoAlias {
  public:
   typedef typename ExpressionType::Scalar Scalar;
 
-  EIGEN_DEVICE_FUNC explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
+  EIGEN_DEVICE_FUNC constexpr explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
 
   template <typename OtherDerived>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ExpressionType& operator=(const StorageBase<OtherDerived>& other) {
@@ -58,7 +58,7 @@ class NoAlias {
     return m_expression;
   }
 
-  EIGEN_DEVICE_FUNC ExpressionType& expression() const { return m_expression; }
+  EIGEN_DEVICE_FUNC constexpr ExpressionType& expression() const { return m_expression; }
 
  protected:
   ExpressionType& m_expression;
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index 5e4e5c2ff60..2bb1eedb8fe 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -95,9 +95,22 @@ struct default_max_digits10_impl<T, false, true>  // Integer
 }  // end namespace internal
 
 namespace numext {
-/** \internal bit-wise cast without changing the underlying bit representation. */
 
-// TODO: Replace by std::bit_cast (available in C++20)
+/** \internal bit-wise cast without changing the underlying bit representation. */
+#if defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L
+template <typename Tgt, typename Src>
+EIGEN_DEVICE_FUNC constexpr Tgt bit_cast(const Src& src) {
+  return std::bit_cast<Tgt>(src);
+}
+#elif EIGEN_HAS_BUILTIN(__builtin_bit_cast)
+template <typename Tgt, typename Src>
+EIGEN_DEVICE_FUNC constexpr Tgt bit_cast(const Src& src) {
+  EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Src>::value, THIS_TYPE_IS_NOT_SUPPORTED)
+  EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Tgt>::value, THIS_TYPE_IS_NOT_SUPPORTED)
+  EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED)
+  return __builtin_bit_cast(Tgt, src);
+}
+#else
 template <typename Tgt, typename Src>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) {
   // The behaviour of memcpy is not specified for non-trivially copyable types
@@ -113,6 +126,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) {
   memcpy(static_cast<void*>(&tgt), static_cast<const void*>(&staged), sizeof(Tgt));
   return tgt;
 }
+#endif
 }  // namespace numext
 
 // clang-format off
diff --git a/Eigen/src/Core/PartialReduxEvaluator.h b/Eigen/src/Core/PartialReduxEvaluator.h
index 1f638f9ac5d..18aeca3887d 100644
--- a/Eigen/src/Core/PartialReduxEvaluator.h
+++ b/Eigen/src/Core/PartialReduxEvaluator.h
@@ -42,12 +42,12 @@ namespace internal {
 /* logic deciding a strategy for unrolling of vectorized paths */
 template <typename Func, typename Evaluator>
 struct packetwise_redux_traits {
-  enum {
-    OuterSize = int(Evaluator::IsRowMajor) ? Evaluator::RowsAtCompileTime : Evaluator::ColsAtCompileTime,
-    Cost = OuterSize == Dynamic ? HugeCost
-                                : OuterSize * Evaluator::CoeffReadCost + (OuterSize - 1) * functor_traits<Func>::Cost,
-    Unrolling = Cost <= EIGEN_UNROLLING_LIMIT ? CompleteUnrolling : NoUnrolling
-  };
+  static constexpr int OuterSize =
+      int(Evaluator::IsRowMajor) ? Evaluator::RowsAtCompileTime : Evaluator::ColsAtCompileTime;
+  static constexpr int Cost = OuterSize == Dynamic
+                                  ? HugeCost
+                                  : OuterSize * Evaluator::CoeffReadCost + (OuterSize - 1) * functor_traits<Func>::Cost;
+  static constexpr int Unrolling = Cost <= EIGEN_UNROLLING_LIMIT ? CompleteUnrolling : NoUnrolling;
 };
 
 /* Value to be returned when size==0 , by default let's return 0 */
@@ -70,8 +70,8 @@ struct packetwise_redux_impl;
 /* Perform the actual reduction with unrolling */
 template <typename Func, typename Evaluator>
 struct packetwise_redux_impl<Func, Evaluator, CompleteUnrolling> {
-  typedef redux_novec_unroller<Func, Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
-  typedef typename Evaluator::Scalar Scalar;
+  using Base = redux_novec_unroller<Func, Evaluator, 0, Evaluator::SizeAtCompileTime>;
+  using Scalar = typename Evaluator::Scalar;
 
   template <typename PacketType>
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE PacketType run(const Evaluator& eval, const Func& func, Index /*size*/) {
@@ -96,8 +96,8 @@ struct redux_vec_unroller<Func, Evaluator, Start, 0> {
 /* Perform the actual reduction for dynamic sizes */
 template <typename Func, typename Evaluator>
 struct packetwise_redux_impl<Func, Evaluator, NoUnrolling> {
-  typedef typename Evaluator::Scalar Scalar;
-  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
+  using Scalar = typename Evaluator::Scalar;
+  using PacketScalar = typename redux_traits<Func, Evaluator>::PacketType;
 
   template <typename PacketType>
   EIGEN_DEVICE_FUNC static PacketType run(const Evaluator& eval, const Func& func, Index size) {
@@ -122,8 +122,8 @@ struct packetwise_redux_impl<Func, Evaluator, NoUnrolling> {
 
 template <typename Func, typename Evaluator>
 struct packetwise_segment_redux_impl {
-  typedef typename Evaluator::Scalar Scalar;
-  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
+  using Scalar = typename Evaluator::Scalar;
+  using PacketScalar = typename redux_traits<Func, Evaluator>::PacketType;
 
   template <typename PacketType>
   EIGEN_DEVICE_FUNC static PacketType run(const Evaluator& eval, const Func& func, Index size, Index begin,
@@ -140,16 +140,16 @@ struct packetwise_segment_redux_impl {
 template <typename ArgType, typename MemberOp, int Direction>
 struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
     : evaluator_base<PartialReduxExpr<ArgType, MemberOp, Direction> > {
-  typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
-  typedef typename internal::nested_eval<ArgType, 1>::type ArgTypeNested;
-  typedef add_const_on_value_type_t<ArgTypeNested> ConstArgTypeNested;
-  typedef internal::remove_all_t<ArgTypeNested> ArgTypeNestedCleaned;
-  typedef typename ArgType::Scalar InputScalar;
-  typedef typename XprType::Scalar Scalar;
+  using XprType = PartialReduxExpr<ArgType, MemberOp, Direction>;
+  using ArgTypeNested = typename internal::nested_eval<ArgType, 1>::type;
+  using ConstArgTypeNested = add_const_on_value_type_t<ArgTypeNested>;
+  using ArgTypeNestedCleaned = internal::remove_all_t<ArgTypeNested>;
+  using InputScalar = typename ArgType::Scalar;
+  using Scalar = typename XprType::Scalar;
   enum {
     TraversalSize = Direction == int(Vertical) ? int(ArgType::RowsAtCompileTime) : int(ArgType::ColsAtCompileTime)
   };
-  typedef typename MemberOp::template Cost<int(TraversalSize)> CostOpType;
+  using CostOpType = typename MemberOp::template Cost<int(TraversalSize)>;
   enum {
     CoeffReadCost = TraversalSize == Dynamic ? HugeCost
                     : TraversalSize == 0
@@ -168,13 +168,13 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
     Alignment = 0  // FIXME this will need to be improved once PartialReduxExpr is vectorized
   };
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr) : m_arg(xpr.nestedExpression()), m_functor(xpr.functor()) {
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : m_arg(xpr.nestedExpression()), m_functor(xpr.functor()) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize == Dynamic ? HugeCost
                                                              : (TraversalSize == 0 ? 1 : int(CostOpType::value)));
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  using CoeffReturnType = typename XprType::CoeffReturnType;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index i, Index j) const {
     return coeff(Direction == Vertical ? j : i);
@@ -199,11 +199,10 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
     using BinaryOp = typename MemberOp::BinaryOp;
     using Impl = internal::packetwise_redux_impl<BinaryOp, PanelEvaluator>;
 
-    // FIXME
-    // See bug 1612, currently if PacketSize==1 (i.e. complex<double> with 128bits registers) then the storage-order of
-    // panel get reversed and methods like packetByOuterInner do not make sense anymore in this context. So let's just
-    // by pass "vectorization" in this case:
-    if (PacketSize == 1) return internal::pset1<PacketType>(coeff(idx));
+    // Workaround for issue 1612 (closed): when PacketSize==1 (i.e. complex<double> with 128bits registers) the
+    // storage-order of panel gets reversed and methods like packetByOuterInner do not make sense in this context, so
+    // bypass "vectorization":
+    EIGEN_IF_CONSTEXPR(PacketSize == 1) return internal::pset1<PacketType>(coeff(idx));
 
     Index startRow = Direction == Vertical ? 0 : idx;
     Index startCol = Direction == Vertical ? idx : 0;
diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h
index 4748b118a68..77133545165 100644
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -109,6 +109,9 @@ class PermutationBase : public EigenBase<Derived> {
    */
   DenseMatrixType toDenseMatrix() const { return derived(); }
 
+  /** \returns the plain matrix representation of the permutation. */
+  DenseMatrixType eval() const { return toDenseMatrix(); }
+
   /** const version of indices(). */
   const IndicesType& indices() const { return derived().indices(); }
   /** \returns a reference to the stored array representing the permutation. */
@@ -468,17 +471,17 @@ class PermutationWrapper : public PermutationBase<PermutationWrapper<IndicesType
 /** \returns the matrix with the permutation applied to the columns.
  */
 template <typename MatrixDerived, typename PermutationDerived>
-EIGEN_DEVICE_FUNC const Product<MatrixDerived, PermutationDerived, AliasFreeProduct> operator*(
+EIGEN_DEVICE_FUNC const Product<MatrixDerived, PermutationDerived, DefaultProduct> operator*(
     const MatrixBase<MatrixDerived>& matrix, const PermutationBase<PermutationDerived>& permutation) {
-  return Product<MatrixDerived, PermutationDerived, AliasFreeProduct>(matrix.derived(), permutation.derived());
+  return Product<MatrixDerived, PermutationDerived, DefaultProduct>(matrix.derived(), permutation.derived());
 }
 
 /** \returns the matrix with the permutation applied to the rows.
  */
 template <typename PermutationDerived, typename MatrixDerived>
-EIGEN_DEVICE_FUNC const Product<PermutationDerived, MatrixDerived, AliasFreeProduct> operator*(
+EIGEN_DEVICE_FUNC const Product<PermutationDerived, MatrixDerived, DefaultProduct> operator*(
     const PermutationBase<PermutationDerived>& permutation, const MatrixBase<MatrixDerived>& matrix) {
-  return Product<PermutationDerived, MatrixDerived, AliasFreeProduct>(permutation.derived(), matrix.derived());
+  return Product<PermutationDerived, MatrixDerived, DefaultProduct>(permutation.derived(), matrix.derived());
 }
 
 template <typename PermutationType>
@@ -520,16 +523,16 @@ class InverseImpl<PermutationType, PermutationStorage> : public EigenBase<Invers
   /** \returns the matrix with the inverse permutation applied to the columns.
    */
   template <typename OtherDerived>
-  friend const Product<OtherDerived, InverseType, AliasFreeProduct> operator*(const MatrixBase<OtherDerived>& matrix,
-                                                                              const InverseType& trPerm) {
-    return Product<OtherDerived, InverseType, AliasFreeProduct>(matrix.derived(), trPerm.derived());
+  friend const Product<OtherDerived, InverseType, DefaultProduct> operator*(const MatrixBase<OtherDerived>& matrix,
+                                                                            const InverseType& trPerm) {
+    return Product<OtherDerived, InverseType, DefaultProduct>(matrix.derived(), trPerm.derived());
   }
 
   /** \returns the matrix with the inverse permutation applied to the rows.
    */
   template <typename OtherDerived>
-  const Product<InverseType, OtherDerived, AliasFreeProduct> operator*(const MatrixBase<OtherDerived>& matrix) const {
-    return Product<InverseType, OtherDerived, AliasFreeProduct>(derived(), matrix.derived());
+  const Product<InverseType, OtherDerived, DefaultProduct> operator*(const MatrixBase<OtherDerived>& matrix) const {
+    return Product<InverseType, OtherDerived, DefaultProduct>(derived(), matrix.derived());
   }
 };
 
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index a78305e2592..2f4c357c5f2 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -159,17 +159,17 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
                       INVALID_MATRIX_TEMPLATE_PARAMETERS)
   EIGEN_STATIC_ASSERT(((Options & (DontAlign | RowMajor)) == Options), INVALID_MATRIX_TEMPLATE_PARAMETERS)
 
-  EIGEN_DEVICE_FUNC Base& base() { return *static_cast<Base*>(this); }
-  EIGEN_DEVICE_FUNC const Base& base() const { return *static_cast<const Base*>(this); }
+  EIGEN_DEVICE_FUNC constexpr Base& base() { return *static_cast<Base*>(this); }
+  EIGEN_DEVICE_FUNC constexpr const Base& base() const { return *static_cast<const Base*>(this); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_storage.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_storage.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_storage.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_storage.cols(); }
 
   /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index,Index) const
    * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
    *
    * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeff(Index rowId, Index colId) const {
+  EIGEN_DEVICE_FUNC constexpr const Scalar& coeff(Index rowId, Index colId) const {
     if (Flags & RowMajorBit)
       return m_storage.data()[colId + rowId * m_storage.cols()];
     else  // column-major
@@ -180,15 +180,13 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
    * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
    *
    * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeff(Index index) const {
-    return m_storage.data()[index];
-  }
+  EIGEN_DEVICE_FUNC constexpr const Scalar& coeff(Index index) const { return m_storage.data()[index]; }
 
   /** This is an overloaded version of DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const
    * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
    *
    * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const for details. */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index rowId, Index colId) {
+  EIGEN_DEVICE_FUNC constexpr Scalar& coeffRef(Index rowId, Index colId) {
     if (Flags & RowMajorBit)
       return m_storage.data()[colId + rowId * m_storage.cols()];
     else  // column-major
@@ -199,11 +197,11 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
    * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
    *
    * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index) const for details. */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) { return m_storage.data()[index]; }
+  EIGEN_DEVICE_FUNC constexpr Scalar& coeffRef(Index index) { return m_storage.data()[index]; }
 
   /** This is the const version of coeffRef(Index,Index) which is thus synonym of coeff(Index,Index).
    * It is provided for convenience. */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeffRef(Index rowId, Index colId) const {
+  EIGEN_DEVICE_FUNC constexpr const Scalar& coeffRef(Index rowId, Index colId) const {
     if (Flags & RowMajorBit)
       return m_storage.data()[colId + rowId * m_storage.cols()];
     else  // column-major
@@ -212,9 +210,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
 
   /** This is the const version of coeffRef(Index) which is thus synonym of coeff(Index).
    * It is provided for convenience. */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeffRef(Index index) const {
-    return m_storage.data()[index];
-  }
+  EIGEN_DEVICE_FUNC constexpr const Scalar& coeffRef(Index index) const { return m_storage.data()[index]; }
 
   /** \internal */
   template <int LoadMode>
@@ -343,7 +339,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
    * remain row-vectors and vectors remain vectors.
    */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resizeLike(const EigenBase<OtherDerived>& _other) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void resizeLike(const EigenBase<OtherDerived>& _other) {
     const OtherDerived& other = _other.derived();
 #ifndef EIGEN_NO_DEBUG
     internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime>::run(
@@ -426,9 +422,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
   /** This is a special case of the templated operator=. Its purpose is to
    * prevent a default operator= from hiding the templated operator=.
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Derived& operator=(const PlainObjectBase& other) {
-    return _set(other);
-  }
+  EIGEN_DEVICE_FUNC constexpr Derived& operator=(const PlainObjectBase& other) { return _set(other); }
 
   /** \sa MatrixBase::lazyAssign() */
   template <typename OtherDerived>
@@ -446,9 +440,9 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
   // Prevent user from trying to instantiate PlainObjectBase objects
   // by making all its constructor protected. See bug 1074.
  protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr PlainObjectBase() = default;
+  EIGEN_DEVICE_FUNC constexpr PlainObjectBase() = default;
   /** \brief Move constructor */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr PlainObjectBase(PlainObjectBase&&) = default;
+  EIGEN_DEVICE_FUNC constexpr PlainObjectBase(PlainObjectBase&&) = default;
   /** \brief Move assignment operator */
   EIGEN_DEVICE_FUNC constexpr PlainObjectBase& operator=(PlainObjectBase&& other) noexcept {
     m_storage = std::move(other.m_storage);
@@ -456,7 +450,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
   }
 
   /** Copy constructor */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr PlainObjectBase(const PlainObjectBase&) = default;
+  EIGEN_DEVICE_FUNC constexpr PlainObjectBase(const PlainObjectBase&) = default;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PlainObjectBase(Index size, Index rows, Index cols)
       : m_storage(size, rows, cols) {}
 
@@ -467,7 +461,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
    * This constructor is for 1D array or vectors with more than 4 coefficients.
    *
    * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
-   * constructor must match the the fixed number of rows (resp. columns) of \c *this.
+   * constructor must match the fixed number of rows (resp. columns) of \c *this.
    */
   template <typename... ArgTypes>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2,
@@ -524,14 +518,14 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
 
   /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PlainObjectBase(const DenseBase<OtherDerived>& other) : m_storage() {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE PlainObjectBase(const DenseBase<OtherDerived>& other) : m_storage() {
     resizeLike(other);
     _set_noalias(other);
   }
 
   /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase<OtherDerived>& other) : m_storage() {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase<OtherDerived>& other) : m_storage() {
     resizeLike(other);
     *this = other.derived();
   }
@@ -691,6 +685,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
     eigen_assert((this->size() == 0 || (IsVectorAtCompileTime ? (this->size() == other.size())
                                                               : (rows() == other.rows() && cols() == other.cols()))) &&
                  "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined");
+    if (this->size() == 0) resizeLike(other);
     EIGEN_ONLY_USED_FOR_DEBUG(other);
 #else
     resizeLike(other);
@@ -714,7 +709,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
   // aliasing is dealt once in internal::call_assignment
   // so at this stage we have to assume aliasing... and resising has to be done later.
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Derived& _set(const DenseBase<OtherDerived>& other) {
+  EIGEN_DEVICE_FUNC constexpr Derived& _set(const DenseBase<OtherDerived>& other) {
     internal::call_assignment(this->derived(), other.derived());
     return this->derived();
   }
@@ -725,7 +720,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
    * \sa operator=(const MatrixBase<OtherDerived>&), _set()
    */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Derived& _set_noalias(const DenseBase<OtherDerived>& other) {
+  EIGEN_DEVICE_FUNC constexpr Derived& _set_noalias(const DenseBase<OtherDerived>& other) {
     // I don't think we need this resize call since the lazyAssign will anyways resize
     // and lazyAssign will be called by the assign selector.
     //_resize_to_match(other);
@@ -737,23 +732,23 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
   }
 
   template <typename T0, typename T1>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(Index rows, Index cols,
-                                                    std::enable_if_t<Base::SizeAtCompileTime != 2, T0>* = 0) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init2(Index rows, Index cols,
+                                                              std::enable_if_t<Base::SizeAtCompileTime != 2, T0>* = 0) {
     EIGEN_STATIC_ASSERT(internal::is_valid_index_type<T0>::value && internal::is_valid_index_type<T1>::value,
                         T0 AND T1 MUST BE INTEGER TYPES)
     resize(rows, cols);
   }
 
   template <typename T0, typename T1>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1,
-                                                    std::enable_if_t<Base::SizeAtCompileTime == 2, T0>* = 0) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1,
+                                                              std::enable_if_t<Base::SizeAtCompileTime == 2, T0>* = 0) {
     EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
     m_storage.data()[0] = Scalar(val0);
     m_storage.data()[1] = Scalar(val1);
   }
 
   template <typename T0, typename T1>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init2(
       const Index& val0, const Index& val1,
       std::enable_if_t<(!internal::is_same<Index, Scalar>::value) && (internal::is_same<T0, Index>::value) &&
                            (internal::is_same<T1, Index>::value) && Base::SizeAtCompileTime == 2,
@@ -766,7 +761,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
   // The argument is convertible to the Index type and we either have a non 1x1 Matrix, or a dynamic-sized Array,
   // then the argument is meant to be the size of the object.
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(
       Index size,
       std::enable_if_t<(Base::SizeAtCompileTime != 1 || !internal::is_convertible<T, Scalar>::value) &&
                            ((!internal::is_same<typename internal::traits<Derived>::XprKind, ArrayXpr>::value ||
@@ -782,7 +777,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
   // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar
   // type can be implicitly converted)
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(
       const Scalar& val0,
       std::enable_if_t<Base::SizeAtCompileTime == 1 && internal::is_convertible<T, Scalar>::value, T>* = 0) {
     EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
@@ -792,7 +787,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
   // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar
   // type match the index type)
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(
       const Index& val0,
       std::enable_if_t<(!internal::is_same<Index, Scalar>::value) && (internal::is_same<Index, T>::value) &&
                            Base::SizeAtCompileTime == 1 && internal::is_convertible<T, Scalar>::value,
@@ -803,42 +798,42 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
 
   // Initialize a fixed size matrix from a pointer to raw data
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Scalar* data) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(const Scalar* data) {
     this->_set_noalias(ConstMapType(data));
   }
 
   // Initialize an arbitrary matrix from a dense expression
   template <typename T, typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const DenseBase<OtherDerived>& other) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(const DenseBase<OtherDerived>& other) {
     this->_set_noalias(other);
   }
 
   // Initialize an arbitrary matrix from an object convertible to the Derived type.
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Derived& other) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(const Derived& other) {
     this->_set_noalias(other);
   }
 
   // Initialize an arbitrary matrix from a generic Eigen expression
   template <typename T, typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const EigenBase<OtherDerived>& other) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(const EigenBase<OtherDerived>& other) {
     this->derived() = other;
   }
 
   template <typename T, typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const ReturnByValue<OtherDerived>& other) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(const ReturnByValue<OtherDerived>& other) {
     resize(other.rows(), other.cols());
     other.evalTo(this->derived());
   }
 
   template <typename T, typename OtherDerived, int ColsAtCompileTime>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const RotationBase<OtherDerived, ColsAtCompileTime>& r) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(const RotationBase<OtherDerived, ColsAtCompileTime>& r) {
     this->derived() = r;
   }
 
   // For fixed-size Array<Scalar,...>
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(
       const Scalar& val0,
       std::enable_if_t<Base::SizeAtCompileTime != Dynamic && Base::SizeAtCompileTime != 1 &&
                            internal::is_convertible<T, Scalar>::value &&
@@ -849,7 +844,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
 
   // For fixed-size Array<Index,...>
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void _init1(
       const Index& val0,
       std::enable_if_t<(!internal::is_same<Index, Scalar>::value) && (internal::is_same<Index, T>::value) &&
                            Base::SizeAtCompileTime != Dynamic && Base::SizeAtCompileTime != 1 &&
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index e16c7cc963b..bc8b9dad360 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -219,16 +219,16 @@ class Product
   using TransposeReturnType = typename internal::product_transpose_helper<Lhs, Rhs, Option>::TransposeType;
   using AdjointReturnType = typename internal::product_transpose_helper<Lhs, Rhs, Option>::AdjointType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) {
     eigen_assert(lhs.cols() == rhs.rows() && "invalid matrix product" &&
                  "if you wanted a coeff-wise or a dot product use the respective explicit functions");
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_lhs.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_lhs.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const LhsNestedCleaned& lhs() const { return m_lhs; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const RhsNestedCleaned& rhs() const { return m_rhs; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const LhsNestedCleaned& lhs() const { return m_lhs; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const RhsNestedCleaned& rhs() const { return m_rhs; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TransposeReturnType transpose() const {
     return internal::product_transpose_helper<Lhs, Rhs, Option>::run_transpose(*this);
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index ce8d954bffc..d789f7501cb 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -15,6 +15,13 @@
 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"
 
+// C4804: unsafe use of type 'bool' in operation. Unavoidable in generic code
+// instantiated with bool scalars (e.g. += and * on bool).
+#if EIGEN_COMP_MSVC
+#pragma warning(push)
+#pragma warning(disable : 4804)
+#endif
+
 namespace Eigen {
 
 namespace internal {
@@ -29,30 +36,27 @@ namespace internal {
  */
 template <typename Lhs, typename Rhs, int Options>
 struct evaluator<Product<Lhs, Rhs, Options>> : public product_evaluator<Product<Lhs, Rhs, Options>> {
-  typedef Product<Lhs, Rhs, Options> XprType;
-  typedef product_evaluator<XprType> Base;
+  using XprType = Product<Lhs, Rhs, Options>;
+  using Base = product_evaluator<XprType>;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
 
 // Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
-// TODO we should apply that rule only if that's really helpful
+// TODO: we should apply that rule only if that's really helpful
 template <typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
 struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_product_op<Scalar1, Scalar2>,
                                                const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
-                                               const Product<Lhs, Rhs, DefaultProduct>>> {
-  static const bool value = true;
-};
+                                               const Product<Lhs, Rhs, DefaultProduct>>> : std::true_type {};
 template <typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
 struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1, Scalar2>,
                                const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
                                const Product<Lhs, Rhs, DefaultProduct>>>
     : public evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1, Lhs, product), Rhs, DefaultProduct>> {
-  typedef CwiseBinaryOp<internal::scalar_product_op<Scalar1, Scalar2>,
-                        const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
-                        const Product<Lhs, Rhs, DefaultProduct>>
-      XprType;
-  typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1, Lhs, product), Rhs, DefaultProduct>> Base;
+  using XprType = CwiseBinaryOp<internal::scalar_product_op<Scalar1, Scalar2>,
+                                const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                                const Product<Lhs, Rhs, DefaultProduct>>;
+  using Base = evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1, Lhs, product), Rhs, DefaultProduct>>;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
       : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs()) {}
@@ -61,8 +65,8 @@ struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1, Scalar2>,
 template <typename Lhs, typename Rhs, int DiagIndex>
 struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex>>
     : public evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>> {
-  typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
-  typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>> Base;
+  using XprType = Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex>;
+  using Base = evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>>;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
       : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
@@ -78,18 +82,16 @@ template <typename Lhs, typename Rhs, typename LhsShape = typename evaluator_tra
 struct generic_product_impl;
 
 template <typename Lhs, typename Rhs>
-struct evaluator_assume_aliasing<Product<Lhs, Rhs, DefaultProduct>> {
-  static const bool value = true;
-};
+struct evaluator_assume_aliasing<Product<Lhs, Rhs, DefaultProduct>> : std::true_type {};
 
 // This is the default evaluator implementation for products:
 // It creates a temporary and call generic_product_impl
 template <typename Lhs, typename Rhs, int Options, int ProductTag, typename LhsShape, typename RhsShape>
 struct product_evaluator<Product<Lhs, Rhs, Options>, ProductTag, LhsShape, RhsShape>
     : public evaluator<typename Product<Lhs, Rhs, Options>::PlainObject> {
-  typedef Product<Lhs, Rhs, Options> XprType;
-  typedef typename XprType::PlainObject PlainObject;
-  typedef evaluator<PlainObject> Base;
+  using XprType = Product<Lhs, Rhs, Options>;
+  using PlainObject = typename XprType::PlainObject;
+  using Base = evaluator<PlainObject>;
   enum { Flags = Base::Flags | EvalBeforeNestingBit };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit product_evaluator(const XprType& xpr)
@@ -123,7 +125,7 @@ struct product_evaluator<Product<Lhs, Rhs, Options>, ProductTag, LhsShape, RhsSh
 template <typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
 struct Assignment<DstXprType, Product<Lhs, Rhs, Options>, internal::assign_op<Scalar, Scalar>, Dense2Dense,
                   std::enable_if_t<(Options == DefaultProduct || Options == AliasFreeProduct)>> {
-  typedef Product<Lhs, Rhs, Options> SrcXprType;
+  using SrcXprType = Product<Lhs, Rhs, Options>;
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src,
                                                         const internal::assign_op<Scalar, Scalar>&) {
     Index dstRows = src.rows();
@@ -138,7 +140,7 @@ struct Assignment<DstXprType, Product<Lhs, Rhs, Options>, internal::assign_op<Sc
 template <typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
 struct Assignment<DstXprType, Product<Lhs, Rhs, Options>, internal::add_assign_op<Scalar, Scalar>, Dense2Dense,
                   std::enable_if_t<(Options == DefaultProduct || Options == AliasFreeProduct)>> {
-  typedef Product<Lhs, Rhs, Options> SrcXprType;
+  using SrcXprType = Product<Lhs, Rhs, Options>;
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src,
                                                         const internal::add_assign_op<Scalar, Scalar>&) {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@@ -151,7 +153,7 @@ struct Assignment<DstXprType, Product<Lhs, Rhs, Options>, internal::add_assign_o
 template <typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
 struct Assignment<DstXprType, Product<Lhs, Rhs, Options>, internal::sub_assign_op<Scalar, Scalar>, Dense2Dense,
                   std::enable_if_t<(Options == DefaultProduct || Options == AliasFreeProduct)>> {
-  typedef Product<Lhs, Rhs, Options> SrcXprType;
+  using SrcXprType = Product<Lhs, Rhs, Options>;
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src,
                                                         const internal::sub_assign_op<Scalar, Scalar>&) {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@@ -161,7 +163,7 @@ struct Assignment<DstXprType, Product<Lhs, Rhs, Options>, internal::sub_assign_o
 };
 
 // Dense ?= scalar * Product
-// TODO we should apply that rule if that's really helpful
+// TODO: we should apply that rule if that's really helpful
 // for instance, this is not good for inner products
 template <typename DstXprType, typename Lhs, typename Rhs, typename AssignFunc, typename Scalar, typename ScalarBis,
           typename Plain>
@@ -170,10 +172,9 @@ struct Assignment<DstXprType,
                                 const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>, Plain>,
                                 const Product<Lhs, Rhs, DefaultProduct>>,
                   AssignFunc, Dense2Dense> {
-  typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis, Scalar>,
-                        const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>, Plain>,
-                        const Product<Lhs, Rhs, DefaultProduct>>
-      SrcXprType;
+  using SrcXprType = CwiseBinaryOp<internal::scalar_product_op<ScalarBis, Scalar>,
+                                   const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>, Plain>,
+                                   const Product<Lhs, Rhs, DefaultProduct>>;
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src,
                                                         const AssignFunc& func) {
     call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs()) * src.rhs().rhs(), func);
@@ -182,25 +183,21 @@ struct Assignment<DstXprType,
 
 //----------------------------------------
 // Catch "Dense ?= xpr + Product<>" expression to save one temporary
-// FIXME we could probably enable these rules for any product, i.e., not only Dense and DefaultProduct
+// FIXME: consider enabling these rules for all product types, not only Dense and DefaultProduct.
 
 template <typename OtherXpr, typename Lhs, typename Rhs>
 struct evaluator_assume_aliasing<
     CwiseBinaryOp<
         internal::scalar_sum_op<typename OtherXpr::Scalar, typename Product<Lhs, Rhs, DefaultProduct>::Scalar>,
         const OtherXpr, const Product<Lhs, Rhs, DefaultProduct>>,
-    DenseShape> {
-  static const bool value = true;
-};
+    DenseShape> : std::true_type {};
 
 template <typename OtherXpr, typename Lhs, typename Rhs>
 struct evaluator_assume_aliasing<
     CwiseBinaryOp<
         internal::scalar_difference_op<typename OtherXpr::Scalar, typename Product<Lhs, Rhs, DefaultProduct>::Scalar>,
         const OtherXpr, const Product<Lhs, Rhs, DefaultProduct>>,
-    DenseShape> {
-  static const bool value = true;
-};
+    DenseShape> : std::true_type {};
 
 template <typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
 struct assignment_from_xpr_op_product {
@@ -237,17 +234,17 @@ template <typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, InnerProduct> {
   using impl = default_inner_product_impl<Lhs, Rhs, false>;
   template <typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
     dst.coeffRef(0, 0) = impl::run(lhs, rhs);
   }
 
   template <typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
     dst.coeffRef(0, 0) += impl::run(lhs, rhs);
   }
 
   template <typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
     dst.coeffRef(0, 0) -= impl::run(lhs, rhs);
   }
 };
@@ -280,13 +277,48 @@ void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs& lhs, cons
   for (Index i = 0; i < rows; ++i) func(dst.row(i), lhsEval.coeff(i, Index(0)) * actual_rhs);
 }
 
+template <typename Dst>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool outer_product_use_small_assignment(const Dst& dst) {
+  return dst.rows() <= 16 && dst.cols() <= 16;
+}
+
+template <typename Dst, typename Lhs, typename Rhs, typename Func, typename Scalar>
+void EIGEN_DEVICE_FUNC outer_product_selector_run_small(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func& func,
+                                                        const Scalar& alpha, const false_type&) {
+  evaluator<Rhs> rhsEval(rhs);
+  ei_declare_local_nested_eval(Lhs, lhs, Rhs::SizeAtCompileTime, actual_lhs);
+  const Index rows = dst.rows();
+  const Index cols = dst.cols();
+  for (Index j = 0; j < cols; ++j) {
+    const Scalar rhs_j = rhsEval.coeff(Index(0), j);
+    for (Index i = 0; i < rows; ++i) {
+      func.assignCoeff(dst.coeffRef(i, j), alpha * (rhs_j * actual_lhs.coeff(i, Index(0))));
+    }
+  }
+}
+
+template <typename Dst, typename Lhs, typename Rhs, typename Func, typename Scalar>
+void EIGEN_DEVICE_FUNC outer_product_selector_run_small(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func& func,
+                                                        const Scalar& alpha, const true_type&) {
+  evaluator<Lhs> lhsEval(lhs);
+  ei_declare_local_nested_eval(Rhs, rhs, Lhs::SizeAtCompileTime, actual_rhs);
+  const Index rows = dst.rows();
+  const Index cols = dst.cols();
+  for (Index i = 0; i < rows; ++i) {
+    const Scalar lhs_i = lhsEval.coeff(i, Index(0));
+    for (Index j = 0; j < cols; ++j) {
+      func.assignCoeff(dst.coeffRef(i, j), alpha * (lhs_i * actual_rhs.coeff(Index(0), j)));
+    }
+  }
+}
+
 template <typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, OuterProduct> {
   template <typename T>
   struct is_row_major : bool_constant<(int(T::Flags) & RowMajorBit)> {};
-  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+  using Scalar = typename Product<Lhs, Rhs>::Scalar;
 
-  // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
+  // TODO: it would be nice to be able to exploit our *_assign_op functors for that purpose
   struct set {
     template <typename Dst, typename Src>
     EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const {
@@ -320,30 +352,50 @@ struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, OuterProduct> {
 
   template <typename Dst>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
-    internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
+    if (internal::outer_product_use_small_assignment(dst)) {
+      internal::outer_product_selector_run_small(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar, Scalar>(),
+                                                 Scalar(1), is_row_major<Dst>());
+    } else {
+      internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
+    }
   }
 
   template <typename Dst>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
-    internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
+    if (internal::outer_product_use_small_assignment(dst)) {
+      internal::outer_product_selector_run_small(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar, Scalar>(),
+                                                 Scalar(1), is_row_major<Dst>());
+    } else {
+      internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
+    }
   }
 
   template <typename Dst>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
-    internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
+    if (internal::outer_product_use_small_assignment(dst)) {
+      internal::outer_product_selector_run_small(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar, Scalar>(),
+                                                 Scalar(1), is_row_major<Dst>());
+    } else {
+      internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
+    }
   }
 
   template <typename Dst>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs,
                                                                   const Scalar& alpha) {
-    internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
+    if (internal::outer_product_use_small_assignment(dst)) {
+      internal::outer_product_selector_run_small(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar, Scalar>(),
+                                                 alpha, is_row_major<Dst>());
+    } else {
+      internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
+    }
   }
 };
 
 // This base class provides default implementations for evalTo, addTo, subTo, in terms of scaleAndAddTo
 template <typename Lhs, typename Rhs, typename Derived>
 struct generic_product_impl_base {
-  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+  using Scalar = typename Product<Lhs, Rhs>::Scalar;
 
   template <typename Dst>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
@@ -371,11 +423,11 @@ struct generic_product_impl_base {
 template <typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, GemvProduct>
     : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, GemvProduct>> {
-  typedef typename nested_eval<Lhs, 1>::type LhsNested;
-  typedef typename nested_eval<Rhs, 1>::type RhsNested;
-  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+  using LhsNested = typename nested_eval<Lhs, 1>::type;
+  using RhsNested = typename nested_eval<Rhs, 1>::type;
+  using Scalar = typename Product<Lhs, Rhs>::Scalar;
   enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
-  typedef internal::remove_all_t<std::conditional_t<int(Side) == OnTheRight, LhsNested, RhsNested>> MatrixType;
+  using MatrixType = internal::remove_all_t<std::conditional_t<int(Side) == OnTheRight, LhsNested, RhsNested>>;
 
   template <typename Dest>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs,
@@ -396,7 +448,7 @@ struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, GemvProduct>
 
 template <typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, CoeffBasedProductMode> {
-  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+  using Scalar = typename Product<Lhs, Rhs>::Scalar;
 
   template <typename Dst>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
@@ -439,7 +491,7 @@ struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, CoeffBasedProductM
       ConjLhs = blas_traits<Lhs>::NeedToConjugate,
       ConjRhs = blas_traits<Rhs>::NeedToConjugate
     };
-    // FIXME: in c++11 this should be auto, and extractScalarFactor should also return auto
+    // FIXME: this should be auto, and extractScalarFactor should also return auto
     //        this is important for real*complex_mat
     Scalar actualAlpha = combine_scalar_factors<Scalar>(lhs, rhs);
 
@@ -485,9 +537,9 @@ struct etor_product_packet_impl;
 template <typename Lhs, typename Rhs, int ProductTag>
 struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, DenseShape>
     : evaluator_base<Product<Lhs, Rhs, LazyProduct>> {
-  typedef Product<Lhs, Rhs, LazyProduct> XprType;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  using XprType = Product<Lhs, Rhs, LazyProduct>;
+  using Scalar = typename XprType::Scalar;
+  using CoeffReturnType = typename XprType::CoeffReturnType;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit product_evaluator(const XprType& xpr)
       : m_lhs(xpr.lhs()),
@@ -500,30 +552,18 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
     EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::AddCost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
-#if 0
-    std::cerr << "LhsOuterStrideBytes=  " << LhsOuterStrideBytes << "\n";
-    std::cerr << "RhsOuterStrideBytes=  " << RhsOuterStrideBytes << "\n";
-    std::cerr << "LhsAlignment=         " << LhsAlignment << "\n";
-    std::cerr << "RhsAlignment=         " << RhsAlignment << "\n";
-    std::cerr << "CanVectorizeLhs=      " << CanVectorizeLhs << "\n";
-    std::cerr << "CanVectorizeRhs=      " << CanVectorizeRhs << "\n";
-    std::cerr << "CanVectorizeInner=    " << CanVectorizeInner << "\n";
-    std::cerr << "EvalToRowMajor=       " << EvalToRowMajor << "\n";
-    std::cerr << "Alignment=            " << Alignment << "\n";
-    std::cerr << "Flags=                " << Flags << "\n";
-#endif
   }
 
   // Everything below here is taken from CoeffBasedProduct.h
 
-  typedef typename internal::nested_eval<Lhs, Rhs::ColsAtCompileTime>::type LhsNested;
-  typedef typename internal::nested_eval<Rhs, Lhs::RowsAtCompileTime>::type RhsNested;
+  using LhsNested = typename internal::nested_eval<Lhs, Rhs::ColsAtCompileTime>::type;
+  using RhsNested = typename internal::nested_eval<Rhs, Lhs::RowsAtCompileTime>::type;
 
-  typedef internal::remove_all_t<LhsNested> LhsNestedCleaned;
-  typedef internal::remove_all_t<RhsNested> RhsNestedCleaned;
+  using LhsNestedCleaned = internal::remove_all_t<LhsNested>;
+  using RhsNestedCleaned = internal::remove_all_t<RhsNested>;
 
-  typedef evaluator<LhsNestedCleaned> LhsEtorType;
-  typedef evaluator<RhsNestedCleaned> RhsEtorType;
+  using LhsEtorType = evaluator<LhsNestedCleaned>;
+  using RhsEtorType = evaluator<RhsNestedCleaned>;
 
   enum {
     RowsAtCompileTime = LhsNestedCleaned::RowsAtCompileTime,
@@ -533,78 +573,77 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
     MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime
   };
 
-  typedef typename find_best_packet<Scalar, RowsAtCompileTime>::type LhsVecPacketType;
-  typedef typename find_best_packet<Scalar, ColsAtCompileTime>::type RhsVecPacketType;
-
-  enum {
-
-    LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
-    RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
-    CoeffReadCost = InnerSize == 0 ? NumTraits<Scalar>::ReadCost
-                    : InnerSize == Dynamic
-                        ? HugeCost
-                        : InnerSize * (NumTraits<Scalar>::MulCost + int(LhsCoeffReadCost) + int(RhsCoeffReadCost)) +
-                              (InnerSize - 1) * NumTraits<Scalar>::AddCost,
-
-    Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
-
-    LhsFlags = LhsEtorType::Flags,
-    RhsFlags = RhsEtorType::Flags,
-
-    LhsRowMajor = LhsFlags & RowMajorBit,
-    RhsRowMajor = RhsFlags & RowMajorBit,
-
-    LhsVecPacketSize = unpacket_traits<LhsVecPacketType>::size,
-    RhsVecPacketSize = unpacket_traits<RhsVecPacketType>::size,
-
-    // Here, we don't care about alignment larger than the usable packet size.
-    LhsAlignment =
-        plain_enum_min(LhsEtorType::Alignment, LhsVecPacketSize* int(sizeof(typename LhsNestedCleaned::Scalar))),
-    RhsAlignment =
-        plain_enum_min(RhsEtorType::Alignment, RhsVecPacketSize* int(sizeof(typename RhsNestedCleaned::Scalar))),
-
-    SameType = is_same<typename LhsNestedCleaned::Scalar, typename RhsNestedCleaned::Scalar>::value,
-
-    CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime != 1),
-    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) && (RowsAtCompileTime != 1),
-
-    EvalToRowMajor = (MaxRowsAtCompileTime == 1 && MaxColsAtCompileTime != 1) ? 1
-                     : (MaxColsAtCompileTime == 1 && MaxRowsAtCompileTime != 1)
-                         ? 0
-                         : (bool(RhsRowMajor) && !CanVectorizeLhs),
-
-    Flags = ((int(LhsFlags) | int(RhsFlags)) & HereditaryBits & ~RowMajorBit) |
-            (EvalToRowMajor ? RowMajorBit : 0)
-            // TODO enable vectorization for mixed types
-            | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0) |
-            (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0),
-
-    LhsOuterStrideBytes =
-        int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
-    RhsOuterStrideBytes =
-        int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
-
-    Alignment = bool(CanVectorizeLhs)
-                    ? (LhsOuterStrideBytes <= 0 || (int(LhsOuterStrideBytes) % plain_enum_max(1, LhsAlignment)) != 0
-                           ? 0
-                           : LhsAlignment)
-                : bool(CanVectorizeRhs)
-                    ? (RhsOuterStrideBytes <= 0 || (int(RhsOuterStrideBytes) % plain_enum_max(1, RhsAlignment)) != 0
-                           ? 0
-                           : RhsAlignment)
-                    : 0,
-
-    /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
-     * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
-     * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
-     * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
-     */
-    CanVectorizeInner = SameType && LhsRowMajor && (!RhsRowMajor) &&
-                        (int(LhsFlags) & int(RhsFlags) & ActualPacketAccessBit) &&
-                        (int(InnerSize) % packet_traits<Scalar>::size == 0)
-  };
+  using LhsVecPacketType = typename find_best_packet<Scalar, RowsAtCompileTime>::type;
+  using RhsVecPacketType = typename find_best_packet<Scalar, ColsAtCompileTime>::type;
+
+  static constexpr int LhsCoeffReadCost = LhsEtorType::CoeffReadCost;
+  static constexpr int RhsCoeffReadCost = RhsEtorType::CoeffReadCost;
+  static constexpr int CoeffReadCost =
+      InnerSize == 0 ? NumTraits<Scalar>::ReadCost
+      : InnerSize == Dynamic
+          ? HugeCost
+          : InnerSize * (NumTraits<Scalar>::MulCost + int(LhsCoeffReadCost) + int(RhsCoeffReadCost)) +
+                (InnerSize - 1) * NumTraits<Scalar>::AddCost;
+
+  static constexpr bool Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT;
+
+  static constexpr int LhsFlags = LhsEtorType::Flags;
+  static constexpr int RhsFlags = RhsEtorType::Flags;
+
+  static constexpr int LhsRowMajor = LhsFlags & RowMajorBit;
+  static constexpr int RhsRowMajor = RhsFlags & RowMajorBit;
+
+  static constexpr int LhsVecPacketSize = unpacket_traits<LhsVecPacketType>::size;
+  static constexpr int RhsVecPacketSize = unpacket_traits<RhsVecPacketType>::size;
+
+  // Here, we don't care about alignment larger than the usable packet size.
+  static constexpr int LhsAlignment =
+      plain_enum_min(LhsEtorType::Alignment, LhsVecPacketSize* int(sizeof(typename LhsNestedCleaned::Scalar)));
+  static constexpr int RhsAlignment =
+      plain_enum_min(RhsEtorType::Alignment, RhsVecPacketSize* int(sizeof(typename RhsNestedCleaned::Scalar)));
+
+  static constexpr bool SameType = is_same<typename LhsNestedCleaned::Scalar, typename RhsNestedCleaned::Scalar>::value;
+
+  static constexpr bool CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime != 1);
+  static constexpr bool CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) && (RowsAtCompileTime != 1);
+
+  static constexpr int EvalToRowMajor = (MaxRowsAtCompileTime == 1 && MaxColsAtCompileTime != 1) ? 1
+                                        : (MaxColsAtCompileTime == 1 && MaxRowsAtCompileTime != 1)
+                                            ? 0
+                                            : (bool(RhsRowMajor) && !CanVectorizeLhs);
+
+  static constexpr int Flags = ((int(LhsFlags) | int(RhsFlags)) & HereditaryBits & ~RowMajorBit) |
+                               (EvalToRowMajor ? RowMajorBit : 0)
+                               // TODO: enable vectorization for mixed types
+                               | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0) |
+                               (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0);
+
+  static constexpr int LhsOuterStrideBytes =
+      int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar));
+  static constexpr int RhsOuterStrideBytes =
+      int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar));
+
+  static constexpr int Alignment =
+      bool(CanVectorizeLhs)
+          ? (LhsOuterStrideBytes <= 0 || (int(LhsOuterStrideBytes) % plain_enum_max(1, LhsAlignment)) != 0
+                 ? 0
+                 : LhsAlignment)
+      : bool(CanVectorizeRhs)
+          ? (RhsOuterStrideBytes <= 0 || (int(RhsOuterStrideBytes) % plain_enum_max(1, RhsAlignment)) != 0
+                 ? 0
+                 : RhsAlignment)
+          : 0;
+
+  /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
+   * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
+   * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
+   * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
+   */
+  static constexpr bool CanVectorizeInner = SameType && LhsRowMajor && (!RhsRowMajor) &&
+                                            (int(LhsFlags) & int(RhsFlags) & ActualPacketAccessBit) &&
+                                            (int(InnerSize) % packet_traits<Scalar>::size == 0);
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const {
     return (m_lhs.row(row).transpose().cwiseProduct(m_rhs.col(col))).sum();
   }
 
@@ -612,7 +651,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    * which is why we don't set the LinearAccessBit.
    * TODO: this seems possible when the result is a vector
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index index) const {
     const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? 0 : index;
     const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? index : 0;
     return (m_lhs.row(row).transpose().cwiseProduct(m_rhs.col(col))).sum();
@@ -621,9 +660,9 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   template <int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packet(Index row, Index col) const {
     PacketType res;
-    typedef etor_product_packet_impl<bool(int(Flags) & RowMajorBit) ? RowMajor : ColMajor,
-                                     Unroll ? int(InnerSize) : Dynamic, LhsEtorType, RhsEtorType, PacketType, LoadMode>
-        PacketImpl;
+    using PacketImpl =
+        etor_product_packet_impl<bool(int(Flags) & RowMajorBit) ? RowMajor : ColMajor,
+                                 Unroll ? int(InnerSize) : Dynamic, LhsEtorType, RhsEtorType, PacketType, LoadMode>;
     PacketImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
     return res;
   }
@@ -639,9 +678,9 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetSegment(Index row, Index col, Index begin,
                                                                        Index count) const {
     PacketType res;
-    typedef etor_product_packet_impl<bool(int(Flags) & RowMajorBit) ? RowMajor : ColMajor,
-                                     Unroll ? int(InnerSize) : Dynamic, LhsEtorType, RhsEtorType, PacketType, LoadMode>
-        PacketImpl;
+    using PacketImpl =
+        etor_product_packet_impl<bool(int(Flags) & RowMajorBit) ? RowMajor : ColMajor,
+                                 Unroll ? int(InnerSize) : Dynamic, LhsEtorType, RhsEtorType, PacketType, LoadMode>;
     PacketImpl::run_segment(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res, begin, count);
     return res;
   }
@@ -660,16 +699,15 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   LhsEtorType m_lhsImpl;
   RhsEtorType m_rhsImpl;
 
-  // TODO: Get rid of m_innerDim if known at compile time
-  Index m_innerDim;
+  variable_if_dynamic<Index, InnerSize> m_innerDim;
 };
 
 template <typename Lhs, typename Rhs>
 struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, LazyCoeffBasedProductMode, DenseShape, DenseShape>
     : product_evaluator<Product<Lhs, Rhs, LazyProduct>, CoeffBasedProductMode, DenseShape, DenseShape> {
-  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
-  typedef Product<Lhs, Rhs, LazyProduct> BaseProduct;
-  typedef product_evaluator<BaseProduct, CoeffBasedProductMode, DenseShape, DenseShape> Base;
+  using XprType = Product<Lhs, Rhs, DefaultProduct>;
+  using BaseProduct = Product<Lhs, Rhs, LazyProduct>;
+  using Base = product_evaluator<BaseProduct, CoeffBasedProductMode, DenseShape, DenseShape>;
   enum { Flags = Base::Flags | EvalBeforeNestingBit };
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit product_evaluator(const XprType& xpr)
       : Base(BaseProduct(xpr.lhs(), xpr.rhs())) {}
@@ -717,8 +755,8 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
 
 template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
-                                                        Index /*innerDim*/, Packet& res) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                  Index /*innerDim*/, Packet& res) {
     res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))), rhs.template packet<LoadMode, Packet>(Index(0), col));
   }
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
@@ -731,8 +769,8 @@ struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode> {
 
 template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
-                                                        Index /*innerDim*/, Packet& res) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                  Index /*innerDim*/, Packet& res) {
     res = pmul(lhs.template packet<LoadMode, Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
   }
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
@@ -812,7 +850,7 @@ struct triangular_product_impl;
 template <typename Lhs, typename Rhs, int ProductTag>
 struct generic_product_impl<Lhs, Rhs, TriangularShape, DenseShape, ProductTag>
     : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, TriangularShape, DenseShape, ProductTag>> {
-  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+  using Scalar = typename Product<Lhs, Rhs>::Scalar;
 
   template <typename Dest>
   static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
@@ -824,7 +862,7 @@ struct generic_product_impl<Lhs, Rhs, TriangularShape, DenseShape, ProductTag>
 template <typename Lhs, typename Rhs, int ProductTag>
 struct generic_product_impl<Lhs, Rhs, DenseShape, TriangularShape, ProductTag>
     : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, DenseShape, TriangularShape, ProductTag>> {
-  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+  using Scalar = typename Product<Lhs, Rhs>::Scalar;
 
   template <typename Dest>
   static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
@@ -842,11 +880,11 @@ struct selfadjoint_product_impl;
 template <typename Lhs, typename Rhs, int ProductTag>
 struct generic_product_impl<Lhs, Rhs, SelfAdjointShape, DenseShape, ProductTag>
     : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, SelfAdjointShape, DenseShape, ProductTag>> {
-  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+  using Scalar = typename Product<Lhs, Rhs>::Scalar;
 
   template <typename Dest>
   static EIGEN_DEVICE_FUNC void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
-    selfadjoint_product_impl<typename Lhs::MatrixType, Lhs::Mode, false, Rhs, 0, Rhs::IsVectorAtCompileTime>::run(
+    selfadjoint_product_impl<typename Lhs::MatrixType, Lhs::Mode, false, Rhs, 0, Rhs::ColsAtCompileTime == 1>::run(
         dst, lhs.nestedExpression(), rhs, alpha);
   }
 };
@@ -854,11 +892,11 @@ struct generic_product_impl<Lhs, Rhs, SelfAdjointShape, DenseShape, ProductTag>
 template <typename Lhs, typename Rhs, int ProductTag>
 struct generic_product_impl<Lhs, Rhs, DenseShape, SelfAdjointShape, ProductTag>
     : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, DenseShape, SelfAdjointShape, ProductTag>> {
-  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+  using Scalar = typename Product<Lhs, Rhs>::Scalar;
 
   template <typename Dest>
   static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
-    selfadjoint_product_impl<Lhs, 0, Lhs::IsVectorAtCompileTime, typename Rhs::MatrixType, Rhs::Mode, false>::run(
+    selfadjoint_product_impl<Lhs, 0, Lhs::RowsAtCompileTime == 1, typename Rhs::MatrixType, Rhs::Mode, false>::run(
         dst, lhs, rhs.nestedExpression(), alpha);
   }
 };
@@ -869,7 +907,7 @@ struct generic_product_impl<Lhs, Rhs, DenseShape, SelfAdjointShape, ProductTag>
 
 template <typename MatrixType, typename DiagonalType, typename Derived, int ProductOrder>
 struct diagonal_product_evaluator_base : evaluator_base<Derived> {
-  typedef typename ScalarBinaryOpTraits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
+  using Scalar = typename ScalarBinaryOpTraits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType;
 
  public:
   enum {
@@ -896,8 +934,8 @@ struct diagonal_product_evaluator_base : evaluator_base<Derived> {
                     (ScalarAccessOnDiag_ || (bool(int(DiagFlags) & PacketAccessBit))),
     LinearAccessMask_ =
         (MatrixType::RowsAtCompileTime == 1 || MatrixType::ColsAtCompileTime == 1) ? LinearAccessBit : 0,
-    Flags =
-        ((HereditaryBits | LinearAccessMask_) & (unsigned int)(MatrixFlags)) | (Vectorizable_ ? PacketAccessBit : 0),
+    Flags = ((HereditaryBits | LinearAccessMask_) & static_cast<unsigned int>(MatrixFlags)) |
+            (Vectorizable_ ? PacketAccessBit : 0),
     Alignment = evaluator<MatrixType>::Alignment,
 
     AsScalarProduct =
@@ -913,7 +951,7 @@ struct diagonal_product_evaluator_base : evaluator_base<Derived> {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const {
     if (AsScalarProduct)
       return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
     else
@@ -932,8 +970,9 @@ struct diagonal_product_evaluator_base : evaluator_base<Derived> {
     enum {
       InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
       DiagonalPacketLoadMode = plain_enum_min(
-          LoadMode,
-          ((InnerSize % 16) == 0) ? int(Aligned16) : int(evaluator<DiagonalType>::Alignment))  // FIXME hardcoded 16!!
+          LoadMode, ((InnerSize * int(sizeof(Scalar))) % int(unpacket_traits<PacketType>::alignment) == 0)
+                        ? int(unpacket_traits<PacketType>::alignment)
+                        : int(evaluator<DiagonalType>::Alignment))
     };
     return internal::pmul(m_matImpl.template packet<LoadMode, PacketType>(row, col),
                           m_diagImpl.template packet<DiagonalPacketLoadMode, PacketType>(id));
@@ -952,8 +991,9 @@ struct diagonal_product_evaluator_base : evaluator_base<Derived> {
     enum {
       InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
       DiagonalPacketLoadMode = plain_enum_min(
-          LoadMode,
-          ((InnerSize % 16) == 0) ? int(Aligned16) : int(evaluator<DiagonalType>::Alignment))  // FIXME hardcoded 16!!
+          LoadMode, ((InnerSize * int(sizeof(Scalar))) % int(unpacket_traits<PacketType>::alignment) == 0)
+                        ? int(unpacket_traits<PacketType>::alignment)
+                        : int(evaluator<DiagonalType>::Alignment))
     };
     return internal::pmul(m_matImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
                           m_diagImpl.template packetSegment<DiagonalPacketLoadMode, PacketType>(id, begin, count));
@@ -968,24 +1008,23 @@ template <typename Lhs, typename Rhs, int ProductKind, int ProductTag>
 struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalShape, DenseShape>
     : diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>,
                                       OnTheLeft> {
-  typedef diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>,
-                                          OnTheLeft>
-      Base;
+  using Base =
+      diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheLeft>;
   using Base::coeff;
   using Base::m_diagImpl;
   using Base::m_matImpl;
-  typedef typename Base::Scalar Scalar;
+  using Scalar = typename Base::Scalar;
 
-  typedef Product<Lhs, Rhs, ProductKind> XprType;
-  typedef typename XprType::PlainObject PlainObject;
-  typedef typename Lhs::DiagonalVectorType DiagonalType;
+  using XprType = Product<Lhs, Rhs, ProductKind>;
+  using PlainObject = typename XprType::PlainObject;
+  using DiagonalType = typename Lhs::DiagonalVectorType;
 
   static constexpr int StorageOrder = Base::StorageOrder_;
   using IsRowMajor_t = bool_constant<StorageOrder == RowMajor>;
 
-  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) {}
+  EIGEN_DEVICE_FUNC constexpr explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const {
     return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
   }
 
@@ -1023,23 +1062,22 @@ template <typename Lhs, typename Rhs, int ProductKind, int ProductTag>
 struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape, DiagonalShape>
     : diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>,
                                       OnTheRight> {
-  typedef diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>,
-                                          OnTheRight>
-      Base;
+  using Base = diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>,
+                                               OnTheRight>;
   using Base::coeff;
   using Base::m_diagImpl;
   using Base::m_matImpl;
-  typedef typename Base::Scalar Scalar;
+  using Scalar = typename Base::Scalar;
 
-  typedef Product<Lhs, Rhs, ProductKind> XprType;
-  typedef typename XprType::PlainObject PlainObject;
+  using XprType = Product<Lhs, Rhs, ProductKind>;
+  using PlainObject = typename XprType::PlainObject;
 
   static constexpr int StorageOrder = Base::StorageOrder_;
   using IsColMajor_t = bool_constant<StorageOrder == ColMajor>;
 
-  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal()) {}
+  EIGEN_DEVICE_FUNC constexpr explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal()) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const {
     return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
   }
 
@@ -1081,8 +1119,8 @@ struct permutation_matrix_product;
 
 template <typename ExpressionType, int Side, bool Transposed>
 struct permutation_matrix_product<ExpressionType, Side, Transposed, DenseShape> {
-  typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
-  typedef remove_all_t<MatrixType> MatrixTypeCleaned;
+  using MatrixType = typename nested_eval<ExpressionType, 1>::type;
+  using MatrixTypeCleaned = remove_all_t<MatrixType>;
 
   template <typename Dest, typename PermutationType>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const PermutationType& perm,
@@ -1170,7 +1208,7 @@ struct generic_product_impl<Lhs, Inverse<Rhs>, MatrixShape, PermutationShape, Pr
  * Products with transpositions matrices
  ***************************************************************************/
 
-// FIXME could we unify Transpositions and Permutation into a single "shape"??
+// FIXME: consider unifying Transpositions and Permutation into a single shape.
 
 /** \internal
  * \class transposition_matrix_product
@@ -1178,14 +1216,14 @@ struct generic_product_impl<Lhs, Inverse<Rhs>, MatrixShape, PermutationShape, Pr
  */
 template <typename ExpressionType, int Side, bool Transposed, typename ExpressionShape>
 struct transposition_matrix_product {
-  typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
-  typedef remove_all_t<MatrixType> MatrixTypeCleaned;
+  using MatrixType = typename nested_eval<ExpressionType, 1>::type;
+  using MatrixTypeCleaned = remove_all_t<MatrixType>;
 
   template <typename Dest, typename TranspositionType>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const TranspositionType& tr,
                                                         const ExpressionType& xpr) {
     MatrixType mat(xpr);
-    typedef typename TranspositionType::StorageIndex StorageIndex;
+    using StorageIndex = typename TranspositionType::StorageIndex;
     const Index size = tr.size();
     StorageIndex j = 0;
 
@@ -1264,8 +1302,28 @@ struct generic_product_impl<Lhs, Rhs, SkewSymmetricShape, SkewSymmetricShape, Pr
   }
 };
 
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, HomogeneousShape, ProductTag>
+    : generic_product_impl<Lhs, typename Rhs::PlainObject, MatrixShape, DenseShape, ProductTag> {};
+
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, HomogeneousShape, MatrixShape, ProductTag>
+    : generic_product_impl<typename Lhs::PlainObject, Rhs, DenseShape, MatrixShape, ProductTag> {};
+
+template <typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs, Rhs, PermutationShape, HomogeneousShape, ProductTag>
+    : generic_product_impl<Lhs, Rhs, PermutationShape, DenseShape, ProductTag> {};
+
+template <typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs, Rhs, HomogeneousShape, PermutationShape, ProductTag>
+    : generic_product_impl<Lhs, Rhs, DenseShape, PermutationShape, ProductTag> {};
+
 }  // end namespace internal
 
 }  // end namespace Eigen
 
+#if EIGEN_COMP_MSVC
+#pragma warning(pop)
+#endif
+
 #endif  // EIGEN_PRODUCT_EVALUATORS_H
diff --git a/Eigen/src/Core/Random.h b/Eigen/src/Core/Random.h
index f8a54356250..91543ebb1a5 100644
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@@ -19,7 +19,7 @@ namespace internal {
 
 template <typename Scalar>
 struct scalar_random_op {
-  inline const Scalar operator()() const { return random<Scalar>(); }
+  inline Scalar operator()() const { return random<Scalar>(); }
 };
 
 template <typename Scalar>
@@ -51,7 +51,7 @@ struct functor_traits<scalar_random_op<Scalar> > {
  * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
  * behavior with expressions involving random matrices.
  *
- * See DenseBase::NullaryExpr(Index, const CustomNullaryOp&) for an example using C++11 random generators.
+ * See DenseBase::NullaryExpr(Index, const CustomNullaryOp&) for an example using std random generators.
  *
  * \sa DenseBase::setRandom(), DenseBase::Random(Index), DenseBase::Random()
  */
diff --git a/Eigen/src/Core/RandomImpl.h b/Eigen/src/Core/RandomImpl.h
index efba33680d7..4a622fc3733 100644
--- a/Eigen/src/Core/RandomImpl.h
+++ b/Eigen/src/Core/RandomImpl.h
@@ -56,19 +56,21 @@ struct random_bits_impl {
   EIGEN_STATIC_ASSERT(std::is_unsigned<Scalar>::value, SCALAR MUST BE A BUILT - IN UNSIGNED INTEGER)
   using RandomDevice = eigen_random_device;
   using RandomReturnType = typename RandomDevice::ReturnType;
-  static constexpr int kEntropy = RandomDevice::Entropy;
   static constexpr int kTotalBits = sizeof(Scalar) * CHAR_BIT;
+  static constexpr int kEntropy = plain_enum_min(kTotalBits, RandomDevice::Entropy);
   // return a Scalar filled with numRandomBits beginning from the least significant bit
   static EIGEN_DEVICE_FUNC inline Scalar run(int numRandomBits) {
     eigen_assert((numRandomBits >= 0) && (numRandomBits <= kTotalBits));
-    const Scalar mask = Scalar(-1) >> ((kTotalBits - numRandomBits) & (kTotalBits - 1));
     Scalar randomBits = 0;
-    for (int shift = 0; shift < numRandomBits; shift += kEntropy) {
-      RandomReturnType r = RandomDevice::run();
-      randomBits |= static_cast<Scalar>(r) << shift;
+    for (int filledBits = 0; filledBits < numRandomBits; filledBits += kEntropy) {
+      Scalar r = static_cast<Scalar>(RandomDevice::run());
+      int remainingBits = numRandomBits - filledBits;
+      if (remainingBits < kEntropy) {
+        // clear the excess bits to avoid UB and rounding bias
+        r >>= kEntropy - remainingBits;
+      }
+      randomBits |= r << filledBits;
     }
-    // clear the excess bits
-    randomBits &= mask;
     return randomBits;
   }
 };
@@ -131,8 +133,15 @@ struct random_longdouble_impl {
     uint64_t randomBits[2];
     long double result = 2.0L;
     memcpy(&randomBits, &result, Size);
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
     randomBits[0] |= getRandomBits<uint64_t>(numLowBits);
     randomBits[1] |= getRandomBits<uint64_t>(numHighBits);
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    randomBits[0] |= getRandomBits<uint64_t>(numHighBits);
+    randomBits[1] |= getRandomBits<uint64_t>(numLowBits);
+#else
+#error Unexpected or undefined __BYTE_ORDER__
+#endif
     memcpy(&result, &randomBits, Size);
     result -= 3.0L;
     return result;
@@ -197,7 +206,8 @@ struct random_int_impl<Scalar, false, true> {
 template <typename Scalar>
 struct random_int_impl<Scalar, true, true> {
   static constexpr int kTotalBits = sizeof(Scalar) * CHAR_BIT;
-  using BitsType = typename make_unsigned<Scalar>::type;
+  // avoid implicit integral promotion to `int`
+  using BitsType = std::conditional_t<(sizeof(Scalar) < sizeof(int)), unsigned int, std::make_unsigned_t<Scalar> >;
   static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y) {
     if (y <= x) return x;
     // Avoid overflow by representing `range` as an unsigned type
diff --git a/Eigen/src/Core/RealView.h b/Eigen/src/Core/RealView.h
new file mode 100644
index 00000000000..59cd6d54ee6
--- /dev/null
+++ b/Eigen/src/Core/RealView.h
@@ -0,0 +1,292 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REALVIEW_H
+#define EIGEN_REALVIEW_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+// Write access and vectorization requires array-oriented access to the real and imaginary components.
+// From https://en.cppreference.com/w/cpp/numeric/complex.html:
+// For any pointer to an element of an array of std::complex<T> named p and any valid array index i,
+// reinterpret_cast<T*>(p)[2 * i] is the real part of the complex number p[i], and
+// reinterpret_cast<T*>(p)[2 * i + 1] is the imaginary part of the complex number p[i].
+
+template <typename T>
+struct complex_array_access : std::false_type {};
+template <typename T>
+struct complex_array_access<std::complex<T>> : std::true_type {};
+
+template <typename Xpr>
+struct traits<RealView<Xpr>> : public traits<Xpr> {
+  template <typename T>
+  static constexpr int double_size(T size, bool times_two) {
+    int size_as_int = int(size);
+    if (size_as_int == Dynamic) return Dynamic;
+    return times_two ? (2 * size_as_int) : size_as_int;
+  }
+
+  using Base = traits<Xpr>;
+  using ComplexScalar = typename Base::Scalar;
+  using Scalar = typename NumTraits<ComplexScalar>::Real;
+
+  static constexpr bool ArrayAccess = complex_array_access<ComplexScalar>::value;
+  static constexpr int ActualDirectAccessBit = ArrayAccess ? DirectAccessBit : 0;
+  static constexpr int ActualLvaluebit = !std::is_const<Xpr>::value && ArrayAccess ? LvalueBit : 0;
+  static constexpr int ActualPacketAccessBit = packet_traits<Scalar>::Vectorizable ? PacketAccessBit : 0;
+  static constexpr int FlagMask =
+      ActualDirectAccessBit | ActualLvaluebit | ActualPacketAccessBit | HereditaryBits | LinearAccessBit;
+  static constexpr int BaseFlags = int(evaluator<Xpr>::Flags) | int(Base::Flags);
+  static constexpr int Flags = BaseFlags & FlagMask;
+  static constexpr bool IsRowMajor = Flags & RowMajorBit;
+  static constexpr int RowsAtCompileTime = double_size(Base::RowsAtCompileTime, !IsRowMajor);
+  static constexpr int ColsAtCompileTime = double_size(Base::ColsAtCompileTime, IsRowMajor);
+  static constexpr int SizeAtCompileTime = size_at_compile_time(RowsAtCompileTime, ColsAtCompileTime);
+  static constexpr int MaxRowsAtCompileTime = double_size(Base::MaxRowsAtCompileTime, !IsRowMajor);
+  static constexpr int MaxColsAtCompileTime = double_size(Base::MaxColsAtCompileTime, IsRowMajor);
+  static constexpr int MaxSizeAtCompileTime = size_at_compile_time(MaxRowsAtCompileTime, MaxColsAtCompileTime);
+  static constexpr int OuterStrideAtCompileTime = double_size(outer_stride_at_compile_time<Xpr>::ret, true);
+  static constexpr int InnerStrideAtCompileTime = inner_stride_at_compile_time<Xpr>::ret;
+};
+
+template <typename Xpr>
+struct evaluator<RealView<Xpr>> : private evaluator<Xpr> {
+  using BaseEvaluator = evaluator<Xpr>;
+  using XprType = RealView<Xpr>;
+  using ExpressionTraits = traits<XprType>;
+  using ComplexScalar = typename ExpressionTraits::ComplexScalar;
+  using Scalar = typename ExpressionTraits::Scalar;
+
+  static constexpr int Flags = ExpressionTraits::Flags;
+  static constexpr int CoeffReadCost = BaseEvaluator::CoeffReadCost;
+  static constexpr int Alignment = BaseEvaluator::Alignment;
+  static constexpr bool IsRowMajor = ExpressionTraits::IsRowMajor;
+  static constexpr bool DirectAccess = (Flags & DirectAccessBit) != 0;
+
+  using ComplexCoeffReturnType = std::conditional_t<DirectAccess, const ComplexScalar&, ComplexScalar>;
+  using CoeffReturnType = std::conditional_t<DirectAccess, const Scalar&, Scalar>;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(XprType realView) : BaseEvaluator(realView.m_xpr) {}
+
+  template <bool Enable = DirectAccess, std::enable_if_t<!Enable, bool> = true>
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const {
+    Index r = IsRowMajor ? row : row / 2;
+    Index c = IsRowMajor ? col / 2 : col;
+    bool p = (IsRowMajor ? col : row) & 1;
+    ComplexScalar ccoeff = BaseEvaluator::coeff(r, c);
+    return p ? numext::imag(ccoeff) : numext::real(ccoeff);
+  }
+  template <bool Enable = DirectAccess, std::enable_if_t<Enable, bool> = true>
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+    Index r = IsRowMajor ? row : row / 2;
+    Index c = IsRowMajor ? col / 2 : col;
+    Index p = (IsRowMajor ? col : row) & 1;
+    ComplexCoeffReturnType ccoeff = BaseEvaluator::coeff(r, c);
+    return reinterpret_cast<const Scalar(&)[2]>(ccoeff)[p];
+  }
+  template <bool Enable = DirectAccess, std::enable_if_t<!Enable, bool> = true>
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index index) const {
+    ComplexScalar ccoeff = BaseEvaluator::coeff(index / 2);
+    bool p = index & 1;
+    return p ? numext::imag(ccoeff) : numext::real(ccoeff);
+  }
+  template <bool Enable = DirectAccess, std::enable_if_t<Enable, bool> = true>
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    ComplexCoeffReturnType ccoeff = BaseEvaluator::coeff(index / 2);
+    Index p = index & 1;
+    return reinterpret_cast<const Scalar(&)[2]>(ccoeff)[p];
+  }
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+    Index r = IsRowMajor ? row : row / 2;
+    Index c = IsRowMajor ? col / 2 : col;
+    Index p = (IsRowMajor ? col : row) & 1;
+    ComplexScalar& ccoeffRef = BaseEvaluator::coeffRef(r, c);
+    return reinterpret_cast<Scalar(&)[2]>(ccoeffRef)[p];
+  }
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    ComplexScalar& ccoeffRef = BaseEvaluator::coeffRef(index / 2);
+    Index p = index & 1;
+    return reinterpret_cast<Scalar(&)[2]>(ccoeffRef)[p];
+  }
+
+  // If the first index is odd (imaginary), discard the first scalar
+  // in 'result' and assign the missing scalar.
+  // This operation is safe as the real component of the first scalar must exist.
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    constexpr int RealPacketSize = unpacket_traits<PacketType>::size;
+    using ComplexPacket = typename find_packet_by_size<ComplexScalar, RealPacketSize / 2>::type;
+    EIGEN_STATIC_ASSERT((find_packet_by_size<ComplexScalar, RealPacketSize / 2>::value),
+                        MISSING COMPATIBLE COMPLEX PACKET TYPE)
+    Index r = IsRowMajor ? row : row / 2;
+    Index c = IsRowMajor ? col / 2 : col;
+    bool p = (IsRowMajor ? col : row) & 1;
+    ComplexPacket cresult = BaseEvaluator::template packet<LoadMode, ComplexPacket>(r, c);
+    PacketType result = preinterpret<PacketType>(cresult);
+    if (p) {
+      Scalar aux[RealPacketSize + 1];
+      pstoreu(aux, result);
+      Index lastr = IsRowMajor ? row : row + RealPacketSize - 1;
+      Index lastc = IsRowMajor ? col + RealPacketSize - 1 : col;
+      aux[RealPacketSize] = coeff(lastr, lastc);
+      result = ploadu<PacketType>(aux + 1);
+    }
+    return result;
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    constexpr int RealPacketSize = unpacket_traits<PacketType>::size;
+    using ComplexPacket = typename find_packet_by_size<ComplexScalar, RealPacketSize / 2>::type;
+    EIGEN_STATIC_ASSERT((find_packet_by_size<ComplexScalar, RealPacketSize / 2>::value),
+                        MISSING COMPATIBLE COMPLEX PACKET TYPE)
+    ComplexPacket cresult = BaseEvaluator::template packet<LoadMode, ComplexPacket>(index / 2);
+    PacketType result = preinterpret<PacketType>(cresult);
+    bool p = index & 1;
+    if (p) {
+      Scalar aux[RealPacketSize + 1];
+      pstoreu(aux, result);
+      aux[RealPacketSize] = coeff(index + RealPacketSize - 1);
+      result = ploadu<PacketType>(aux + 1);
+    }
+    return result;
+  }
+
+  // The requested real packet segment forms the half-open interval [begin, end), where 'end' = 'begin' + 'count'.
+  // In order to access the underlying complex array, even indices must be aligned with the real components
+  // of the complex scalars. 'begin' and 'count' must be modified as follows:
+  // a) 'begin' must be rounded down to the nearest even number; and
+  // b) 'end' must be rounded up to the nearest even number.
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    constexpr int RealPacketSize = unpacket_traits<PacketType>::size;
+    using ComplexPacket = typename find_packet_by_size<ComplexScalar, RealPacketSize / 2>::type;
+    EIGEN_STATIC_ASSERT((find_packet_by_size<ComplexScalar, RealPacketSize / 2>::value),
+                        MISSING COMPATIBLE COMPLEX PACKET TYPE)
+    Index actualBegin = numext::round_down(begin, 2);
+    Index actualEnd = numext::round_down(begin + count + 1, 2);
+    Index actualCount = actualEnd - actualBegin;
+    Index r = IsRowMajor ? row : row / 2;
+    Index c = IsRowMajor ? col / 2 : col;
+    ComplexPacket cresult =
+        BaseEvaluator::template packetSegment<LoadMode, ComplexPacket>(r, c, actualBegin / 2, actualCount / 2);
+    PacketType result = preinterpret<PacketType>(cresult);
+    bool p = (IsRowMajor ? col : row) & 1;
+    if (p) {
+      Scalar aux[RealPacketSize + 1] = {};
+      pstoreu(aux, result);
+      Index lastr = IsRowMajor ? row : row + actualEnd - 1;
+      Index lastc = IsRowMajor ? col + actualEnd - 1 : col;
+      aux[actualEnd] = coeff(lastr, lastc);
+      result = ploadu<PacketType>(aux + 1);
+    }
+    return result;
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    constexpr int RealPacketSize = unpacket_traits<PacketType>::size;
+    using ComplexPacket = typename find_packet_by_size<ComplexScalar, RealPacketSize / 2>::type;
+    EIGEN_STATIC_ASSERT((find_packet_by_size<ComplexScalar, RealPacketSize / 2>::value),
+                        MISSING COMPATIBLE COMPLEX PACKET TYPE)
+    Index actualBegin = numext::round_down(begin, 2);
+    Index actualEnd = numext::round_down(begin + count + 1, 2);
+    Index actualCount = actualEnd - actualBegin;
+    ComplexPacket cresult =
+        BaseEvaluator::template packetSegment<LoadMode, ComplexPacket>(index / 2, actualBegin / 2, actualCount / 2);
+    PacketType result = preinterpret<PacketType>(cresult);
+    bool p = index & 1;
+    if (p) {
+      Scalar aux[RealPacketSize + 1] = {};
+      pstoreu(aux, result);
+      aux[actualEnd] = coeff(index + actualEnd - 1);
+      result = ploadu<PacketType>(aux + 1);
+    }
+    return result;
+  }
+};
+
+}  // namespace internal
+
+template <typename Xpr>
+class RealView : public internal::dense_xpr_base<RealView<Xpr>>::type {
+  using ExpressionTraits = internal::traits<RealView>;
+  EIGEN_STATIC_ASSERT(NumTraits<typename Xpr::Scalar>::IsComplex, SCALAR MUST BE COMPLEX)
+ public:
+  using Scalar = typename ExpressionTraits::Scalar;
+  using Nested = RealView;
+
+  EIGEN_DEVICE_FUNC explicit RealView(Xpr& xpr) : m_xpr(xpr) {}
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return Xpr::IsRowMajor ? m_xpr.rows() : 2 * m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return Xpr::IsRowMajor ? 2 * m_xpr.cols() : m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return 2 * m_xpr.size(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_xpr.innerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return 2 * m_xpr.outerStride(); }
+  EIGEN_DEVICE_FUNC void resize(Index rows, Index cols) {
+    m_xpr.resize(Xpr::IsRowMajor ? rows : rows / 2, Xpr::IsRowMajor ? cols / 2 : cols);
+  }
+  EIGEN_DEVICE_FUNC void resize(Index size) { m_xpr.resize(size / 2); }
+  EIGEN_DEVICE_FUNC Scalar* data() { return reinterpret_cast<Scalar*>(m_xpr.data()); }
+  EIGEN_DEVICE_FUNC const Scalar* data() const { return reinterpret_cast<const Scalar*>(m_xpr.data()); }
+
+  EIGEN_DEVICE_FUNC RealView(const RealView&) = default;
+
+  EIGEN_DEVICE_FUNC RealView& operator=(const RealView& other);
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC RealView& operator=(const RealView<OtherDerived>& other);
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC RealView& operator=(const DenseBase<OtherDerived>& other);
+
+ protected:
+  friend struct internal::evaluator<RealView>;
+  Xpr& m_xpr;
+};
+
+template <typename Xpr>
+EIGEN_DEVICE_FUNC RealView<Xpr>& RealView<Xpr>::operator=(const RealView& other) {
+  internal::call_assignment(*this, other);
+  return *this;
+}
+
+template <typename Xpr>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC RealView<Xpr>& RealView<Xpr>::operator=(const RealView<OtherDerived>& other) {
+  internal::call_assignment(*this, other);
+  return *this;
+}
+
+template <typename Xpr>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC RealView<Xpr>& RealView<Xpr>::operator=(const DenseBase<OtherDerived>& other) {
+  internal::call_assignment(*this, other.derived());
+  return *this;
+}
+
+template <typename Derived>
+EIGEN_DEVICE_FUNC typename DenseBase<Derived>::RealViewReturnType DenseBase<Derived>::realView() {
+  return RealViewReturnType(derived());
+}
+
+template <typename Derived>
+EIGEN_DEVICE_FUNC typename DenseBase<Derived>::ConstRealViewReturnType DenseBase<Derived>::realView() const {
+  return ConstRealViewReturnType(derived());
+}
+
+}  // namespace Eigen
+
+#endif  // EIGEN_REALVIEW_H
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 4e9ab0e4f89..18ba1036452 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -101,7 +101,7 @@ struct redux_novec_unroller {
 
   typedef typename Evaluator::Scalar Scalar;
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func& func) {
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func& func) {
     return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval, func),
                 redux_novec_unroller<Func, Evaluator, Start + HalfLength, Length - HalfLength>::run(eval, func));
   }
@@ -114,7 +114,7 @@ struct redux_novec_unroller<Func, Evaluator, Start, 1> {
 
   typedef typename Evaluator::Scalar Scalar;
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func&) {
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func&) {
     return eval.coeffByOuterInner(outer, inner);
   }
 };
@@ -125,7 +125,7 @@ struct redux_novec_unroller<Func, Evaluator, Start, 1> {
 template <typename Func, typename Evaluator, Index Start>
 struct redux_novec_unroller<Func, Evaluator, Start, 0> {
   typedef typename Evaluator::Scalar Scalar;
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
 };
 
 template <typename Func, typename Evaluator, Index Start, Index Length>
@@ -134,7 +134,7 @@ struct redux_novec_linear_unroller {
 
   typedef typename Evaluator::Scalar Scalar;
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func& func) {
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func& func) {
     return func(redux_novec_linear_unroller<Func, Evaluator, Start, HalfLength>::run(eval, func),
                 redux_novec_linear_unroller<Func, Evaluator, Start + HalfLength, Length - HalfLength>::run(eval, func));
   }
@@ -144,7 +144,7 @@ template <typename Func, typename Evaluator, Index Start>
 struct redux_novec_linear_unroller<Func, Evaluator, Start, 1> {
   typedef typename Evaluator::Scalar Scalar;
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func&) {
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func&) {
     return eval.coeff(Start);
   }
 };
@@ -155,7 +155,7 @@ struct redux_novec_linear_unroller<Func, Evaluator, Start, 1> {
 template <typename Func, typename Evaluator, Index Start>
 struct redux_novec_linear_unroller<Func, Evaluator, Start, 0> {
   typedef typename Evaluator::Scalar Scalar;
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
 };
 
 /*** vectorization ***/
@@ -367,7 +367,7 @@ struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
 
   template <typename XprType>
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func& func, const XprType& xpr) {
-    EIGEN_ONLY_USED_FOR_DEBUG(xpr)
+    EIGEN_ONLY_USED_FOR_DEBUG(xpr);
     eigen_assert(xpr.rows() > 0 && xpr.cols() > 0 && "you are using an empty matrix");
     if (VectorizedSize > 0) {
       Scalar res = func.predux(
@@ -398,8 +398,8 @@ class redux_evaluator : public internal::evaluator<XprType_> {
   enum {
     MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
-    // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime
-    // from the evaluator
+    // TODO: we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at
+    // runtime from the evaluator
     Flags = Base::Flags & ~DirectAccessBit,
     IsRowMajor = XprType::IsRowMajor,
     SizeAtCompileTime = XprType::SizeAtCompileTime,
@@ -432,7 +432,7 @@ class redux_evaluator : public internal::evaluator<XprType_> {
 /** \returns the result of a full redux operation on the whole matrix or vector using \a func
  *
  * The template parameter \a BinaryOp is the type of the functor \a func which must be
- * an associative operator. Both current C++98 and C++11 functor styles are handled.
+ * an associative operator.
  *
  * \warning the matrix must be not empty, otherwise an assertion is triggered.
  *
diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index 30ec277d06e..4493441d036 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -43,7 +43,7 @@ struct traits<Ref<PlainObjectType_, Options_, StrideType_> >
       OuterStrideMatch = IsVectorAtCompileTime || int(OuterStrideAtCompileTime) == int(Dynamic) ||
                          int(OuterStrideAtCompileTime) == int(Derived::OuterStrideAtCompileTime),
       // NOTE, this indirection of evaluator<Derived>::Alignment is needed
-      // to workaround a very strange bug in MSVC related to the instantiation
+      // to work around an MSVC bug related to the instantiation
       // of has_*ary_operator in evaluator<CwiseNullaryOp>.
       // This line is surprisingly very sensitive. For instance, simply adding parenthesis
       // as "DerivedAlignment = (int(evaluator<Derived>::Alignment))," will make MSVC fail...
@@ -265,7 +265,7 @@ class Ref : public RefBase<Ref<PlainObjectType, Options, StrideType> > {
  private:
   typedef internal::traits<Ref> Traits;
   template <typename Derived>
-  EIGEN_DEVICE_FUNC inline Ref(
+  EIGEN_DEVICE_FUNC constexpr inline Ref(
       const PlainObjectBase<Derived>& expr,
       std::enable_if_t<bool(Traits::template match<Derived>::MatchAtCompileTime), Derived>* = 0);
 
@@ -275,17 +275,17 @@ class Ref : public RefBase<Ref<PlainObjectType, Options, StrideType> > {
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
   template <typename Derived>
-  EIGEN_DEVICE_FUNC inline Ref(
+  EIGEN_DEVICE_FUNC constexpr inline Ref(
       PlainObjectBase<Derived>& expr,
       std::enable_if_t<bool(Traits::template match<Derived>::MatchAtCompileTime), Derived>* = 0) {
     EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
     // Construction must pass since we will not create temporary storage in the non-const case.
     const bool success = Base::construct(expr.derived());
-    EIGEN_UNUSED_VARIABLE(success)
+    EIGEN_UNUSED_VARIABLE(success);
     eigen_assert(success);
   }
   template <typename Derived>
-  EIGEN_DEVICE_FUNC inline Ref(
+  EIGEN_DEVICE_FUNC constexpr inline Ref(
       const DenseBase<Derived>& expr,
       std::enable_if_t<bool(Traits::template match<Derived>::MatchAtCompileTime), Derived>* = 0)
 #else
@@ -299,7 +299,7 @@ class Ref : public RefBase<Ref<PlainObjectType, Options, StrideType> > {
     EIGEN_STATIC_ASSERT(!Derived::IsPlainObjectBase, THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
     // Construction must pass since we will not create temporary storage in the non-const case.
     const bool success = Base::construct(expr.const_cast_derived());
-    EIGEN_UNUSED_VARIABLE(success)
+    EIGEN_UNUSED_VARIABLE(success);
     eigen_assert(success);
   }
 
@@ -327,8 +327,9 @@ class Ref<const TPlainObjectType, Options, StrideType>
   EIGEN_DENSE_PUBLIC_INTERFACE(Ref)
 
   template <typename Derived>
-  EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
-                               std::enable_if_t<bool(Traits::template match<Derived>::ScalarTypeMatch), Derived>* = 0) {
+  EIGEN_DEVICE_FUNC constexpr inline Ref(
+      const DenseBase<Derived>& expr,
+      std::enable_if_t<bool(Traits::template match<Derived>::ScalarTypeMatch), Derived>* = 0) {
     //      std::cout << match_helper<Derived>::HasDirectAccess << "," << match_helper<Derived>::OuterStrideMatch << ","
     //      << match_helper<Derived>::InnerStrideMatch << "\n"; std::cout << int(StrideType::OuterStrideAtCompileTime)
     //      << " - " << int(Derived::OuterStrideAtCompileTime) << "\n"; std::cout <<
@@ -338,11 +339,11 @@ class Ref<const TPlainObjectType, Options, StrideType>
     construct(expr.derived(), typename Traits::template match<Derived>::type());
   }
 
-  EIGEN_DEVICE_FUNC inline Ref(const Ref& other) : Base(other) {
+  EIGEN_DEVICE_FUNC constexpr inline Ref(const Ref& other) : Base(other) {
     // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
   }
 
-  EIGEN_DEVICE_FUNC inline Ref(Ref&& other) {
+  EIGEN_DEVICE_FUNC constexpr inline Ref(Ref&& other) {
     if (other.data() == other.m_object.data()) {
       m_object = std::move(other.m_object);
       Base::construct(m_object);
@@ -351,7 +352,7 @@ class Ref<const TPlainObjectType, Options, StrideType>
   }
 
   template <typename OtherRef>
-  EIGEN_DEVICE_FUNC inline Ref(const RefBase<OtherRef>& other) {
+  EIGEN_DEVICE_FUNC constexpr inline Ref(const RefBase<OtherRef>& other) {
     EIGEN_STATIC_ASSERT(Traits::template match<OtherRef>::type::value || may_map_m_object_successfully,
                         STORAGE_LAYOUT_DOES_NOT_MATCH);
     construct(other.derived(), typename Traits::template match<OtherRef>::type());
@@ -370,7 +371,7 @@ class Ref<const TPlainObjectType, Options, StrideType>
   EIGEN_DEVICE_FUNC void construct(const Expression& expr, internal::false_type) {
     internal::call_assignment_no_alias(m_object, expr, internal::assign_op<Scalar, Scalar>());
     const bool success = Base::construct(m_object);
-    EIGEN_ONLY_USED_FOR_DEBUG(success)
+    EIGEN_ONLY_USED_FOR_DEBUG(success);
     eigen_assert(success);
   }
 
diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h
index 34150452278..9bdc725c796 100644
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@@ -30,7 +30,7 @@ struct traits<Replicate<MatrixType, RowFactor, ColFactor> > : traits<MatrixType>
     ColsAtCompileTime = ColFactor == Dynamic || int(MatrixType::ColsAtCompileTime) == Dynamic
                             ? Dynamic
                             : ColFactor * MatrixType::ColsAtCompileTime,
-    // FIXME we don't propagate the max sizes !!!
+    // FIXME: propagate MaxRowsAtCompileTime and MaxColsAtCompileTime.
     MaxRowsAtCompileTime = RowsAtCompileTime,
     MaxColsAtCompileTime = ColsAtCompileTime,
     IsRowMajor = MaxRowsAtCompileTime == 1 && MaxColsAtCompileTime != 1   ? 1
@@ -38,7 +38,7 @@ struct traits<Replicate<MatrixType, RowFactor, ColFactor> > : traits<MatrixType>
                  : (MatrixType::Flags & RowMajorBit)                      ? 1
                                                                           : 0,
 
-    // FIXME enable DirectAccess with negative strides?
+    // FIXME: consider enabling DirectAccess with negative strides.
     Flags = IsRowMajor ? RowMajorBit : 0
   };
 };
@@ -71,7 +71,7 @@ class Replicate : public internal::dense_xpr_base<Replicate<MatrixType, RowFacto
   typedef internal::remove_all_t<MatrixType> NestedExpression;
 
   template <typename OriginalMatrixType>
-  EIGEN_DEVICE_FUNC inline explicit Replicate(const OriginalMatrixType& matrix)
+  EIGEN_DEVICE_FUNC constexpr inline explicit Replicate(const OriginalMatrixType& matrix)
       : m_matrix(matrix), m_rowFactor(RowFactor), m_colFactor(ColFactor) {
     EIGEN_STATIC_ASSERT((internal::is_same<std::remove_const_t<MatrixType>, OriginalMatrixType>::value),
                         THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
@@ -79,7 +79,7 @@ class Replicate : public internal::dense_xpr_base<Replicate<MatrixType, RowFacto
   }
 
   template <typename OriginalMatrixType>
-  EIGEN_DEVICE_FUNC inline Replicate(const OriginalMatrixType& matrix, Index rowFactor, Index colFactor)
+  EIGEN_DEVICE_FUNC constexpr inline Replicate(const OriginalMatrixType& matrix, Index rowFactor, Index colFactor)
       : m_matrix(matrix), m_rowFactor(rowFactor), m_colFactor(colFactor) {
     EIGEN_STATIC_ASSERT((internal::is_same<std::remove_const_t<MatrixType>, OriginalMatrixType>::value),
                         THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
@@ -88,7 +88,7 @@ class Replicate : public internal::dense_xpr_base<Replicate<MatrixType, RowFacto
   EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_matrix.rows() * m_rowFactor.value(); }
   EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_matrix.cols() * m_colFactor.value(); }
 
-  EIGEN_DEVICE_FUNC const MatrixTypeNested_& nestedExpression() const { return m_matrix; }
+  EIGEN_DEVICE_FUNC constexpr const MatrixTypeNested_& nestedExpression() const { return m_matrix; }
 
  protected:
   MatrixTypeNested m_matrix;
diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h
index 22acdc0bc52..c3b4d56ea52 100644
--- a/Eigen/src/Core/Reshaped.h
+++ b/Eigen/src/Core/Reshaped.h
@@ -107,7 +107,7 @@ class Reshaped : public ReshapedImpl<XprType, Rows, Cols, Order, typename intern
 
   /** Fixed-size constructor
    */
-  EIGEN_DEVICE_FUNC inline Reshaped(XprType& xpr) : Impl(xpr) {
+  EIGEN_DEVICE_FUNC constexpr inline Reshaped(XprType& xpr) : Impl(xpr) {
     EIGEN_STATIC_ASSERT(RowsAtCompileTime != Dynamic && ColsAtCompileTime != Dynamic,
                         THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
     eigen_assert(Rows * Cols == xpr.rows() * xpr.cols());
@@ -115,7 +115,7 @@ class Reshaped : public ReshapedImpl<XprType, Rows, Cols, Order, typename intern
 
   /** Dynamic-size constructor
    */
-  EIGEN_DEVICE_FUNC inline Reshaped(XprType& xpr, Index reshapeRows, Index reshapeCols)
+  EIGEN_DEVICE_FUNC constexpr inline Reshaped(XprType& xpr, Index reshapeRows, Index reshapeCols)
       : Impl(xpr, reshapeRows, reshapeCols) {
     eigen_assert((RowsAtCompileTime == Dynamic || RowsAtCompileTime == reshapeRows) &&
                  (ColsAtCompileTime == Dynamic || ColsAtCompileTime == reshapeCols));
@@ -136,8 +136,8 @@ class ReshapedImpl<XprType, Rows, Cols, Order, Dense>
  public:
   typedef Impl Base;
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl)
-  EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr) : Impl(xpr) {}
-  EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr, Index reshapeRows, Index reshapeCols)
+  EIGEN_DEVICE_FUNC constexpr inline ReshapedImpl(XprType& xpr) : Impl(xpr) {}
+  EIGEN_DEVICE_FUNC constexpr inline ReshapedImpl(XprType& xpr, Index reshapeRows, Index reshapeCols)
       : Impl(xpr, reshapeRows, reshapeCols) {}
 };
 
@@ -161,15 +161,15 @@ class ReshapedImpl_dense<XprType, Rows, Cols, Order, false>
 
   /** Fixed-size constructor
    */
-  EIGEN_DEVICE_FUNC inline ReshapedImpl_dense(XprType& xpr) : m_xpr(xpr), m_rows(Rows), m_cols(Cols) {}
+  EIGEN_DEVICE_FUNC constexpr inline ReshapedImpl_dense(XprType& xpr) : m_xpr(xpr), m_rows(Rows), m_cols(Cols) {}
 
   /** Dynamic-size constructor
    */
-  EIGEN_DEVICE_FUNC inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
+  EIGEN_DEVICE_FUNC constexpr inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
       : m_xpr(xpr), m_rows(nRows), m_cols(nCols) {}
 
-  EIGEN_DEVICE_FUNC Index rows() const { return m_rows; }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_cols; }
 
 #ifdef EIGEN_PARSED_BY_DOXYGEN
   /** \sa MapBase::data() */
@@ -179,10 +179,10 @@ class ReshapedImpl_dense<XprType, Rows, Cols, Order, false>
 #endif
 
   /** \returns the nested expression */
-  EIGEN_DEVICE_FUNC const internal::remove_all_t<XprType>& nestedExpression() const { return m_xpr; }
+  EIGEN_DEVICE_FUNC constexpr const internal::remove_all_t<XprType>& nestedExpression() const { return m_xpr; }
 
   /** \returns the nested expression */
-  EIGEN_DEVICE_FUNC std::remove_reference_t<XprType>& nestedExpression() { return m_xpr; }
+  EIGEN_DEVICE_FUNC constexpr std::remove_reference_t<XprType>& nestedExpression() { return m_xpr; }
 
  protected:
   MatrixTypeNested m_xpr;
@@ -203,16 +203,16 @@ class ReshapedImpl_dense<XprType, Rows, Cols, Order, true> : public MapBase<Resh
 
   /** Fixed-size constructor
    */
-  EIGEN_DEVICE_FUNC inline ReshapedImpl_dense(XprType& xpr) : Base(xpr.data()), m_xpr(xpr) {}
+  EIGEN_DEVICE_FUNC constexpr inline ReshapedImpl_dense(XprType& xpr) : Base(xpr.data()), m_xpr(xpr) {}
 
   /** Dynamic-size constructor
    */
-  EIGEN_DEVICE_FUNC inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
+  EIGEN_DEVICE_FUNC constexpr inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
       : Base(xpr.data(), nRows, nCols), m_xpr(xpr) {}
 
-  EIGEN_DEVICE_FUNC const internal::remove_all_t<XprTypeNested>& nestedExpression() const { return m_xpr; }
+  EIGEN_DEVICE_FUNC constexpr const internal::remove_all_t<XprTypeNested>& nestedExpression() const { return m_xpr; }
 
-  EIGEN_DEVICE_FUNC XprType& nestedExpression() { return m_xpr; }
+  EIGEN_DEVICE_FUNC constexpr XprType& nestedExpression() { return m_xpr; }
 
   /** \sa MapBase::innerStride() */
   EIGEN_DEVICE_FUNC constexpr Index innerStride() const { return m_xpr.innerStride(); }
@@ -265,7 +265,7 @@ struct evaluator<Reshaped<ArgType, Rows, Cols, Order> >
     Alignment = evaluator<ArgType>::Alignment
   };
   typedef reshaped_evaluator<ArgType, Rows, Cols, Order, HasDirectAccess> reshaped_evaluator_type;
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : reshaped_evaluator_type(xpr) {
+  EIGEN_DEVICE_FUNC constexpr explicit evaluator(const XprType& xpr) : reshaped_evaluator_type(xpr) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 };
@@ -283,7 +283,8 @@ struct reshaped_evaluator<ArgType, Rows, Cols, Order, /* HasDirectAccess */ fals
     Alignment = 0
   };
 
-  EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr) {
+  EIGEN_DEVICE_FUNC constexpr explicit reshaped_evaluator(const XprType& xpr)
+      : m_argImpl(xpr.nestedExpression()), m_xpr(xpr) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
@@ -292,7 +293,7 @@ struct reshaped_evaluator<ArgType, Rows, Cols, Order, /* HasDirectAccess */ fals
 
   typedef std::pair<Index, Index> RowCol;
 
-  EIGEN_DEVICE_FUNC inline RowCol index_remap(Index rowId, Index colId) const {
+  EIGEN_DEVICE_FUNC constexpr inline RowCol index_remap(Index rowId, Index colId) const {
     if (Order == ColMajor) {
       const Index nth_elem_idx = colId * m_xpr.rows() + rowId;
       return RowCol(nth_elem_idx % m_xpr.nestedExpression().rows(), nth_elem_idx / m_xpr.nestedExpression().rows());
@@ -302,74 +303,38 @@ struct reshaped_evaluator<ArgType, Rows, Cols, Order, /* HasDirectAccess */ fals
     }
   }
 
-  EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index rowId, Index colId) {
+  EIGEN_DEVICE_FUNC constexpr inline Scalar& coeffRef(Index rowId, Index colId) {
     EIGEN_STATIC_ASSERT_LVALUE(XprType)
     const RowCol row_col = index_remap(rowId, colId);
     return m_argImpl.coeffRef(row_col.first, row_col.second);
   }
 
-  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index rowId, Index colId) const {
+  EIGEN_DEVICE_FUNC constexpr inline const Scalar& coeffRef(Index rowId, Index colId) const {
     const RowCol row_col = index_remap(rowId, colId);
     return m_argImpl.coeffRef(row_col.first, row_col.second);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const {
     const RowCol row_col = index_remap(rowId, colId);
     return m_argImpl.coeff(row_col.first, row_col.second);
   }
 
-  EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC constexpr inline Scalar& coeffRef(Index index) {
     EIGEN_STATIC_ASSERT_LVALUE(XprType)
     const RowCol row_col = index_remap(Rows == 1 ? 0 : index, Rows == 1 ? index : 0);
     return m_argImpl.coeffRef(row_col.first, row_col.second);
   }
 
-  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr inline const Scalar& coeffRef(Index index) const {
     const RowCol row_col = index_remap(Rows == 1 ? 0 : index, Rows == 1 ? index : 0);
     return m_argImpl.coeffRef(row_col.first, row_col.second);
   }
 
-  EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC constexpr inline const CoeffReturnType coeff(Index index) const {
     const RowCol row_col = index_remap(Rows == 1 ? 0 : index, Rows == 1 ? index : 0);
     return m_argImpl.coeff(row_col.first, row_col.second);
   }
-#if 0
-  EIGEN_DEVICE_FUNC
-  template<int LoadMode>
-  inline PacketScalar packet(Index rowId, Index colId) const
-  {
-    const RowCol row_col = index_remap(rowId, colId);
-    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second);
-
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC
-  inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
-  {
-    const RowCol row_col = index_remap(rowId, colId);
-    m_argImpl.const_cast_derived().template writePacket<Unaligned>
-            (row_col.first, row_col.second, val);
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC
-  inline PacketScalar packet(Index index) const
-  {
-    const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index,
-                                        RowsAtCompileTime == 1 ? index : 0);
-    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second);
-  }
 
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC
-  inline void writePacket(Index index, const PacketScalar& val)
-  {
-    const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index,
-                                        RowsAtCompileTime == 1 ? index : 0);
-    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second, val);
-  }
-#endif
  protected:
   evaluator<ArgType> m_argImpl;
   const XprType& m_xpr;
@@ -382,10 +347,8 @@ struct reshaped_evaluator<ArgType, Rows, Cols, Order, /* HasDirectAccess */ true
   typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
   typedef typename XprType::Scalar Scalar;
 
-  EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC constexpr explicit reshaped_evaluator(const XprType& xpr)
       : mapbase_evaluator<XprType, typename XprType::PlainObject>(xpr) {
-    // TODO: for the 3.4 release, this should be turned to an internal assertion, but let's keep it as is for the beta
-    // lifetime
     eigen_assert(((std::uintptr_t(xpr.data()) % plain_enum_max(1, evaluator<XprType>::Alignment)) == 0) &&
                  "data is not aligned");
   }
diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h
index 892c193bd31..410b77d2e2d 100644
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -23,7 +23,7 @@ struct traits<ReturnByValue<Derived> > : public traits<typename traits<Derived>:
   enum {
     // We're disabling the DirectAccess because e.g. the constructor of
     // the Block-with-DirectAccess expression requires to have a coeffRef method.
-    // Also, we don't want to have to implement the stride stuff.
+    // Also, this avoids having to implement stride support.
     Flags = (traits<typename traits<Derived>::ReturnType>::Flags | EvalBeforeNestingBit) & ~DirectAccessBit
   };
 };
@@ -32,7 +32,7 @@ struct traits<ReturnByValue<Derived> > : public traits<typename traits<Derived>:
  * So the only way that nesting it in an expression can work, is by evaluating it into a plain matrix.
  * So internal::nested always gives the plain return matrix type.
  *
- * FIXME: I don't understand why we need this specialization: isn't this taken care of by the EvalBeforeNestingBit ??
+ * FIXME: this specialization may be redundant with EvalBeforeNestingBit.
  * Answer: EvalBeforeNestingBit should be deprecated since we have the evaluators
  */
 template <typename Derived, int n, typename PlainObject>
diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h
index d11ba16708b..a4af8d11598 100644
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -83,7 +83,7 @@ class Reverse : public internal::dense_xpr_base<Reverse<MatrixType, Direction> >
   typedef internal::reverse_packet_cond<PacketScalar, ReversePacket> reverse_packet;
 
  public:
-  EIGEN_DEVICE_FUNC explicit inline Reverse(const MatrixType& matrix) : m_matrix(matrix) {}
+  EIGEN_DEVICE_FUNC constexpr explicit inline Reverse(const MatrixType& matrix) : m_matrix(matrix) {}
 
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse)
 
@@ -92,7 +92,7 @@ class Reverse : public internal::dense_xpr_base<Reverse<MatrixType, Direction> >
 
   EIGEN_DEVICE_FUNC inline Index innerStride() const { return -m_matrix.innerStride(); }
 
-  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename MatrixType::Nested>& nestedExpression() const {
+  EIGEN_DEVICE_FUNC constexpr const internal::remove_all_t<typename MatrixType::Nested>& nestedExpression() const {
     return m_matrix;
   }
 
diff --git a/Eigen/src/Core/Select.h b/Eigen/src/Core/Select.h
index 0fa5f1e178e..6ad290f23a6 100644
--- a/Eigen/src/Core/Select.h
+++ b/Eigen/src/Core/Select.h
@@ -15,7 +15,7 @@
 
 namespace Eigen {
 
-/** \class Select
+/** \typedef Select
  * \ingroup Core_Module
  *
  * \brief Expression of a coefficient wise version of the C++ ternary operator ?:
@@ -24,73 +24,16 @@ namespace Eigen {
  * \tparam ThenMatrixType the type of the \em then expression
  * \tparam ElseMatrixType the type of the \em else expression
  *
- * This class represents an expression of a coefficient wise version of the C++ ternary operator ?:.
+ * This type represents an expression of a coefficient wise version of the C++ ternary operator ?:.
  * It is the return type of DenseBase::select() and most of the time this is the only way it is used.
  *
  * \sa DenseBase::select(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&) const
  */
-
-namespace internal {
-template <typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
-struct traits<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> > : traits<ThenMatrixType> {
-  typedef typename traits<ThenMatrixType>::Scalar Scalar;
-  typedef Dense StorageKind;
-  typedef typename traits<ThenMatrixType>::XprKind XprKind;
-  typedef typename ConditionMatrixType::Nested ConditionMatrixNested;
-  typedef typename ThenMatrixType::Nested ThenMatrixNested;
-  typedef typename ElseMatrixType::Nested ElseMatrixNested;
-  enum {
-    RowsAtCompileTime = ConditionMatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = ConditionMatrixType::ColsAtCompileTime,
-    MaxRowsAtCompileTime = ConditionMatrixType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = ConditionMatrixType::MaxColsAtCompileTime,
-    Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & RowMajorBit
-  };
-};
-}  // namespace internal
-
 template <typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
-class Select : public internal::dense_xpr_base<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >::type,
-               internal::no_assignment_operator {
- public:
-  typedef typename internal::dense_xpr_base<Select>::type Base;
-  EIGEN_DENSE_PUBLIC_INTERFACE(Select)
-
-  inline EIGEN_DEVICE_FUNC Select(const ConditionMatrixType& a_conditionMatrix, const ThenMatrixType& a_thenMatrix,
-                                  const ElseMatrixType& a_elseMatrix)
-      : m_condition(a_conditionMatrix), m_then(a_thenMatrix), m_else(a_elseMatrix) {
-    eigen_assert(m_condition.rows() == m_then.rows() && m_condition.rows() == m_else.rows());
-    eigen_assert(m_condition.cols() == m_then.cols() && m_condition.cols() == m_else.cols());
-  }
-
-  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_condition.rows(); }
-  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_condition.cols(); }
-
-  inline EIGEN_DEVICE_FUNC const Scalar coeff(Index i, Index j) const {
-    if (m_condition.coeff(i, j))
-      return m_then.coeff(i, j);
-    else
-      return m_else.coeff(i, j);
-  }
-
-  inline EIGEN_DEVICE_FUNC const Scalar coeff(Index i) const {
-    if (m_condition.coeff(i))
-      return m_then.coeff(i);
-    else
-      return m_else.coeff(i);
-  }
-
-  inline EIGEN_DEVICE_FUNC const ConditionMatrixType& conditionMatrix() const { return m_condition; }
-
-  inline EIGEN_DEVICE_FUNC const ThenMatrixType& thenMatrix() const { return m_then; }
-
-  inline EIGEN_DEVICE_FUNC const ElseMatrixType& elseMatrix() const { return m_else; }
-
- protected:
-  typename ConditionMatrixType::Nested m_condition;
-  typename ThenMatrixType::Nested m_then;
-  typename ElseMatrixType::Nested m_else;
-};
+using Select = CwiseTernaryOp<internal::scalar_boolean_select_op<typename DenseBase<ThenMatrixType>::Scalar,
+                                                                 typename DenseBase<ElseMatrixType>::Scalar,
+                                                                 typename DenseBase<ConditionMatrixType>::Scalar>,
+                              ThenMatrixType, ElseMatrixType, ConditionMatrixType>;
 
 /** \returns a matrix where each coefficient (i,j) is equal to \a thenMatrix(i,j)
  * if \c *this(i,j) != Scalar(0), and \a elseMatrix(i,j) otherwise.
@@ -98,57 +41,50 @@ class Select : public internal::dense_xpr_base<Select<ConditionMatrixType, ThenM
  * Example: \include MatrixBase_select.cpp
  * Output: \verbinclude MatrixBase_select.out
  *
- * \sa DenseBase::bitwiseSelect(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&)
+ * \sa typedef Select
  */
 template <typename Derived>
 template <typename ThenDerived, typename ElseDerived>
-inline EIGEN_DEVICE_FUNC CwiseTernaryOp<
+inline EIGEN_DEVICE_FUNC constexpr CwiseTernaryOp<
     internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar, typename DenseBase<ElseDerived>::Scalar,
                                        typename DenseBase<Derived>::Scalar>,
     ThenDerived, ElseDerived, Derived>
 DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix, const DenseBase<ElseDerived>& elseMatrix) const {
-  using Op = internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar,
-                                                typename DenseBase<ElseDerived>::Scalar, Scalar>;
-  return CwiseTernaryOp<Op, ThenDerived, ElseDerived, Derived>(thenMatrix.derived(), elseMatrix.derived(), derived(),
-                                                               Op());
+  return Select<Derived, ThenDerived, ElseDerived>(thenMatrix.derived(), elseMatrix.derived(), derived());
 }
 /** Version of DenseBase::select(const DenseBase&, const DenseBase&) with
  * the \em else expression being a scalar value.
  *
- * \sa DenseBase::booleanSelect(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&) const, class Select
+ * \sa typedef Select
  */
 template <typename Derived>
 template <typename ThenDerived>
-inline EIGEN_DEVICE_FUNC CwiseTernaryOp<
+inline EIGEN_DEVICE_FUNC constexpr CwiseTernaryOp<
     internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar, typename DenseBase<ThenDerived>::Scalar,
                                        typename DenseBase<Derived>::Scalar>,
     ThenDerived, typename DenseBase<ThenDerived>::ConstantReturnType, Derived>
 DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
                            const typename DenseBase<ThenDerived>::Scalar& elseScalar) const {
   using ElseConstantType = typename DenseBase<ThenDerived>::ConstantReturnType;
-  using Op = internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar,
-                                                typename DenseBase<ThenDerived>::Scalar, Scalar>;
-  return CwiseTernaryOp<Op, ThenDerived, ElseConstantType, Derived>(
-      thenMatrix.derived(), ElseConstantType(rows(), cols(), elseScalar), derived(), Op());
+  return Select<Derived, ThenDerived, ElseConstantType>(thenMatrix.derived(),
+                                                        ElseConstantType(rows(), cols(), elseScalar), derived());
 }
 /** Version of DenseBase::select(const DenseBase&, const DenseBase&) with
  * the \em then expression being a scalar value.
  *
- * \sa DenseBase::booleanSelect(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&) const, class Select
+ * \sa typedef Select
  */
 template <typename Derived>
 template <typename ElseDerived>
-inline EIGEN_DEVICE_FUNC CwiseTernaryOp<
+inline EIGEN_DEVICE_FUNC constexpr CwiseTernaryOp<
     internal::scalar_boolean_select_op<typename DenseBase<ElseDerived>::Scalar, typename DenseBase<ElseDerived>::Scalar,
                                        typename DenseBase<Derived>::Scalar>,
     typename DenseBase<ElseDerived>::ConstantReturnType, ElseDerived, Derived>
 DenseBase<Derived>::select(const typename DenseBase<ElseDerived>::Scalar& thenScalar,
                            const DenseBase<ElseDerived>& elseMatrix) const {
   using ThenConstantType = typename DenseBase<ElseDerived>::ConstantReturnType;
-  using Op = internal::scalar_boolean_select_op<typename DenseBase<ElseDerived>::Scalar,
-                                                typename DenseBase<ElseDerived>::Scalar, Scalar>;
-  return CwiseTernaryOp<Op, ThenConstantType, ElseDerived, Derived>(ThenConstantType(rows(), cols(), thenScalar),
-                                                                    elseMatrix.derived(), derived(), Op());
+  return Select<Derived, ThenConstantType, ElseDerived>(ThenConstantType(rows(), cols(), thenScalar),
+                                                        elseMatrix.derived(), derived());
 }
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index 16f0e7513d0..59731b85667 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -114,6 +114,12 @@ class SelfAdjointView : public TriangularBase<SelfAdjointView<MatrixType_, UpLo>
     return Product<OtherDerived, SelfAdjointView>(lhs.derived(), rhs);
   }
 
+  EIGEN_DEVICE_FUNC const
+      SelfAdjointView<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(MatrixType, Scalar, product), UpLo>
+      operator*(const Scalar& s) const {
+    return (nestedExpression() * s).template selfadjointView<UpLo>();
+  }
+
   friend EIGEN_DEVICE_FUNC const
       SelfAdjointView<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar, MatrixType, product), UpLo>
       operator*(const Scalar& s, const SelfAdjointView& mat) {
@@ -217,10 +223,32 @@ class SelfAdjointView : public TriangularBase<SelfAdjointView<MatrixType_, UpLo>
     return typename MatrixType::ConstDiagonalReturnType(m_matrix);
   }
 
+  /** \returns the matrix 1-norm (maximum absolute column sum) of the implicit
+   * full self-adjoint matrix, reading only the stored triangle. For Hermitian
+   * (complex) scalars the unstored entries are conjugates of stored ones, and
+   * since |conj(x)| = |x| the result matches the L1 norm of the full matrix.
+   */
+  EIGEN_DEVICE_FUNC typename NumTraits<Scalar>::Real l1Norm() const {
+    typedef typename NumTraits<Scalar>::Real RealScalar_;
+    RealScalar_ norm = RealScalar_(0);
+    const Index n = m_matrix.rows();
+    for (Index col = 0; col < n; ++col) {
+      RealScalar_ abs_col_sum;
+      if (UpLo == Lower)
+        abs_col_sum =
+            m_matrix.col(col).tail(n - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>();
+      else
+        abs_col_sum =
+            m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(n - col).template lpNorm<1>();
+      if (abs_col_sum > norm) norm = abs_col_sum;
+    }
+    return norm;
+  }
+
   /////////// Cholesky module ///////////
 
-  const LLT<PlainObject, UpLo> llt() const;
-  const LDLT<PlainObject, UpLo> ldlt() const;
+  LLT<PlainObject, UpLo> llt() const;
+  LDLT<PlainObject, UpLo> ldlt() const;
 
   /////////// Eigenvalue module ///////////
 
@@ -236,14 +264,6 @@ class SelfAdjointView : public TriangularBase<SelfAdjointView<MatrixType_, UpLo>
   MatrixTypeNested m_matrix;
 };
 
-// template<typename OtherDerived, typename MatrixType, unsigned int UpLo>
-// internal::selfadjoint_matrix_product_returntype<OtherDerived,SelfAdjointView<MatrixType,UpLo> >
-// operator*(const MatrixBase<OtherDerived>& lhs, const SelfAdjointView<MatrixType,UpLo>& rhs)
-// {
-//   return internal::matrix_selfadjoint_product_returntype<OtherDerived,SelfAdjointView<MatrixType,UpLo>
-//   >(lhs.derived(),rhs);
-// }
-
 // selfadjoint to dense matrix
 
 namespace internal {
@@ -288,6 +308,14 @@ class triangular_dense_assignment_kernel<UpLo, SelfAdjoint, SetOpposite, DstEval
     m_functor.assignCoeff(m_dst.coeffRef(col, row), numext::conj(tmp));
   }
 
+  // Override to ensure the SelfAdjoint assignCoeff (which mirrors conjugates) is called,
+  // not the base class version (which is a plain copy).
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner) {
+    Index row = Base::rowIndexByOuterInner(outer, inner);
+    Index col = Base::colIndexByOuterInner(outer, inner);
+    assignCoeff(row, col);
+  }
+
   EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id) { Base::assignCoeff(id, id); }
 
   EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index, Index) { eigen_internal_assert(false && "should never be called"); }
@@ -302,7 +330,7 @@ class triangular_dense_assignment_kernel<UpLo, SelfAdjoint, SetOpposite, DstEval
 /** This is the const version of MatrixBase::selfadjointView() */
 template <typename Derived>
 template <unsigned int UpLo>
-EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC constexpr typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView() const {
   return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
 }
@@ -319,7 +347,7 @@ MatrixBase<Derived>::selfadjointView() const {
  */
 template <typename Derived>
 template <unsigned int UpLo>
-EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC constexpr typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView() {
   return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
 }
diff --git a/Eigen/src/Core/SelfCwiseBinaryOp.h b/Eigen/src/Core/SelfCwiseBinaryOp.h
index f73ceb4007c..a887ad2ce4d 100644
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -15,19 +15,33 @@
 
 namespace Eigen {
 
-// TODO generalize the scalar type of 'other'
+template <typename Derived>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other) {
+  using ConstantExpr = typename internal::plain_constant_type<Derived, Scalar>::type;
+  using Op = internal::mul_assign_op<Scalar>;
+  internal::call_assignment(derived(), ConstantExpr(rows(), cols(), other), Op());
+  return derived();
+}
+
+template <typename Derived>
+template <bool Enable, typename>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const RealScalar& other) {
+  realView() *= other;
+  return derived();
+}
 
 template <typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other) {
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(), cols(), other),
-                            internal::mul_assign_op<Scalar, Scalar>());
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other) {
+  using ConstantExpr = typename internal::plain_constant_type<Derived, Scalar>::type;
+  using Op = internal::div_assign_op<Scalar>;
+  internal::call_assignment(derived(), ConstantExpr(rows(), cols(), other), Op());
   return derived();
 }
 
 template <typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other) {
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(), cols(), other),
-                            internal::div_assign_op<Scalar, Scalar>());
+template <bool Enable, typename>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const RealScalar& other) {
+  realView() /= other;
   return derived();
 }
 
diff --git a/Eigen/src/Core/SkewSymmetricMatrix3.h b/Eigen/src/Core/SkewSymmetricMatrix3.h
index 3545afc76cd..72b65e17aa8 100644
--- a/Eigen/src/Core/SkewSymmetricMatrix3.h
+++ b/Eigen/src/Core/SkewSymmetricMatrix3.h
@@ -62,7 +62,7 @@ class SkewSymmetricBase : public EigenBase<Derived> {
   /**
    * Constructs a dense matrix from \c *this. Note, this directly returns a dense matrix type,
    * not an expression.
-   * \returns A dense matrix, with its entries set from the the derived object. */
+   * \returns A dense matrix, with its entries set from the derived object. */
   EIGEN_DEVICE_FUNC DenseMatrixType toDenseMatrix() const { return derived(); }
 
   /** Determinant vanishes */
@@ -159,6 +159,61 @@ class SkewSymmetricBase : public EigenBase<Derived> {
       const SkewSymmetricBase<OtherDerived>& other) const {
     return (vector() - other.vector()).asSkewSymmetric();
   }
+
+  // Return type of dense +/- skew. Scalar follows ScalarBinaryOpTraits, so
+  // mixed-scalar cases go through the same promotion machinery as the rest of
+  // Eigen (and are rejected at compile time unless a user-provided
+  // specialization makes them valid). Shape is always 3x3.
+  template <typename OtherDerived, typename BinaryOp>
+  using DenseSkewBinaryReturnType = Matrix<
+      typename ScalarBinaryOpTraits<typename internal::traits<OtherDerived>::Scalar, Scalar, BinaryOp>::ReturnType, 3,
+      3>;
+
+  template <typename OtherDerived>
+  using DenseSkewSumReturnType =
+      DenseSkewBinaryReturnType<OtherDerived,
+                                internal::scalar_sum_op<typename internal::traits<OtherDerived>::Scalar, Scalar>>;
+
+  template <typename OtherDerived>
+  using DenseSkewDifferenceReturnType = DenseSkewBinaryReturnType<
+      OtherDerived, internal::scalar_difference_op<typename internal::traits<OtherDerived>::Scalar, Scalar>>;
+
+  /** \returns the sum of a dense matrix \a lhs and the skew symmetric matrix \a rhs as a dense matrix.
+   *
+   * The LHS must be 3x3 at compile time (or Dynamic, which is checked at runtime by the conversion).
+   * Only the skew side is materialized via \c toDenseMatrix(); the LHS remains a lazy expression
+   * until the enclosing assignment. */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC friend EIGEN_STRONG_INLINE DenseSkewSumReturnType<OtherDerived> operator+(
+      const MatrixBase<OtherDerived>& lhs, const SkewSymmetricBase& rhs) {
+    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(OtherDerived, DenseMatrixType);
+    return lhs.derived() + rhs.toDenseMatrix();
+  }
+
+  /** \returns the sum of the skew symmetric matrix \a lhs and a dense matrix \a rhs as a dense matrix.
+   *
+   * Sum is commutative, so this forwards to the \c dense+skew overload. */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC friend EIGEN_STRONG_INLINE DenseSkewSumReturnType<OtherDerived> operator+(
+      const SkewSymmetricBase& lhs, const MatrixBase<OtherDerived>& rhs) {
+    return rhs + lhs;
+  }
+
+  /** \returns the difference of a dense matrix \a lhs and the skew symmetric matrix \a rhs as a dense matrix. */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC friend EIGEN_STRONG_INLINE DenseSkewDifferenceReturnType<OtherDerived> operator-(
+      const MatrixBase<OtherDerived>& lhs, const SkewSymmetricBase& rhs) {
+    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(OtherDerived, DenseMatrixType);
+    return lhs.derived() - rhs.toDenseMatrix();
+  }
+
+  /** \returns the difference of the skew symmetric matrix \a lhs and a dense matrix \a rhs as a dense matrix. */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC friend EIGEN_STRONG_INLINE DenseSkewDifferenceReturnType<OtherDerived> operator-(
+      const SkewSymmetricBase& lhs, const MatrixBase<OtherDerived>& rhs) {
+    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(OtherDerived, DenseMatrixType);
+    return lhs.toDenseMatrix() - rhs.derived();
+  }
 };
 
 /** \class SkewSymmetricMatrix3
@@ -308,7 +363,7 @@ class SkewSymmetricWrapper : public SkewSymmetricBase<SkewSymmetricWrapper<SkewS
  * \sa class SkewSymmetricWrapper, class SkewSymmetricMatrix3, vector(), isSkewSymmetric()
  **/
 template <typename Derived>
-EIGEN_DEVICE_FUNC inline const SkewSymmetricWrapper<const Derived> MatrixBase<Derived>::asSkewSymmetric() const {
+EIGEN_DEVICE_FUNC constexpr const SkewSymmetricWrapper<const Derived> MatrixBase<Derived>::asSkewSymmetric() const {
   return SkewSymmetricWrapper<const Derived>(derived());
 }
 
diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h
index aa51410001a..030b934600e 100644
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -69,8 +69,8 @@ class Solve : public SolveImpl<Decomposition, RhsType, typename internal::traits
   EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_dec.cols(); }
   EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
 
-  EIGEN_DEVICE_FUNC const Decomposition &dec() const { return m_dec; }
-  EIGEN_DEVICE_FUNC const RhsType &rhs() const { return m_rhs; }
+  EIGEN_DEVICE_FUNC constexpr const Decomposition &dec() const { return m_dec; }
+  EIGEN_DEVICE_FUNC constexpr const RhsType &rhs() const { return m_rhs; }
 
  protected:
   const Decomposition &m_dec;
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index 9d318742272..684f7a55eb0 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -53,10 +53,11 @@ struct triangular_solver_selector<Lhs, Rhs, Side, Mode, NoUnrolling, 1> {
   typedef typename Lhs::Scalar LhsScalar;
   typedef typename Rhs::Scalar RhsScalar;
   typedef blas_traits<Lhs> LhsProductTraits;
-  typedef typename LhsProductTraits::ExtractType ActualLhsType;
+  typedef typename LhsProductTraits::DirectLinearAccessType ActualLhsType;
+  typedef remove_all_t<ActualLhsType> ActualLhsTypeCleaned;
   typedef Map<Matrix<RhsScalar, Dynamic, 1>, Aligned> MappedRhs;
   static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) {
-    ActualLhsType actualLhs = LhsProductTraits::extract(lhs);
+    add_const_on_value_type_t<ActualLhsType> actualLhs = LhsProductTraits::extract(lhs);
 
     // FIXME find a way to allow an inner stride if packet_traits<Scalar>::size==1
 
@@ -67,10 +68,11 @@ struct triangular_solver_selector<Lhs, Rhs, Side, Mode, NoUnrolling, 1> {
     if (!useRhsDirectly) MappedRhs(actualRhs, rhs.size()) = rhs;
 
     triangular_solve_vector<LhsScalar, RhsScalar, Index, Side, Mode, LhsProductTraits::NeedToConjugate,
-                            (int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor>::run(actualLhs.cols(),
-                                                                                        actualLhs.data(),
-                                                                                        actualLhs.outerStride(),
-                                                                                        actualRhs);
+                            (int(ActualLhsTypeCleaned::Flags) & RowMajorBit) ? RowMajor
+                                                                             : ColMajor>::run(actualLhs.cols(),
+                                                                                              actualLhs.data(),
+                                                                                              actualLhs.outerStride(),
+                                                                                              actualRhs);
 
     if (!useRhsDirectly) rhs = MappedRhs(actualRhs, rhs.size());
   }
@@ -181,11 +183,15 @@ EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::solveInPlace
   if (derived().cols() == 0) return;
 
   enum {
-    copy = (internal::traits<OtherDerived>::Flags & RowMajorBit) && OtherDerived::IsVectorAtCompileTime &&
-           OtherDerived::SizeAtCompileTime != 1
+    OtherFlags = internal::traits<OtherDerived>::Flags,
+    IsRowMajorVector =
+        (OtherFlags & RowMajorBit) && OtherDerived::IsVectorAtCompileTime && OtherDerived::SizeAtCompileTime != 1,
+    copy = IsRowMajorVector || ((OtherFlags & DirectAccessBit) == 0)
   };
-  typedef std::conditional_t<copy, typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>
-      OtherCopy;
+  typedef std::conditional_t<IsRowMajorVector, typename internal::plain_matrix_type_column_major<OtherDerived>::type,
+                             typename internal::plain_matrix_type<OtherDerived>::type>
+      OtherPlainObject;
+  typedef std::conditional_t<copy, OtherPlainObject, OtherDerived&> OtherCopy;
   OtherCopy otherCopy(other);
 
   internal::triangular_solver_selector<MatrixType, std::remove_reference_t<OtherCopy>, Side, Mode>::run(
diff --git a/Eigen/src/Core/SolverBase.h b/Eigen/src/Core/SolverBase.h
index 5a6dfd425df..af91bb8fb03 100644
--- a/Eigen/src/Core/SolverBase.h
+++ b/Eigen/src/Core/SolverBase.h
@@ -102,16 +102,14 @@ class SolverBase : public EigenBase<Derived> {
   };
 
   /** Default constructor */
-  SolverBase() {}
-
-  ~SolverBase() {}
+  SolverBase() = default;
 
   using Base::derived;
 
   /** \returns an expression of the solution x of \f$ A x = b \f$ using the current decomposition of A.
    */
   template <typename Rhs>
-  inline const Solve<Derived, Rhs> solve(const MatrixBase<Rhs>& b) const {
+  inline Solve<Derived, Rhs> solve(const MatrixBase<Rhs>& b) const {
     internal::solve_assertion<internal::remove_all_t<Derived>>::template run<false>(derived(), b);
     return Solve<Derived, Rhs>(derived(), b.derived());
   }
diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h
index 711ee3fb474..11abb77b0d0 100644
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -40,8 +40,7 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc
     scale = maxCoeff;
   }
 
-  // TODO if the maxCoeff is much much smaller than the current scale,
-  // then we can neglect this sub vector
+  // TODO: skip sub-vector when maxCoeff << current scale.
   if (scale > Scalar(0))  // if scale==0, then bl is 0
     ssq += (bl * invScale).squaredNorm();
 }
diff --git a/Eigen/src/Core/StlIterators.h b/Eigen/src/Core/StlIterators.h
index a24d4c236cc..afa2ecba0e7 100644
--- a/Eigen/src/Core/StlIterators.h
+++ b/Eigen/src/Core/StlIterators.h
@@ -28,7 +28,7 @@ class indexed_based_stl_iterator_base {
   typedef indexed_based_stl_iterator_base<typename traits::non_const_iterator> non_const_iterator;
   typedef indexed_based_stl_iterator_base<typename traits::const_iterator> const_iterator;
   typedef std::conditional_t<internal::is_const<XprType>::value, non_const_iterator, const_iterator> other_iterator;
-  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
+
   friend class indexed_based_stl_iterator_base<typename traits::const_iterator>;
   friend class indexed_based_stl_iterator_base<typename traits::non_const_iterator>;
 
@@ -174,7 +174,7 @@ class indexed_based_stl_reverse_iterator_base {
   typedef indexed_based_stl_reverse_iterator_base<typename traits::non_const_iterator> non_const_iterator;
   typedef indexed_based_stl_reverse_iterator_base<typename traits::const_iterator> const_iterator;
   typedef std::conditional_t<internal::is_const<XprType>::value, non_const_iterator, const_iterator> other_iterator;
-  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
+
   friend class indexed_based_stl_reverse_iterator_base<typename traits::const_iterator>;
   friend class indexed_based_stl_reverse_iterator_base<typename traits::non_const_iterator>;
 
@@ -318,7 +318,7 @@ class pointer_based_stl_iterator {
   typedef pointer_based_stl_iterator<std::remove_const_t<XprType>> non_const_iterator;
   typedef pointer_based_stl_iterator<std::add_const_t<XprType>> const_iterator;
   typedef std::conditional_t<internal::is_const<XprType>::value, non_const_iterator, const_iterator> other_iterator;
-  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
+
   friend class pointer_based_stl_iterator<std::add_const_t<XprType>>;
   friend class pointer_based_stl_iterator<std::remove_const_t<XprType>>;
 
@@ -335,10 +335,9 @@ class pointer_based_stl_iterator {
   typedef std::conditional_t<bool(is_lvalue), value_type*, const value_type*> pointer;
   typedef std::conditional_t<bool(is_lvalue), value_type&, const value_type&> reference;
 
-  pointer_based_stl_iterator() noexcept : m_ptr(0) {}
-  pointer_based_stl_iterator(XprType& xpr, Index index) noexcept : m_incr(xpr.innerStride()) {
-    m_ptr = xpr.data() + index * m_incr.value();
-  }
+  pointer_based_stl_iterator() noexcept : m_ptr(0), m_incr(XprType::InnerStrideAtCompileTime) {}
+  pointer_based_stl_iterator(XprType& xpr, Index index) noexcept
+      : m_ptr(xpr.data() + index * xpr.innerStride()), m_incr(xpr.innerStride()) {}
 
   pointer_based_stl_iterator(const non_const_iterator& other) noexcept : m_ptr(other.m_ptr), m_incr(other.m_incr) {}
 
@@ -450,7 +449,7 @@ class generic_randaccess_stl_iterator
   using Base::m_index;
   using Base::mp_xpr;
 
-  // TODO currently const Transpose/Reshape expressions never returns const references,
+  // TODO: currently const Transpose/Reshape expressions never returns const references,
   // so lets return by value too.
   // typedef std::conditional_t<bool(has_direct_access), const value_type&, const value_type> read_only_ref_t;
   typedef const value_type read_only_ref_t;
diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h
index 692f0a1cab3..3d71b6d9223 100644
--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h
@@ -58,20 +58,19 @@ class Stride {
   enum { InnerStrideAtCompileTime = InnerStrideAtCompileTime_, OuterStrideAtCompileTime = OuterStrideAtCompileTime_ };
 
   /** Default constructor, for use when strides are fixed at compile time */
-  EIGEN_DEVICE_FUNC Stride() : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime) {
-    // FIXME: for Eigen 4 we should use DynamicIndex instead of Dynamic.
-    // FIXME: for Eigen 4 we should also unify this API with fix<>
+  EIGEN_DEVICE_FUNC constexpr Stride() : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime) {
     eigen_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic);
   }
 
   /** Constructor allowing to pass the strides at runtime */
-  EIGEN_DEVICE_FUNC Stride(Index outerStride, Index innerStride) : m_outer(outerStride), m_inner(innerStride) {}
+  EIGEN_DEVICE_FUNC constexpr Stride(Index outerStride, Index innerStride)
+      : m_outer(outerStride), m_inner(innerStride) {}
 
   /** Copy constructor */
-  EIGEN_DEVICE_FUNC Stride(const Stride& other) : m_outer(other.outer()), m_inner(other.inner()) {}
+  EIGEN_DEVICE_FUNC constexpr Stride(const Stride& other) : m_outer(other.outer()), m_inner(other.inner()) {}
 
   /** Copy assignment operator */
-  EIGEN_DEVICE_FUNC Stride& operator=(const Stride& other) {
+  EIGEN_DEVICE_FUNC constexpr Stride& operator=(const Stride& other) {
     m_outer.setValue(other.outer());
     m_inner.setValue(other.inner());
     return *this;
@@ -94,8 +93,8 @@ class InnerStride : public Stride<0, Value> {
   typedef Stride<0, Value> Base;
 
  public:
-  EIGEN_DEVICE_FUNC InnerStride() : Base() {}
-  EIGEN_DEVICE_FUNC InnerStride(Index v) : Base(0, v) {}  // FIXME making this explicit could break valid code
+  EIGEN_DEVICE_FUNC constexpr InnerStride() : Base() {}
+  EIGEN_DEVICE_FUNC constexpr InnerStride(Index v) : Base(0, v) {}  // FIXME making this explicit could break valid code
 };
 
 /** \brief Convenience specialization of Stride to specify only an outer stride
@@ -105,8 +104,8 @@ class OuterStride : public Stride<Value, 0> {
   typedef Stride<Value, 0> Base;
 
  public:
-  EIGEN_DEVICE_FUNC OuterStride() : Base() {}
-  EIGEN_DEVICE_FUNC OuterStride(Index v) : Base(v, 0) {}  // FIXME making this explicit could break valid code
+  EIGEN_DEVICE_FUNC constexpr OuterStride() : Base() {}
+  EIGEN_DEVICE_FUNC constexpr OuterStride(Index v) : Base(v, 0) {}  // FIXME making this explicit could break valid code
 };
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/StructuredBindings.h b/Eigen/src/Core/StructuredBindings.h
new file mode 100644
index 00000000000..073f10e20be
--- /dev/null
+++ b/Eigen/src/Core/StructuredBindings.h
@@ -0,0 +1,155 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Pavel Guzenfeld
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STRUCTURED_BINDINGS_H
+#define EIGEN_STRUCTURED_BINDINGS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#if EIGEN_MAX_CPP_VER >= 17 && EIGEN_COMP_CXXVER >= 17
+
+#include <tuple>
+#include <type_traits>
+
+// Structured bindings support for fixed-size Eigen vectors and matrices.
+//
+// Enables:
+//   Eigen::Vector3d v(1, 2, 3);
+//   auto [x, y, z] = v;
+//
+//   Eigen::Array3i a(4, 5, 6);
+//   auto& [a0, a1, a2] = a;
+//
+// Decomposition order follows storage order: column-major by default,
+// so Matrix2d decomposes as (0,0), (1,0), (0,1), (1,1). Only fixed-size
+// column-major Matrix and Array specialize here; Map, Ref, and fixed-size
+// Block intentionally do not participate.
+
+namespace std {
+
+// std::tuple_size for fixed-size Matrix.
+//
+// Deliberately NOT SFINAE-gated on (Rows, Cols) because base-class-specifier
+// substitution is not a SFINAE context (a malformed base via enable_if_t
+// produces a non-SFINAE hard error rather than letting the primary template
+// stay incomplete). The static_assert below produces a friendly diagnostic
+// if generic code probes tuple_size<MatrixXd>.
+template <typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
+struct tuple_size<Eigen::Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>>
+    : std::integral_constant<size_t, static_cast<size_t>((Rows_ > 0 && Cols_ > 0) ? Rows_* Cols_ : 0)> {
+  static_assert(Rows_ != Eigen::Dynamic && Cols_ != Eigen::Dynamic,
+                "Structured bindings require fixed-size Eigen types (e.g. Vector3d, not VectorXd).");
+};
+
+// std::tuple_element for fixed-size Matrix.
+// Note: uses Idx_ instead of I to avoid conflict with Eigen's test framework macro.
+template <size_t Idx_, typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
+struct tuple_element<Idx_, Eigen::Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>> {
+  static_assert(Rows_ != Eigen::Dynamic && Cols_ != Eigen::Dynamic,
+                "Structured bindings require fixed-size Eigen types (e.g. Vector3d, not VectorXd).");
+  static_assert(Idx_ < static_cast<size_t>(Rows_ * Cols_), "Index out of range.");
+  using type = Scalar_;
+};
+
+// std::tuple_size for fixed-size Array. See note on Matrix specialization above.
+template <typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
+struct tuple_size<Eigen::Array<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>>
+    : std::integral_constant<size_t, static_cast<size_t>((Rows_ > 0 && Cols_ > 0) ? Rows_* Cols_ : 0)> {
+  static_assert(Rows_ != Eigen::Dynamic && Cols_ != Eigen::Dynamic,
+                "Structured bindings require fixed-size Eigen types (e.g. Array3d, not ArrayXd).");
+};
+
+// std::tuple_element for fixed-size Array.
+template <size_t Idx_, typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
+struct tuple_element<Idx_, Eigen::Array<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>> {
+  static_assert(Rows_ != Eigen::Dynamic && Cols_ != Eigen::Dynamic,
+                "Structured bindings require fixed-size Eigen types (e.g. Array3d, not ArrayXd).");
+  static_assert(Idx_ < static_cast<size_t>(Rows_ * Cols_), "Index out of range.");
+  using type = Scalar_;
+};
+
+}  // namespace std
+
+namespace Eigen {
+
+// Until the decomposition order for genuinely 2D RowMajor storage is agreed
+// upon, reject row-major matrices at the get<I> level. coeffRef(Index) is
+// linear in storage order, so permitting both orientations for 2D types would
+// silently flip the binding order between Matrix<T,R,C> and Matrix<T,R,C,RowMajor>.
+// Vector types (Rows==1 or Cols==1) are unaffected because storage order
+// does not change element order for a 1×N or N×1 shape — and Eigen forces
+// row-major on 1×N regardless, so we must allow it for row vectors.
+#define EIGEN_STRUCTURED_BINDINGS_ASSERT_COL_MAJOR(ROWS, COLS, OPTIONS)                                       \
+  static_assert((ROWS) == 1 || (COLS) == 1 || ((OPTIONS) & Eigen::RowMajorBit) == 0,                          \
+                "Structured bindings on 2D RowMajor Eigen types are not supported: coeffRef(Index) follows "  \
+                "storage order, so decomposition order would silently flip versus the column-major default. " \
+                "Use a column-major type, or transpose first. Row/column vectors are unaffected.")
+
+// get<Idx_> free functions for Matrix.
+template <size_t Idx_, typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar_& get(
+    Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>& m) noexcept {
+  static_assert(Rows_ != Dynamic && Cols_ != Dynamic, "Structured bindings require fixed-size Eigen types.");
+  static_assert(Idx_ < static_cast<size_t>(Rows_ * Cols_), "Index out of range.");
+  EIGEN_STRUCTURED_BINDINGS_ASSERT_COL_MAJOR(Rows_, Cols_, Options_);
+  return m.coeffRef(static_cast<Index>(Idx_));
+}
+
+template <size_t Idx_, typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar_& get(
+    const Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>& m) noexcept {
+  static_assert(Rows_ != Dynamic && Cols_ != Dynamic, "Structured bindings require fixed-size Eigen types.");
+  static_assert(Idx_ < static_cast<size_t>(Rows_ * Cols_), "Index out of range.");
+  EIGEN_STRUCTURED_BINDINGS_ASSERT_COL_MAJOR(Rows_, Cols_, Options_);
+  return m.coeffRef(static_cast<Index>(Idx_));
+}
+
+template <size_t Idx_, typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar_&& get(
+    Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>&& m) noexcept {
+  static_assert(Rows_ != Dynamic && Cols_ != Dynamic, "Structured bindings require fixed-size Eigen types.");
+  static_assert(Idx_ < static_cast<size_t>(Rows_ * Cols_), "Index out of range.");
+  EIGEN_STRUCTURED_BINDINGS_ASSERT_COL_MAJOR(Rows_, Cols_, Options_);
+  return std::move(m.coeffRef(static_cast<Index>(Idx_)));
+}
+
+// get<Idx_> free functions for Array.
+template <size_t Idx_, typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar_& get(
+    Array<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>& a) noexcept {
+  static_assert(Rows_ != Dynamic && Cols_ != Dynamic, "Structured bindings require fixed-size Eigen types.");
+  static_assert(Idx_ < static_cast<size_t>(Rows_ * Cols_), "Index out of range.");
+  EIGEN_STRUCTURED_BINDINGS_ASSERT_COL_MAJOR(Rows_, Cols_, Options_);
+  return a.coeffRef(static_cast<Index>(Idx_));
+}
+
+template <size_t Idx_, typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar_& get(
+    const Array<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>& a) noexcept {
+  static_assert(Rows_ != Dynamic && Cols_ != Dynamic, "Structured bindings require fixed-size Eigen types.");
+  static_assert(Idx_ < static_cast<size_t>(Rows_ * Cols_), "Index out of range.");
+  EIGEN_STRUCTURED_BINDINGS_ASSERT_COL_MAJOR(Rows_, Cols_, Options_);
+  return a.coeffRef(static_cast<Index>(Idx_));
+}
+
+template <size_t Idx_, typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar_&& get(
+    Array<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>&& a) noexcept {
+  static_assert(Rows_ != Dynamic && Cols_ != Dynamic, "Structured bindings require fixed-size Eigen types.");
+  static_assert(Idx_ < static_cast<size_t>(Rows_ * Cols_), "Index out of range.");
+  EIGEN_STRUCTURED_BINDINGS_ASSERT_COL_MAJOR(Rows_, Cols_, Options_);
+  return std::move(a.coeffRef(static_cast<Index>(Idx_)));
+}
+
+}  // namespace Eigen
+
+#endif  // C++17
+
+#endif  // EIGEN_STRUCTURED_BINDINGS_H
diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h
index dd825e90752..91d06adace3 100644
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h
@@ -36,9 +36,10 @@ class generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT,
   typedef typename Base::DstXprType DstXprType;
   typedef swap_assign_op<Scalar> Functor;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE generic_dense_assignment_kernel(DstEvaluatorTypeT &dst,
-                                                                        const SrcEvaluatorTypeT &src,
-                                                                        const Functor &func, DstXprType &dstExpr)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE generic_dense_assignment_kernel(DstEvaluatorTypeT &dst,
+                                                                                  const SrcEvaluatorTypeT &src,
+                                                                                  const Functor &func,
+                                                                                  DstXprType &dstExpr)
       : Base(dst, src, func, dstExpr) {}
 
   template <int StoreMode, int LoadMode, typename PacketType>
@@ -57,8 +58,6 @@ class generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT,
     m_dst.template writePacket<StoreMode>(index, tmp);
   }
 
-  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I
-  // mean no CRTP (Gael)
   template <int StoreMode, int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner) {
     Index row = Base::rowIndexByOuterInner(outer, inner);
@@ -82,8 +81,6 @@ class generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT,
     m_dst.template writePacketSegment<StoreMode>(index, tmp, begin, count);
   }
 
-  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I
-  // mean no CRTP (Gael)
   template <int StoreMode, int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin, Index count) {
     Index row = Base::rowIndexByOuterInner(outer, inner);
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 0676a252afc..2077c925dfe 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -61,20 +61,21 @@ class Transpose : public TransposeImpl<MatrixType, typename internal::traits<Mat
   EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose)
   typedef internal::remove_all_t<MatrixType> NestedExpression;
 
-  EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE Transpose(MatrixType& matrix) : m_matrix(matrix) {}
+  EIGEN_DEVICE_FUNC constexpr explicit EIGEN_STRONG_INLINE Transpose(MatrixType& matrix) : m_matrix(matrix) {}
 
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_matrix.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.rows(); }
 
   /** \returns the nested expression */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<MatrixTypeNested>& nestedExpression() const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const internal::remove_all_t<MatrixTypeNested>& nestedExpression()
+      const {
     return m_matrix;
   }
 
   /** \returns the nested expression */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::remove_reference_t<MatrixTypeNested>& nestedExpression() {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE std::remove_reference_t<MatrixTypeNested>& nestedExpression() {
     return m_matrix;
   }
 
@@ -114,17 +115,17 @@ class TransposeImpl<MatrixType, Dense> : public internal::TransposeImpl_base<Mat
   EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl)
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index innerStride() const { return derived().nestedExpression().innerStride(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outerStride() const { return derived().nestedExpression().outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Index innerStride() const {
+    return derived().nestedExpression().innerStride();
+  }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Index outerStride() const {
+    return derived().nestedExpression().outerStride();
+  }
 
   typedef std::conditional_t<internal::is_lvalue<MatrixType>::value, Scalar, const Scalar> ScalarWithConstIfNotLvalue;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr ScalarWithConstIfNotLvalue* data() {
-    return derived().nestedExpression().data();
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar* data() const {
-    return derived().nestedExpression().data();
-  }
+  EIGEN_DEVICE_FUNC constexpr ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return derived().nestedExpression().data(); }
 
   // FIXME: shall we keep the const version of coeffRef?
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeffRef(Index rowId, Index colId) const {
@@ -194,7 +195,7 @@ DenseBase<Derived>::transpose() const {
  *
  * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
 template <typename Derived>
-EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::AdjointReturnType MatrixBase<Derived>::adjoint() const {
+EIGEN_DEVICE_FUNC constexpr const typename MatrixBase<Derived>::AdjointReturnType MatrixBase<Derived>::adjoint() const {
   return AdjointReturnType(this->transpose());
 }
 
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 27ad78ecafc..2b0f56bea80 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -89,6 +89,11 @@ class TriangularBase : public EigenBase<Derived> {
     return coeffRef(row, col);
   }
 
+#ifdef EIGEN_MULTIDIMENSIONAL_SUBSCRIPT
+  EIGEN_DEVICE_FUNC inline Scalar operator[](Index row, Index col) const { return operator()(row, col); }
+  EIGEN_DEVICE_FUNC inline Scalar& operator[](Index row, Index col) { return operator()(row, col); }
+#endif
+
 #ifndef EIGEN_PARSED_BY_DOXYGEN
   EIGEN_DEVICE_FUNC inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
   EIGEN_DEVICE_FUNC inline Derived& derived() { return *static_cast<Derived*>(this); }
@@ -239,7 +244,7 @@ class TriangularView
   }
 
   template <typename Other>
-  EIGEN_DEVICE_FUNC inline const Solve<TriangularView, Other> solve(const MatrixBase<Other>& other) const {
+  EIGEN_DEVICE_FUNC inline Solve<TriangularView, Other> solve(const MatrixBase<Other>& other) const {
     return Solve<TriangularView, Other>(*this, other.derived());
   }
 
@@ -408,6 +413,21 @@ class TriangularViewImpl<MatrixType_, Mode_, Dense> : public TriangularBase<Tria
     return Product<OtherDerived, TriangularViewType>(lhs.derived(), rhs.derived());
   }
 
+  // Scaling a unit triangular view would break its implicit unit diagonal, so only non-unit modes participate.
+  template <unsigned int M = Mode, std::enable_if_t<(M & UnitDiag) == 0, int> = 0>
+  EIGEN_DEVICE_FUNC const
+      TriangularView<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(MatrixType, Scalar, product), Mode>
+      operator*(const Scalar& s) const {
+    return (derived().nestedExpression() * s).template triangularView<Mode>();
+  }
+
+  template <unsigned int M = Mode, std::enable_if_t<(M & UnitDiag) == 0, int> = 0>
+  friend EIGEN_DEVICE_FUNC const
+      TriangularView<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar, MatrixType, product), Mode>
+      operator*(const Scalar& s, const TriangularViewImpl& mat) {
+    return (s * mat.derived().nestedExpression()).template triangularView<Mode>();
+  }
+
   /** \returns the product of the inverse of \c *this with \a other, \a *this being triangular.
    *
    * This function computes the inverse-matrix matrix product inverse(\c *this) * \a other if
@@ -563,7 +583,7 @@ EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived>&
  */
 template <typename Derived>
 template <unsigned int Mode>
-EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
+EIGEN_DEVICE_FUNC constexpr typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView() {
   return typename TriangularViewReturnType<Mode>::Type(derived());
 }
@@ -571,7 +591,7 @@ MatrixBase<Derived>::triangularView() {
 /** This is the const version of MatrixBase::triangularView() */
 template <typename Derived>
 template <unsigned int Mode>
-EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
+EIGEN_DEVICE_FUNC constexpr typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView() const {
   return typename ConstTriangularViewReturnType<Mode>::Type(derived());
 }
@@ -628,7 +648,7 @@ bool MatrixBase<Derived>::isLowerTriangular(const RealScalar& prec) const {
 
 namespace internal {
 
-// TODO currently a triangular expression has the form TriangularView<.,.>
+// TODO: currently a triangular expression has the form TriangularView<.,.>
 //      in the future triangular-ness should be defined by the expression traits
 //      such that Transpose<TriangularView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make
 //      it work)
@@ -812,28 +832,51 @@ struct triangular_assignment_loop<Kernel, Mode, 0, SetOpposite> {
 template <typename Kernel, unsigned int Mode, bool SetOpposite>
 struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite> {
   typedef typename Kernel::Scalar Scalar;
+  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
+  typedef typename Kernel::AssignmentTraits AssignmentTraits;
+
+  enum {
+    IsRowMajor = (int(DstEvaluatorType::Flags) & RowMajorBit) != 0,
+    // In col-major: inner=row, outer=col. Upper means row<col i.e. inner<outer -> active before diagonal.
+    // In row-major: inner=col, outer=row. Upper means row<col i.e. inner>outer -> active after diagonal.
+    // So ActiveBeforeDiag = (Upper XOR IsRowMajor).
+    ActiveBeforeDiag = (bool(Mode & Upper) != bool(IsRowMajor))
+  };
+
+  // Compile-time outer/inner to row/col mapping. These constant-fold away entirely:
+  // ColMajor: row(outer,i) -> i, col(outer,i) -> outer
+  // RowMajor: row(outer,i) -> outer, col(outer,i) -> i
+  static constexpr Index row(Index outer, Index inner) { return IsRowMajor ? outer : inner; }
+  static constexpr Index col(Index outer, Index inner) { return IsRowMajor ? inner : outer; }
+
+  // Iterates in outer/inner order matching the storage layout for cache friendliness.
+  // Unlike the old code (which always iterated outer=col, inner=row), this gives
+  // contiguous memory access for both ColMajor and RowMajor storage.
+  // Simple scalar loops allow GCC to recognize memcpy/memset idioms and Clang to auto-vectorize.
+  // Uses a single running index 'i' per column (not separate loop variables) so the compiler
+  // can track the continuous progression and optimize register allocation.
   EIGEN_DEVICE_FUNC static inline void run(Kernel& kernel) {
-    for (Index j = 0; j < kernel.cols(); ++j) {
-      Index maxi = numext::mini(j, kernel.rows());
+    const Index outerSize = IsRowMajor ? kernel.rows() : kernel.cols();
+    const Index innerSize = IsRowMajor ? kernel.cols() : kernel.rows();
+
+    for (Index outer = 0; outer < outerSize; ++outer) {
+      const Index maxi = numext::mini(outer, innerSize);
       Index i = 0;
-      if (((Mode & Lower) && SetOpposite) || (Mode & Upper)) {
-        for (; i < maxi; ++i)
-          if (Mode & Upper)
-            kernel.assignCoeff(i, j);
-          else
-            kernel.assignOppositeCoeff(i, j);
-      } else
+
+      if (ActiveBeforeDiag) {
+        for (; i < maxi; ++i) kernel.assignCoeff(row(outer, i), col(outer, i));
+      } else if (SetOpposite) {
+        for (; i < maxi; ++i) kernel.assignOppositeCoeff(row(outer, i), col(outer, i));
+      } else {
         i = maxi;
+      }
 
-      if (i < kernel.rows())  // then i==j
-        kernel.assignDiagonalCoeff(i++);
+      if (i < innerSize) kernel.assignDiagonalCoeff(i++);
 
-      if (((Mode & Upper) && SetOpposite) || (Mode & Lower)) {
-        for (; i < kernel.rows(); ++i)
-          if (Mode & Lower)
-            kernel.assignCoeff(i, j);
-          else
-            kernel.assignOppositeCoeff(i, j);
+      if (!ActiveBeforeDiag) {
+        for (; i < innerSize; ++i) kernel.assignCoeff(row(outer, i), col(outer, i));
+      } else if (SetOpposite) {
+        for (; i < innerSize; ++i) kernel.assignOppositeCoeff(row(outer, i), col(outer, i));
       }
     }
   }
diff --git a/Eigen/src/Core/VectorBlock.h b/Eigen/src/Core/VectorBlock.h
index 5ac13eb8e41..1277e26f7fc 100644
--- a/Eigen/src/Core/VectorBlock.h
+++ b/Eigen/src/Core/VectorBlock.h
@@ -68,13 +68,13 @@ class VectorBlock : public Block<VectorType, internal::traits<VectorType>::Flags
 
   /** Dynamic-size constructor
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE VectorBlock(VectorType& vector, Index start, Index size)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE VectorBlock(VectorType& vector, Index start, Index size)
       : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start, IsColVector ? size : 1, IsColVector ? 1 : size) {
   }
 
   /** Fixed-size constructor
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE VectorBlock(VectorType& vector, Index start)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE VectorBlock(VectorType& vector, Index start)
       : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start) {}
 };
 
diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index 9ccbf7d7685..9e34d8c99a7 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -146,6 +146,22 @@ struct member_redux {
   const BinaryOp& binaryFunc() const { return m_functor; }
   const BinaryOp m_functor;
 };
+
+template <typename Scalar>
+struct scalar_replace_zero_with_one_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& x) const {
+    return numext::is_exactly_zero(x) ? Scalar(1) : x;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return pselect(pcmp_eq(x, pzero(x)), pset1<Packet>(Scalar(1)), x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_replace_zero_with_one_op<Scalar>> {
+  enum { Cost = 1, PacketAccess = packet_traits<Scalar>::HasCmp };
+};
+
 }  // namespace internal
 
 /** \class VectorwiseOp
@@ -624,18 +640,28 @@ class VectorwiseOp {
     return m_matrix / extendedTo(other.derived());
   }
 
+  using Normalized_NonzeroNormType =
+      CwiseUnaryOp<internal::scalar_replace_zero_with_one_op<Scalar>, const NormReturnType>;
+  using NormalizedReturnType = CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const ExpressionTypeNestedCleaned,
+                                             const typename OppositeExtendedType<Normalized_NonzeroNormType>::Type>;
+
   /** \returns an expression where each column (or row) of the referenced matrix are normalized.
    * The referenced matrix is \b not modified.
+   *
+   * \warning If the input columns (or rows) are too small (i.e., their norm equals to 0), they remain unchanged in the
+   *          resulting expression.
+   *
    * \sa MatrixBase::normalized(), normalize()
    */
-  EIGEN_DEVICE_FUNC CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const ExpressionTypeNestedCleaned,
-                                  const typename OppositeExtendedType<NormReturnType>::Type>
-  normalized() const {
-    return m_matrix.cwiseQuotient(extendedToOpposite(this->norm()));
+  EIGEN_DEVICE_FUNC NormalizedReturnType normalized() const {
+    return m_matrix.cwiseQuotient(extendedToOpposite(Normalized_NonzeroNormType(this->norm())));
   }
 
   /** Normalize in-place each row or columns of the referenced matrix.
-   * \sa MatrixBase::normalize(), normalized()
+   *
+   * \warning If the input columns (or rows) are too small (i.e., their norm equals to 0), they are left unchanged.
+   *
+   * \sa MatrixBase::normalized(), normalize()
    */
   EIGEN_DEVICE_FUNC void normalize() { m_matrix = this->normalized(); }
 
diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h
index e1d2ca52707..e2bbf96ce1c 100644
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -46,8 +46,8 @@ struct visitor_impl<Visitor, Derived, UnrollCount, Vectorize, false, ShortCircui
 
   static constexpr bool CanVectorize(int K) {
     constexpr int InnerSizeAtCompileTime = RowMajor ? ColsAtCompileTime : RowsAtCompileTime;
-    if (InnerSizeAtCompileTime < PacketSize) return false;
-    return Vectorize && (InnerSizeAtCompileTime - (K % InnerSizeAtCompileTime) >= PacketSize);
+    return Vectorize && InnerSizeAtCompileTime >= PacketSize &&
+           (InnerSizeAtCompileTime - (K % (InnerSizeAtCompileTime > 0 ? InnerSizeAtCompileTime : 1)) >= PacketSize);
   }
 
   template <int K = 0, bool Empty = (K == UnrollCount), std::enable_if_t<Empty, bool> = true>
@@ -317,6 +317,13 @@ class visitor_evaluator {
   const XprType& m_xpr;
 };
 
+template <typename T, typename = void>
+struct visitor_has_linear_access : std::false_type {};
+
+template <typename T>
+struct visitor_has_linear_access<T, decltype(functor_traits<T>::LinearAccess)>
+    : std::integral_constant<bool, static_cast<bool>(functor_traits<T>::LinearAccess)> {};
+
 template <typename Derived, typename Visitor, bool ShortCircuitEvaulation>
 struct visit_impl {
   using Evaluator = visitor_evaluator<Derived>;
@@ -329,8 +336,7 @@ struct visit_impl {
   static constexpr int InnerSizeAtCompileTime = IsRowMajor ? ColsAtCompileTime : RowsAtCompileTime;
   static constexpr int OuterSizeAtCompileTime = IsRowMajor ? RowsAtCompileTime : ColsAtCompileTime;
 
-  static constexpr bool LinearAccess =
-      Evaluator::LinearAccess && static_cast<bool>(functor_traits<Visitor>::LinearAccess);
+  static constexpr bool LinearAccess = Evaluator::LinearAccess && visitor_has_linear_access<Visitor>::value;
   static constexpr bool Vectorize = Evaluator::PacketAccess && static_cast<bool>(functor_traits<Visitor>::PacketAccess);
 
   static constexpr int PacketSize = packet_traits<Scalar>::size;
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index a4a87c4fc67..28d7cd661a3 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -79,8 +79,8 @@ EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a) {
-  const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000,
-                                                            0x80000000, 0x00000000, 0x80000000));
+  const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000, SIGN_MASK_I32, 0x00000000, SIGN_MASK_I32,
+                                                            0x00000000, SIGN_MASK_I32, 0x00000000, SIGN_MASK_I32));
   return Packet4cf(_mm256_xor_ps(a.v, mask));
 }
 
@@ -141,10 +141,13 @@ EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
 
 template <>
 EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from) {
-  // FIXME The following might be optimized using _mm256_movedup_pd
-  Packet2cf a = ploaddup<Packet2cf>(from);
-  Packet2cf b = ploaddup<Packet2cf>(from + 1);
-  return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
+  // vbroadcastf128 + vpermilpd, 2 uops: broadcast the 16 bytes holding two
+  // complex<float> into both 128-bit lanes, then duplicate each complex so
+  // the result is {c0, c0, c1, c1}. The load has no alignment requirement;
+  // we cast the source pointer through void* rather than through double*
+  // because alignof(std::complex<float>) == 4 < alignof(double).
+  __m256 bcast = _mm256_broadcast_ps(reinterpret_cast<const __m128*>(static_cast<const void*>(from)));
+  return Packet4cf(_mm256_castpd_ps(_mm256_permute_pd(_mm256_castps_pd(bcast), 3 << 2)));
 }
 
 template <>
@@ -245,6 +248,7 @@ struct packet_traits<std::complex<double> > : default_packet_traits {
     HasNegate = 1,
     HasSqrt = 1,
     HasLog = 1,
+    HasExp = 1,
     HasAbs = 0,
     HasAbs2 = 0,
     HasMin = 0,
@@ -282,7 +286,8 @@ EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a) {
-  const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000, 0x0, 0x0, 0x0, 0x80000000, 0x0, 0x0, 0x0));
+  const __m256d mask =
+      _mm256_castsi256_pd(_mm256_set_epi32(SIGN_MASK_I32, 0x0, 0x0, 0x0, SIGN_MASK_I32, 0x0, 0x0, 0x0));
   return Packet2cd(_mm256_xor_pd(a.v, mask));
 }
 
@@ -430,29 +435,20 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cd, 2>& kernel) {
   kernel.packet[0].v = tmp;
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet2cd psqrt<Packet2cd>(const Packet2cd& a) {
-  return psqrt_complex<Packet2cd>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4cf psqrt<Packet4cf>(const Packet4cf& a) {
-  return psqrt_complex<Packet4cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cd plog<Packet2cd>(const Packet2cd& a) {
-  return plog_complex<Packet2cd>(a);
-}
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS_NO_EXP(Packet2cd)
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet4cf)
 
 template <>
-EIGEN_STRONG_INLINE Packet4cf plog<Packet4cf>(const Packet4cf& a) {
-  return plog_complex<Packet4cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4cf pexp<Packet4cf>(const Packet4cf& a) {
-  return pexp_complex<Packet4cf>(a);
+EIGEN_STRONG_INLINE Packet2cd pexp<Packet2cd>(const Packet2cd& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return pexp_complex<Packet2cd>(a);
+#else
+  // Without AVX2, pexp_complex<Packet2cd> requires psincos_double<Packet4d> which needs
+  // 256-bit integer operations (Packet4l) not available on AVX-only targets.
+  // Process as two independent Packet1cd using the SSE implementation instead.
+  return Packet2cd(_mm256_insertf128_pd(_mm256_castpd128_pd256(pexp(Packet1cd(_mm256_castpd256_pd128(a.v))).v),
+                                        pexp(Packet1cd(_mm256_extractf128_pd(a.v, 1))).v, 1));
+#endif
 }
 
 #ifdef EIGEN_VECTORIZE_FMA
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index 5b7285f99bc..357f3142e68 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -24,17 +24,43 @@ namespace internal {
 EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet8f)
 
 EIGEN_DOUBLE_PACKET_FUNCTION(atanh, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(sinh, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(cosh, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(asinh, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(acosh, Packet4d)
 EIGEN_DOUBLE_PACKET_FUNCTION(log, Packet4d)
-EIGEN_DOUBLE_PACKET_FUNCTION(log2, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(log10, Packet4d)
 EIGEN_DOUBLE_PACKET_FUNCTION(exp, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(log2, Packet4d)
 EIGEN_DOUBLE_PACKET_FUNCTION(tanh, Packet4d)
 EIGEN_DOUBLE_PACKET_FUNCTION(cbrt, Packet4d)
 #ifdef EIGEN_VECTORIZE_AVX2
 EIGEN_DOUBLE_PACKET_FUNCTION(sin, Packet4d)
 EIGEN_DOUBLE_PACKET_FUNCTION(cos, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(tan, Packet4d)
+#else
+// Without AVX2, psincos_double<Packet4d> requires 256-bit integer operations (Packet4l)
+// that are not available. Process as two Packet2d halves using the SSE implementation.
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4d psin<Packet4d>(const Packet4d& x) {
+  return _mm256_insertf128_pd(_mm256_castpd128_pd256(psin(_mm256_castpd256_pd128(x))),
+                              psin(_mm256_extractf128_pd(x, 1)), 1);
+}
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4d pcos<Packet4d>(const Packet4d& x) {
+  return _mm256_insertf128_pd(_mm256_castpd128_pd256(pcos(_mm256_castpd256_pd128(x))),
+                              pcos(_mm256_extractf128_pd(x, 1)), 1);
+}
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4d ptan<Packet4d>(const Packet4d& x) {
+  return _mm256_insertf128_pd(_mm256_castpd128_pd256(ptan(_mm256_castpd256_pd128(x))),
+                              ptan(_mm256_extractf128_pd(x, 1)), 1);
+}
 #endif
 EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet4d)
 EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet4d)
+EIGEN_GENERIC_PACKET_FUNCTION(expm1, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(log1p, Packet4d)
 
 // Notice that for newer processors, it is counterproductive to use Newton
 // iteration for square root. In particular, Skylake and Zen2 processors
@@ -95,32 +121,10 @@ EIGEN_STRONG_INLINE Packet8bf pldexp(const Packet8bf& a, const Packet8bf& expone
   return F32ToBf16(pldexp<Packet8f>(Bf16ToF32(a), Bf16ToF32(exponent)));
 }
 
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pcos)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexp)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexp2)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexpm1)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog1p)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog2)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, preciprocal)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, prsqrt)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psin)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psqrt)
-BF16_PACKET_FUNCTION(Packet8f, Packet8bf, ptanh)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_BF16(Packet8f, Packet8bf)
 
 #ifndef EIGEN_VECTORIZE_AVX512FP16
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pcos)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp2)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, pexpm1)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, plog)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, plog1p)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, plog2)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, preciprocal)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, prsqrt)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, psin)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt)
-F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_F16(Packet8f, Packet8h)
 #endif
 
 }  // end namespace internal
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index eb5da53d032..bd35d7d202c 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -46,6 +46,8 @@ typedef eigen_packet_wrapper<__m256i, 3> Packet4l;
 typedef eigen_packet_wrapper<__m256i, 5> Packet4ul;
 #endif
 
+#define SIGN_MASK_I64 static_cast<int64_t>(0x8000000000000000ULL)
+
 template <>
 struct is_arithmetic<__m256> {
   enum { value = true };
@@ -110,14 +112,21 @@ struct packet_traits<float> : default_packet_traits {
     HasReciprocal = EIGEN_FAST_MATH,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
+    HasTan = EIGEN_FAST_MATH,
     HasACos = 1,
     HasASin = 1,
     HasATan = 1,
     HasATanh = 1,
+    HasSinh = 1,
+    HasCosh = 1,
+    HasASinh = 1,
+    HasACosh = 1,
     HasLog = 1,
+    HasLog10 = 1,
+    HasExp = 1,
     HasLog1p = 1,
     HasExpm1 = 1,
-    HasExp = 1,
+    HasPow = 1,
     HasNdtri = 1,
     HasBessel = 1,
     HasSqrt = 1,
@@ -126,7 +135,6 @@ struct packet_traits<float> : default_packet_traits {
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
     HasErfc = EIGEN_FAST_MATH,
-    HasBlend = 1
   };
 };
 template <>
@@ -143,18 +151,26 @@ struct packet_traits<double> : default_packet_traits {
 #ifdef EIGEN_VECTORIZE_AVX2
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
+    HasTan = EIGEN_FAST_MATH,
 #endif
+    HasSinh = 1,
+    HasCosh = 1,
+    HasASinh = 1,
+    HasACosh = 1,
     HasTanh = EIGEN_FAST_MATH,
-    HasLog = 1,
     HasErf = 1,
     HasErfc = 1,
+    HasLog = 1,
+    HasLog10 = 1,
     HasExp = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasPow = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
     HasCbrt = 1,
     HasATan = 1,
     HasATanh = 1,
-    HasBlend = 1
   };
 };
 
@@ -177,7 +193,6 @@ struct packet_traits<Eigen::half> : default_packet_traits {
     HasCos = EIGEN_FAST_MATH,
     HasNegate = 1,
     HasAbs = 1,
-    HasAbs2 = 0,
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
@@ -190,7 +205,6 @@ struct packet_traits<Eigen::half> : default_packet_traits {
     HasRsqrt = 1,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
-    HasBlend = 0,
     HasBessel = 1,
     HasNdtri = 1
   };
@@ -216,7 +230,6 @@ struct packet_traits<bfloat16> : default_packet_traits {
     HasCos = EIGEN_FAST_MATH,
     HasNegate = 1,
     HasAbs = 1,
-    HasAbs2 = 0,
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
@@ -229,7 +242,6 @@ struct packet_traits<bfloat16> : default_packet_traits {
     HasRsqrt = 1,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
-    HasBlend = 0,
     HasBessel = 1,
     HasNdtri = 1
   };
@@ -252,7 +264,6 @@ struct packet_traits<uint32_t> : default_packet_traits {
 
     HasDiv = 0,
     HasNegate = 0,
-    HasSqrt = 0,
 
     HasCmp = 1,
     HasMin = 1,
@@ -279,13 +290,9 @@ struct packet_traits<uint64_t> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 4,
 
-    // HasMin = 0,
-    // HasMax = 0,
     HasDiv = 0,
-    HasBlend = 0,
     HasTranspose = 0,
     HasNegate = 0,
-    HasSqrt = 0,
     HasCmp = 1,
     HasShift = 1
   };
@@ -880,12 +887,12 @@ EIGEN_STRONG_INLINE Packet8ui psub<Packet8ui>(const Packet8ui& a, const Packet8u
 
 template <>
 EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) {
-  const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
+  const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(SIGN_MASK_I32));
   return _mm256_xor_ps(a, mask);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) {
-  const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000ULL));
+  const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(SIGN_MASK_I64));
   return _mm256_xor_pd(a, mask);
 }
 template <>
@@ -1769,12 +1776,6 @@ template <>
 EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) {
   __m256d tmp = _mm256_shuffle_pd(a, a, 5);
   return _mm256_permute2f128_pd(tmp, tmp, 1);
-#if 0
-  // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
-  // exhibit the same latency/throughput, but it is here for future reference/benchmarking...
-  __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
-    return _mm256_permute_pd(swap_halves,5);
-#endif
 }
 template <>
 EIGEN_STRONG_INLINE Packet8i preverse(const Packet8i& a) {
@@ -1937,15 +1938,15 @@ EIGEN_STRONG_INLINE Packet4d pldexp_fast<Packet4d>(const Packet4d& a, const Pack
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a) {
+EIGEN_STRONG_INLINE Packet4f predux_half<Packet8f>(const Packet8f& a) {
   return _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1));
 }
 template <>
-EIGEN_STRONG_INLINE Packet4i predux_half_dowto4<Packet8i>(const Packet8i& a) {
+EIGEN_STRONG_INLINE Packet4i predux_half<Packet8i>(const Packet8i& a) {
   return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
 }
 template <>
-EIGEN_STRONG_INLINE Packet4ui predux_half_dowto4<Packet8ui>(const Packet8ui& a) {
+EIGEN_STRONG_INLINE Packet4ui predux_half<Packet8ui>(const Packet8ui& a) {
   return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
 }
 
@@ -2068,31 +2069,6 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4d, 4>& kernel) {
   kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
 }
 
-EIGEN_STRONG_INLINE __m256i avx_blend_mask(const Selector<4>& ifPacket) {
-  return _mm256_set_epi64x(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1],
-                           0 - ifPacket.select[0]);
-}
-
-EIGEN_STRONG_INLINE __m256i avx_blend_mask(const Selector<8>& ifPacket) {
-  return _mm256_set_epi32(0 - ifPacket.select[7], 0 - ifPacket.select[6], 0 - ifPacket.select[5],
-                          0 - ifPacket.select[4], 0 - ifPacket.select[3], 0 - ifPacket.select[2],
-                          0 - ifPacket.select[1], 0 - ifPacket.select[0]);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket,
-                                    const Packet8f& elsePacket) {
-  const __m256 true_mask = _mm256_castsi256_ps(avx_blend_mask(ifPacket));
-  return pselect<Packet8f>(true_mask, thenPacket, elsePacket);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket,
-                                    const Packet4d& elsePacket) {
-  const __m256d true_mask = _mm256_castsi256_pd(avx_blend_mask(ifPacket));
-  return pselect<Packet4d>(true_mask, thenPacket, elsePacket);
-}
-
 // Packet math for Eigen::half
 #ifndef EIGEN_VECTORIZE_AVX512FP16
 template <>
@@ -2161,7 +2137,7 @@ EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pabs(const Packet8h& a) {
-  const __m128i sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  const __m128i sign_mask = _mm_set1_epi16(static_cast<short>(0x8000u));
   return _mm_andnot_si128(sign_mask, a);
 }
 
@@ -2316,7 +2292,7 @@ EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) {
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
-  Packet8h sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  Packet8h sign_mask = _mm_set1_epi16(static_cast<short>(0x8000u));
   return _mm_xor_si128(a, sign_mask);
 }
 
@@ -2595,7 +2571,7 @@ EIGEN_STRONG_INLINE Packet8bf ptrue(const Packet8bf& a) {
 
 template <>
 EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
-  const __m128i sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  const __m128i sign_mask = _mm_set1_epi16(static_cast<short>(0x8000u));
   return _mm_andnot_si128(sign_mask, a);
 }
 
@@ -2688,7 +2664,7 @@ EIGEN_STRONG_INLINE Packet8bf pconj(const Packet8bf& a) {
 
 template <>
 EIGEN_STRONG_INLINE Packet8bf pnegate(const Packet8bf& a) {
-  Packet8bf sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  Packet8bf sign_mask = _mm_set1_epi16(static_cast<short>(0x8000u));
   return _mm_xor_si128(a, sign_mask);
 }
 
@@ -2829,7 +2805,7 @@ inline __m128i segment_mask_4x8(Index begin, Index count) {
   mask <<= CHAR_BIT * count;
   mask--;
   mask <<= CHAR_BIT * begin;
-#if defined(_WIN32) && !defined(_WIN64)
+#if !EIGEN_ARCH_x86_64
   return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&mask));
 #else
   return _mm_cvtsi64_si128(mask);
@@ -2845,7 +2821,7 @@ inline __m128i segment_mask_8x8(Index begin, Index count) {
   mask <<= (CHAR_BIT / 2) * count;
   mask--;
   mask <<= CHAR_BIT * begin;
-#if defined(_WIN32) && !defined(_WIN64)
+#if !EIGEN_ARCH_x86_64
   return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&mask));
 #else
   return _mm_cvtsi64_si128(mask);
diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h
index 5b73ffe8607..9feb38f81af 100644
--- a/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -56,6 +56,40 @@ struct type_casting_traits<int64_t, double> : vectorized_type_casting_traits<int
 #endif
 #endif
 
+EIGEN_STRONG_INLINE __m256 _eigen_mm256_set_m128(__m128 hi, __m128 lo) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 810)
+  __m256 result = _mm256_castps128_ps256(lo);
+  return _mm256_insertf128_ps(result, hi, 1);
+#else
+  return _mm256_set_m128(hi, lo);
+#endif
+}
+
+EIGEN_STRONG_INLINE __m256d _eigen_mm256_set_m128d(__m128d hi, __m128d lo) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 810)
+  __m256d result = _mm256_castpd128_pd256(lo);
+  return _mm256_insertf128_pd(result, hi, 1);
+#else
+  return _mm256_set_m128d(hi, lo);
+#endif
+}
+
+EIGEN_STRONG_INLINE __m256i _eigen_mm256_set_m128i(__m128i hi, __m128i lo) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 810)
+#if defined(EIGEN_VECTORIZE_AVX2)
+  __m256i result = _mm256_castsi128_si256(lo);
+  return _mm256_inserti128_si256(result, hi, 1);
+#else
+  EIGEN_ALIGN32 int32_t tmp[8];
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp + 4), hi);
+  return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(tmp));
+#endif
+#else
+  return _mm256_set_m128i(hi, lo);
+#endif
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16b pcast<Packet8f, Packet16b>(const Packet8f& a, const Packet8f& b) {
   __m256 nonzero_a = _mm256_cmp_ps(a, pzero(a), _CMP_NEQ_UQ);
@@ -109,7 +143,7 @@ EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
 
 template <>
 EIGEN_STRONG_INLINE Packet8i pcast<Packet4d, Packet8i>(const Packet4d& a, const Packet4d& b) {
-  return _mm256_set_m128i(_mm256_cvttpd_epi32(b), _mm256_cvttpd_epi32(a));
+  return _eigen_mm256_set_m128i(_mm256_cvttpd_epi32(b), _mm256_cvttpd_epi32(a));
 }
 
 template <>
@@ -124,7 +158,7 @@ EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
 
 template <>
 EIGEN_STRONG_INLINE Packet8f pcast<Packet4d, Packet8f>(const Packet4d& a, const Packet4d& b) {
-  return _mm256_set_m128(_mm256_cvtpd_ps(b), _mm256_cvtpd_ps(a));
+  return _eigen_mm256_set_m128(_mm256_cvtpd_ps(b), _mm256_cvtpd_ps(a));
 }
 
 template <>
@@ -240,8 +274,8 @@ EIGEN_STRONG_INLINE Packet4d pcast<Packet4l, Packet4d>(const Packet4l& a) {
 #if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVS512VL)
   return _mm256_cvtepi64_pd(a);
 #else
-  EIGEN_ALIGN16 int64_t aux[4];
-  pstore(aux, a);
+  int64_t aux[4];
+  pstoreu(aux, a);
   return _mm256_set_pd(static_cast<double>(aux[3]), static_cast<double>(aux[2]), static_cast<double>(aux[1]),
                        static_cast<double>(aux[0]));
 #endif
@@ -249,7 +283,7 @@ EIGEN_STRONG_INLINE Packet4d pcast<Packet4l, Packet4d>(const Packet4l& a) {
 
 template <>
 EIGEN_STRONG_INLINE Packet4d pcast<Packet2l, Packet4d>(const Packet2l& a, const Packet2l& b) {
-  return _mm256_set_m128d((pcast<Packet2l, Packet2d>(b)), (pcast<Packet2l, Packet2d>(a)));
+  return _eigen_mm256_set_m128d((pcast<Packet2l, Packet2d>(b)), (pcast<Packet2l, Packet2d>(a)));
 }
 
 template <>
diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h
index b70c7fefedb..b8483a80427 100644
--- a/Eigen/src/Core/arch/AVX512/Complex.h
+++ b/Eigen/src/Core/arch/AVX512/Complex.h
@@ -82,8 +82,8 @@ EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a) {
 template <>
 EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a) {
   const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32(
-      0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000,
-      0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000));
+      0x00000000, SIGN_MASK_I32, 0x00000000, SIGN_MASK_I32, 0x00000000, SIGN_MASK_I32, 0x00000000, SIGN_MASK_I32,
+      0x00000000, SIGN_MASK_I32, 0x00000000, SIGN_MASK_I32, 0x00000000, SIGN_MASK_I32, 0x00000000, SIGN_MASK_I32));
   return Packet8cf(pxor(a.v, mask));
 }
 
@@ -184,7 +184,7 @@ EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4<Packet8cf>(const Packet8cf& a) {
+EIGEN_STRONG_INLINE Packet4cf predux_half<Packet8cf>(const Packet8cf& a) {
   __m256 lane0 = extract256<0>(a.v);
   __m256 lane1 = extract256<1>(a.v);
   __m256 res = _mm256_add_ps(lane0, lane1);
@@ -226,6 +226,7 @@ struct packet_traits<std::complex<double> > : default_packet_traits {
     HasNegate = 1,
     HasSqrt = 1,
     HasLog = 1,
+    HasExp = 1,
     HasAbs = 0,
     HasAbs2 = 0,
     HasMin = 0,
@@ -262,8 +263,9 @@ EIGEN_STRONG_INLINE Packet4cd pnegate(const Packet4cd& a) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a) {
-  const __m512d mask = _mm512_castsi512_pd(_mm512_set_epi32(0x80000000, 0x0, 0x0, 0x0, 0x80000000, 0x0, 0x0, 0x0,
-                                                            0x80000000, 0x0, 0x0, 0x0, 0x80000000, 0x0, 0x0, 0x0));
+  const __m512d mask =
+      _mm512_castsi512_pd(_mm512_set_epi32(SIGN_MASK_I32, 0x0, 0x0, 0x0, SIGN_MASK_I32, 0x0, 0x0, 0x0, SIGN_MASK_I32,
+                                           0x0, 0x0, 0x0, SIGN_MASK_I32, 0x0, 0x0, 0x0));
   return Packet4cd(pxor(a.v, mask));
 }
 
@@ -441,30 +443,8 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4cd, 4>& kernel) {
   kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0, 2, 0, 2>::mask)));  // [a0 b0 c0 d0]
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet4cd psqrt<Packet4cd>(const Packet4cd& a) {
-  return psqrt_complex<Packet4cd>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8cf psqrt<Packet8cf>(const Packet8cf& a) {
-  return psqrt_complex<Packet8cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4cd plog<Packet4cd>(const Packet4cd& a) {
-  return plog_complex<Packet4cd>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8cf plog<Packet8cf>(const Packet8cf& a) {
-  return plog_complex<Packet8cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8cf pexp<Packet8cf>(const Packet8cf& a) {
-  return pexp_complex<Packet8cf>(a);
-}
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet4cd)
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet8cf)
 
 }  // end namespace internal
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX512/GemmKernel.h b/Eigen/src/Core/arch/AVX512/GemmKernel.h
index e06b83c91c4..50e26ec6d5c 100644
--- a/Eigen/src/Core/arch/AVX512/GemmKernel.h
+++ b/Eigen/src/Core/arch/AVX512/GemmKernel.h
@@ -35,6 +35,8 @@
 namespace Eigen {
 namespace internal {
 
+#if EIGEN_USE_AVX512_GEMM_KERNELS
+
 template <typename Scalar, bool is_unit_inc>
 class gemm_class {
   using vec = typename packet_traits<Scalar>::type;
@@ -79,10 +81,10 @@ class gemm_class {
   Index m;
   const Index n, k, ldc;
   const Index inc;
-  const Scalar *alpha;
+  const Scalar* alpha;
 
   const Scalar *a, *b;
-  Scalar *c;
+  Scalar* c;
 
   const bool is_alpha1;
   const bool is_beta0;
@@ -90,26 +92,26 @@ class gemm_class {
   const Index a_stride, b_stride;
   const Index a_off, b_off;
 
-  EIGEN_ALWAYS_INLINE void prefetch_a(const Scalar *a_addr) {
-    _mm_prefetch((char *)(a_prefetch_size + a_addr - a_shift), _MM_HINT_T0);
+  EIGEN_ALWAYS_INLINE void prefetch_a(const Scalar* a_addr) {
+    _mm_prefetch((char*)(a_prefetch_size + a_addr - a_shift), _MM_HINT_T0);
   }
 
-  EIGEN_ALWAYS_INLINE void prefetch_b(const Scalar *b_addr) {
-    _mm_prefetch((char *)(b_prefetch_size + b_addr - b_shift), _MM_HINT_T0);
+  EIGEN_ALWAYS_INLINE void prefetch_b(const Scalar* b_addr) {
+    _mm_prefetch((char*)(b_prefetch_size + b_addr - b_shift), _MM_HINT_T0);
   }
 
-  EIGEN_ALWAYS_INLINE void prefetch_x(const Scalar *x_addr) { _mm_prefetch((char *)(x_addr - a_shift), _MM_HINT_T2); }
+  EIGEN_ALWAYS_INLINE void prefetch_x(const Scalar* x_addr) { _mm_prefetch((char*)(x_addr - a_shift), _MM_HINT_T2); }
 
-  EIGEN_ALWAYS_INLINE void prefetch_c(const Scalar *c_addr) {
+  EIGEN_ALWAYS_INLINE void prefetch_c(const Scalar* c_addr) {
 #if defined(__PRFCHW__) && __PRFCHW__ == 1
-    _m_prefetchw((void *)c_addr);
+    _m_prefetchw((void*)c_addr);
 #else
-    _mm_prefetch((char *)c_addr, _MM_HINT_T0);
+    _mm_prefetch((char*)c_addr, _MM_HINT_T0);
 #endif
   }
 
   template <int nelems>
-  EIGEN_ALWAYS_INLINE void a_load(vec &a_reg, const Scalar *a_addr) {
+  EIGEN_ALWAYS_INLINE void a_load(vec& a_reg, const Scalar* a_addr) {
     switch (nelems * sizeof(*a_addr) * 8) {
       default:
       case 512 * 3:
@@ -122,13 +124,13 @@ class gemm_class {
         a_reg = ploadu<vec>(a_addr);
         break;
       case 256 * 1:
-        a_reg = preinterpret<vec>(_mm512_broadcast_f64x4(ploadu<Packet4d>(reinterpret_cast<const double *>(a_addr))));
+        a_reg = preinterpret<vec>(_mm512_broadcast_f64x4(ploadu<Packet4d>(reinterpret_cast<const double*>(a_addr))));
         break;
       case 128 * 1:
-        a_reg = preinterpret<vec>(_mm512_broadcast_f32x4(ploadu<Packet4f>(reinterpret_cast<const float *>(a_addr))));
+        a_reg = preinterpret<vec>(_mm512_broadcast_f32x4(ploadu<Packet4f>(reinterpret_cast<const float*>(a_addr))));
         break;
       case 64 * 1:
-        a_reg = preinterpret<vec>(pload1<Packet8d>(reinterpret_cast<const double *>(a_addr)));
+        a_reg = preinterpret<vec>(pload1<Packet8d>(reinterpret_cast<const double*>(a_addr)));
         break;
       case 32 * 1:
         a_reg = pload1<vec>(a_addr);
@@ -136,10 +138,10 @@ class gemm_class {
     }
   }
 
-  EIGEN_ALWAYS_INLINE void b_load(vec &b_reg, const Scalar *b_addr) { b_reg = pload1<vec>(b_addr); }
+  EIGEN_ALWAYS_INLINE void b_load(vec& b_reg, const Scalar* b_addr) { b_reg = pload1<vec>(b_addr); }
 
   template <int nelems>
-  EIGEN_ALWAYS_INLINE void c_store(Scalar *mem, vec &src) {
+  EIGEN_ALWAYS_INLINE void c_store(Scalar* mem, vec& src) {
     if (is_unit_inc) {
       switch (nelems * sizeof(*mem) * 8) {
         default:
@@ -194,7 +196,7 @@ class gemm_class {
   }
 
   template <int nelems>
-  EIGEN_ALWAYS_INLINE void vaddm(vec &dst, const Scalar *mem, vec &src, vec &reg) {
+  EIGEN_ALWAYS_INLINE void vaddm(vec& dst, const Scalar* mem, vec& src, vec& reg) {
     if (is_unit_inc) {
       switch (nelems * sizeof(*mem) * 8) {
         default:
@@ -261,7 +263,7 @@ class gemm_class {
     }
   }
 
-  EIGEN_STRONG_INLINE void vfmadd(vec &dst, const vec &src1, const vec &src2) {
+  EIGEN_STRONG_INLINE void vfmadd(vec& dst, const vec& src1, const vec& src2) {
     dst = pmadd(src1, src2, dst);
 
 #if (EIGEN_COMP_GNUC != 0) || (EIGEN_COMP_CLANG != 0)
@@ -271,7 +273,7 @@ class gemm_class {
   }
 
   template <int nelems>
-  EIGEN_ALWAYS_INLINE void vfmaddm(vec &dst, const Scalar *mem, vec &src, vec &scale, vec &reg) {
+  EIGEN_ALWAYS_INLINE void vfmaddm(vec& dst, const Scalar* mem, vec& src, vec& scale, vec& reg) {
     if (is_unit_inc) {
       switch (nelems * sizeof(*mem) * 8) {
         default:
@@ -348,16 +350,16 @@ class gemm_class {
   }
 
   template <int j, int endX, int i, int endY, int nelems>
-  EIGEN_ALWAYS_INLINE std::enable_if_t<(j > endX) || (i > endY)> a_loads(const Scalar *ao) {
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(j > endX) || (i > endY)> a_loads(const Scalar* ao) {
     EIGEN_UNUSED_VARIABLE(ao);
   }
 
   template <int j, int endX, int i, int endY, int nelems>
-  EIGEN_ALWAYS_INLINE std::enable_if_t<(j <= endX) && (i <= endY)> a_loads(const Scalar *ao) {
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(j <= endX) && (i <= endY)> a_loads(const Scalar* ao) {
     if (j < endX) {
       if (i < endY) {
-        auto &a_reg = zmm[a_regs[i + (j % 2) * 3]];
-        const Scalar *a_addr = ao + nelems * j + nelems_in_cache_line * i - a_shift;
+        auto& a_reg = zmm[a_regs[i + (j % 2) * 3]];
+        const Scalar* a_addr = ao + nelems * j + nelems_in_cache_line * i - a_shift;
         a_load<nelems>(a_reg, a_addr);
 
         a_loads<j, endX, i + 1, endY, nelems>(ao);
@@ -368,8 +370,8 @@ class gemm_class {
   }
 
   template <int un, int max_b_unroll, int i, int um_vecs, int a_unroll, int b_unroll>
-  EIGEN_ALWAYS_INLINE std::enable_if_t<(un > max_b_unroll) || (i > um_vecs)> prefetch_cs(const Scalar *co1,
-                                                                                         const Scalar *co2) {
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(un > max_b_unroll) || (i > um_vecs)> prefetch_cs(const Scalar* co1,
+                                                                                         const Scalar* co2) {
     EIGEN_UNUSED_VARIABLE(co1);
     EIGEN_UNUSED_VARIABLE(co2);
   }
@@ -389,13 +391,13 @@ class gemm_class {
    */
 
   template <int un, int max_b_unroll, int i, int um_vecs, int a_unroll, int b_unroll>
-  EIGEN_ALWAYS_INLINE std::enable_if_t<(un <= max_b_unroll) && (i <= um_vecs)> prefetch_cs(Scalar *&co1, Scalar *&co2) {
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(un <= max_b_unroll) && (i <= um_vecs)> prefetch_cs(Scalar*& co1, Scalar*& co2) {
     if (un < max_b_unroll) {
       if (b_unroll >= un + 1) {
         if (un == 4 && i == 0) co2 = co1 + 4 * ldc;
 
         if (i < um_vecs) {
-          Scalar *co = (un + 1 <= 4) ? co1 : co2;
+          Scalar* co = (un + 1 <= 4) ? co1 : co2;
           auto co_off = (un % 4) * ldc + a_unroll - 1 + i * nelems_in_cache_line * sizeof *co;
           prefetch_c(co + co_off);
 
@@ -412,16 +414,16 @@ class gemm_class {
 
   // load_c
   template <int i, int um_vecs, int idx, int nelems>
-  EIGEN_ALWAYS_INLINE std::enable_if_t<(i > um_vecs)> scale_load_c(const Scalar *cox, vec &alpha_reg) {
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(i > um_vecs)> scale_load_c(const Scalar* cox, vec& alpha_reg) {
     EIGEN_UNUSED_VARIABLE(cox);
     EIGEN_UNUSED_VARIABLE(alpha_reg);
   }
 
   template <int i, int um_vecs, int idx, int nelems>
-  EIGEN_ALWAYS_INLINE std::enable_if_t<(i <= um_vecs)> scale_load_c(const Scalar *cox, vec &alpha_reg) {
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(i <= um_vecs)> scale_load_c(const Scalar* cox, vec& alpha_reg) {
     if (i < um_vecs) {
-      auto &c_reg = zmm[c_regs[i + idx * 3]];
-      auto &c_load_reg = zmm[c_load_regs[i % 3]];
+      auto& c_reg = zmm[c_regs[i + idx * 3]];
+      auto& c_load_reg = zmm[c_load_regs[i % 3]];
       auto c_mem = cox;
       if (is_unit_inc)
         c_mem += i * nelems_in_cache_line;
@@ -441,14 +443,14 @@ class gemm_class {
 
   // store_c
   template <int i, int um_vecs, int idx, int nelems>
-  EIGEN_ALWAYS_INLINE std::enable_if_t<(i > um_vecs)> write_c(Scalar *cox) {
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(i > um_vecs)> write_c(Scalar* cox) {
     EIGEN_UNUSED_VARIABLE(cox);
   }
 
   template <int i, int um_vecs, int idx, int nelems>
-  EIGEN_ALWAYS_INLINE std::enable_if_t<(i <= um_vecs)> write_c(Scalar *cox) {
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(i <= um_vecs)> write_c(Scalar* cox) {
     if (i < um_vecs) {
-      auto &c_reg = zmm[c_regs[i + idx * 3]];
+      auto& c_reg = zmm[c_regs[i + idx * 3]];
       auto c_mem = cox;
       if (is_unit_inc)
         c_mem += i * nelems_in_cache_line;
@@ -493,20 +495,20 @@ class gemm_class {
    */
 
   template <int pow, int a_unroll, int idx>
-  EIGEN_ALWAYS_INLINE void c_update_1count(Scalar *&cox) {
+  EIGEN_ALWAYS_INLINE void c_update_1count(Scalar*& cox) {
     if (pow >= 4) cox += ldc;
 
     const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
-    auto &alpha_reg = zmm[alpha_load_reg];
+    auto& alpha_reg = zmm[alpha_load_reg];
 
     scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg);
     write_c<0, um_vecs, idx, a_unroll>(cox);
   }
 
   template <int pow, int a_unroll>
-  EIGEN_ALWAYS_INLINE void c_update_1pow(Scalar *&co1, Scalar *&co2) {
+  EIGEN_ALWAYS_INLINE void c_update_1pow(Scalar*& co1, Scalar*& co2) {
     constexpr int idx = pow / 2;
-    Scalar *&cox = idx == 0 ? co1 : co2;
+    Scalar*& cox = idx == 0 ? co1 : co2;
 
     constexpr int max_count = (pow + 1) / 2;
     static_assert(max_count <= 4, "Unsupported max_count.");
@@ -518,8 +520,8 @@ class gemm_class {
   }
 
   template <int max_b_unroll, int a_unroll, int b_unroll>
-  EIGEN_ALWAYS_INLINE void c_update(Scalar *&co1, Scalar *&co2) {
-    auto &alpha_reg = zmm[alpha_load_reg];
+  EIGEN_ALWAYS_INLINE void c_update(Scalar*& co1, Scalar*& co2) {
+    auto& alpha_reg = zmm[alpha_load_reg];
 
     co2 = co1 + ldc;
     if (!is_alpha1) alpha_reg = pload1<vec>(alpha);
@@ -540,8 +542,8 @@ class gemm_class {
 
   // compute
   template <int um, int um_vecs, int idx, int uk, bool fetch_x, bool ktail>
-  EIGEN_ALWAYS_INLINE std::enable_if_t<(um > um_vecs)> compute(const Scalar *ao, const Scalar *bo, int &fetchA_idx,
-                                                               int &fetchB_idx, vec &b_reg) {
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(um > um_vecs)> compute(const Scalar* ao, const Scalar* bo, int& fetchA_idx,
+                                                               int& fetchB_idx, vec& b_reg) {
     EIGEN_UNUSED_VARIABLE(ao);
     EIGEN_UNUSED_VARIABLE(bo);
     EIGEN_UNUSED_VARIABLE(fetchA_idx);
@@ -550,11 +552,11 @@ class gemm_class {
   }
 
   template <int um, int um_vecs, int idx, int uk, bool fetch_x, bool ktail>
-  EIGEN_ALWAYS_INLINE std::enable_if_t<(um <= um_vecs)> compute(const Scalar *ao, const Scalar *bo, int &fetchA_idx,
-                                                                int &fetchB_idx, vec &b_reg) {
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(um <= um_vecs)> compute(const Scalar* ao, const Scalar* bo, int& fetchA_idx,
+                                                                int& fetchB_idx, vec& b_reg) {
     if (um < um_vecs) {
-      auto &c_reg = zmm[c_regs[um + idx * 3]];
-      auto &a_reg = zmm[a_regs[um + (uk % 2) * 3]];
+      auto& c_reg = zmm[c_regs[um + idx * 3]];
+      auto& a_reg = zmm[a_regs[um + (uk % 2) * 3]];
 
       vfmadd(c_reg, a_reg, b_reg);
 
@@ -576,25 +578,25 @@ class gemm_class {
 
   // load_a
   template <int um, int um_vecs, int uk, int nelems, bool ktail>
-  EIGEN_ALWAYS_INLINE std::enable_if_t<(um > um_vecs)> load_a(const Scalar *ao) {
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(um > um_vecs)> load_a(const Scalar* ao) {
     EIGEN_UNUSED_VARIABLE(ao);
   }
 
   template <int um, int um_vecs, int uk, int nelems, bool ktail>
-  EIGEN_ALWAYS_INLINE std::enable_if_t<(um <= um_vecs)> load_a(const Scalar *ao) {
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(um <= um_vecs)> load_a(const Scalar* ao) {
     if (um < um_vecs) {
-      auto &a_reg = zmm[a_regs[um + (uk % 2) * 3]];
-      const Scalar *a_addr = ao + nelems * (1 + !ktail * !use_less_a_regs + uk) + nelems_in_cache_line * um - a_shift;
+      auto& a_reg = zmm[a_regs[um + (uk % 2) * 3]];
+      const Scalar* a_addr = ao + nelems * (1 + !ktail * !use_less_a_regs + uk) + nelems_in_cache_line * um - a_shift;
       a_load<nelems>(a_reg, a_addr);
 
       load_a<um + 1, um_vecs, uk, nelems, ktail>(ao);
     }
   }
   template <int uk, int pow, int count, int um_vecs, int b_unroll, bool ktail, bool fetch_x, bool c_fetch>
-  EIGEN_ALWAYS_INLINE std::enable_if_t<(count > (pow + 1) / 2)> innerkernel_1pow(const Scalar *&aa,
-                                                                                 const Scalar *const &ao,
-                                                                                 const Scalar *const &bo, Scalar *&co2,
-                                                                                 int &fetchA_idx, int &fetchB_idx) {
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(count > (pow + 1) / 2)> innerkernel_1pow(const Scalar*& aa,
+                                                                                 const Scalar* const& ao,
+                                                                                 const Scalar* const& bo, Scalar*& co2,
+                                                                                 int& fetchA_idx, int& fetchB_idx) {
     EIGEN_UNUSED_VARIABLE(aa);
     EIGEN_UNUSED_VARIABLE(ao);
     EIGEN_UNUSED_VARIABLE(bo);
@@ -604,14 +606,14 @@ class gemm_class {
   }
 
   template <int uk, int pow, int count, int um_vecs, int b_unroll, bool ktail, bool fetch_x, bool c_fetch>
-  EIGEN_ALWAYS_INLINE std::enable_if_t<(count <= (pow + 1) / 2)> innerkernel_1pow(const Scalar *&aa,
-                                                                                  const Scalar *const &ao,
-                                                                                  const Scalar *const &bo, Scalar *&co2,
-                                                                                  int &fetchA_idx, int &fetchB_idx) {
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(count <= (pow + 1) / 2)> innerkernel_1pow(const Scalar*& aa,
+                                                                                  const Scalar* const& ao,
+                                                                                  const Scalar* const& bo, Scalar*& co2,
+                                                                                  int& fetchA_idx, int& fetchB_idx) {
     const int idx = (pow / 2) + count;
 
     if (count < (pow + 1) / 2) {
-      auto &b_reg = zmm[b_regs[idx % 2]];
+      auto& b_reg = zmm[b_regs[idx % 2]];
 
       if (fetch_x && uk == 3 && idx == 0) prefetch_x(aa);
       if (fetch_x && uk == 3 && idx == 4) aa += 8;
@@ -619,7 +621,7 @@ class gemm_class {
       if (b_unroll >= pow) {
         compute<0, um_vecs, idx, uk, fetch_x, ktail>(ao, bo, fetchA_idx, fetchB_idx, b_reg);
 
-        const Scalar *b_addr = bo + b_unroll * uk + idx + 1 + (b_unroll > 1) * !use_less_b_regs - b_shift;
+        const Scalar* b_addr = bo + b_unroll * uk + idx + 1 + (b_unroll > 1) * !use_less_b_regs - b_shift;
         b_load(b_reg, b_addr);
       }
 
@@ -641,8 +643,8 @@ class gemm_class {
 
   template <int uk, int max_b_unroll, int a_unroll, int b_unroll, bool ktail, bool fetch_x, bool c_fetch,
             bool no_a_preload = false>
-  EIGEN_ALWAYS_INLINE void innerkernel_1uk(const Scalar *&aa, const Scalar *const &ao, const Scalar *const &bo,
-                                           Scalar *&co2, int &fetchA_idx, int &fetchB_idx) {
+  EIGEN_ALWAYS_INLINE void innerkernel_1uk(const Scalar*& aa, const Scalar* const& ao, const Scalar* const& bo,
+                                           Scalar*& co2, int& fetchA_idx, int& fetchB_idx) {
     const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
 
     if (max_b_unroll >= 1)
@@ -699,7 +701,7 @@ class gemm_class {
 
   template <int a_unroll, int b_unroll, int k_factor, int max_b_unroll, int max_k_factor, bool c_fetch,
             bool no_a_preload = false>
-  EIGEN_ALWAYS_INLINE void innerkernel(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co2) {
+  EIGEN_ALWAYS_INLINE void innerkernel(const Scalar*& aa, const Scalar*& ao, const Scalar*& bo, Scalar*& co2) {
     int fetchA_idx = 0;
     int fetchB_idx = 0;
 
@@ -729,7 +731,7 @@ class gemm_class {
   }
 
   template <int a_unroll, int b_unroll, int max_b_unroll>
-  EIGEN_ALWAYS_INLINE void kloop(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) {
+  EIGEN_ALWAYS_INLINE void kloop(const Scalar*& aa, const Scalar*& ao, const Scalar*& bo, Scalar*& co1, Scalar*& co2) {
     const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
     if (!use_less_a_regs && k > 1)
       a_loads<0, 2, 0, um_vecs, a_unroll>(ao);
@@ -793,7 +795,7 @@ class gemm_class {
   }
 
   template <int a_unroll, int b_unroll, int max_b_unroll>
-  EIGEN_ALWAYS_INLINE void nloop(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) {
+  EIGEN_ALWAYS_INLINE void nloop(const Scalar*& aa, const Scalar*& ao, const Scalar*& bo, Scalar*& co1, Scalar*& co2) {
     // Set A matrix pointer.
     ao = a + a_off * a_unroll;
 
@@ -810,9 +812,9 @@ class gemm_class {
   }
 
   template <int a_unroll, int max_a_unroll, int max_b_unroll>
-  EIGEN_ALWAYS_INLINE void mloop(const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) {
+  EIGEN_ALWAYS_INLINE void mloop(const Scalar*& ao, const Scalar*& bo, Scalar*& co1, Scalar*& co2) {
     // Set prefetch A pointers.
-    const Scalar *aa = a + a_unroll * a_stride;
+    const Scalar* aa = a + a_unroll * a_stride;
 
     // Set C matrix pointers.
     co1 = c;
@@ -830,10 +832,6 @@ class gemm_class {
 
     // n-remainders.
     if (n & 4 && max_b_unroll > 4) nloop<a_unroll, 4, max_b_unroll>(aa, ao, bo, co1, co2);
-#if 0
-        if (n & 2 && max_b_unroll > 2) nloop<a_unroll, 2, max_b_unroll>(aa, ao, bo, co1, co2);
-        if (n & 1 && max_b_unroll > 1) nloop<a_unroll, 1, max_b_unroll>(aa, ao, bo, co1, co2);
-#else
     // Copy kernels don't support tails of n = 2 for single/double precision.
     // Loop over ones.
     int n_rem = 2 * ((n & 2) != 0) + 1 * ((n & 1) != 0);
@@ -841,7 +839,6 @@ class gemm_class {
       nloop<a_unroll, 1, max_b_unroll>(aa, ao, bo, co1, co2);
       n_rem--;
     }
-#endif
 
     // Advance A matrix pointer.
     a = ao + a_unroll * (a_stride - k - a_off);
@@ -854,10 +851,10 @@ class gemm_class {
     a -= -a_shift;
     b -= -b_shift;
 
-    const Scalar *ao = nullptr;
-    const Scalar *bo = nullptr;
-    Scalar *co1 = nullptr;
-    Scalar *co2 = nullptr;
+    const Scalar* ao = nullptr;
+    const Scalar* bo = nullptr;
+    Scalar* co1 = nullptr;
+    Scalar* co2 = nullptr;
 
     // Main m-loop.
     for (; m >= max_a_unroll; m -= max_a_unroll) mloop<max_a_unroll, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
@@ -881,8 +878,8 @@ class gemm_class {
     }
   }
 
-  gemm_class(Index m_, Index n_, Index k_, Index ldc_, Index inc_, const Scalar *alpha_, const Scalar *a_,
-             const Scalar *b_, Scalar *c_, bool is_alpha1_, bool is_beta0_, Index a_stride_, Index b_stride_,
+  gemm_class(Index m_, Index n_, Index k_, Index ldc_, Index inc_, const Scalar* alpha_, const Scalar* a_,
+             const Scalar* b_, Scalar* c_, bool is_alpha1_, bool is_beta0_, Index a_stride_, Index b_stride_,
              Index a_off_, Index b_off_)
       : m(m_),
         n(n_),
@@ -927,6 +924,15 @@ class gemm_class {
   }
 };
 
+template <typename Scalar, bool is_unit_inc>
+const int gemm_class<Scalar, is_unit_inc>::a_regs[];
+
+template <typename Scalar, bool is_unit_inc>
+const int gemm_class<Scalar, is_unit_inc>::b_regs[];
+
+template <typename Scalar, bool is_unit_inc>
+const int gemm_class<Scalar, is_unit_inc>::c_regs[];
+
 // Compute kernel with max unroll support of:
 //   Single precision:
 //     max_a_unroll: 48, 32, 16, 8, 4, 2, 1
@@ -935,8 +941,8 @@ class gemm_class {
 //     max_a_unroll: 24, 16, 8, 4, 2, 1
 //     max_b_unroll: 8, 4, 2, 1
 template <typename Scalar, int max_a_unroll, int max_b_unroll, bool is_alpha1, bool is_beta0, bool is_unit_inc>
-EIGEN_DONT_INLINE void gemm_kern_avx512(Index m, Index n, Index k, Scalar *alpha, const Scalar *a, const Scalar *b,
-                                        Scalar *c, Index ldc, Index inc = 1, Index a_stride = -1, Index b_stride = -1,
+EIGEN_DONT_INLINE void gemm_kern_avx512(Index m, Index n, Index k, Scalar* alpha, const Scalar* a, const Scalar* b,
+                                        Scalar* c, Index ldc, Index inc = 1, Index a_stride = -1, Index b_stride = -1,
                                         Index a_off = 0, Index b_off = 0) {
   if (a_stride == -1) a_stride = k;
   if (b_stride == -1) b_stride = k;
@@ -947,7 +953,6 @@ EIGEN_DONT_INLINE void gemm_kern_avx512(Index m, Index n, Index k, Scalar *alpha
 }
 
 // Template specializations of GEBP kernels with nr = 8.
-#if EIGEN_USE_AVX512_GEMM_KERNELS
 template <bool ConjLhs_, bool ConjRhs_, int PacketSize_>
 class gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Target, PacketSize_>
     : public gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_> {
@@ -971,13 +976,13 @@ struct gemm_pack_rhs<Scalar, Index, DataMapper, 8, ColMajor, Conjugate, PanelMod
   typedef typename packet_traits<Scalar>::type Packet;
   typedef typename DataMapper::LinearMapper LinearMapper;
   enum { PacketSize = packet_traits<Scalar>::size };
-  EIGEN_DONT_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0);
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) const;
 };
 
 template <typename Scalar, typename Index, typename DataMapper, bool Conjugate, bool PanelMode>
 EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, 8, ColMajor, Conjugate, PanelMode>::operator()(
-    Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride, Index offset) {
+    Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) const {
   constexpr int nr = 8;
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
   EIGEN_UNUSED_VARIABLE(stride);
@@ -1001,10 +1006,9 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, 8, ColMajor, Con
       const LinearMapper dm6 = rhs.getLinearMapper(0, j2 + 6);
       const LinearMapper dm7 = rhs.getLinearMapper(0, j2 + 7);
       Index k = 0;
-      if ((PacketSize % 8) == 0)  // TODO enable vectorized transposition for PacketSize==4
-      {
+      EIGEN_IF_CONSTEXPR((PacketSize % 8) == 0 || PacketSize == 4) {
         for (; k < peeled_k; k += PacketSize) {
-          PacketBlock<Packet, (PacketSize % 8) == 0 ? 8 : PacketSize> kernel;
+          PacketBlock<Packet, 8> kernel;
 
           kernel.packet[0] = dm0.template loadPacket<Packet>(k);
           kernel.packet[1] = dm1.template loadPacket<Packet>(k);
@@ -1015,16 +1019,43 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, 8, ColMajor, Con
           kernel.packet[6] = dm6.template loadPacket<Packet>(k);
           kernel.packet[7] = dm7.template loadPacket<Packet>(k);
 
-          ptranspose(kernel);
+          EIGEN_IF_CONSTEXPR(PacketSize == 4) {
+            // For PacketSize==4 we cannot ptranspose 8 packets directly; compose two
+            // 4-packet transposes (cols 0-3 and 4-7) and interleave the halves so
+            // the 8 stores produce 4 rows of 8 packed elements.
+            PacketBlock<Packet, 4> tmp_lo;
+            tmp_lo.packet[0] = kernel.packet[0];
+            tmp_lo.packet[1] = kernel.packet[1];
+            tmp_lo.packet[2] = kernel.packet[2];
+            tmp_lo.packet[3] = kernel.packet[3];
+            ptranspose(tmp_lo);
+            PacketBlock<Packet, 4> tmp_hi;
+            tmp_hi.packet[0] = kernel.packet[4];
+            tmp_hi.packet[1] = kernel.packet[5];
+            tmp_hi.packet[2] = kernel.packet[6];
+            tmp_hi.packet[3] = kernel.packet[7];
+            ptranspose(tmp_hi);
+            kernel.packet[0] = tmp_lo.packet[0];
+            kernel.packet[1] = tmp_hi.packet[0];
+            kernel.packet[2] = tmp_lo.packet[1];
+            kernel.packet[3] = tmp_hi.packet[1];
+            kernel.packet[4] = tmp_lo.packet[2];
+            kernel.packet[5] = tmp_hi.packet[2];
+            kernel.packet[6] = tmp_lo.packet[3];
+            kernel.packet[7] = tmp_hi.packet[3];
+          }
+          else {
+            ptranspose(kernel);
+          }
 
           pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel.packet[0]));
-          pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1 % PacketSize]));
-          pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2 % PacketSize]));
-          pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3 % PacketSize]));
-          pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel.packet[4 % PacketSize]));
-          pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel.packet[5 % PacketSize]));
-          pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel.packet[6 % PacketSize]));
-          pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel.packet[7 % PacketSize]));
+          pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1]));
+          pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2]));
+          pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3]));
+          pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel.packet[4]));
+          pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel.packet[5]));
+          pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel.packet[6]));
+          pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel.packet[7]));
           count += 8 * PacketSize;
         }
       }
@@ -1054,19 +1085,35 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, 8, ColMajor, Con
       const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
 
       Index k = 0;
-      if ((PacketSize % 4) == 0)  // TODO enable vectorized transposition for PacketSize==2 ??
-      {
+      EIGEN_IF_CONSTEXPR((PacketSize % 4) == 0 || PacketSize == 2) {
         for (; k < peeled_k; k += PacketSize) {
-          PacketBlock<Packet, (PacketSize % 4) == 0 ? 4 : PacketSize> kernel;
+          PacketBlock<Packet, 4> kernel;
           kernel.packet[0] = dm0.template loadPacket<Packet>(k);
-          kernel.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
-          kernel.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
-          kernel.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
-          ptranspose(kernel);
+          kernel.packet[1] = dm1.template loadPacket<Packet>(k);
+          kernel.packet[2] = dm2.template loadPacket<Packet>(k);
+          kernel.packet[3] = dm3.template loadPacket<Packet>(k);
+          EIGEN_IF_CONSTEXPR(PacketSize == 2) {
+            // See the matching note in GeneralBlockPanelKernel.h.
+            PacketBlock<Packet, 2> tmp01;
+            tmp01.packet[0] = kernel.packet[0];
+            tmp01.packet[1] = kernel.packet[1];
+            ptranspose(tmp01);
+            PacketBlock<Packet, 2> tmp23;
+            tmp23.packet[0] = kernel.packet[2];
+            tmp23.packet[1] = kernel.packet[3];
+            ptranspose(tmp23);
+            kernel.packet[0] = tmp01.packet[0];
+            kernel.packet[1] = tmp23.packet[0];
+            kernel.packet[2] = tmp01.packet[1];
+            kernel.packet[3] = tmp23.packet[1];
+          }
+          else {
+            ptranspose(kernel);
+          }
           pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel.packet[0]));
-          pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1 % PacketSize]));
-          pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2 % PacketSize]));
-          pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3 % PacketSize]));
+          pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1]));
+          pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2]));
+          pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3]));
           count += 4 * PacketSize;
         }
       }
@@ -1105,8 +1152,8 @@ struct gemm_pack_rhs<Scalar, Index, DataMapper, 8, RowMajor, Conjugate, PanelMod
     HalfPacketSize = unpacket_traits<HalfPacket>::size,
     QuarterPacketSize = unpacket_traits<QuarterPacket>::size
   };
-  EIGEN_DONT_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0) {
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) const {
     constexpr int nr = 8;
     EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
     EIGEN_UNUSED_VARIABLE(stride);
@@ -1204,33 +1251,32 @@ struct gemm_pack_rhs<Scalar, Index, DataMapper, 8, RowMajor, Conjugate, PanelMod
 
 template <typename Scalar, typename Index, typename DataMapper, int mr, bool ConjugateLhs, bool ConjugateRhs>
 struct gebp_kernel<Scalar, Scalar, Index, DataMapper, mr, 8, ConjugateLhs, ConjugateRhs> {
-  EIGEN_ALWAYS_INLINE void operator()(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows,
+  EIGEN_ALWAYS_INLINE void operator()(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows,
                                       Index depth, Index cols, Scalar alpha, Index strideA = -1, Index strideB = -1,
-                                      Index offsetA = 0, Index offsetB = 0);
+                                      Index offsetA = 0, Index offsetB = 0) const;
 };
 
 template <typename Scalar, typename Index, typename DataMapper, int mr, bool ConjugateLhs, bool ConjugateRhs>
 EIGEN_ALWAYS_INLINE void gebp_kernel<Scalar, Scalar, Index, DataMapper, mr, 8, ConjugateLhs, ConjugateRhs>::operator()(
-    const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows, Index depth, Index cols,
-    Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+    const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols,
+    Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) const {
   if (res.incr() == 1) {
     if (alpha == 1) {
-      gemm_kern_avx512<Scalar, mr, 8, true, false, true>(rows, cols, depth, &alpha, blockA, blockB,
-                                                         (Scalar *)res.data(), res.stride(), res.incr(), strideA,
-                                                         strideB, offsetA, offsetB);
+      gemm_kern_avx512<Scalar, mr, 8, true, false, true>(rows, cols, depth, &alpha, blockA, blockB, (Scalar*)res.data(),
+                                                         res.stride(), res.incr(), strideA, strideB, offsetA, offsetB);
     } else {
       gemm_kern_avx512<Scalar, mr, 8, false, false, true>(rows, cols, depth, &alpha, blockA, blockB,
-                                                          (Scalar *)res.data(), res.stride(), res.incr(), strideA,
+                                                          (Scalar*)res.data(), res.stride(), res.incr(), strideA,
                                                           strideB, offsetA, offsetB);
     }
   } else {
     if (alpha == 1) {
       gemm_kern_avx512<Scalar, mr, 8, true, false, false>(rows, cols, depth, &alpha, blockA, blockB,
-                                                          (Scalar *)res.data(), res.stride(), res.incr(), strideA,
+                                                          (Scalar*)res.data(), res.stride(), res.incr(), strideA,
                                                           strideB, offsetA, offsetB);
     } else {
       gemm_kern_avx512<Scalar, mr, 8, false, false, false>(rows, cols, depth, &alpha, blockA, blockB,
-                                                           (Scalar *)res.data(), res.stride(), res.incr(), strideA,
+                                                           (Scalar*)res.data(), res.stride(), res.incr(), strideA,
                                                            strideB, offsetA, offsetB);
     }
   }
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 04499a0c2a2..24990194377 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -106,32 +106,10 @@ EIGEN_STRONG_INLINE Packet16f preciprocal<Packet16f>(const Packet16f& a) {
 }
 #endif
 
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pcos)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp2)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog1p)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog2)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, preciprocal)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, prsqrt)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psin)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psqrt)
-BF16_PACKET_FUNCTION(Packet16f, Packet16bf, ptanh)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_BF16(Packet16f, Packet16bf)
 
 #ifndef EIGEN_VECTORIZE_AVX512FP16
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pcos)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp2)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, plog)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, plog1p)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, plog2)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, preciprocal)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, prsqrt)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, psin)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, psqrt)
-F16_PACKET_FUNCTION(Packet16f, Packet16h, ptanh)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_F16(Packet16f, Packet16h)
 #endif  // EIGEN_VECTORIZE_AVX512FP16
 
 }  // end namespace internal
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 932b0568d51..7e997984290 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -44,6 +44,54 @@ typedef eigen_packet_wrapper<__m512i, 6> Packet32s;
 typedef eigen_packet_wrapper<__m256i, 6> Packet16s;
 typedef eigen_packet_wrapper<__m128i, 6> Packet8s;
 
+EIGEN_STRONG_INLINE Packet16i _eigen_mm512_loadu_epi32(const int* from) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 1010)
+  return _mm512_loadu_si512(reinterpret_cast<const void*>(from));
+#else
+  return _mm512_loadu_epi32(from);
+#endif
+}
+
+EIGEN_STRONG_INLINE Packet16i _eigen_mm512_loadu_epi64(const int64_t* from) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 1010)
+  return _mm512_loadu_si512(reinterpret_cast<const void*>(from));
+#else
+  return _mm512_loadu_epi64(from);
+#endif
+}
+
+EIGEN_STRONG_INLINE void _eigen_mm512_storeu_epi32(void* to, const Packet16i& from) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 1010)
+  _mm512_storeu_si512(to, from);
+#else
+  _mm512_storeu_epi32(to, from);
+#endif
+}
+
+EIGEN_STRONG_INLINE void _eigen_mm512_storeu_epi64(void* to, const Packet16i& from) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 1010)
+  _mm512_storeu_si512(to, from);
+#else
+  _mm512_storeu_epi64(to, from);
+#endif
+}
+
+EIGEN_STRONG_INLINE void _eigen_mm256_storeu_epi32(void* to, const __m256i& from) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 1010)
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
+#else
+  _mm256_storeu_epi32(to, from);
+#endif
+}
+
+EIGEN_STRONG_INLINE void _eigen_mm_storeu_epi32(void* to, const __m128i& from) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 1010)
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
+#else
+  _mm_storeu_epi32(to, from);
+#endif
+}
+
 template <>
 struct is_arithmetic<__m512> {
   enum { value = true };
@@ -84,7 +132,6 @@ struct packet_traits<half> : default_packet_traits {
     HasDiv = 1,
     HasNegate = 1,
     HasAbs = 1,
-    HasAbs2 = 0,
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
@@ -101,7 +148,6 @@ struct packet_traits<half> : default_packet_traits {
     HasCos = EIGEN_FAST_MATH,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
-    HasBlend = 0
   };
 };
 #endif
@@ -119,22 +165,28 @@ struct packet_traits<float> : default_packet_traits {
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
-    HasBlend = 1,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
+    HasTan = EIGEN_FAST_MATH,
     HasACos = 1,
     HasASin = 1,
     HasATan = 1,
     HasATanh = 1,
+    HasSinh = 1,
+    HasCosh = 1,
+    HasASinh = 1,
+    HasACosh = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
     HasCbrt = 1,
     HasLog = 1,
+    HasLog10 = 1,
     HasLog1p = 1,
     HasExpm1 = 1,
     HasNdtri = 1,
     HasBessel = 1,
     HasExp = 1,
+    HasPow = 1,
     HasReciprocal = EIGEN_FAST_MATH,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
@@ -151,14 +203,22 @@ struct packet_traits<double> : default_packet_traits {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 8,
-    HasBlend = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
     HasCbrt = 1,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
+    HasTan = EIGEN_FAST_MATH,
+    HasSinh = 1,
+    HasCosh = 1,
+    HasASinh = 1,
+    HasACosh = 1,
     HasLog = 1,
+    HasLog10 = 1,
     HasExp = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasPow = 1,
     HasATan = 1,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
@@ -173,7 +233,7 @@ template <>
 struct packet_traits<int> : default_packet_traits {
   typedef Packet16i type;
   typedef Packet8i half;
-  enum { Vectorizable = 1, AlignedOnScalar = 1, HasBlend = 0, HasCmp = 1, HasDiv = 1, size = 16 };
+  enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, HasDiv = 1, size = 16 };
 };
 
 template <>
@@ -441,15 +501,15 @@ EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
   //       The intel docs give it a relatively high latency as well, so we're probably
   //       better off with using _mm512_set_epi32 directly anyways.
   const __m512i mask =
-      _mm512_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
-                       0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
+      _mm512_set_epi32(SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32,
+                       SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32,
+                       SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32);
   return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), mask));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) {
-  const __m512i mask =
-      _mm512_set_epi64(0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL,
-                       0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL);
+  const __m512i mask = _mm512_set_epi64(SIGN_MASK_I64, SIGN_MASK_I64, SIGN_MASK_I64, SIGN_MASK_I64, SIGN_MASK_I64,
+                                        SIGN_MASK_I64, SIGN_MASK_I64, SIGN_MASK_I64);
   return _mm512_castsi512_pd(_mm512_xor_epi64(_mm512_castpd_si512(a), mask));
 }
 template <>
@@ -768,22 +828,22 @@ EIGEN_STRONG_INLINE Packet8l pcmp_lt(const Packet8l& a, const Packet8l& b) {
 template <>
 EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) {
   __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
-  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, int64_t(-1)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pcmp_le(const Packet8d& a, const Packet8d& b) {
   __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ);
-  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, int64_t(-1)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pcmp_lt(const Packet8d& a, const Packet8d& b) {
   __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ);
-  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, int64_t(-1)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pcmp_lt_or_nan(const Packet8d& a, const Packet8d& b) {
   __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ);
-  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, int64_t(-1)));
 }
 
 template <>
@@ -1031,11 +1091,11 @@ EIGEN_STRONG_INLINE Packet8d ploadu<Packet8d>(const double* from) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_epi32(from);
+  EIGEN_DEBUG_UNALIGNED_LOAD return _eigen_mm512_loadu_epi32(from);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8l ploadu<Packet8l>(const int64_t* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_epi64(from);
+  EIGEN_DEBUG_UNALIGNED_LOAD return _eigen_mm512_loadu_epi64(from);
 }
 
 template <>
@@ -1156,11 +1216,11 @@ EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet8d& from) {
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_epi32(to, from);
+  EIGEN_DEBUG_UNALIGNED_STORE _eigen_mm512_storeu_epi32(to, from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet8l& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_epi64(to, from);
+  EIGEN_DEBUG_UNALIGNED_STORE _eigen_mm512_storeu_epi64(to, from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from, uint16_t umask) {
@@ -1357,12 +1417,12 @@ EIGEN_STRONG_INLINE Packet8l preverse(const Packet8l& a) {
 
 template <>
 EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) {
-  // _mm512_abs_ps intrinsic not found, so hack around it
+  // _mm512_abs_ps intrinsic not found, so implement via bitwise AND with sign-bit mask.
   return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
-  // _mm512_abs_ps intrinsic not found, so hack around it
+  // _mm512_abs_pd intrinsic not found, so implement via bitwise AND with sign-bit mask.
   return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a), _mm512_set1_epi64(0x7fffffffffffffff)));
 }
 template <>
@@ -1495,7 +1555,7 @@ EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d&
 #endif
 
 template <>
-EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
+EIGEN_STRONG_INLINE Packet8f predux_half<Packet16f>(const Packet16f& a) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
   __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
   __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
@@ -1511,13 +1571,13 @@ EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
 #endif
 }
 template <>
-EIGEN_STRONG_INLINE Packet4d predux_half_dowto4<Packet8d>(const Packet8d& a) {
+EIGEN_STRONG_INLINE Packet4d predux_half<Packet8d>(const Packet8d& a) {
   __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
   __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
   return _mm256_add_pd(lane0, lane1);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8i predux_half_dowto4<Packet16i>(const Packet16i& a) {
+EIGEN_STRONG_INLINE Packet8i predux_half<Packet16i>(const Packet16i& a) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
   __m256i lane0 = _mm512_extracti32x8_epi32(a, 0);
   __m256i lane1 = _mm512_extracti32x8_epi32(a, 1);
@@ -1534,7 +1594,7 @@ EIGEN_STRONG_INLINE Packet8i predux_half_dowto4<Packet16i>(const Packet16i& a) {
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet4l predux_half_dowto4<Packet8l>(const Packet8l& a) {
+EIGEN_STRONG_INLINE Packet4l predux_half<Packet8l>(const Packet8l& a) {
   __m256i lane0 = _mm512_extracti64x4_epi64(a, 0);
   __m256i lane1 = _mm512_extracti64x4_epi64(a, 1);
   return _mm256_add_epi64(lane0, lane1);
@@ -2055,27 +2115,6 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16i, 4>& kernel) {
   PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 3, 1);
 }
 
-template <size_t N>
-EIGEN_STRONG_INLINE int avx512_blend_mask(const Selector<N>& ifPacket) {
-  alignas(__m128i) uint8_t aux[sizeof(__m128i)];
-  for (size_t i = 0; i < N; i++) aux[i] = static_cast<uint8_t>(ifPacket.select[i]);
-  __m128i paux = _mm_sub_epi8(_mm_setzero_si128(), _mm_load_si128(reinterpret_cast<const __m128i*>(aux)));
-  return _mm_movemask_epi8(paux);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& ifPacket, const Packet16f& thenPacket,
-                                     const Packet16f& elsePacket) {
-  __mmask16 m = avx512_blend_mask(ifPacket);
-  return _mm512_mask_blend_ps(m, elsePacket, thenPacket);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket, const Packet8d& thenPacket,
-                                    const Packet8d& elsePacket) {
-  __mmask8 m = avx512_blend_mask(ifPacket);
-  return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
-}
-
 // Packet math for Eigen::half
 #ifndef EIGEN_VECTORIZE_AVX512FP16
 template <>
@@ -2149,7 +2188,7 @@ EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
 
 template <>
 EIGEN_STRONG_INLINE Packet16h pabs(const Packet16h& a) {
-  const __m256i sign_mask = _mm256_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  const __m256i sign_mask = _mm256_set1_epi16(static_cast<short>(0x8000u));
   return _mm256_andnot_si256(sign_mask, a);
 }
 
@@ -2246,7 +2285,7 @@ EIGEN_STRONG_INLINE Packet16h pconj(const Packet16h& a) {
 
 template <>
 EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
-  Packet16h sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
+  Packet16h sign_mask = _mm256_set1_epi16(static_cast<short>(0x8000u));
   return _mm256_xor_si256(a, sign_mask);
 }
 
@@ -2303,7 +2342,7 @@ EIGEN_STRONG_INLINE Packet16h pnmsub<Packet16h>(const Packet16h& a, const Packet
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
+EIGEN_STRONG_INLINE Packet8h predux_half<Packet16h>(const Packet16h& a) {
   Packet8h lane0 = _mm256_extractf128_si256(a, 0);
   Packet8h lane1 = _mm256_extractf128_si256(a, 1);
   return padd<Packet8h>(lane0, lane1);
@@ -2530,7 +2569,6 @@ struct packet_traits<bfloat16> : default_packet_traits {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 16,
-    HasBlend = 0,
     HasInsert = 1,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
@@ -2738,7 +2776,7 @@ EIGEN_STRONG_INLINE Packet16bf pcmp_lt_or_nan(const Packet16bf& a, const Packet1
 
 template <>
 EIGEN_STRONG_INLINE Packet16bf pnegate(const Packet16bf& a) {
-  Packet16bf sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
+  Packet16bf sign_mask = _mm256_set1_epi16(static_cast<short>(0x8000u));
   return _mm256_xor_si256(a, sign_mask);
 }
 
@@ -2749,7 +2787,7 @@ EIGEN_STRONG_INLINE Packet16bf pconj(const Packet16bf& a) {
 
 template <>
 EIGEN_STRONG_INLINE Packet16bf pabs(const Packet16bf& a) {
-  const __m256i sign_mask = _mm256_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  const __m256i sign_mask = _mm256_set1_epi16(static_cast<short>(0x8000u));
   return _mm256_andnot_si256(sign_mask, a);
 }
 
@@ -2809,7 +2847,7 @@ EIGEN_STRONG_INLINE Packet16bf plset<Packet16bf>(const bfloat16& a) {
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4<Packet16bf>(const Packet16bf& a) {
+EIGEN_STRONG_INLINE Packet8bf predux_half<Packet16bf>(const Packet16bf& a) {
   Packet8bf lane0 = _mm256_extractf128_si256(a, 0);
   Packet8bf lane1 = _mm256_extractf128_si256(a, 1);
   return padd<Packet8bf>(lane0, lane1);
@@ -3017,19 +3055,19 @@ EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet8s>(numext::int16_t* out,
 template <>
 EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet32s>(numext::int16_t* out, const Packet32s& x) {
   EIGEN_DEBUG_UNALIGNED_STORE
-  _mm512_storeu_epi32(out, x);
+  _eigen_mm512_storeu_epi32(out, x);
 }
 
 template <>
 EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet16s>(numext::int16_t* out, const Packet16s& x) {
   EIGEN_DEBUG_UNALIGNED_STORE
-  _mm256_storeu_epi32(out, x);
+  _eigen_mm256_storeu_epi32(out, x);
 }
 
 template <>
 EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet8s>(numext::int16_t* out, const Packet8s& x) {
   EIGEN_DEBUG_UNALIGNED_STORE
-  _mm_storeu_epi32(out, x);
+  _eigen_mm_storeu_epi32(out, x);
 }
 
 template <>
diff --git a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
index a040bbeade6..551a94a5cc6 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
@@ -42,7 +42,6 @@ struct packet_traits<half> : default_packet_traits {
     HasDiv = 1,
     HasNegate = 1,
     HasAbs = 1,
-    HasAbs2 = 0,
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
@@ -60,7 +59,6 @@ struct packet_traits<half> : default_packet_traits {
     HasCos = EIGEN_FAST_MATH,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = 0,  // EIGEN_FAST_MATH,
-    HasBlend = 0
   };
 };
 
@@ -737,9 +735,9 @@ EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) {
   return half(_mm_reduce_add_ph(a));
 }
 
-// predux_half_dowto4
+// predux_half
 template <>
-EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {
+EIGEN_STRONG_INLINE Packet16h predux_half<Packet32h>(const Packet32h& a) {
   const __m512i bits = _mm512_castph_si512(a);
   Packet16h lo = _mm256_castsi256_ph(_mm512_castsi512_si256(bits));
   Packet16h hi = _mm256_castsi256_ph(_mm512_extracti64x4_epi64(bits, 1));
@@ -747,7 +745,7 @@ EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a)
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
+EIGEN_STRONG_INLINE Packet8h predux_half<Packet16h>(const Packet16h& a) {
   Packet8h lo = _mm_castsi128_ph(_mm256_castsi256_si128(_mm256_castph_si256(a)));
   Packet8h hi = _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(a), 1));
   return padd(lo, hi);
@@ -880,19 +878,17 @@ EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const
 
 template <>
 EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) {
-  return _mm512_castsi512_ph(
-      _mm512_xor_si512(_mm512_castph_si512(a), _mm512_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
+  return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_set1_epi16(static_cast<short>(0x8000u))));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet16h pnegate<Packet16h>(const Packet16h& a) {
-  return _mm256_castsi256_ph(
-      _mm256_xor_si256(_mm256_castph_si256(a), _mm256_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
+  return _mm256_castsi256_ph(_mm256_xor_si256(_mm256_castph_si256(a), _mm256_set1_epi16(static_cast<short>(0x8000u))));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pnegate<Packet8h>(const Packet8h& a) {
-  return _mm_castsi128_ph(_mm_xor_si128(_mm_castph_si128(a), _mm_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
+  return _mm_castsi128_ph(_mm_xor_si128(_mm_castph_si128(a), _mm_set1_epi16(static_cast<short>(0x8000u))));
 }
 
 // pconj
diff --git a/Eigen/src/Core/arch/AVX512/Reductions.h b/Eigen/src/Core/arch/AVX512/Reductions.h
index f7b4c25a10a..f59b78e273f 100644
--- a/Eigen/src/Core/arch/AVX512/Reductions.h
+++ b/Eigen/src/Core/arch/AVX512/Reductions.h
@@ -55,7 +55,7 @@ EIGEN_STRONG_INLINE int64_t predux(const Packet8l& a) {
 // MSVC's _mm512_reduce_mul_epi64 is borked, at least up to and including 1939.
 //    alignas(64) int64_t data[] = { 1,1,-1,-1,1,-1,-1,-1 };
 //    int64_t out = _mm512_reduce_mul_epi64(_mm512_load_epi64(data));
-// produces garbage: 4294967295.  It seems to happen whenever the output is supposed to be negative.
+// produces garbage: 4294967295.  This occurs when the result should be negative.
 // Fall back to a manual approach:
 template <>
 EIGEN_STRONG_INLINE int64_t predux_mul(const Packet8l& a) {
diff --git a/Eigen/src/Core/arch/AVX512/TrsmKernel.h b/Eigen/src/Core/arch/AVX512/TrsmKernel.h
index c763b5fe324..fffedb67ed6 100644
--- a/Eigen/src/Core/arch/AVX512/TrsmKernel.h
+++ b/Eigen/src/Core/arch/AVX512/TrsmKernel.h
@@ -44,6 +44,8 @@
 namespace Eigen {
 namespace internal {
 
+#if (EIGEN_USE_AVX512_TRSM_KERNELS)
+
 #define EIGEN_AVX_MAX_NUM_ACC (int64_t(24))
 #define EIGEN_AVX_MAX_NUM_ROW (int64_t(8))  // Denoted L in code.
 #define EIGEN_AVX_MAX_K_UNROL (int64_t(4))
@@ -58,7 +60,8 @@ typedef Packet4d vecHalfDouble;
 // Note: this depends on macros and typedefs above.
 #include "TrsmUnrolls.inc"
 
-#if (EIGEN_USE_AVX512_TRSM_KERNELS) && (EIGEN_COMP_CLANG != 0)
+#if (EIGEN_COMP_CLANG != 0)
+
 /**
  * For smaller problem sizes, and certain compilers, using the optimized kernels trsmKernelL/R directly
  * is faster than the packed versions in TriangularSolverMatrix.h.
@@ -118,7 +121,7 @@ int64_t avx512_trsm_cutoff(int64_t L2Size, int64_t N, double L2Cap) {
  * Used by gemmKernel for the case A/B row-major and C col-major.
  */
 template <typename Scalar, typename vec, int64_t unrollM, int64_t unrollN, bool remM, bool remN>
-EIGEN_ALWAYS_INLINE void transStoreC(PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, Scalar *C_arr,
+EIGEN_ALWAYS_INLINE void transStoreC(PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS>& zmm, Scalar* C_arr,
                                      int64_t LDC, int64_t remM_ = 0, int64_t remN_ = 0) {
   EIGEN_UNUSED_VARIABLE(remN_);
   EIGEN_UNUSED_VARIABLE(remM_);
@@ -218,13 +221,13 @@ EIGEN_ALWAYS_INLINE void transStoreC(PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_
  * handleKRem: Handle arbitrary K? This is not needed for trsm.
  */
 template <typename Scalar, bool isARowMajor, bool isCRowMajor, bool isAdd, bool handleKRem>
-void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t N, int64_t K, int64_t LDA, int64_t LDB,
+void gemmKernel(Scalar* A_arr, Scalar* B_arr, Scalar* C_arr, int64_t M, int64_t N, int64_t K, int64_t LDA, int64_t LDB,
                 int64_t LDC) {
   using urolls = unrolls::gemm<Scalar, isAdd>;
   constexpr int64_t U3 = urolls::PacketSize * 3;
   constexpr int64_t U2 = urolls::PacketSize * 2;
   constexpr int64_t U1 = urolls::PacketSize * 1;
-  using vec = typename std::conditional<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>::type;
+  using vec = std::conditional_t<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>;
   int64_t N_ = (N / U3) * U3;
   int64_t M_ = (M / EIGEN_AVX_MAX_NUM_ROW) * EIGEN_AVX_MAX_NUM_ROW;
   int64_t K_ = (K / EIGEN_AVX_MAX_K_UNROL) * EIGEN_AVX_MAX_K_UNROL;
@@ -261,8 +264,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
       }
     }
     if (M - i >= 4) {  // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
-      Scalar *B_t = &B_arr[0 * LDB + j];
+      Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar* B_t = &B_arr[0 * LDB + j];
       PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
       urolls::template setzero<3, 4>(zmm);
       for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -291,8 +294,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
       i += 4;
     }
     if (M - i >= 2) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
-      Scalar *B_t = &B_arr[0 * LDB + j];
+      Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar* B_t = &B_arr[0 * LDB + j];
       PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
       urolls::template setzero<3, 2>(zmm);
       for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -321,8 +324,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
       i += 2;
     }
     if (M - i > 0) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
-      Scalar *B_t = &B_arr[0 * LDB + j];
+      Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar* B_t = &B_arr[0 * LDB + j];
       PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
       urolls::template setzero<3, 1>(zmm);
       {
@@ -384,8 +387,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
       }
     }
     if (M - i >= 4) {  // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
-      Scalar *B_t = &B_arr[0 * LDB + j];
+      Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar* B_t = &B_arr[0 * LDB + j];
       PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
       urolls::template setzero<2, 4>(zmm);
       for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -414,8 +417,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
       i += 4;
     }
     if (M - i >= 2) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
-      Scalar *B_t = &B_arr[0 * LDB + j];
+      Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar* B_t = &B_arr[0 * LDB + j];
       PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
       urolls::template setzero<2, 2>(zmm);
       for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -444,8 +447,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
       i += 2;
     }
     if (M - i > 0) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
-      Scalar *B_t = &B_arr[0 * LDB + j];
+      Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar* B_t = &B_arr[0 * LDB + j];
       PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
       urolls::template setzero<2, 1>(zmm);
       for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -505,8 +508,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
       }
     }
     if (M - i >= 4) {  // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
-      Scalar *B_t = &B_arr[0 * LDB + j];
+      Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar* B_t = &B_arr[0 * LDB + j];
       PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
       urolls::template setzero<1, 4>(zmm);
       for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -535,8 +538,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
       i += 4;
     }
     if (M - i >= 2) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
-      Scalar *B_t = &B_arr[0 * LDB + j];
+      Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar* B_t = &B_arr[0 * LDB + j];
       PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
       urolls::template setzero<1, 2>(zmm);
       for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -565,8 +568,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
       i += 2;
     }
     if (M - i > 0) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
-      Scalar *B_t = &B_arr[0 * LDB + j];
+      Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar* B_t = &B_arr[0 * LDB + j];
       PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
       urolls::template setzero<1, 1>(zmm);
       {
@@ -600,8 +603,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
     constexpr int64_t EIGEN_AVX_MAX_B_LOAD = EIGEN_AVX_B_LOAD_SETS * 1;
     int64_t i = 0;
     for (; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
-      Scalar *B_t = &B_arr[0 * LDB + j];
+      Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar* B_t = &B_arr[0 * LDB + j];
       PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
       urolls::template setzero<1, EIGEN_AVX_MAX_NUM_ROW>(zmm);
       for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -629,8 +632,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
       }
     }
     if (M - i >= 4) {  // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
-      Scalar *B_t = &B_arr[0 * LDB + j];
+      Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar* B_t = &B_arr[0 * LDB + j];
       PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
       urolls::template setzero<1, 4>(zmm);
       for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -659,8 +662,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
       i += 4;
     }
     if (M - i >= 2) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
-      Scalar *B_t = &B_arr[0 * LDB + j];
+      Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar* B_t = &B_arr[0 * LDB + j];
       PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
       urolls::template setzero<1, 2>(zmm);
       for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -689,8 +692,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
       i += 2;
     }
     if (M - i > 0) {
-      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
-      Scalar *B_t = &B_arr[0 * LDB + j];
+      Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar* B_t = &B_arr[0 * LDB + j];
       PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
       urolls::template setzero<1, 1>(zmm);
       for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -729,7 +732,7 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
  * The B matrix (RHS) is assumed to be row-major
  */
 template <typename Scalar, typename vec, int64_t unrollM, bool isARowMajor, bool isFWDSolve, bool isUnitDiag>
-EIGEN_ALWAYS_INLINE void triSolveKernel(Scalar *A_arr, Scalar *B_arr, int64_t K, int64_t LDA, int64_t LDB) {
+EIGEN_ALWAYS_INLINE void triSolveKernel(Scalar* A_arr, Scalar* B_arr, int64_t K, int64_t LDA, int64_t LDB) {
   static_assert(unrollM <= EIGEN_AVX_MAX_NUM_ROW, "unrollM should be equal to EIGEN_AVX_MAX_NUM_ROW");
   using urolls = unrolls::trsm<Scalar>;
   constexpr int64_t U3 = urolls::PacketSize * 3;
@@ -779,10 +782,10 @@ EIGEN_ALWAYS_INLINE void triSolveKernel(Scalar *A_arr, Scalar *B_arr, int64_t K,
  * The B matrix (RHS) is assumed to be row-major
  */
 template <typename Scalar, bool isARowMajor, bool isFWDSolve, bool isUnitDiag>
-void triSolveKernelLxK(Scalar *A_arr, Scalar *B_arr, int64_t M, int64_t K, int64_t LDA, int64_t LDB) {
+void triSolveKernelLxK(Scalar* A_arr, Scalar* B_arr, int64_t M, int64_t K, int64_t LDA, int64_t LDB) {
   // Note: this assumes EIGEN_AVX_MAX_NUM_ROW = 8. Unrolls should be adjusted
   // accordingly if EIGEN_AVX_MAX_NUM_ROW is smaller.
-  using vec = typename std::conditional<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>::type;
+  using vec = std::conditional_t<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>;
   if (M == 8)
     triSolveKernel<Scalar, vec, 8, isARowMajor, isFWDSolve, isUnitDiag>(A_arr, B_arr, K, LDA, LDB);
   else if (M == 7)
@@ -810,11 +813,11 @@ void triSolveKernelLxK(Scalar *A_arr, Scalar *B_arr, int64_t M, int64_t K, int64
  *
  */
 template <typename Scalar, bool toTemp = true, bool remM = false>
-EIGEN_ALWAYS_INLINE void copyBToRowMajor(Scalar *B_arr, int64_t LDB, int64_t K, Scalar *B_temp, int64_t LDB_,
+EIGEN_ALWAYS_INLINE void copyBToRowMajor(Scalar* B_arr, int64_t LDB, int64_t K, Scalar* B_temp, int64_t LDB_,
                                          int64_t remM_ = 0) {
   EIGEN_UNUSED_VARIABLE(remM_);
   using urolls = unrolls::transB<Scalar>;
-  using vecHalf = typename std::conditional<std::is_same<Scalar, float>::value, vecHalfFloat, vecFullDouble>::type;
+  using vecHalf = std::conditional_t<std::is_same<Scalar, float>::value, vecHalfFloat, vecFullDouble>;
   PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> ymm;
   constexpr int64_t U3 = urolls::PacketSize * 3;
   constexpr int64_t U2 = urolls::PacketSize * 2;
@@ -897,7 +900,7 @@ EIGEN_ALWAYS_INLINE void copyBToRowMajor(Scalar *B_arr, int64_t LDB, int64_t K,
  */
 template <typename Scalar, bool isARowMajor = true, bool isBRowMajor = true, bool isFWDSolve = true,
           bool isUnitDiag = false>
-void triSolve(Scalar *A_arr, Scalar *B_arr, int64_t M, int64_t numRHS, int64_t LDA, int64_t LDB) {
+void triSolve(Scalar* A_arr, Scalar* B_arr, int64_t M, int64_t numRHS, int64_t LDA, int64_t LDB) {
   constexpr int64_t psize = packet_traits<Scalar>::size;
   /**
    * The values for kB, numM were determined experimentally.
@@ -916,7 +919,7 @@ void triSolve(Scalar *A_arr, Scalar *B_arr, int64_t M, int64_t numRHS, int64_t L
   constexpr int64_t numM = 8 * EIGEN_AVX_MAX_NUM_ROW;
 
   int64_t sizeBTemp = 0;
-  Scalar *B_temp = NULL;
+  Scalar* B_temp = NULL;
   EIGEN_IF_CONSTEXPR(!isBRowMajor) {
     /**
      * If B is col-major, we copy it to a fixed-size temporary array of size at most ~numM*kB and
@@ -926,7 +929,7 @@ void triSolve(Scalar *A_arr, Scalar *B_arr, int64_t M, int64_t numRHS, int64_t L
     sizeBTemp = (((std::min(kB, numRHS) + psize - 1) / psize + 4) * psize) * numM;
   }
 
-  EIGEN_IF_CONSTEXPR(!isBRowMajor) B_temp = (Scalar *)handmade_aligned_malloc(sizeof(Scalar) * sizeBTemp, 64);
+  EIGEN_IF_CONSTEXPR(!isBRowMajor) B_temp = (Scalar*)handmade_aligned_malloc(sizeof(Scalar) * sizeBTemp, 64);
 
   for (int64_t k = 0; k < numRHS; k += kB) {
     int64_t bK = numRHS - k > kB ? kB : numRHS - k;
@@ -1060,7 +1063,6 @@ void triSolve(Scalar *A_arr, Scalar *B_arr, int64_t M, int64_t numRHS, int64_t L
 }
 
 // Template specializations of trsmKernelL/R for float/double and inner strides of 1.
-#if (EIGEN_USE_AVX512_TRSM_KERNELS)
 #if (EIGEN_USE_AVX512_TRSM_R_KERNELS)
 template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride,
           bool Specialized>
@@ -1068,19 +1070,19 @@ struct trsmKernelR;
 
 template <typename Index, int Mode, int TriStorageOrder>
 struct trsmKernelR<float, Index, Mode, false, TriStorageOrder, 1, true> {
-  static void kernel(Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
+  static void kernel(Index size, Index otherSize, const float* _tri, Index triStride, float* _other, Index otherIncr,
                      Index otherStride);
 };
 
 template <typename Index, int Mode, int TriStorageOrder>
 struct trsmKernelR<double, Index, Mode, false, TriStorageOrder, 1, true> {
-  static void kernel(Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
+  static void kernel(Index size, Index otherSize, const double* _tri, Index triStride, double* _other, Index otherIncr,
                      Index otherStride);
 };
 
 template <typename Index, int Mode, int TriStorageOrder>
 EIGEN_DONT_INLINE void trsmKernelR<float, Index, Mode, false, TriStorageOrder, 1, true>::kernel(
-    Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
+    Index size, Index otherSize, const float* _tri, Index triStride, float* _other, Index otherIncr,
     Index otherStride) {
   EIGEN_UNUSED_VARIABLE(otherIncr);
 #ifdef EIGEN_RUNTIME_NO_MALLOC
@@ -1091,12 +1093,12 @@ EIGEN_DONT_INLINE void trsmKernelR<float, Index, Mode, false, TriStorageOrder, 1
   }
 #endif
   triSolve<float, TriStorageOrder != RowMajor, true, (Mode & Lower) != Lower, (Mode & UnitDiag) != 0>(
-      const_cast<float *>(_tri), _other, size, otherSize, triStride, otherStride);
+      const_cast<float*>(_tri), _other, size, otherSize, triStride, otherStride);
 }
 
 template <typename Index, int Mode, int TriStorageOrder>
 EIGEN_DONT_INLINE void trsmKernelR<double, Index, Mode, false, TriStorageOrder, 1, true>::kernel(
-    Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
+    Index size, Index otherSize, const double* _tri, Index triStride, double* _other, Index otherIncr,
     Index otherStride) {
   EIGEN_UNUSED_VARIABLE(otherIncr);
 #ifdef EIGEN_RUNTIME_NO_MALLOC
@@ -1107,7 +1109,7 @@ EIGEN_DONT_INLINE void trsmKernelR<double, Index, Mode, false, TriStorageOrder,
   }
 #endif
   triSolve<double, TriStorageOrder != RowMajor, true, (Mode & Lower) != Lower, (Mode & UnitDiag) != 0>(
-      const_cast<double *>(_tri), _other, size, otherSize, triStride, otherStride);
+      const_cast<double*>(_tri), _other, size, otherSize, triStride, otherStride);
 }
 #endif  // (EIGEN_USE_AVX512_TRSM_R_KERNELS)
 
@@ -1119,19 +1121,19 @@ struct trsmKernelL;
 
 template <typename Index, int Mode, int TriStorageOrder>
 struct trsmKernelL<float, Index, Mode, false, TriStorageOrder, 1, true> {
-  static void kernel(Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
+  static void kernel(Index size, Index otherSize, const float* _tri, Index triStride, float* _other, Index otherIncr,
                      Index otherStride);
 };
 
 template <typename Index, int Mode, int TriStorageOrder>
 struct trsmKernelL<double, Index, Mode, false, TriStorageOrder, 1, true> {
-  static void kernel(Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
+  static void kernel(Index size, Index otherSize, const double* _tri, Index triStride, double* _other, Index otherIncr,
                      Index otherStride);
 };
 
 template <typename Index, int Mode, int TriStorageOrder>
 EIGEN_DONT_INLINE void trsmKernelL<float, Index, Mode, false, TriStorageOrder, 1, true>::kernel(
-    Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
+    Index size, Index otherSize, const float* _tri, Index triStride, float* _other, Index otherIncr,
     Index otherStride) {
   EIGEN_UNUSED_VARIABLE(otherIncr);
 #ifdef EIGEN_RUNTIME_NO_MALLOC
@@ -1142,12 +1144,12 @@ EIGEN_DONT_INLINE void trsmKernelL<float, Index, Mode, false, TriStorageOrder, 1
   }
 #endif
   triSolve<float, TriStorageOrder == RowMajor, false, (Mode & Lower) == Lower, (Mode & UnitDiag) != 0>(
-      const_cast<float *>(_tri), _other, size, otherSize, triStride, otherStride);
+      const_cast<float*>(_tri), _other, size, otherSize, triStride, otherStride);
 }
 
 template <typename Index, int Mode, int TriStorageOrder>
 EIGEN_DONT_INLINE void trsmKernelL<double, Index, Mode, false, TriStorageOrder, 1, true>::kernel(
-    Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
+    Index size, Index otherSize, const double* _tri, Index triStride, double* _other, Index otherIncr,
     Index otherStride) {
   EIGEN_UNUSED_VARIABLE(otherIncr);
 #ifdef EIGEN_RUNTIME_NO_MALLOC
@@ -1158,10 +1160,12 @@ EIGEN_DONT_INLINE void trsmKernelL<double, Index, Mode, false, TriStorageOrder,
   }
 #endif
   triSolve<double, TriStorageOrder == RowMajor, false, (Mode & Lower) == Lower, (Mode & UnitDiag) != 0>(
-      const_cast<double *>(_tri), _other, size, otherSize, triStride, otherStride);
+      const_cast<double*>(_tri), _other, size, otherSize, triStride, otherStride);
 }
 #endif  // EIGEN_USE_AVX512_TRSM_L_KERNELS
+
 #endif  // EIGEN_USE_AVX512_TRSM_KERNELS
+
 }  // namespace internal
 }  // namespace Eigen
 #endif  // EIGEN_CORE_ARCH_AVX512_TRSM_KERNEL_H
diff --git a/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc b/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
index 3a5f68eba2f..3ca8b2d24ec 100644
--- a/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
+++ b/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
@@ -129,8 +129,8 @@ EIGEN_ALWAYS_INLINE void trans8x8blocks(PacketBlock<Packet8d, 8> &kernel) {
 template <typename Scalar>
 class trans {
  public:
-  using vec = typename std::conditional<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>::type;
-  using vecHalf = typename std::conditional<std::is_same<Scalar, float>::value, vecHalfFloat, vecFullDouble>::type;
+  using vec = std::conditional_t<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>;
+  using vecHalf = std::conditional_t<std::is_same<Scalar, float>::value, vecHalfFloat, vecFullDouble>;
   static constexpr int64_t PacketSize = packet_traits<Scalar>::size;
 
   /***********************************
@@ -281,8 +281,8 @@ class trans {
 template <typename Scalar>
 class transB {
  public:
-  using vec = typename std::conditional<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>::type;
-  using vecHalf = typename std::conditional<std::is_same<Scalar, float>::value, vecHalfFloat, vecFullDouble>::type;
+  using vec = std::conditional_t<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>;
+  using vecHalf = std::conditional_t<std::is_same<Scalar, float>::value, vecHalfFloat, vecFullDouble>;
   static constexpr int64_t PacketSize = packet_traits<Scalar>::size;
 
   /***********************************
@@ -585,7 +585,7 @@ class transB {
 template <typename Scalar>
 class trsm {
  public:
-  using vec = typename std::conditional<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>::type;
+  using vec = std::conditional_t<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>;
   static constexpr int64_t PacketSize = packet_traits<Scalar>::size;
 
   /***********************************
@@ -864,7 +864,7 @@ class trsm {
 template <typename Scalar, bool isAdd>
 class gemm {
  public:
-  using vec = typename std::conditional<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>::type;
+  using vec = std::conditional_t<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>;
   static constexpr int64_t PacketSize = packet_traits<Scalar>::size;
 
   /***********************************
diff --git a/Eigen/src/Core/arch/AVX512/TypeCasting.h b/Eigen/src/Core/arch/AVX512/TypeCasting.h
index fc55fd86149..6c429099f3d 100644
--- a/Eigen/src/Core/arch/AVX512/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2019 Rasmus Munk Larsen <rmlarsen@google.com>
+// Copyright (C) 2019 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -52,9 +52,17 @@ struct type_casting_traits<bfloat16, float> : vectorized_type_casting_traits<bfl
 template <>
 struct type_casting_traits<float, bfloat16> : vectorized_type_casting_traits<float, bfloat16> {};
 
+EIGEN_STRONG_INLINE __mmask16 _eigen_mm512_cmpneq_ps_mask(__m512 a, __m512 b) {
+#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 810)
+  return _mm512_cmp_ps_mask(a, b, _CMP_NEQ_UQ);
+#else
+  return _mm512_cmpneq_ps_mask(a, b);
+#endif
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16b pcast<Packet16f, Packet16b>(const Packet16f& a) {
-  __mmask16 mask = _mm512_cmpneq_ps_mask(a, pzero(a));
+  __mmask16 mask = _eigen_mm512_cmpneq_ps_mask(a, pzero(a));
   return _mm512_maskz_cvtepi32_epi8(mask, _mm512_set1_epi32(1));
 }
 
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index d6df59af6a8..12d73ed5e9c 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -109,9 +109,6 @@ struct packet_traits<std::complex<float> > : default_packet_traits {
     HasSqrt = 1,
     HasLog = 1,
     HasExp = 1,
-#ifdef EIGEN_VECTORIZE_VSX
-    HasBlend = 1,
-#endif
     HasSetLinear = 0
   };
 };
@@ -364,31 +361,7 @@ EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
   return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV)));
 }
 
-#ifdef EIGEN_VECTORIZE_VSX
-template <>
-EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
-                                     const Packet2cf& elsePacket) {
-  Packet2cf result;
-  result.v = reinterpret_cast<Packet4f>(
-      pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
-  return result;
-}
-#endif
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
-  return psqrt_complex<Packet2cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf plog<Packet2cf>(const Packet2cf& a) {
-  return plog_complex<Packet2cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a) {
-  return pexp_complex<Packet2cf>(a);
-}
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet2cf)
 
 //---------- double ----------
 #ifdef EIGEN_VECTORIZE_VSX
@@ -635,15 +608,7 @@ EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
   return Packet1cd(vec_and(eq, eq_swapped));
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
-  return psqrt_complex<Packet1cd>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd plog<Packet1cd>(const Packet1cd& a) {
-  return plog_complex<Packet1cd>(a);
-}
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS_NO_EXP(Packet1cd)
 
 #endif  // __VSX__
 }  // end namespace internal
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
index 94c5dd2737d..4815974b098 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
@@ -82,7 +82,6 @@ EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data
   }
 }
 
-// Defaults to float32, since Eigen still supports C++03 we can't use default template arguments
 template <typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
 EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) {
   if (NegativeAccumulate) {
@@ -491,7 +490,7 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
   const Packet pAlpha = pset1<Packet>(alpha);
   const Packet pMask = bmask<Packet>(remaining_rows);
 
-  typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
+  typedef std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
 
   Index col = 0;
 #ifdef GEMM_MULTIPLE_COLS
@@ -870,7 +869,7 @@ void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsS
   const Scalar* blockA = (Scalar*)blockAc;
   const Scalar* blockB = (Scalar*)blockBc;
 
-  typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
+  typedef std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
 
   Index col = 0;
 #ifdef GEMM_MULTIPLE_COLS
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc
index 90c0d39202f..b79be586b53 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc
+++ b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc
@@ -380,7 +380,8 @@ EIGEN_STRONG_INLINE void gemv_col(Index rows, Index cols, const LhsMapper& alhs,
   conj_helper<LhsPacket, RhsPacket, false, false> pcj;
 
   const Index lhsStride = lhs.stride();
-  // TODO: for padded aligned inputs, we could enable aligned reads
+  // LhsAlignment stays Unaligned; enabling aligned reads would require
+  // propagating the Mapper's Alignment through the run() template.
   enum {
     LhsAlignment = Unaligned,
     ResPacketSize = Traits::ResPacketSize,
@@ -1866,7 +1867,7 @@ EIGEN_ALWAYS_INLINE void disassembleResults(__vector_quad* c0, PacketBlock<Scala
   if (GEMV_GETN_COMPLEX(N) > iter1) {                      \
     if (GEMV_IS_COMPLEX_FLOAT) {                           \
       GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2);        \
-      EIGEN_UNUSED_VARIABLE(a##iter3)                      \
+      EIGEN_UNUSED_VARIABLE(a##iter3);                     \
     } else {                                               \
       GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2 << 1);   \
       GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter3, iter3 << 1);   \
@@ -2058,7 +2059,8 @@ EIGEN_STRONG_INLINE void gemv_complex_col(Index rows, Index cols, const LhsMappe
   conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
 
   const Index lhsStride = lhs.stride();
-  // TODO: for padded aligned inputs, we could enable aligned reads
+  // LhsAlignment stays Unaligned; enabling aligned reads would require
+  // propagating the Mapper's Alignment through the run() template.
   enum {
     LhsAlignment = Unaligned,
     ResPacketSize = PTraits::ResPacketSize,
@@ -2390,7 +2392,8 @@ EIGEN_STRONG_INLINE void gemv_row(Index rows, Index cols, const LhsMapper& alhs,
   const Index n2 = rows - 1;
 #endif
 
-  // TODO: for padded aligned inputs, we could enable aligned reads
+  // LhsAlignment stays Unaligned; enabling aligned reads would require
+  // propagating the Mapper's Alignment through the run() template.
   enum {
     LhsAlignment = Unaligned,
     ResPacketSize = Traits::ResPacketSize,
@@ -2733,7 +2736,8 @@ EIGEN_STRONG_INLINE void gemv_complex_row(Index rows, Index cols, const LhsMappe
   const Index n2 = rows - 1;
 #endif
 
-  // TODO: for padded aligned inputs, we could enable aligned reads
+  // LhsAlignment stays Unaligned; enabling aligned reads would require
+  // propagating the Mapper's Alignment through the run() template.
   enum {
     LhsAlignment = Unaligned,
     ResPacketSize = PTraits::ResPacketSize,
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index d7bd9bee440..8d5a3515635 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -41,7 +41,7 @@ typedef __vector signed char Packet16c;
 typedef __vector unsigned char Packet16uc;
 typedef eigen_packet_wrapper<__vector unsigned short int, 0> Packet8bf;
 
-// We don't want to write the same code all the time, but we need to reuse the constants
+// To avoid repeating the same code, but we need to reuse the constants
 // and it doesn't really work to declare them global, so we define macros instead
 #define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = {X, X, X, X}
 
@@ -178,13 +178,18 @@ struct packet_traits<float> : default_packet_traits {
     HasAbs = 1,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
+    HasTan = EIGEN_FAST_MATH,
     HasACos = 1,
     HasASin = 1,
     HasATan = 1,
     HasATanh = 1,
     HasLog = 1,
     HasExp = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
 #ifdef EIGEN_VECTORIZE_VSX
+    HasCmp = 1,
+    HasPow = 1,
     HasSqrt = 1,
     HasCbrt = 1,
 #if !EIGEN_COMP_CLANG
@@ -202,7 +207,6 @@ struct packet_traits<float> : default_packet_traits {
     HasErf = 0,
 #endif
     HasNegate = 1,
-    HasBlend = 1
   };
 };
 template <>
@@ -239,7 +243,6 @@ struct packet_traits<bfloat16> : default_packet_traits {
     HasTanh = 0,
     HasErf = 0,
     HasNegate = 1,
-    HasBlend = 1
   };
 };
 
@@ -261,7 +264,6 @@ struct packet_traits<int> : default_packet_traits {
 #else
     HasDiv = 0,
 #endif
-    HasBlend = 1,
     HasCmp = 1
   };
 };
@@ -279,7 +281,6 @@ struct packet_traits<short int> : default_packet_traits {
     HasSub = 1,
     HasMul = 1,
     HasDiv = 0,
-    HasBlend = 1,
     HasCmp = 1
   };
 };
@@ -297,7 +298,6 @@ struct packet_traits<unsigned short int> : default_packet_traits {
     HasSub = 1,
     HasMul = 1,
     HasDiv = 0,
-    HasBlend = 1,
     HasCmp = 1
   };
 };
@@ -315,7 +315,6 @@ struct packet_traits<signed char> : default_packet_traits {
     HasSub = 1,
     HasMul = 1,
     HasDiv = 0,
-    HasBlend = 1,
     HasCmp = 1
   };
 };
@@ -333,7 +332,6 @@ struct packet_traits<unsigned char> : default_packet_traits {
     HasSub = 1,
     HasMul = 1,
     HasDiv = 0,
-    HasBlend = 1,
     HasCmp = 1
   };
 };
@@ -1165,7 +1163,7 @@ EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b
 #endif
 }
 
-// for some weird raisons, it has to be overloaded for packet of integers
+// This overload is required for integer packet types.
 template <>
 EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
   return vec_madd(a, b, c);
@@ -3051,74 +3049,6 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
   kernel.packet[15] = vec_mergel(step3[7], step3[15]);
 }
 
-template <typename Packet>
-EIGEN_STRONG_INLINE Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
-  Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
-  Packet4ui mask = reinterpret_cast<Packet4ui>(pnegate(reinterpret_cast<Packet4i>(select)));
-  return vec_sel(elsePacket, thenPacket, mask);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
-                                    const Packet4i& elsePacket) {
-  return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
-                                    const Packet4f& elsePacket) {
-  return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket,
-                                    const Packet8s& elsePacket) {
-  Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
-                      ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
-  Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
-  Packet8s result = vec_sel(elsePacket, thenPacket, mask);
-  return result;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket,
-                                     const Packet8us& elsePacket) {
-  Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
-                      ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
-  Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
-  return vec_sel(elsePacket, thenPacket, mask);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket,
-                                     const Packet8bf& elsePacket) {
-  return pblend<Packet8us>(ifPacket, thenPacket, elsePacket);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket,
-                                     const Packet16c& elsePacket) {
-  Packet16uc select = {ifPacket.select[0],  ifPacket.select[1],  ifPacket.select[2],  ifPacket.select[3],
-                       ifPacket.select[4],  ifPacket.select[5],  ifPacket.select[6],  ifPacket.select[7],
-                       ifPacket.select[8],  ifPacket.select[9],  ifPacket.select[10], ifPacket.select[11],
-                       ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
-
-  Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
-  return vec_sel(elsePacket, thenPacket, mask);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket,
-                                      const Packet16uc& elsePacket) {
-  Packet16uc select = {ifPacket.select[0],  ifPacket.select[1],  ifPacket.select[2],  ifPacket.select[3],
-                       ifPacket.select[4],  ifPacket.select[5],  ifPacket.select[6],  ifPacket.select[7],
-                       ifPacket.select[8],  ifPacket.select[9],  ifPacket.select[10], ifPacket.select[11],
-                       ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
-
-  Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
-  return vec_sel(elsePacket, thenPacket, mask);
-}
-
 //---------- double ----------
 #ifdef EIGEN_VECTORIZE_VSX
 typedef __vector double Packet2d;
@@ -3169,13 +3099,17 @@ struct packet_traits<double> : default_packet_traits {
     HasAbs = 1,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
+    HasTan = EIGEN_FAST_MATH,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
     HasErfc = EIGEN_FAST_MATH,
     HasATanh = 1,
     HasATan = 0,
-    HasLog = 0,
+    HasCmp = 1,
+    HasLog = 1,
     HasExp = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
     HasSqrt = 1,
     HasCbrt = 1,
 #if !EIGEN_COMP_CLANG
@@ -3184,7 +3118,6 @@ struct packet_traits<double> : default_packet_traits {
     HasRsqrt = 0,
 #endif
     HasNegate = 1,
-    HasBlend = 1
   };
 };
 
@@ -3341,7 +3274,7 @@ EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b
   return vec_div(a, b);
 }
 
-// for some weird raisons, it has to be overloaded for packet of integers
+// This overload is required for integer packet types.
 template <>
 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
   return vec_madd(a, b, c);
@@ -3710,14 +3643,6 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
   kernel.packet[1] = t1;
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
-                                    const Packet2d& elsePacket) {
-  Packet2l select = {ifPacket.select[0], ifPacket.select[1]};
-  Packet2ul mask = reinterpret_cast<Packet2ul>(pnegate(reinterpret_cast<Packet2l>(select)));
-  return vec_sel(elsePacket, thenPacket, mask);
-}
-
 #endif  // __VSX__
 }  // end namespace internal
 
diff --git a/Eigen/src/Core/arch/AltiVec/TypeCasting.h b/Eigen/src/Core/arch/AltiVec/TypeCasting.h
index 439339ee501..7be66421f4a 100644
--- a/Eigen/src/Core/arch/AltiVec/TypeCasting.h
+++ b/Eigen/src/Core/arch/AltiVec/TypeCasting.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2019 Rasmus Munk Larsen <rmlarsen@google.com>
+// Copyright (C) 2019 Rasmus Munk Larsen <rmlarsen@gmail.com>
 // Copyright (C) 2023 Chip Kerchner (chip.kerchner@ibm.com)
 //
 // This Source Code Form is subject to the terms of the Mozilla
diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h
index f2e55f34588..313c506e328 100644
--- a/Eigen/src/Core/arch/Default/BFloat16.h
+++ b/Eigen/src/Core/arch/Default/BFloat16.h
@@ -38,6 +38,45 @@ limitations under the License.
     return F32ToBf16(METHOD<PACKET_F>(Bf16ToF32(_x)));                                              \
   }
 
+#define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_BF16(PACKET_F, PACKET_BF16) \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pcos)                      \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, psin)                      \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, psinh)                     \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pcosh)                     \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pasinh)                    \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pacosh)                    \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pexp)                      \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pexp2)                     \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pexpm1)                    \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, plog)                      \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, plog1p)                    \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, plog2)                     \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, plog10)                    \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, preciprocal)               \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, prsqrt)                    \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pcbrt)                     \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, psqrt)                     \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, ptanh)
+
+// BF16 wrappers for unsupported/SpecialFunctions.
+#define EIGEN_INSTANTIATE_SPECIAL_FUNCS_BF16(PACKET_F, PACKET_BF16) \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, perf)                 \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pndtri)
+
+#define EIGEN_INSTANTIATE_BESSEL_FUNCS_BF16(PACKET_F, PACKET_BF16) \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_i0)          \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_i0e)         \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_i1)          \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_i1e)         \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_j0)          \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_j1)          \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_k0)          \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_k0e)         \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_k1)          \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_k1e)         \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_y0)          \
+  BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, pbessel_y1)
+
 // Only use HIP GPU bf16 in kernels
 #if defined(EIGEN_HAS_HIP_BF16) && defined(EIGEN_GPU_COMPILE_PHASE)
 #define EIGEN_USE_HIP_BF16
@@ -176,6 +215,8 @@ struct numeric_limits_bfloat16_impl {
   static EIGEN_CONSTEXPR Eigen::bfloat16 denorm_min() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0001); }
 };
 
+// Redundant out-of-class definitions are required pre-C++17 but deprecated since.
+#if EIGEN_COMP_CXXVER < 17
 template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_specialized;
 template <typename T>
@@ -225,6 +266,7 @@ template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::traps;
 template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::tinyness_before;
+#endif
 }  // end namespace bfloat16_impl
 }  // end namespace Eigen
 
@@ -252,7 +294,7 @@ namespace bfloat16_impl {
 // of the functions, while the latter can only deal with one of them.
 #if !defined(EIGEN_HAS_NATIVE_BF16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)  // Emulate support for bfloat16 floats
 
-#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
+#if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
 // We need to provide emulated *host-side* BF16 operators for clang.
 #pragma push_macro("EIGEN_DEVICE_FUNC")
 #undef EIGEN_DEVICE_FUNC
@@ -622,6 +664,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log2(const bfloat16& a) {
   return bfloat16(static_cast<float>(EIGEN_LOG2E) * ::logf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sqrt(const bfloat16& a) { return bfloat16(::sqrtf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cbrt(const bfloat16& a) { return bfloat16(::cbrtf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 pow(const bfloat16& a, const bfloat16& b) {
   return bfloat16(::powf(float(a), float(b)));
 }
@@ -793,6 +836,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 nextafter(const bfloat16& from, c
   return numext::bit_cast<bfloat16>(from_bits);
 }
 
+// Specialize multiply-add to match packet operations and reduce conversions to/from float.
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 madd<Eigen::bfloat16>(const Eigen::bfloat16& x,
+                                                                            const Eigen::bfloat16& y,
+                                                                            const Eigen::bfloat16& z) {
+  return Eigen::bfloat16(static_cast<float>(x) * static_cast<float>(y) + static_cast<float>(z));
+}
+
 }  // namespace numext
 }  // namespace Eigen
 
@@ -807,16 +858,8 @@ struct hash<Eigen::bfloat16> {
 }  // namespace std
 #endif
 
-// Add the missing shfl* intrinsics.
-// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
-//   CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
-//
-// HIP and CUDA prior to SDK 9.0 define
-//    __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
-// CUDA since 9.0 deprecates those and instead defines
-//    __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
-//    with native support for __half and __nv_bfloat16
-//
+// Warp shuffle overloads for Eigen::bfloat16.
+// HIP uses non-sync __shfl variants; CUDA has native __nv_bfloat16 support in __shfl_sync.
 // Note that the following are __device__ - only functions.
 #if defined(EIGEN_HIPCC)
 
diff --git a/Eigen/src/Core/arch/Default/ConjHelper.h b/Eigen/src/Core/arch/Default/ConjHelper.h
index fd7923e16a2..137e7fb56fd 100644
--- a/Eigen/src/Core/arch/Default/ConjHelper.h
+++ b/Eigen/src/Core/arch/Default/ConjHelper.h
@@ -17,6 +17,9 @@
     EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const { \
       return padd(c, this->pmul(x, y));                                                                             \
     }                                                                                                               \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmsub(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const { \
+      return psub(this->pmul(x, y), c);                                                                             \
+    }                                                                                                               \
     EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const {                        \
       return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v));                                               \
     }                                                                                                               \
@@ -27,6 +30,9 @@
     EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const { \
       return padd(c, this->pmul(x, y));                                                                             \
     }                                                                                                               \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmsub(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const { \
+      return psub(this->pmul(x, y), c);                                                                             \
+    }                                                                                                               \
     EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const {                        \
       return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y));                                               \
     }                                                                                                               \
@@ -76,6 +82,11 @@ struct conj_helper {
     return this->pmul(x, y) + c;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmsub(const LhsType& x, const RhsType& y,
+                                                         const ResultType& c) const {
+    return this->pmul(x, y) - c;
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsType& x, const RhsType& y) const {
     return conj_if<ConjLhs>()(x) * conj_if<ConjRhs>()(y);
   }
@@ -104,6 +115,10 @@ struct conj_helper<Packet, Packet, ConjLhs, ConjRhs> {
     return Eigen::internal::pmadd(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y), c);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmsub(const Packet& x, const Packet& y, const Packet& c) const {
+    return Eigen::internal::pmsub(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y), c);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const {
     return Eigen::internal::pmul(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y));
   }
@@ -116,6 +131,9 @@ struct conj_helper<Packet, Packet, true, true> {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const {
     return Eigen::internal::pmadd(pconj(x), pconj(y), c);
   }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmsub(const Packet& x, const Packet& y, const Packet& c) const {
+    return Eigen::internal::pmsub(pconj(x), pconj(y), c);
+  }
   // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b).
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const {
     return pconj(Eigen::internal::pmul(x, y));
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathComplex.h b/Eigen/src/Core/arch/Default/GenericPacketMathComplex.h
new file mode 100644
index 00000000000..8ceaf967b61
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathComplex.h
@@ -0,0 +1,283 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2018-2025 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_COMPLEX_H
+#define EIGEN_ARCH_GENERIC_PACKET_MATH_COMPLEX_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+//----------------------------------------------------------------------
+// Complex Arithmetic and Functions
+//----------------------------------------------------------------------
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet& x, const Packet& y) {
+  typedef typename unpacket_traits<Packet>::as_real RealPacket;
+  typedef typename unpacket_traits<RealPacket>::type RealScalar;
+  // In the following we annotate the code for the case where the inputs
+  // are a pair length-2 SIMD vectors representing a single pair of complex
+  // numbers x = a + i*b, y = c + i*d.
+  const RealPacket one = pset1<RealPacket>(RealScalar(1));
+  const RealPacket abs_y = pabs(y.v);
+  const RealPacket abs_y_flip = pcplxflip(Packet(abs_y)).v;
+
+  const RealPacket mask = pcmp_lt(abs_y, abs_y_flip);  // |c| < |d|
+  RealPacket y_scaled = pselect(mask, pdiv(abs_y, abs_y_flip), one);
+  y_scaled = por(y_scaled, pandnot(y.v, abs_y));    // copy signs in case |c| == |d|
+  RealPacket denom = pmul(y.v, y_scaled);
+  denom = padd(denom, pcplxflip(Packet(denom)).v);  // c * c' + d * d'
+  Packet num = pmul(x, pconj(Packet(y_scaled)));    // a * c' + b * d', -a * d + b * c
+  return Packet(pdiv(num.v, denom));
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pmul_complex(const Packet& x, const Packet& y) {
+  // In the following we annotate the code for the case where the inputs
+  // are a pair length-2 SIMD vectors representing a single pair of complex
+  // numbers x = a + i*b, y = c + i*d.
+  Packet x_re = pdupreal(x);                  // a, a
+  Packet x_im = pdupimag(x);                  // b, b
+  Packet tmp_re = Packet(pmul(x_re.v, y.v));  // a*c, a*d
+  Packet tmp_im = Packet(pmul(x_im.v, y.v));  // b*c, b*d
+  tmp_im = pcplxflip(pconj(tmp_im));          // -b*d, d*c
+  return padd(tmp_im, tmp_re);                // a*c - b*d, a*d + b*c
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_complex(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename Scalar::value_type RealScalar;
+  typedef typename unpacket_traits<Packet>::as_real RealPacket;
+
+  // Real part
+  RealPacket x_flip = pcplxflip(x).v;  // b, a
+  Packet x_norm = phypot_complex(x);   // sqrt(a^2 + b^2), sqrt(a^2 + b^2)
+  RealPacket xlogr = plog(x_norm.v);   // log(sqrt(a^2 + b^2)), log(sqrt(a^2 + b^2))
+
+  // Imag part
+  RealPacket ximg = patan2(x.v, x_flip);  // atan2(a, b), atan2(b, a)
+
+  const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
+  RealPacket x_abs = pabs(x.v);
+  RealPacket is_x_pos_inf = pcmp_eq(x_abs, cst_pos_inf);
+  RealPacket is_y_pos_inf = pcplxflip(Packet(is_x_pos_inf)).v;
+  RealPacket is_any_inf = por(is_x_pos_inf, is_y_pos_inf);
+  RealPacket xreal = pselect(is_any_inf, cst_pos_inf, xlogr);
+
+  return Packet(pselect(peven_mask(xreal), xreal, ximg));  // log(sqrt(a^2 + b^2)), atan2(b, a)
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_complex(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::as_real RealPacket;
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename Scalar::value_type RealScalar;
+  const RealPacket even_mask = peven_mask(a.v);
+  const RealPacket odd_mask = pcplxflip(Packet(even_mask)).v;
+
+  // Let a = x + iy.
+  // exp(a) = exp(x) * cis(y), plus some special edge-case handling.
+
+  // exp(x):
+  RealPacket x = pand(a.v, even_mask);
+  x = por(x, pcplxflip(Packet(x)).v);
+  RealPacket expx = pexp(x);  // exp(x);
+
+  // cis(y):
+  RealPacket y = pand(odd_mask, a.v);
+  y = por(y, pcplxflip(Packet(y)).v);
+  RealPacket cisy = psincos_selector<RealPacket>(y);
+  cisy = pcplxflip(Packet(cisy)).v;  // cos(y) + i * sin(y)
+
+  const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
+  const RealPacket cst_neg_inf = pset1<RealPacket>(-NumTraits<RealScalar>::infinity());
+
+  // If x is -inf, we know that cossin(y) is bounded,
+  //   so the result is (0, +/-0), where the sign of the imaginary part comes
+  //   from the sign of cossin(y).
+  RealPacket cisy_sign = por(pandnot(cisy, pabs(cisy)), pset1<RealPacket>(RealScalar(1)));
+  cisy = pselect(pcmp_eq(x, cst_neg_inf), cisy_sign, cisy);
+
+  // If x is inf, and cos(y) has unknown sign (y is inf or NaN), the result
+  // is (+/-inf, NaN), where the signs are undetermined (take the sign of y).
+  RealPacket y_sign = por(pandnot(y, pabs(y)), pset1<RealPacket>(RealScalar(1)));
+  cisy = pselect(pand(pcmp_eq(x, cst_pos_inf), pisnan(cisy)), pand(y_sign, even_mask), cisy);
+
+  // If exp(x) is +inf and y is finite, replace cisy with copysign(1, cisy) to
+  // prevent inf * 0 = NaN. The vectorized sincos may compute exact zero
+  // for near-zero values like cos(pi/2), and inf * +-1 = +-inf is correct.
+  // The y=0 case is handled separately below.
+  RealPacket cisy_sign_one = por(pand(cisy, pset1<RealPacket>(RealScalar(-0.0))), pset1<RealPacket>(RealScalar(1)));
+  RealPacket expx_inf_y_finite = pand(pcmp_eq(expx, cst_pos_inf), pcmp_lt(pabs(y), cst_pos_inf));
+  cisy = pselect(expx_inf_y_finite, cisy_sign_one, cisy);
+
+  Packet result = Packet(pmul(expx, cisy));
+
+  // If y is +/- 0, the input is real, so take the real result for consistency.
+  result = pselect(Packet(pcmp_eq(y, pzero(y))), Packet(por(pand(expx, even_mask), pand(y, odd_mask))), result);
+
+  return result;
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt_complex(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename Scalar::value_type RealScalar;
+  typedef typename unpacket_traits<Packet>::as_real RealPacket;
+
+  // Computes the principal sqrt of the complex numbers in the input.
+  //
+  // For example, for packets containing 2 complex numbers stored in interleaved format
+  //    a = [a0, a1] = [x0, y0, x1, y1],
+  // where x0 = real(a0), y0 = imag(a0) etc., this function returns
+  //    b = [b0, b1] = [u0, v0, u1, v1],
+  // such that b0^2 = a0, b1^2 = a1.
+  //
+  // To derive the formula for the complex square roots, let's consider the equation for
+  // a single complex square root of the number x + i*y. We want to find real numbers
+  // u and v such that
+  //    (u + i*v)^2 = x + i*y  <=>
+  //    u^2 - v^2 + i*2*u*v = x + i*v.
+  // By equating the real and imaginary parts we get:
+  //    u^2 - v^2 = x
+  //    2*u*v = y.
+  //
+  // For x >= 0, this has the numerically stable solution
+  //    u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
+  //    v = 0.5 * (y / u)
+  // and for x < 0,
+  //    v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
+  //    u = 0.5 * (y / v)
+  //
+  //  To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as
+  //     l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) ,
+
+  // In the following, without lack of generality, we have annotated the code, assuming
+  // that the input is a packet of 2 complex numbers.
+  //
+  // Step 1. Compute l = [l0, l0, l1, l1], where
+  //    l0 = sqrt(x0^2 + y0^2),  l1 = sqrt(x1^2 + y1^2)
+  // To avoid over- and underflow, we use the stable formula for each hypotenuse
+  //    l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)),
+  // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1.
+
+  RealPacket a_abs = pabs(a.v);                        // [|x0|, |y0|, |x1|, |y1|]
+  RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v;  // [|y0|, |x0|, |y1|, |x1|]
+  RealPacket a_max = pmax(a_abs, a_abs_flip);
+  RealPacket a_min = pmin(a_abs, a_abs_flip);
+  RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
+  RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
+  RealPacket r = pdiv(a_min, a_max);
+  const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
+  RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r))));  // [l0, l0, l1, l1]
+  // Set l to a_max if a_min is zero.
+  l = pselect(a_min_zero_mask, a_max, l);
+
+  // Step 2. Compute [rho0, *, rho1, *], where
+  // rho0 = sqrt(0.5 * (l0 + |x0|)), rho1 =  sqrt(0.5 * (l1 + |x1|))
+  // We don't care about the imaginary parts computed here. They will be overwritten later.
+  const RealPacket cst_half = pset1<RealPacket>(RealScalar(0.5));
+  Packet rho;
+  rho.v = psqrt(pmul(cst_half, padd(a_abs, l)));
+
+  // Step 3. Compute [rho0, eta0, rho1, eta1], where
+  // eta0 = (y0 / l0) / 2, and eta1 = (y1 / l1) / 2.
+  // set eta = 0 of input is 0 + i0.
+  RealPacket eta = pandnot(pmul(cst_half, pdiv(a.v, pcplxflip(rho).v)), a_max_zero_mask);
+  RealPacket real_mask = peven_mask(a.v);
+  Packet positive_real_result;
+  // Compute result for inputs with positive real part.
+  positive_real_result.v = pselect(real_mask, rho.v, eta);
+
+  // Step 4. Compute solution for inputs with negative real part:
+  //         [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1]
+  const RealPacket cst_imag_sign_mask = pset1<Packet>(Scalar(RealScalar(0.0), RealScalar(-0.0))).v;
+  RealPacket imag_signs = pand(a.v, cst_imag_sign_mask);
+  Packet negative_real_result;
+  // Notice that rho is positive, so taking its absolute value is a noop.
+  negative_real_result.v = por(pabs(pcplxflip(positive_real_result).v), imag_signs);
+
+  // Step 5. Select solution branch based on the sign of the real parts.
+  Packet negative_real_mask;
+  negative_real_mask.v = pcmp_lt(pand(real_mask, a.v), pzero(a.v));
+  negative_real_mask.v = por(negative_real_mask.v, pcplxflip(negative_real_mask).v);
+  Packet result = pselect(negative_real_mask, negative_real_result, positive_real_result);
+
+  // Step 6. Handle special cases for infinities:
+  // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN
+  // * If z is (x,-∞), the result is (+∞,-∞) even if x is NaN
+  // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y
+  // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y
+  const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
+  Packet is_inf;
+  is_inf.v = pcmp_eq(a_abs, cst_pos_inf);
+  Packet is_real_inf;
+  is_real_inf.v = pand(is_inf.v, real_mask);
+  is_real_inf = por(is_real_inf, pcplxflip(is_real_inf));
+  // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part.
+  Packet real_inf_result;
+  real_inf_result.v = pmul(a_abs, pset1<Packet>(Scalar(RealScalar(1.0), RealScalar(0.0))).v);
+  real_inf_result.v = pselect(negative_real_mask.v, pcplxflip(real_inf_result).v, real_inf_result.v);
+  // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part.
+  Packet is_imag_inf;
+  is_imag_inf.v = pandnot(is_inf.v, real_mask);
+  is_imag_inf = por(is_imag_inf, pcplxflip(is_imag_inf));
+  Packet imag_inf_result;
+  imag_inf_result.v = por(pand(cst_pos_inf, real_mask), pandnot(a.v, real_mask));
+  // unless otherwise specified, if either the real or imaginary component is nan, the entire result is nan
+  Packet result_is_nan = pisnan(result);
+  result = por(result_is_nan, result);
+
+  return pselect(is_imag_inf, imag_inf_result, pselect(is_real_inf, real_inf_result, result));
+}
+
+// \internal \returns the norm of a complex number z = x + i*y, defined as sqrt(x^2 + y^2).
+// Implemented using the hypot(a,b) algorithm from https://doi.org/10.48550/arXiv.1904.09481
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet phypot_complex(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename Scalar::value_type RealScalar;
+  typedef typename unpacket_traits<Packet>::as_real RealPacket;
+
+  const RealPacket cst_zero_rp = pset1<RealPacket>(static_cast<RealScalar>(0.0));
+  const RealPacket cst_minus_one_rp = pset1<RealPacket>(static_cast<RealScalar>(-1.0));
+  const RealPacket cst_two_rp = pset1<RealPacket>(static_cast<RealScalar>(2.0));
+  const RealPacket evenmask = peven_mask(a.v);
+
+  RealPacket a_abs = pabs(a.v);
+  RealPacket a_flip = pcplxflip(Packet(a_abs)).v;       // |b|, |a|
+  RealPacket a_all = pselect(evenmask, a_abs, a_flip);  // |a|, |a|
+  RealPacket b_all = pselect(evenmask, a_flip, a_abs);  // |b|, |b|
+
+  RealPacket a2 = pmul(a.v, a.v);                    // |a^2, b^2|
+  RealPacket a2_flip = pcplxflip(Packet(a2)).v;      // |b^2, a^2|
+  RealPacket h = psqrt(padd(a2, a2_flip));           // |sqrt(a^2 + b^2), sqrt(a^2 + b^2)|
+  RealPacket h_sq = pmul(h, h);                      // |a^2 + b^2, a^2 + b^2|
+  RealPacket a_sq = pselect(evenmask, a2, a2_flip);  // |a^2, a^2|
+  RealPacket m_h_sq = pmul(h_sq, cst_minus_one_rp);
+  RealPacket m_a_sq = pmul(a_sq, cst_minus_one_rp);
+  RealPacket x = psub(psub(pmadd(h, h, m_h_sq), pmadd(b_all, b_all, psub(a_sq, h_sq))), pmadd(a_all, a_all, m_a_sq));
+  h = psub(h, pdiv(x, pmul(cst_two_rp, h)));  // |h - x/(2*h), h - x/(2*h)|
+
+  // handle zero-case
+  RealPacket iszero = pcmp_eq(por(a_abs, a_flip), cst_zero_rp);
+
+  h = pandnot(h, iszero);  // |sqrt(a^2+b^2), sqrt(a^2+b^2)|
+  return Packet(h);        // |sqrt(a^2+b^2), sqrt(a^2+b^2)|
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_ARCH_GENERIC_PACKET_MATH_COMPLEX_H
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathDoubleWord.h b/Eigen/src/Core/arch/Default/GenericPacketMathDoubleWord.h
new file mode 100644
index 00000000000..5cb677fea66
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathDoubleWord.h
@@ -0,0 +1,208 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018-2025 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_DOUBLE_WORD_H
+#define EIGEN_ARCH_GENERIC_PACKET_MATH_DOUBLE_WORD_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+// This function splits x into the nearest integer n and fractional part r,
+// such that x = n + r holds exactly.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void absolute_split(const Packet& x, Packet& n, Packet& r) {
+  n = pround(x);
+  r = psub(x, n);
+}
+
+// This function computes the sum {s, r}, such that x + y = s_hi + s_lo
+// holds exactly, and s_hi = fl(x+y), if |x| >= |y|.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
+  s_hi = padd(x, y);
+  const Packet t = psub(s_hi, x);
+  s_lo = psub(y, t);
+}
+
+#ifdef EIGEN_VECTORIZE_FMA
+// This function implements the extended precision product of
+// a pair of floating point numbers. Given {x, y}, it computes the pair
+// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
+// p_hi = fl(x * y).
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
+  p_hi = pmul(x, y);
+  p_lo = pmsub(x, y, p_hi);
+}
+
+// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
+// x * y = xy + p_lo holds exactly.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
+  return pmsub(x, y, xy);
+}
+
+#else
+
+// This function implements the Veltkamp splitting. Given a floating point
+// number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds
+// exactly and that half of the significant of x fits in x_hi.
+// This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions",
+// 3rd edition, Birkh\"auser, 2016.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  constexpr int shift = (NumTraits<Scalar>::digits() + 1) / 2;
+  const Scalar shift_scale = Scalar(uint64_t(1) << shift);  // Scalar constructor not necessarily constexpr.
+  const Packet gamma = pmul(pset1<Packet>(shift_scale + Scalar(1)), x);
+  Packet rho = psub(x, gamma);
+  x_hi = padd(rho, gamma);
+  x_lo = psub(x, x_hi);
+}
+
+// This function implements Dekker's algorithm for products x * y.
+// Given floating point numbers {x, y} computes the pair
+// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
+// p_hi = fl(x * y).
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
+  Packet x_hi, x_lo, y_hi, y_lo;
+  veltkamp_splitting(x, x_hi, x_lo);
+  veltkamp_splitting(y, y_hi, y_lo);
+
+  p_hi = pmul(x, y);
+  p_lo = pmadd(x_hi, y_hi, pnegate(p_hi));
+  p_lo = pmadd(x_hi, y_lo, p_lo);
+  p_lo = pmadd(x_lo, y_hi, p_lo);
+  p_lo = pmadd(x_lo, y_lo, p_lo);
+}
+
+// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
+// x * y = xy + p_lo holds exactly.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
+  Packet x_hi, x_lo, y_hi, y_lo;
+  veltkamp_splitting(x, x_hi, x_lo);
+  veltkamp_splitting(y, y_hi, y_lo);
+
+  Packet p_lo = pmadd(x_hi, y_hi, pnegate(xy));
+  p_lo = pmadd(x_hi, y_lo, p_lo);
+  p_lo = pmadd(x_lo, y_hi, p_lo);
+  p_lo = pmadd(x_lo, y_lo, p_lo);
+  return p_lo;
+}
+
+#endif  // EIGEN_VECTORIZE_FMA
+
+// This function implements Dekker's algorithm for the addition
+// of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
+// It returns the result as a pair {s_hi, s_lo} such that
+// x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly.
+// This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions",
+// 3rd edition, Birkh\"auser, 2016.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
+                                                  const Packet& y_lo, Packet& s_hi, Packet& s_lo) {
+  const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi));
+  Packet r_hi_1, r_lo_1;
+  fast_twosum(x_hi, y_hi, r_hi_1, r_lo_1);
+  Packet r_hi_2, r_lo_2;
+  fast_twosum(y_hi, x_hi, r_hi_2, r_lo_2);
+  const Packet r_hi = pselect(x_greater_mask, r_hi_1, r_hi_2);
+
+  const Packet s1 = padd(padd(y_lo, r_lo_1), x_lo);
+  const Packet s2 = padd(padd(x_lo, r_lo_2), y_lo);
+  const Packet s = pselect(x_greater_mask, s1, s2);
+
+  fast_twosum(r_hi, s, s_hi, s_lo);
+}
+
+// This is a version of twosum for double word numbers,
+// which assumes that |x_hi| >= |y_hi|.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
+                                                       const Packet& y_lo, Packet& s_hi, Packet& s_lo) {
+  Packet r_hi, r_lo;
+  fast_twosum(x_hi, y_hi, r_hi, r_lo);
+  const Packet s = padd(padd(y_lo, r_lo), x_lo);
+  fast_twosum(r_hi, s, s_hi, s_lo);
+}
+
+// This is a version of twosum for adding a floating point number x to
+// double word number {y_hi, y_lo} number, with the assumption
+// that |x| >= |y_hi|.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y_hi, const Packet& y_lo,
+                                                       Packet& s_hi, Packet& s_lo) {
+  Packet r_hi, r_lo;
+  fast_twosum(x, y_hi, r_hi, r_lo);
+  const Packet s = padd(y_lo, r_lo);
+  fast_twosum(r_hi, s, s_hi, s_lo);
+}
+
+// This function implements the multiplication of a double word
+// number represented by {x_hi, x_lo} by a floating point number y.
+// It returns the result as a pair {p_hi, p_lo} such that
+// (x_hi + x_lo) * y = p_hi + p_lo hold with a relative error
+// of less than 2*2^{-2p}, where p is the number of significand bit
+// in the floating point type.
+// This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions",
+// 3rd edition, Birkh\"auser, 2016.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y,
+                                                   Packet& p_hi, Packet& p_lo) {
+  Packet c_hi, c_lo1;
+  twoprod(x_hi, y, c_hi, c_lo1);
+  const Packet c_lo2 = pmul(x_lo, y);
+  Packet t_hi, t_lo1;
+  fast_twosum(c_hi, c_lo2, t_hi, t_lo1);
+  const Packet t_lo2 = padd(t_lo1, c_lo1);
+  fast_twosum(t_hi, t_lo2, p_hi, p_lo);
+}
+
+// This function implements the multiplication of two double word
+// numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
+// It returns the result as a pair {p_hi, p_lo} such that
+// (x_hi + x_lo) * (y_hi + y_lo) = p_hi + p_lo holds with a relative error
+// of less than 2*2^{-2p}, where p is the number of significand bit
+// in the floating point type.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
+                                                   const Packet& y_lo, Packet& p_hi, Packet& p_lo) {
+  Packet p_hi_hi, p_hi_lo;
+  twoprod(x_hi, x_lo, y_hi, p_hi_hi, p_hi_lo);
+  Packet p_lo_hi, p_lo_lo;
+  twoprod(x_hi, x_lo, y_lo, p_lo_hi, p_lo_lo);
+  fast_twosum(p_hi_hi, p_hi_lo, p_lo_hi, p_lo_lo, p_hi, p_lo);
+}
+
+// This function implements the division of double word {x_hi, x_lo}
+// by float y. This is Algorithm 15 from "Tight and rigorous error bounds
+// for basic building blocks of double-word arithmetic", Joldes, Muller, & Popescu,
+// 2017. https://hal.archives-ouvertes.fr/hal-01351529
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y,
+                                                             Packet& z_hi, Packet& z_lo) {
+  const Packet t_hi = pdiv(x_hi, y);
+  Packet pi_hi, pi_lo;
+  twoprod(t_hi, y, pi_hi, pi_lo);
+  const Packet delta_hi = psub(x_hi, pi_hi);
+  const Packet delta_t = psub(delta_hi, pi_lo);
+  const Packet delta = padd(delta_t, x_lo);
+  const Packet t_lo = pdiv(delta, y);
+  fast_twosum(t_hi, t_lo, z_hi, z_lo);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_ARCH_GENERIC_PACKET_MATH_DOUBLE_WORD_H
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFrexpLdexp.h b/Eigen/src/Core/arch/Default/GenericPacketMathFrexpLdexp.h
new file mode 100644
index 00000000000..978818ecd20
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFrexpLdexp.h
@@ -0,0 +1,162 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2018-2025 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FREXP_LDEXP_H
+#define EIGEN_ARCH_GENERIC_PACKET_MATH_FREXP_LDEXP_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+// Creates a Scalar integer type with same bit-width.
+template <typename T>
+struct make_integer;
+template <>
+struct make_integer<float> {
+  typedef numext::int32_t type;
+};
+template <>
+struct make_integer<double> {
+  typedef numext::int64_t type;
+};
+template <>
+struct make_integer<half> {
+  typedef numext::int16_t type;
+};
+template <>
+struct make_integer<bfloat16> {
+  typedef numext::int16_t type;
+};
+
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  static constexpr int mantissa_bits = numext::numeric_limits<Scalar>::digits - 1;
+  return pcast<PacketI, Packet>(plogical_shift_right<mantissa_bits>(preinterpret<PacketI>(pabs(a))));
+}
+
+// Safely applies frexp, correctly handles denormals.
+// Assumes IEEE floating point format.
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet& a, Packet& exponent) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename make_unsigned<typename make_integer<Scalar>::type>::type ScalarUI;
+  static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+                       ExponentBits = TotalBits - MantissaBits - 1;
+
+  constexpr ScalarUI scalar_sign_mantissa_mask =
+      ~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits);  // ~0x7f800000
+  const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask));
+  const Packet half = pset1<Packet>(Scalar(0.5));
+  const Packet zero = pzero(a);
+  const Packet normal_min = pset1<Packet>((numext::numeric_limits<Scalar>::min)());  // Minimum normal value, 2^-126
+
+  // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1).
+  const Packet is_denormal = pcmp_lt(pabs(a), normal_min);
+  constexpr ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1);  // 24
+  // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr.
+  const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset));  // 2^24
+  const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor);
+  const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a);
+
+  // Determine exponent offset: -126 if normal, -126-24 if denormal
+  const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1) << (ExponentBits - 1)) - ScalarUI(2));  // -126
+  Packet exponent_offset = pset1<Packet>(scalar_exponent_offset);
+  const Packet normalization_offset = pset1<Packet>(-Scalar(scalar_normalization_offset));  // -24
+  exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset);
+
+  // Determine exponent and mantissa from normalized_a.
+  exponent = pfrexp_generic_get_biased_exponent(normalized_a);
+  // Zero, Inf and NaN return 'a' unmodified, exponent is zero
+  // (technically the exponent is unspecified for inf/NaN, but GCC/Clang set it to zero)
+  const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << ExponentBits) - ScalarUI(1));  // 255
+  const Packet non_finite_exponent = pset1<Packet>(scalar_non_finite_exponent);
+  const Packet is_zero_or_not_finite = por(pcmp_eq(a, zero), pcmp_eq(exponent, non_finite_exponent));
+  const Packet m = pselect(is_zero_or_not_finite, a, por(pand(normalized_a, sign_mantissa_mask), half));
+  exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset));
+  return m;
+}
+
+// Safely applies ldexp, correctly handles overflows, underflows and denormals.
+// Assumes IEEE floating point format.
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent) {
+  // We want to return a * 2^exponent, allowing for all possible integer
+  // exponents without overflowing or underflowing in intermediate
+  // computations.
+  //
+  // Since 'a' and the output can be denormal, the maximum range of 'exponent'
+  // to consider for a float is:
+  //   -255-23 -> 255+23
+  // Below -278 any finite float 'a' will become zero, and above +278 any
+  // finite float will become inf, including when 'a' is the smallest possible
+  // denormal.
+  //
+  // Unfortunately, 2^(278) cannot be represented using either one or two
+  // finite normal floats, so we must split the scale factor into at least
+  // three parts. It turns out to be faster to split 'exponent' into four
+  // factors, since [exponent>>2] is much faster to compute that [exponent/3].
+  //
+  // Set e = min(max(exponent, -278), 278);
+  //     b = floor(e/4);
+  //   out = ((((a * 2^(b)) * 2^(b)) * 2^(b)) * 2^(e-3*b))
+  //
+  // This will avoid any intermediate overflows and correctly handle 0, inf,
+  // NaN cases.
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename unpacket_traits<PacketI>::type ScalarI;
+  static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+                       ExponentBits = TotalBits - MantissaBits - 1;
+
+  const Packet max_exponent = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) + ScalarI(MantissaBits - 1)));  // 278
+  const PacketI bias = pset1<PacketI>((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1));                         // 127
+  const PacketI e = pcast<Packet, PacketI>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+  PacketI b = parithmetic_shift_right<2>(e);                                          // floor(e/4);
+  Packet c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias)));  // 2^b
+  Packet out = pmul(pmul(pmul(a, c), c), c);                                          // a * 2^(3b)
+  b = pnmadd(pset1<PacketI>(3), b, e);                                                // e - 3b
+  c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias)));         // 2^(e-3*b)
+  out = pmul(out, c);
+  return out;
+}
+
+// Explicitly multiplies
+//    a * (2^e)
+// clamping e to the range
+// [NumTraits<Scalar>::min_exponent()-2, NumTraits<Scalar>::max_exponent()]
+//
+// This is approx 7x faster than pldexp_impl, but will prematurely over/underflow
+// if 2^e doesn't fit into a normal floating-point Scalar.
+//
+// Assumes IEEE floating point format
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent) {
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename unpacket_traits<PacketI>::type ScalarI;
+  static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+                       ExponentBits = TotalBits - MantissaBits - 1;
+
+  const Packet bias = pset1<Packet>(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1)));  // 127
+  const Packet limit = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1)));       // 255
+  // restrict biased exponent between 0 and 255 for float.
+  const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit));  // exponent + 127
+  // return a * (2^e)
+  return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e)));
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_ARCH_GENERIC_PACKET_MATH_FREXP_LDEXP_H
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index a46a8eff075..e91bd1147b1 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -4,6 +4,7 @@
 // Copyright (C) 2007 Julien Pommier
 // Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
 // Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2018-2025 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -18,469 +19,115 @@
 
 // IWYU pragma: private
 #include "../../InternalHeaderCheck.h"
+#include "GenericPacketMathPolynomials.h"
+#include "GenericPacketMathFrexpLdexp.h"
+#include "GenericPacketMathDoubleWord.h"
 
 namespace Eigen {
 namespace internal {
 
-// Creates a Scalar integer type with same bit-width.
-template <typename T>
-struct make_integer;
-template <>
-struct make_integer<float> {
-  typedef numext::int32_t type;
-};
-template <>
-struct make_integer<double> {
-  typedef numext::int64_t type;
-};
-template <>
-struct make_integer<half> {
-  typedef numext::int16_t type;
-};
-template <>
-struct make_integer<bfloat16> {
-  typedef numext::int16_t type;
-};
-
-/* polevl (modified for Eigen)
- *
- *      Evaluate polynomial
- *
- *
- *
- * SYNOPSIS:
- *
- * int N;
- * Scalar x, y, coef[N+1];
- *
- * y = polevl<decltype(x), N>( x, coef);
- *
- *
- *
- * DESCRIPTION:
- *
- * Evaluates polynomial of degree N:
- *
- *                     2          N
- * y  =  C  + C x + C x  +...+ C x
- *        0    1     2          N
- *
- * Coefficients are stored in reverse order:
- *
- * coef[0] = C  , ..., coef[N] = C  .
- *            N                   0
- *
- *  The function p1evl() assumes that coef[N] = 1.0 and is
- * omitted from the array.  Its calling arguments are
- * otherwise the same as polevl().
- *
- *
- * The Eigen implementation is templatized.  For best speed, store
- * coef as a const array (constexpr), e.g.
- *
- * const double coef[] = {1.0, 2.0, 3.0, ...};
- *
- */
-template <typename Packet, int N>
-struct ppolevl {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
-                                                          const typename unpacket_traits<Packet>::type coeff[]) {
-    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return pmadd(ppolevl<Packet, N - 1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
-  }
-};
-
-template <typename Packet>
-struct ppolevl<Packet, 0> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
-                                                          const typename unpacket_traits<Packet>::type coeff[]) {
-    EIGEN_UNUSED_VARIABLE(x);
-    return pset1<Packet>(coeff[0]);
-  }
-};
+//----------------------------------------------------------------------
+// Exponential and Logarithmic Functions
+//----------------------------------------------------------------------
 
-/* chbevl (modified for Eigen)
- *
- *     Evaluate Chebyshev series
- *
- *
- *
- * SYNOPSIS:
- *
- * int N;
- * Scalar x, y, coef[N], chebevl();
- *
- * y = chbevl( x, coef, N );
- *
- *
- *
- * DESCRIPTION:
- *
- * Evaluates the series
- *
- *        N-1
- *         - '
- *  y  =   >   coef[i] T (x/2)
- *         -            i
- *        i=0
- *
- * of Chebyshev polynomials Ti at argument x/2.
- *
- * Coefficients are stored in reverse order, i.e. the zero
- * order term is last in the array.  Note N is the number of
- * coefficients, not the order.
- *
- * If coefficients are for the interval a to b, x must
- * have been transformed to x -> 2(2x - b - a)/(b-a) before
- * entering the routine.  This maps x from (a, b) to (-1, 1),
- * over which the Chebyshev polynomials are defined.
- *
- * If the coefficients are for the inverted interval, in
- * which (a, b) is mapped to (1/b, 1/a), the transformation
- * required is x -> 2(2ab/x - b - a)/(b-a).  If b is infinity,
- * this becomes x -> 4a/x - 1.
- *
- *
- *
- * SPEED:
- *
- * Taking advantage of the recurrence properties of the
- * Chebyshev polynomials, the routine requires one more
- * addition per loop than evaluating a nested polynomial of
- * the same degree.
- *
- */
-
-template <typename Packet, int N>
-struct pchebevl {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(Packet x,
-                                                          const typename unpacket_traits<Packet>::type coef[]) {
-    typedef typename unpacket_traits<Packet>::type Scalar;
-    Packet b0 = pset1<Packet>(coef[0]);
-    Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
-    Packet b2;
-
-    for (int i = 1; i < N; i++) {
-      b2 = b1;
-      b1 = b0;
-      b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
-    }
-
-    return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
-  }
-};
-
-template <typename Packet>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-  static constexpr int mantissa_bits = numext::numeric_limits<Scalar>::digits - 1;
-  return pcast<PacketI, Packet>(plogical_shift_right<mantissa_bits>(preinterpret<PacketI>(pabs(a))));
-}
-
-// Safely applies frexp, correctly handles denormals.
-// Assumes IEEE floating point format.
-template <typename Packet>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet& a, Packet& exponent) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  typedef typename make_unsigned<typename make_integer<Scalar>::type>::type ScalarUI;
-  static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
-                       ExponentBits = TotalBits - MantissaBits - 1;
-
-  constexpr ScalarUI scalar_sign_mantissa_mask =
-      ~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits);  // ~0x7f800000
-  const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask));
-  const Packet half = pset1<Packet>(Scalar(0.5));
-  const Packet zero = pzero(a);
-  const Packet normal_min = pset1<Packet>((numext::numeric_limits<Scalar>::min)());  // Minimum normal value, 2^-126
-
-  // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1).
-  const Packet is_denormal = pcmp_lt(pabs(a), normal_min);
-  constexpr ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1);  // 24
-  // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr.
-  const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset));  // 2^24
-  const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor);
-  const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a);
-
-  // Determine exponent offset: -126 if normal, -126-24 if denormal
-  const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1) << (ExponentBits - 1)) - ScalarUI(2));  // -126
-  Packet exponent_offset = pset1<Packet>(scalar_exponent_offset);
-  const Packet normalization_offset = pset1<Packet>(-Scalar(scalar_normalization_offset));  // -24
-  exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset);
-
-  // Determine exponent and mantissa from normalized_a.
-  exponent = pfrexp_generic_get_biased_exponent(normalized_a);
-  // Zero, Inf and NaN return 'a' unmodified, exponent is zero
-  // (technically the exponent is unspecified for inf/NaN, but GCC/Clang set it to zero)
-  const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << ExponentBits) - ScalarUI(1));  // 255
-  const Packet non_finite_exponent = pset1<Packet>(scalar_non_finite_exponent);
-  const Packet is_zero_or_not_finite = por(pcmp_eq(a, zero), pcmp_eq(exponent, non_finite_exponent));
-  const Packet m = pselect(is_zero_or_not_finite, a, por(pand(normalized_a, sign_mantissa_mask), half));
-  exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset));
-  return m;
-}
-
-// Safely applies ldexp, correctly handles overflows, underflows and denormals.
-// Assumes IEEE floating point format.
-template <typename Packet>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent) {
-  // We want to return a * 2^exponent, allowing for all possible integer
-  // exponents without overflowing or underflowing in intermediate
-  // computations.
-  //
-  // Since 'a' and the output can be denormal, the maximum range of 'exponent'
-  // to consider for a float is:
-  //   -255-23 -> 255+23
-  // Below -278 any finite float 'a' will become zero, and above +278 any
-  // finite float will become inf, including when 'a' is the smallest possible
-  // denormal.
-  //
-  // Unfortunately, 2^(278) cannot be represented using either one or two
-  // finite normal floats, so we must split the scale factor into at least
-  // three parts. It turns out to be faster to split 'exponent' into four
-  // factors, since [exponent>>2] is much faster to compute that [exponent/3].
-  //
-  // Set e = min(max(exponent, -278), 278);
-  //     b = floor(e/4);
-  //   out = ((((a * 2^(b)) * 2^(b)) * 2^(b)) * 2^(e-3*b))
-  //
-  // This will avoid any intermediate overflows and correctly handle 0, inf,
-  // NaN cases.
-  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  typedef typename unpacket_traits<PacketI>::type ScalarI;
-  static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
-                       ExponentBits = TotalBits - MantissaBits - 1;
-
-  const Packet max_exponent = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) + ScalarI(MantissaBits - 1)));  // 278
-  const PacketI bias = pset1<PacketI>((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1));                         // 127
-  const PacketI e = pcast<Packet, PacketI>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
-  PacketI b = parithmetic_shift_right<2>(e);                                          // floor(e/4);
-  Packet c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias)));  // 2^b
-  Packet out = pmul(pmul(pmul(a, c), c), c);                                          // a * 2^(3b)
-  b = pnmadd(pset1<PacketI>(3), b, e);                                                // e - 3b
-  c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias)));         // 2^(e-3*b)
-  out = pmul(out, c);
-  return out;
-}
-
-// Explicitly multiplies
-//    a * (2^e)
-// clamping e to the range
-// [NumTraits<Scalar>::min_exponent()-2, NumTraits<Scalar>::max_exponent()]
+// Core range reduction and polynomial evaluation for float logarithm.
 //
-// This is approx 7x faster than pldexp_impl, but will prematurely over/underflow
-// if 2^e doesn't fit into a normal floating-point Scalar.
+// Given a positive float value v (may be denormal), decomposes it as
+// v = 2^e * (1+f) with f in [sqrt(0.5)-1, sqrt(2)-1], then evaluates
+// log(1+f) ≈ f + f^2 * P(f) using a degree-7 minimax polynomial.
 //
-// Assumes IEEE floating point format
-template <typename Packet>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent) {
-  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  typedef typename unpacket_traits<PacketI>::type ScalarI;
-  static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
-                       ExponentBits = TotalBits - MantissaBits - 1;
-
-  const Packet bias = pset1<Packet>(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1)));  // 127
-  const Packet limit = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1)));       // 255
-  // restrict biased exponent between 0 and 255 for float.
-  const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit));  // exponent + 127
-  // return a * (2^e)
-  return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e)));
-}
-
-// This function implements a single step of Halley's iteration for
-// computing x = y^(1/3):
-//   x_{k+1} = x_k - (x_k^3 - y) x_k / (2x_k^3 + y)
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_halley_iteration_step(const Packet& x_k,
-                                                                                      const Packet& y) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  Packet x_k_cb = pmul(x_k, pmul(x_k, x_k));
-  Packet denom = pmadd(pset1<Packet>(Scalar(2)), x_k_cb, y);
-  Packet num = psub(x_k_cb, y);
-  Packet r = pdiv(num, denom);
-  return pnmadd(x_k, r, x_k);
-}
-
-// Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
-// interval [0.125,1].
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_decompose(const Packet& x, Packet& e_div3) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  // Extract the significant s in the range [0.5,1) and exponent e, such that
-  // x = 2^e * s.
-  Packet e, s;
-  s = pfrexp(x, e);
-
-  // Split the exponent into a part divisible by 3 and the remainder.
-  // e = 3*e_div3 + e_mod3.
-  constexpr Scalar kOneThird = Scalar(1) / 3;
-  e_div3 = pceil(pmul(e, pset1<Packet>(kOneThird)));
-  Packet e_mod3 = pnmadd(pset1<Packet>(Scalar(3)), e_div3, e);
-
-  // Replace s by y = (s * 2^e_mod3).
-  return pldexp_fast(s, e_mod3);
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_special_cases_and_sign(const Packet& x,
-                                                                                       const Packet& abs_root) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-
-  // Set sign.
-  const Packet sign_mask = pset1<Packet>(Scalar(-0.0));
-  const Packet x_sign = pand(sign_mask, x);
-  Packet root = por(x_sign, abs_root);
-
-  // Pass non-finite and zero values of x straight through.
-  const Packet is_not_finite = por(pisinf(x), pisnan(x));
-  const Packet is_zero = pcmp_eq(pzero(x), x);
-  const Packet use_x = por(is_not_finite, is_zero);
-  return pselect(use_x, x, root);
-}
-
-// Generic implementation of cbrt(x) for float.
-//
-// The algorithm computes the cubic root of the input by first
-// decomposing it into a exponent and significant
-//   x = s * 2^e.
-//
-// We can then write the cube root as
-//
-//   x^(1/3) = 2^(e/3) * s^(1/3)
-//           = 2^((3*e_div3 + e_mod3)/3) * s^(1/3)
-//           = 2^(e_div3) * 2^(e_mod3/3) * s^(1/3)
-//           = 2^(e_div3) * (s * 2^e_mod3)^(1/3)
+// Returns the approximation of log(v_mantissa) in log_mantissa and the
+// integer exponent in e. The caller combines these as appropriate
+// (e.g. e*ln2 + log_mantissa for natural log, or log_mantissa*log2e + e
+// for log2).
 //
-// where e_div3 = ceil(e/3) and e_mod3 = e - 3*e_div3.
-//
-// The cube root of the second term y = (s * 2^e_mod3)^(1/3) is coarsely
-// approximated using a cubic polynomial and subsequently refined using a
-// single step of Halley's iteration, and finally the two terms are combined
-// using pldexp_fast.
-//
-// Note: Many alternatives exist for implementing cbrt. See, for example,
-// the excellent discussion in Kahan's note:
-//   https://csclub.uwaterloo.ca/~pbarfuss/qbrt.pdf
-// This particular implementation was found to be very fast and accurate
-// among several alternatives tried, but is probably not "optimal" on all
-// platforms.
-//
-// This is accurate to 2 ULP.
+// Range reduction uses integer bit manipulation (musl-inspired) instead of the
+// heavier pfrexp_generic, saving ~12 ops. The minimax polynomial was found via
+// Sollya's fpminimax, giving faithfully-rounded results (max 1 ULP for log).
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_float(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
-
-  // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
-  // interval [0.125,1].
-  Packet e_div3;
-  const Packet y = cbrt_decompose(pabs(x), e_div3);
-
-  // Compute initial approximation accurate to 5.22e-3.
-  // The polynomial was computed using Rminimax.
-  constexpr float alpha[] = {5.9220016002655029296875e-01f, -1.3859539031982421875e+00f, 1.4581282138824462890625e+00f,
-                             3.408401906490325927734375e-01f};
-  Packet r = ppolevl<Packet, 3>::run(y, alpha);
-
-  // Take one step of Halley's iteration.
-  r = cbrt_halley_iteration_step(r, y);
-
-  // Finally multiply by 2^(e_div3)
-  r = pldexp_fast(r, e_div3);
-
-  return cbrt_special_cases_and_sign(x, r);
-}
+EIGEN_STRONG_INLINE void plog_core_float(const Packet v, Packet& log_mantissa, Packet& e) {
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
 
-// Generic implementation of cbrt(x) for double.
+  const PacketI cst_min_normal = pset1<PacketI>(0x00800000);
+  const PacketI cst_mant_mask = pset1<PacketI>(0x007fffff);
+  // Adding this offset to the integer representation biases the exponent so
+  // that values near 1 (0x3f800000) map to exponent 0, and values below
+  // sqrt(0.5) get folded into the previous exponent.  The magic constant is
+  // 0x3f800000 - 0x3f3504f3 = 0x004afb0d, where 0x3f3504f3 ≈ sqrt(0.5).
+  const PacketI cst_sqrt_half_offset = pset1<PacketI>(0x004afb0d);
+  const PacketI cst_exp_bias = pset1<PacketI>(0x7f);         // 127
+  const PacketI cst_half_mant = pset1<PacketI>(0x3f3504f3);  // sqrt(0.5)
+
+  // Normalize denormals by multiplying by 2^23.
+  PacketI vi = preinterpret<PacketI>(v);
+  PacketI is_denormal = pcmp_lt(vi, cst_min_normal);
+  Packet v_normalized = pmul(v, pset1<Packet>(8388608.0f));  // 2^23
+  vi = pselect(is_denormal, preinterpret<PacketI>(v_normalized), vi);
+  // Denormal exponent adjustment: subtract 23 from exponent.
+  PacketI denorm_adj = pand(is_denormal, pset1<PacketI>(23));
+
+  // Combined range reduction: bias integer representation so that exponent
+  // extraction automatically shifts mantissa to [sqrt(0.5), sqrt(2)).
+  PacketI vi_biased = padd(vi, cst_sqrt_half_offset);
+  // Extract exponent as integer, subtract bias and denormal adjustment.
+  PacketI e_int = psub(psub(plogical_shift_right<23>(vi_biased), cst_exp_bias), denorm_adj);
+  e = pcast<PacketI, Packet>(e_int);
+  // Reconstruct mantissa in [sqrt(0.5), sqrt(2)). The integer addition of the
+  // masked mantissa with 0x3f3504f3 (sqrt(0.5)) naturally produces carry into
+  // the exponent field, yielding values in [sqrt(0.5), 1) or [1, sqrt(2)).
+  // Then subtract 1 to center on 0 → f in [sqrt(0.5)-1, sqrt(2)-1].
+  Packet f = psub(preinterpret<Packet>(padd(pand(vi_biased, cst_mant_mask), cst_half_mant)), pset1<Packet>(1.0f));
+
+  // Minimax degree-7 polynomial for g(f) = (log(1+f) - f) / f^2 on
+  // [sqrt(0.5)-1, sqrt(2)-1], so log(1+f) ≈ f + f^2 * P(f).
+  // Generated by Sollya: fpminimax(g, 7, [|single...|], [lo;hi])
+  // Mathematical approximation error: max |log(1+f) - (f + f^2*P(f))| < 2.04e-8.
+  // Coefficients stored in reverse order for ppolevl (highest degree first).
+  constexpr float coeffs[] = {
+      8.8758550584316254e-02f,   //  c7 (x^7)
+      -1.4199858903884888e-01f,  //  c6 (x^6)
+      1.4824025332927704e-01f,   //  c5 (x^5)
+      -1.6583317518234253e-01f,  //  c4 (x^4)
+      1.9972395896911621e-01f,   //  c3 (x^3)
+      -2.5001299381256104e-01f,  //  c2 (x^2)
+      3.3333668112754822e-01f,   //  c1 (x^1)
+      -4.9999997019767761e-01f,  //  c0 (x^0)
+  };
+
+  // Evaluate P(f) via Horner's method, then log(1+f) ≈ f + f^2 * P(f).
+  Packet f2 = pmul(f, f);
+  Packet p = ppolevl<Packet, 7>::run(f, coeffs);
+  log_mantissa = pmadd(p, f2, f);
+}
+
+// Natural or base-2 logarithm for float packets.
 //
-// The algorithm is identical to the one for float except that a different initial
-// approximation is used for y^(1/3) and two Halley iteration steps are peformed.
-//
-// This is accurate to 1 ULP.
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_double(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
-
-  // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
-  // interval [0.125,1].
-  Packet e_div3;
-  const Packet y = cbrt_decompose(pabs(x), e_div3);
-
-  // Compute initial approximation accurate to 0.016.
-  // The polynomial was computed using Rminimax.
-  constexpr double alpha[] = {-4.69470621553356115551736138513660989701747894287109375e-01,
-                              1.072314636518546304699839311069808900356292724609375e+00,
-                              3.81249427609571867048288140722434036433696746826171875e-01};
-  Packet r = ppolevl<Packet, 2>::run(y, alpha);
-
-  // Take two steps of Halley's iteration.
-  r = cbrt_halley_iteration_step(r, y);
-  r = cbrt_halley_iteration_step(r, y);
-
-  // Finally multiply by 2^(e_div3).
-  r = pldexp_fast(r, e_div3);
-  return cbrt_special_cases_and_sign(x, r);
-}
-
-// Natural or base 2 logarithm.
-// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
-// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
-// be easily approximated by a polynomial centered on m=1 for stability.
-// TODO(gonnet): Further reduce the interval allowing for lower-degree
-//               polynomial interpolants -> ... -> profit!
+// Computes log(x) as e*C + log(m), where x = 2^e * m with m in [sqrt(1/2), sqrt(2))
+// and C = ln(2) for natural log, C = 1 for log2.
 template <typename Packet, bool base2>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_float(const Packet _x) {
-  const Packet cst_1 = pset1<Packet>(1.0f);
-  const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0xff800000u));
-  const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x7f800000u));
+  Packet log_mantissa, e;
+  plog_core_float(_x, log_mantissa, e);
 
-  const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f);
-  Packet e, x;
-  // extract significant in the range [0.5,1) and exponent
-  x = pfrexp(_x, e);
-
-  // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
-  // and shift by -1. The values are then centered around 0, which improves
-  // the stability of the polynomial evaluation.
-  //   if( x < SQRTHF ) {
-  //     e -= 1;
-  //     x = x + x - 1.0;
-  //   } else { x = x - 1.0; }
-  Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
-  Packet tmp = pand(x, mask);
-  x = psub(x, cst_1);
-  e = psub(e, pand(cst_1, mask));
-  x = padd(x, tmp);
-
-  // Polynomial coefficients for rational r(x) = p(x)/q(x)
-  // approximating log(1+x) on [sqrt(0.5)-1;sqrt(2)-1].
-  constexpr float alpha[] = {0.18256296349849254f, 1.0000000190281063f, 1.0000000190281136f};
-  constexpr float beta[] = {0.049616247954120038f, 0.59923249590823520f, 1.4999999999999927f, 1.0f};
-
-  Packet p = ppolevl<Packet, 2>::run(x, alpha);
-  p = pmul(x, p);
-  Packet q = ppolevl<Packet, 3>::run(x, beta);
-  x = pdiv(p, q);
-
-  // Add the logarithm of the exponent back to the result of the interpolation.
+  // Add the logarithm of the exponent back to the result.
+  Packet x;
   if (base2) {
     const Packet cst_log2e = pset1<Packet>(static_cast<float>(EIGEN_LOG2E));
-    x = pmadd(x, cst_log2e, e);
+    x = pmadd(log_mantissa, cst_log2e, e);
   } else {
     const Packet cst_ln2 = pset1<Packet>(static_cast<float>(EIGEN_LN2));
-    x = pmadd(e, cst_ln2, x);
+    x = pmadd(e, cst_ln2, log_mantissa);
   }
 
+  // Filter out invalid inputs:
+  //  - negative arg → NAN
+  //  - 0 → -INF
+  //  - +INF → +INF
+  const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0xff800000u));
+  const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x7f800000u));
   Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
   Packet iszero_mask = pcmp_eq(_x, pzero(_x));
   Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
-  // Filter out invalid inputs, i.e.:
-  //  - negative arg will be NAN
-  //  - 0 will be -INF
-  //  - +INF will be +INF
   return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
 }
 
@@ -494,63 +141,33 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Pac
   return plog_impl_float<Packet, /* base2 */ true>(_x);
 }
 
-/* Returns the base e (2.718...) or base 2 logarithm of x.
- * The argument is separated into its exponent and fractional parts.
- * The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)],
- * is approximated by
- *
- *     log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x).
- *
- * for more detail see: http://www.netlib.org/cephes/
- */
-template <typename Packet, bool base2>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(const Packet _x) {
-  Packet x = _x;
-
-  const Packet cst_1 = pset1<Packet>(1.0);
-  const Packet cst_neg_half = pset1<Packet>(-0.5);
-  const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
-  const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
+// -----------------------------------------------------------------------
+// Double logarithm: shared polynomial + two range-reduction backends
+// -----------------------------------------------------------------------
 
-  // Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
-  //                             1/sqrt(2) <= x < sqrt(2)
-  const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
+// Cephes rational-polynomial approximation of log(1+f) for
+// f in [sqrt(0.5)-1, sqrt(2)-1].
+// Evaluates x - 0.5*x^2 + x^3 * P(x)/Q(x) where P and Q are degree-5.
+// See: http://www.netlib.org/cephes/
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet plog_mantissa_double(const Packet x) {
   const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
   const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
   const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
   const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
   const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
   const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
-
-  const Packet cst_cephes_log_q0 = pset1<Packet>(1.0);
+  // Q0 = 1.0; pmadd(1, x, q1) simplifies to padd(x, q1).
   const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
   const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
   const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
   const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
   const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
 
-  Packet e;
-  // extract significant in the range [0.5,1) and exponent
-  x = pfrexp(x, e);
-
-  // Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
-  // and shift by -1. The values are then centered around 0, which improves
-  // the stability of the polynomial evaluation.
-  //   if( x < SQRTHF ) {
-  //     e -= 1;
-  //     x = x + x - 1.0;
-  //   } else { x = x - 1.0; }
-  Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
-  Packet tmp = pand(x, mask);
-  x = psub(x, cst_1);
-  e = psub(e, pand(cst_1, mask));
-  x = padd(x, tmp);
-
   Packet x2 = pmul(x, x);
   Packet x3 = pmul(x2, x);
 
-  // Evaluate the polynomial approximant , probably to improve instruction-level parallelism.
-  // y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
+  // Evaluate P and Q simultaneously for better ILP.
   Packet y, y1, y_;
   y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
   y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
@@ -558,7 +175,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(cons
   y1 = pmadd(y1, x, cst_cephes_log_p5);
   y_ = pmadd(y, x3, y1);
 
-  y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
+  y = padd(x, cst_cephes_log_q1);
   y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
   y = pmadd(y, x, cst_cephes_log_q2);
   y1 = pmadd(y1, x, cst_cephes_log_q5);
@@ -566,26 +183,132 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(cons
 
   y_ = pmul(y_, x3);
   y = pdiv(y_, y);
+  y = pnmadd(pset1<Packet>(0.5), x2, y);
+  return padd(x, y);
+}
+
+// Detect whether unpacket_traits<Packet>::integer_packet is defined.
+template <typename Packet, typename = void>
+struct packet_has_integer_packet : std::false_type {};
+template <typename Packet>
+struct packet_has_integer_packet<Packet, void_t<typename unpacket_traits<Packet>::integer_packet>> : std::true_type {};
+
+// Dispatch struct for double-precision range reduction.
+// Primary template: pfrexp-based fallback (used when integer_packet is absent).
+template <typename Packet, bool UseIntegerPacket>
+struct plog_range_reduce_double {
+  EIGEN_STRONG_INLINE static void run(const Packet v, Packet& f, Packet& e) {
+    const Packet one = pset1<Packet>(1.0);
+    const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
+    // pfrexp: f in [0.5, 1), e = unbiased exponent as double.
+    f = pfrexp(v, e);
+    // Shift [0.5,1) -> [sqrt(0.5)-1, sqrt(2)-1] with exponent correction:
+    //   if f < sqrt(0.5): f = f + f - 1, e -= 1   (giving f in [0, sqrt(2)-1))
+    //   else:             f = f - 1                (giving f in [sqrt(0.5)-1, 0))
+    Packet mask = pcmp_lt(f, cst_cephes_SQRTHF);
+    Packet tmp = pand(f, mask);
+    f = psub(f, one);
+    e = psub(e, pand(one, mask));
+    f = padd(f, tmp);
+  }
+};
+
+// Specialisation: fast integer-bit-manipulation path (musl-inspired).
+// Requires unpacket_traits<Packet>::integer_packet to be a 64-bit integer packet.
+template <typename Packet>
+struct plog_range_reduce_double<Packet, true> {
+  EIGEN_STRONG_INLINE static void run(const Packet v, Packet& f, Packet& e) {
+    typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+    // 2^-1022: smallest positive normal double.
+    const PacketI cst_min_normal = pset1<PacketI>(static_cast<int64_t>(0x0010000000000000LL));
+    // Lower 52-bit mask (IEEE mantissa field).
+    const PacketI cst_mant_mask = pset1<PacketI>(static_cast<int64_t>(0x000FFFFFFFFFFFFFLL));
+    // Offset = 1.0_bits - sqrt(0.5)_bits.  Adding this to the integer
+    // representation shifts the exponent field so that the [sqrt(0.5), sqrt(2))
+    // half-octave boundary falls on an exact biased-exponent boundary, letting
+    // us extract e with a single right shift.  The constant is:
+    //   0x3FF0000000000000 - 0x3FE6A09E667F3BCD = 0x00095F619980C433
+    const PacketI cst_sqrt_half_offset =
+        pset1<PacketI>(static_cast<int64_t>(0x3FF0000000000000LL - 0x3FE6A09E667F3BCDLL));
+    // IEEE double exponent bias (1023).
+    const PacketI cst_exp_bias = pset1<PacketI>(static_cast<int64_t>(1023));
+    // sqrt(0.5) IEEE bits — used to reconstruct f from biased mantissa.
+    const PacketI cst_half_mant = pset1<PacketI>(static_cast<int64_t>(0x3FE6A09E667F3BCDLL));
+
+    // Reinterpret v as a 64-bit integer vector.
+    PacketI vi = preinterpret<PacketI>(v);
+
+    // Normalise denormals: multiply by 2^52 and correct the exponent by -52.
+    PacketI is_denormal = pcmp_lt(vi, cst_min_normal);
+    // 2^52 via bit pattern: biased exponent = 52 + 1023 = 0x433, mantissa = 0.
+    Packet v_norm = pmul(v, pset1frombits<Packet>(static_cast<uint64_t>(int64_t(52 + 0x3ff) << 52)));
+    vi = pselect(is_denormal, preinterpret<PacketI>(v_norm), vi);
+    PacketI denorm_adj = pand(is_denormal, pset1<PacketI>(static_cast<int64_t>(52)));
+
+    // Bias the integer representation so the exponent field directly encodes
+    // the half-octave index.
+    PacketI vi_biased = padd(vi, cst_sqrt_half_offset);
+    // Extract unbiased exponent: shift out mantissa bits, subtract IEEE bias
+    // and denormal adjustment.
+    PacketI e_int = psub(psub(plogical_shift_right<52>(vi_biased), cst_exp_bias), denorm_adj);
+    // Convert integer exponent to floating-point.
+    e = pcast<PacketI, Packet>(e_int);
+
+    // Reconstruct mantissa in [sqrt(0.5), sqrt(2)) via integer arithmetic.
+    // The integer addition of the masked mantissa bits and the sqrt(0.5) bit
+    // pattern carries into the exponent field, yielding a value in that range.
+    // Then subtract 1 to centre on 0: f in [sqrt(0.5)-1, sqrt(2)-1].
+    f = psub(preinterpret<Packet>(padd(pand(vi_biased, cst_mant_mask), cst_half_mant)), pset1<Packet>(1.0));
+  }
+};
+
+// Core range reduction and polynomial for double logarithm.
+// Input:  v > 0 (zero / negative / inf / nan are handled by the caller).
+// Output: log_mantissa ≈ log(mantissa of v in [sqrt(0.5), sqrt(2))),
+//         e            = unbiased exponent of v as a double.
+// Selects the fast integer path when integer_packet is available, otherwise
+// falls back to pfrexp.
+template <typename Packet>
+EIGEN_STRONG_INLINE void plog_core_double(const Packet v, Packet& log_mantissa, Packet& e) {
+  Packet f;
+  plog_range_reduce_double<Packet, packet_has_integer_packet<Packet>::value>::run(v, f, e);
+  log_mantissa = plog_mantissa_double(f);
+}
+
+/* Returns the base e (2.718...) or base 2 logarithm of x.
+ * The argument is separated into its exponent and fractional parts.
+ * The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)],
+ * is approximated by
+ *
+ *     log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x).
+ *
+ * for more detail see: http://www.netlib.org/cephes/
+ */
+template <typename Packet, bool base2>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(const Packet _x) {
+  const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
+  const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
 
-  y = pmadd(cst_neg_half, x2, y);
-  x = padd(x, y);
+  Packet log_mantissa, e;
+  plog_core_double(_x, log_mantissa, e);
 
-  // Add the logarithm of the exponent back to the result of the interpolation.
+  // Combine: log(x) = e * ln2 + log(mantissa), or log2(x) = log(mantissa)*log2e + e.
+  Packet x;
   if (base2) {
     const Packet cst_log2e = pset1<Packet>(static_cast<double>(EIGEN_LOG2E));
-    x = pmadd(x, cst_log2e, e);
+    x = pmadd(log_mantissa, cst_log2e, e);
   } else {
     const Packet cst_ln2 = pset1<Packet>(static_cast<double>(EIGEN_LN2));
-    x = pmadd(e, cst_ln2, x);
+    x = pmadd(e, cst_ln2, log_mantissa);
   }
 
   Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
   Packet iszero_mask = pcmp_eq(_x, pzero(_x));
   Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
-  // Filter out invalid inputs, i.e.:
-  //  - negative arg will be NAN
-  //  - 0 will be -INF
-  //  - +INF will be +INF
+  // Filter out invalid inputs:
+  //  - negative arg → NAN
+  //  - 0            → -INF
+  //  - +INF         → +INF
   return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
 }
 
@@ -599,8 +322,89 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_double(const Pa
   return plog_impl_double<Packet, /* base2 */ true>(_x);
 }
 
+/** \internal \returns log(1 + x) for single precision float.
+    Computes log(1+x) using plog_core_float for the core range reduction
+    and polynomial evaluation. The rounding error from forming u = fl(1+x)
+    is recovered as dx = x - (u - 1), and folded in as a first-order
+    correction dx/u after the polynomial evaluation.
+ */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_float(const Packet& x) {
+  const Packet one = pset1<Packet>(1.0f);
+  const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0xff800000u));
+  const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x7f800000u));
+
+  // u = 1 + x, with rounding. Recover the lost low bits: dx = x - (u - 1).
+  Packet u = padd(one, x);
+  Packet dx = psub(x, psub(u, one));
+
+  // For |x| tiny enough that u rounds to 1, return x directly.
+  Packet small_mask = pcmp_eq(u, one);
+  // For u = +inf (x very large), return +inf.
+  Packet inf_mask = pcmp_eq(u, cst_pos_inf);
+
+  // Core range reduction and polynomial on u.
+  Packet log_u, e;
+  plog_core_float(u, log_u, e);
+
+  // result = e * ln2 + log(u) + dx/u.
+  // The dx/u term corrects for the rounding error in u = fl(1+x).
+  const Packet cst_ln2 = pset1<Packet>(static_cast<float>(EIGEN_LN2));
+  Packet result = pmadd(e, cst_ln2, padd(log_u, pdiv(dx, u)));
+
+  // Handle special cases.
+  Packet neg_mask = pcmp_lt(u, pzero(u));
+  Packet zero_mask = pcmp_eq(x, pset1<Packet>(-1.0f));
+  result = pselect(small_mask, x, result);
+  result = pselect(inf_mask, cst_pos_inf, result);
+  result = pselect(zero_mask, cst_minus_inf, result);
+  result = por(neg_mask, result);  // NaN for x < -1
+  return result;
+}
+
+/** \internal \returns log(1 + x) for double precision.
+    Computes log(1+x) using plog_core_double for the core range reduction and
+    polynomial evaluation.  The rounding error from forming u = fl(1+x) is
+    recovered as dx = x - (u - 1) and folded in as a first-order correction
+    dx/u after the polynomial evaluation.
+ */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_double(const Packet& x) {
+  const Packet one = pset1<Packet>(1.0);
+  const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
+  const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
+
+  // u = 1 + x, with rounding.  Recover the lost low bits: dx = x - (u - 1).
+  Packet u = padd(one, x);
+  Packet dx = psub(x, psub(u, one));
+
+  // For |x| tiny enough that u rounds to 1, return x directly.
+  Packet small_mask = pcmp_eq(u, one);
+  // For u = +inf (x very large), return +inf.
+  Packet inf_mask = pcmp_eq(u, cst_pos_inf);
+
+  // Core range reduction and polynomial on u.
+  Packet log_u, e;
+  plog_core_double(u, log_u, e);
+
+  // result = e * ln2 + log(u) + dx/u.
+  // The dx/u term corrects for the rounding error in u = fl(1+x).
+  const Packet cst_ln2 = pset1<Packet>(static_cast<double>(EIGEN_LN2));
+  Packet result = pmadd(e, cst_ln2, padd(log_u, pdiv(dx, u)));
+
+  // Handle special cases.
+  Packet neg_mask = pcmp_lt(u, pzero(u));
+  Packet zero_mask = pcmp_eq(x, pset1<Packet>(-1.0));
+  result = pselect(small_mask, x, result);
+  result = pselect(inf_mask, cst_pos_inf, result);
+  result = pselect(zero_mask, cst_minus_inf, result);
+  result = por(neg_mask, result);  // NaN for x < -1
+  return result;
+}
+
 /** \internal \returns log(1 + x) computed using W. Kahan's formula.
     See: http://www.plunk.org/~hatch/rightway.php
+    This is the generic fallback for types without a specialized implementation.
  */
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p(const Packet& x) {
@@ -638,17 +442,16 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_expm1(const P
 }
 
 // Exponential function. Works by writing "x = m*log(2) + r" where
-// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
+// "m = rint(x/log(2))" and "r" is the remainder. The result is then
 // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
 // exp(r) is computed using a 6th order minimax polynomial approximation.
-template <typename Packet>
+template <typename Packet, bool IsFinite>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x) {
-  const Packet cst_zero = pset1<Packet>(0.0f);
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+
   const Packet cst_one = pset1<Packet>(1.0f);
-  const Packet cst_half = pset1<Packet>(0.5f);
   const Packet cst_exp_hi = pset1<Packet>(88.723f);
   const Packet cst_exp_lo = pset1<Packet>(-104.f);
-  const Packet cst_pldexp_threshold = pset1<Packet>(87.0);
 
   const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
   const Packet cst_p2 = pset1<Packet>(0.49999988079071044921875f);
@@ -657,17 +460,15 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Pack
   const Packet cst_p5 = pset1<Packet>(8.36894474923610687255859375e-3f);
   const Packet cst_p6 = pset1<Packet>(1.37449637986719608306884765625e-3f);
 
-  // Clamp x.
-  Packet zero_mask = pcmp_lt(_x, cst_exp_lo);
-  Packet x = pmin(_x, cst_exp_hi);
+  // Clamp x to prevent overflow/underflow.
+  Packet x = pmin(pmax(_x, cst_exp_lo), cst_exp_hi);
 
   // Express exp(x) as exp(m*ln(2) + r), start by extracting
-  // m = floor(x/ln(2) + 0.5).
-  Packet m = pfloor(pmadd(x, cst_cephes_LOG2EF, cst_half));
+  // m = rint(x/ln(2)).
+  Packet m = print(pmul(x, cst_cephes_LOG2EF));
 
-  // Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
-  // subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
-  // truncation errors.
+  // Get r = x - m*ln(2). m*ln(2) is subtracted out in two parts,
+  // m*C1+m*C2 = m*ln(2), to avoid accumulating truncation errors.
   const Packet cst_cephes_exp_C1 = pset1<Packet>(-0.693359375f);
   const Packet cst_cephes_exp_C2 = pset1<Packet>(2.12194440e-4f);
   Packet r = pmadd(m, cst_cephes_exp_C1, x);
@@ -683,23 +484,27 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Pack
   Packet y = pmadd(r, p_odd, p_even);
   y = pmadd(r2, y, p_low);
 
-  // Return 2^m * exp(r).
-  const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(x));
-  if (!predux_any(fast_pldexp_unsafe)) {
-    // For |x| <= 87, we know the result is not zero or inf, and we can safely use
-    // the fast version of pldexp.
-    return pmax(pldexp_fast(y, m), _x);
+  // Construct 2^m by directly manipulating the exponent bits.
+  // After clamping, m is in [-150, 128], so biased exponent m+127 is in [-23, 255].
+  // We only need the lower clamp to 0 (the upper bound 255 is exact).
+  const PacketI cst_bias = pset1<PacketI>(127);
+  PacketI mi = pcast<Packet, PacketI>(m);
+  mi = pmax(padd(mi, cst_bias), pzero(mi));
+  const Packet pow2m = preinterpret<Packet>(plogical_shift_left<23>(mi));
+  y = pmul(y, pow2m);
+
+  if (!IsFinite) {
+    // Handle NaN: exp(nan) = nan. Use pmax to propagate NaN from input.
+    y = pmax(y, _x);
   }
-  return pselect(zero_mask, cst_zero, pmax(pldexp(y, m), _x));
+  return y;
 }
 
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Packet _x) {
-  Packet x = _x;
   const Packet cst_zero = pset1<Packet>(0.0);
   const Packet cst_1 = pset1<Packet>(1.0);
   const Packet cst_2 = pset1<Packet>(2.0);
-  const Packet cst_half = pset1<Packet>(0.5);
 
   const Packet cst_exp_hi = pset1<Packet>(709.784);
   const Packet cst_exp_lo = pset1<Packet>(-745.519);
@@ -715,24 +520,19 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Pac
   const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693145751953125);
   const Packet cst_cephes_exp_C2 = pset1<Packet>(1.42860682030941723212e-6);
 
-  Packet tmp, fx;
-
-  // clamp x
+  // Clamp x.
   Packet zero_mask = pcmp_lt(_x, cst_exp_lo);
-  x = pmin(x, cst_exp_hi);
-  // Express exp(x) as exp(g + n*log(2)).
-  fx = pmadd(cst_cephes_LOG2EF, x, cst_half);
+  Packet x = pmin(_x, cst_exp_hi);
 
-  // Get the integer modulus of log(2), i.e. the "n" described above.
-  fx = pfloor(fx);
+  // Express exp(x) as exp(g + n*log(2)).
+  // n = rint(x / ln(2)).
+  Packet fx = print(pmul(x, cst_cephes_LOG2EF));
 
   // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
   // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
   // digits right.
-  tmp = pmul(fx, cst_cephes_exp_C1);
-  Packet z = pmul(fx, cst_cephes_exp_C2);
-  x = psub(x, tmp);
-  x = psub(x, z);
+  x = pnmadd(fx, cst_cephes_exp_C1, x);
+  x = pnmadd(fx, cst_cephes_exp_C2, x);
 
   Packet x2 = pmul(x, x);
 
@@ -748,9 +548,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Pac
   qx = pmadd(qx, x2, cst_cephes_exp_q2);
   qx = pmadd(qx, x2, cst_cephes_exp_q3);
 
-  // I don't really get this bit, copied from the SSE2 routines, so...
-  // TODO(gonnet): Figure out what is going on here, perhaps find a better
-  // rational interpolant?
+  // exp(g) = 1 + 2*px/(qx - px).
   x = pdiv(px, psub(qx, px));
   x = pmadd(cst_2, x, cst_1);
 
@@ -765,928 +563,66 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Pac
   return pselect(zero_mask, cst_zero, pmax(pldexp(x, fx), _x));
 }
 
-// The following code is inspired by the following stack-overflow answer:
-//   https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
-// It has been largely optimized:
-//  - By-pass calls to frexp.
-//  - Aligned loads of required 96 bits of 2/pi. This is accomplished by
-//    (1) balancing the mantissa and exponent to the required bits of 2/pi are
-//    aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
-//  - Avoid a branch in rounding and extraction of the remaining fractional part.
-// Overall, I measured a speed up higher than x2 on x86-64.
-inline float trig_reduce_huge(float xf, Eigen::numext::int32_t* quadrant) {
-  using Eigen::numext::int32_t;
-  using Eigen::numext::int64_t;
-  using Eigen::numext::uint32_t;
-  using Eigen::numext::uint64_t;
-
-  const double pio2_62 = 3.4061215800865545e-19;     // pi/2 * 2^-62
-  const uint64_t zero_dot_five = uint64_t(1) << 61;  // 0.5 in 2.62-bit fixed-point format
-
-  // 192 bits of 2/pi for Payne-Hanek reduction
-  // Bits are introduced by packet of 8 to enable aligned reads.
-  static const uint32_t two_over_pi[] = {
-      0x00000028, 0x000028be, 0x0028be60, 0x28be60db, 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a, 0x91054a7f,
-      0x054a7f09, 0x4a7f09d5, 0x7f09d5f4, 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770, 0x4d377036, 0x377036d8,
-      0x7036d8a5, 0x36d8a566, 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410, 0x10e41000, 0xe4100000};
-
-  uint32_t xi = numext::bit_cast<uint32_t>(xf);
-  // Below, -118 = -126 + 8.
-  //   -126 is to get the exponent,
-  //   +8 is to enable alignment of 2/pi's bits on 8 bits.
-  // This is possible because the fractional part of x as only 24 meaningful bits.
-  uint32_t e = (xi >> 23) - 118;
-  // Extract the mantissa and shift it to align it wrt the exponent
-  xi = ((xi & 0x007fffffu) | 0x00800000u) << (e & 0x7);
-
-  uint32_t i = e >> 3;
-  uint32_t twoopi_1 = two_over_pi[i - 1];
-  uint32_t twoopi_2 = two_over_pi[i + 3];
-  uint32_t twoopi_3 = two_over_pi[i + 7];
-
-  // Compute x * 2/pi in 2.62-bit fixed-point format.
-  uint64_t p;
-  p = uint64_t(xi) * twoopi_3;
-  p = uint64_t(xi) * twoopi_2 + (p >> 32);
-  p = (uint64_t(xi * twoopi_1) << 32) + p;
-
-  // Round to nearest: add 0.5 and extract integral part.
-  uint64_t q = (p + zero_dot_five) >> 62;
-  *quadrant = int(q);
-  // Now it remains to compute "r = x - q*pi/2" with high accuracy,
-  // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
-  //   r = (p-q)*pi/2,
-  // where the product can be be carried out with sufficient accuracy using double precision.
-  p -= q << 62;
-  return float(double(int64_t(p)) * pio2_62);
-}
-
-template <bool ComputeSine, typename Packet, bool ComputeBoth = false>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-#if EIGEN_COMP_GNUC_STRICT
-    __attribute__((optimize("-fno-unsafe-math-optimizations")))
-#endif
-    Packet
-    psincos_float(const Packet& _x) {
-  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-
-  const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f);  // 2/PI
-  const Packet cst_rounding_magic = pset1<Packet>(12582912);           // 2^23 for rounding
-  const PacketI csti_1 = pset1<PacketI>(1);
-  const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x80000000u));
-
-  Packet x = pabs(_x);
-
-  // Scale x by 2/Pi to find x's octant.
-  Packet y = pmul(x, cst_2oPI);
-
-  // Rounding trick to find nearest integer:
-  Packet y_round = padd(y, cst_rounding_magic);
-  EIGEN_OPTIMIZATION_BARRIER(y_round)
-  PacketI y_int = preinterpret<PacketI>(y_round);  // last 23 digits represent integer (if abs(x)<2^24)
-  y = psub(y_round, cst_rounding_magic);           // nearest integer to x * (2/pi)
-
-// Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4
-// using "Extended precision modular arithmetic"
-#if defined(EIGEN_VECTORIZE_FMA)
-  // This version requires true FMA for high accuracy.
-  // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
-  const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
-  x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
-  x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
-  x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
-#else
-  // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
-  // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
-  // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
-
-  // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
-  // and 2 ULP up to:
-  const float huge_th = ComputeSine ? 25966.f : 18838.f;
-  x = pmadd(y, pset1<Packet>(-1.5703125), x);  // = 0xbfc90000
-  EIGEN_OPTIMIZATION_BARRIER(x)
-  x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x);  // = 0xb9fdc000
-  EIGEN_OPTIMIZATION_BARRIER(x)
-  x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x);                      // = 0x342ee000
-  x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x);  // = 0x2e74b9ee
-
-// For the record, the following set of coefficients maintain 2ULP up
-// to a slightly larger range:
-// const float huge_th = ComputeSine ? 51981.f : 39086.125f;
-// but it slightly fails to maintain 1ULP for two values of sin below pi.
-// x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
-// x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
-// x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
-// x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
-
-// For the record, with only 3 iterations it is possible to maintain
-// 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
-// The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
-#endif
-
-  if (predux_any(pcmp_le(pset1<Packet>(huge_th), pabs(_x)))) {
-    const int PacketSize = unpacket_traits<Packet>::size;
-    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
-    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
-    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Eigen::numext::int32_t y_int2[PacketSize];
-    pstoreu(vals, pabs(_x));
-    pstoreu(x_cpy, x);
-    pstoreu(y_int2, y_int);
-    for (int k = 0; k < PacketSize; ++k) {
-      float val = vals[k];
-      if (val >= huge_th && (numext::isfinite)(val)) x_cpy[k] = trig_reduce_huge(val, &y_int2[k]);
-    }
-    x = ploadu<Packet>(x_cpy);
-    y_int = ploadu<PacketI>(y_int2);
-  }
-
-  // Compute the sign to apply to the polynomial.
-  // sin: sign = second_bit(y_int) xor signbit(_x)
-  // cos: sign = second_bit(y_int+1)
-  Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
-                                : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
-  sign_bit = pand(sign_bit, cst_sign_mask);  // clear all but left most bit
-
-  // Get the polynomial selection mask from the second bit of y_int
-  // We'll calculate both (sin and cos) polynomials and then select from the two.
-  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
-
-  Packet x2 = pmul(x, x);
-
-  // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
-  Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f);
-  y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f));
-  y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f));
-  y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
-  y1 = pmadd(y1, x2, pset1<Packet>(1.f));
-
-  // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
-  // octave/matlab code to compute those coefficients:
-  //    x = (0:0.0001:pi/4)';
-  //    A = [x.^3 x.^5 x.^7];
-  //    w = ((1.-(x/(pi/4)).^2).^5)*2000+1;         # weights trading relative accuracy
-  //    c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
-  //    printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
-  //
-  Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
-  y2 = pmadd(y2, x2, pset1<Packet>(0.0083326873655616851693794799871284340042620897293090820312500000f));
-  y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
-  y2 = pmul(y2, x2);
-  y2 = pmadd(y2, x, x);
-
-  // Select the correct result from the two polynomials.
-  if (ComputeBoth) {
-    Packet peven = peven_mask(x);
-    Packet ysin = pselect(poly_mask, y2, y1);
-    Packet ycos = pselect(poly_mask, y1, y2);
-    Packet sign_bit_sin = pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)));
-    Packet sign_bit_cos = preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
-    sign_bit_sin = pand(sign_bit_sin, cst_sign_mask);  // clear all but left most bit
-    sign_bit_cos = pand(sign_bit_cos, cst_sign_mask);  // clear all but left most bit
-    y = pselect(peven, pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos));
-  } else {
-    y = ComputeSine ? pselect(poly_mask, y2, y1) : pselect(poly_mask, y1, y2);
-    y = pxor(y, sign_bit);
-  }
-  // Update the sign and filter huge inputs
-  return y;
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Packet& x) {
-  return psincos_float<true>(x);
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x) {
-  return psincos_float<false>(x);
-}
-
-// Trigonometric argument reduction for double for inputs smaller than 15.
-// Reduces trigonometric arguments for double inputs where x < 15. Given an argument x and its corresponding quadrant
-// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
-template <typename Packet>
-Packet trig_reduce_small_double(const Packet& x, const Packet& q) {
-  // Pi/2 split into 2 values
-  const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
-  const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
-
-  Packet t;
-  t = pmadd(cst_pio2_a, q, x);
-  t = pmadd(cst_pio2_b, q, t);
-  return t;
-}
-
-// Trigonometric argument reduction for double for inputs smaller than 1e14.
-// Reduces trigonometric arguments for double inputs where x < 1e14. Given an argument x and its corresponding quadrant
-// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
-template <typename Packet>
-Packet trig_reduce_medium_double(const Packet& x, const Packet& q_high, const Packet& q_low) {
-  // Pi/2 split into 4 values
-  const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
-  const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
-  const Packet cst_pio2_c = pset1<Packet>(-6.123234014771656e-17);
-  const Packet cst_pio2_d = pset1<Packet>(1.903488962019325e-25);
-
-  Packet t;
-  t = pmadd(cst_pio2_a, q_high, x);
-  t = pmadd(cst_pio2_a, q_low, t);
-  t = pmadd(cst_pio2_b, q_high, t);
-  t = pmadd(cst_pio2_b, q_low, t);
-  t = pmadd(cst_pio2_c, q_high, t);
-  t = pmadd(cst_pio2_c, q_low, t);
-  t = pmadd(cst_pio2_d, padd(q_low, q_high), t);
-  return t;
-}
-
-template <bool ComputeSine, typename Packet, bool ComputeBoth = false>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-#if EIGEN_COMP_GNUC_STRICT
-    __attribute__((optimize("-fno-unsafe-math-optimizations")))
-#endif
-    Packet
-    psincos_double(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-  typedef typename unpacket_traits<PacketI>::type ScalarI;
-
-  const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint64_t>(0x8000000000000000u));
-
-  // If the argument is smaller than this value, use a simpler argument reduction
-  const double small_th = 15;
-  // If the argument is bigger than this value, use the non-vectorized std version
-  const double huge_th = 1e14;
-
-  const Packet cst_2oPI = pset1<Packet>(0.63661977236758134307553505349006);  // 2/PI
-  // Integer Packet constants
-  const PacketI cst_one = pset1<PacketI>(ScalarI(1));
-  // Constant for splitting
-  const Packet cst_split = pset1<Packet>(1 << 24);
-
-  Packet x_abs = pabs(x);
-
-  // Scale x by 2/Pi
-  PacketI q_int;
-  Packet s;
-
-  // TODO Implement huge angle argument reduction
-  if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(small_th), x_abs)))) {
-    Packet q_high = pmul(pfloor(pmul(x_abs, pdiv(cst_2oPI, cst_split))), cst_split);
-    Packet q_low_noround = psub(pmul(x_abs, cst_2oPI), q_high);
-    q_int = pcast<Packet, PacketI>(padd(q_low_noround, pset1<Packet>(0.5)));
-    Packet q_low = pcast<PacketI, Packet>(q_int);
-    s = trig_reduce_medium_double(x_abs, q_high, q_low);
-  } else {
-    Packet qval_noround = pmul(x_abs, cst_2oPI);
-    q_int = pcast<Packet, PacketI>(padd(qval_noround, pset1<Packet>(0.5)));
-    Packet q = pcast<PacketI, Packet>(q_int);
-    s = trig_reduce_small_double(x_abs, q);
-  }
-
-  // All the upcoming approximating polynomials have even exponents
-  Packet ss = pmul(s, s);
-
-  // Padé approximant of cos(x)
-  // Assuring < 1 ULP error on the interval [-pi/4, pi/4]
-  // cos(x) ~= (80737373*x^8 - 13853547000*x^6 + 727718024880*x^4 - 11275015752000*x^2 + 23594700729600)/(147173*x^8 +
-  // 39328920*x^6 + 5772800880*x^4 + 522334612800*x^2 + 23594700729600)
-  // MATLAB code to compute those coefficients:
-  //    syms x;
-  //    cosf = @(x) cos(x);
-  //    pade_cosf = pade(cosf(x), x, 0, 'Order', 8)
-  Packet sc1_num = pmadd(ss, pset1<Packet>(80737373), pset1<Packet>(-13853547000));
-  Packet sc2_num = pmadd(sc1_num, ss, pset1<Packet>(727718024880));
-  Packet sc3_num = pmadd(sc2_num, ss, pset1<Packet>(-11275015752000));
-  Packet sc4_num = pmadd(sc3_num, ss, pset1<Packet>(23594700729600));
-  Packet sc1_denum = pmadd(ss, pset1<Packet>(147173), pset1<Packet>(39328920));
-  Packet sc2_denum = pmadd(sc1_denum, ss, pset1<Packet>(5772800880));
-  Packet sc3_denum = pmadd(sc2_denum, ss, pset1<Packet>(522334612800));
-  Packet sc4_denum = pmadd(sc3_denum, ss, pset1<Packet>(23594700729600));
-  Packet scos = pdiv(sc4_num, sc4_denum);
-
-  // Padé approximant of sin(x)
-  // Assuring < 1 ULP error on the interval [-pi/4, pi/4]
-  // sin(x) ~= (x*(4585922449*x^8 - 1066023933480*x^6 + 83284044283440*x^4 - 2303682236856000*x^2 +
-  // 15605159573203200))/(45*(1029037*x^8 + 345207016*x^6 + 61570292784*x^4 + 6603948711360*x^2 + 346781323848960))
-  // MATLAB code to compute those coefficients:
-  //    syms x;
-  //    sinf = @(x) sin(x);
-  //    pade_sinf = pade(sinf(x), x, 0, 'Order', 8, 'OrderMode', 'relative')
-  Packet ss1_num = pmadd(ss, pset1<Packet>(4585922449), pset1<Packet>(-1066023933480));
-  Packet ss2_num = pmadd(ss1_num, ss, pset1<Packet>(83284044283440));
-  Packet ss3_num = pmadd(ss2_num, ss, pset1<Packet>(-2303682236856000));
-  Packet ss4_num = pmadd(ss3_num, ss, pset1<Packet>(15605159573203200));
-  Packet ss1_denum = pmadd(ss, pset1<Packet>(1029037), pset1<Packet>(345207016));
-  Packet ss2_denum = pmadd(ss1_denum, ss, pset1<Packet>(61570292784));
-  Packet ss3_denum = pmadd(ss2_denum, ss, pset1<Packet>(6603948711360));
-  Packet ss4_denum = pmadd(ss3_denum, ss, pset1<Packet>(346781323848960));
-  Packet ssin = pdiv(pmul(s, ss4_num), pmul(pset1<Packet>(45), ss4_denum));
-
-  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(q_int, cst_one), pzero(q_int)));
-
-  Packet sign_sin = pxor(x, preinterpret<Packet>(plogical_shift_left<62>(q_int)));
-  Packet sign_cos = preinterpret<Packet>(plogical_shift_left<62>(padd(q_int, cst_one)));
-  Packet sign_bit, sFinalRes;
-  if (ComputeBoth) {
-    Packet peven = peven_mask(x);
-    sign_bit = pselect((s), sign_sin, sign_cos);
-    sFinalRes = pselect(pxor(peven, poly_mask), ssin, scos);
-  } else {
-    sign_bit = ComputeSine ? sign_sin : sign_cos;
-    sFinalRes = ComputeSine ? pselect(poly_mask, ssin, scos) : pselect(poly_mask, scos, ssin);
-  }
-  sign_bit = pand(sign_bit, cst_sign_mask);  // clear all but left most bit
-  sFinalRes = pxor(sFinalRes, sign_bit);
-
-  // If the inputs values are higher than that a value that the argument reduction can currently address, compute them
-  // using std::sin and std::cos
-  // TODO Remove it when huge angle argument reduction is implemented
-  if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(huge_th), x_abs)))) {
-    const int PacketSize = unpacket_traits<Packet>::size;
-    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double sincos_vals[PacketSize];
-    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double x_cpy[PacketSize];
-    pstoreu(x_cpy, x);
-    pstoreu(sincos_vals, sFinalRes);
-    for (int k = 0; k < PacketSize; ++k) {
-      double val = x_cpy[k];
-      if (std::abs(val) > huge_th && (numext::isfinite)(val)) {
-        if (ComputeBoth)
-          sincos_vals[k] = k % 2 == 0 ? std::sin(val) : std::cos(val);
-        else
-          sincos_vals[k] = ComputeSine ? std::sin(val) : std::cos(val);
-      }
-    }
-    sFinalRes = ploadu<Packet>(sincos_vals);
-  }
-  return sFinalRes;
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x) {
-  return psincos_double<true>(x);
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x) {
-  return psincos_double<false>(x);
-}
-
-// Generic implementation of acos(x).
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x_in) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
-
-  const Packet cst_one = pset1<Packet>(Scalar(1));
-  const Packet cst_pi = pset1<Packet>(Scalar(EIGEN_PI));
-  const Packet p6 = pset1<Packet>(Scalar(2.36423197202384471893310546875e-3));
-  const Packet p5 = pset1<Packet>(Scalar(-1.1368644423782825469970703125e-2));
-  const Packet p4 = pset1<Packet>(Scalar(2.717843465507030487060546875e-2));
-  const Packet p3 = pset1<Packet>(Scalar(-4.8969544470310211181640625e-2));
-  const Packet p2 = pset1<Packet>(Scalar(8.8804088532924652099609375e-2));
-  const Packet p1 = pset1<Packet>(Scalar(-0.214591205120086669921875));
-  const Packet p0 = pset1<Packet>(Scalar(1.57079637050628662109375));
-
-  // For x in [0:1], we approximate acos(x)/sqrt(1-x), which is a smooth
-  // function, by a 6'th order polynomial.
-  // For x in [-1:0) we use that acos(-x) = pi - acos(x).
-  const Packet neg_mask = psignbit(x_in);
-  const Packet abs_x = pabs(x_in);
-
-  // Evaluate the polynomial using Horner's rule:
-  //   P(x) = p0 + x * (p1 +  x * (p2 + ... (p5 + x * p6)) ... ) .
-  // We evaluate even and odd terms independently to increase
-  // instruction level parallelism.
-  Packet x2 = pmul(x_in, x_in);
-  Packet p_even = pmadd(p6, x2, p4);
-  Packet p_odd = pmadd(p5, x2, p3);
-  p_even = pmadd(p_even, x2, p2);
-  p_odd = pmadd(p_odd, x2, p1);
-  p_even = pmadd(p_even, x2, p0);
-  Packet p = pmadd(p_odd, abs_x, p_even);
-
-  // The polynomial approximates acos(x)/sqrt(1-x), so
-  // multiply by sqrt(1-x) to get acos(x).
-  // Conveniently returns NaN for arguments outside [-1:1].
-  Packet denom = psqrt(psub(cst_one, abs_x));
-  Packet result = pmul(denom, p);
-  // Undo mapping for negative arguments.
-  return pselect(neg_mask, psub(cst_pi, result), result);
-}
-
-// Generic implementation of asin(x).
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x_in) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
-
-  constexpr float kPiOverTwo = static_cast<float>(EIGEN_PI / 2);
-
-  const Packet cst_half = pset1<Packet>(0.5f);
-  const Packet cst_one = pset1<Packet>(1.0f);
-  const Packet cst_two = pset1<Packet>(2.0f);
-  const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
-
-  const Packet abs_x = pabs(x_in);
-  const Packet sign_mask = pandnot(x_in, abs_x);
-  const Packet invalid_mask = pcmp_lt(cst_one, abs_x);
-
-  // For arguments |x| > 0.5, we map x back to [0:0.5] using
-  // the transformation x_large = sqrt(0.5*(1-x)), and use the
-  // identity
-  //   asin(x) = pi/2 - 2 * asin( sqrt( 0.5 * (1 - x)))
-
-  const Packet x_large = psqrt(pnmadd(cst_half, abs_x, cst_half));
-  const Packet large_mask = pcmp_lt(cst_half, abs_x);
-  const Packet x = pselect(large_mask, x_large, abs_x);
-  const Packet x2 = pmul(x, x);
-
-  // For |x| < 0.5 approximate asin(x)/x by an 8th order polynomial with
-  // even terms only.
-  constexpr float alpha[] = {5.08838854730129241943359375e-2f, 3.95139865577220916748046875e-2f,
-                             7.550220191478729248046875e-2f, 0.16664917767047882080078125f, 1.00000011920928955078125f};
-  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
-  p = pmul(p, x);
-
-  const Packet p_large = pnmadd(cst_two, p, cst_pi_over_two);
-  p = pselect(large_mask, p_large, p);
-  // Flip the sign for negative arguments.
-  p = pxor(p, sign_mask);
-  // Return NaN for arguments outside [-1:1].
-  return por(invalid_mask, p);
-}
-
-template <typename Scalar>
-struct patan_reduced {
-  template <typename Packet>
-  static EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet run(const Packet& x);
-};
-
-template <>
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<double>::run(const Packet& x) {
-  constexpr double alpha[] = {2.6667153866462208e-05, 3.0917513112462781e-03, 5.2574296781008604e-02,
-                              3.0409318473444424e-01, 7.5365702534987022e-01, 8.2704055405494614e-01,
-                              3.3004361289279920e-01};
-
-  constexpr double beta[] = {
-      2.7311202462436667e-04, 1.0899150928962708e-02, 1.1548932646420353e-01, 4.9716458728465573e-01, 1.0,
-      9.3705509168587852e-01, 3.3004361289279920e-01};
-
-  Packet x2 = pmul(x, x);
-  Packet p = ppolevl<Packet, 6>::run(x2, alpha);
-  Packet q = ppolevl<Packet, 6>::run(x2, beta);
-  return pmul(x, pdiv(p, q));
-}
-
-// Computes elementwise atan(x) for x in [-1:1] with 2 ulp accuracy.
-template <>
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<float>::run(const Packet& x) {
-  constexpr float alpha[] = {1.12026982009410858154296875e-01f, 7.296695709228515625e-01f, 8.109951019287109375e-01f};
-
-  constexpr float beta[] = {1.00917108356952667236328125e-02f, 2.8318560123443603515625e-01f, 1.0f,
-                            8.109951019287109375e-01f};
-
-  Packet x2 = pmul(x, x);
-  Packet p = ppolevl<Packet, 2>::run(x2, alpha);
-  Packet q = ppolevl<Packet, 3>::run(x2, beta);
-  return pmul(x, pdiv(p, q));
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_atan(const Packet& x_in) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-
-  constexpr Scalar kPiOverTwo = static_cast<Scalar>(EIGEN_PI / 2);
-
-  const Packet cst_signmask = pset1<Packet>(Scalar(-0.0));
-  const Packet cst_one = pset1<Packet>(Scalar(1));
-  const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
-
-  //   "Large": For |x| > 1, use atan(1/x) = sign(x)*pi/2 - atan(x).
-  //   "Small": For |x| <= 1, approximate atan(x) directly by a polynomial
-  //            calculated using Rminimax.
-
-  const Packet abs_x = pabs(x_in);
-  const Packet x_signmask = pand(x_in, cst_signmask);
-  const Packet large_mask = pcmp_lt(cst_one, abs_x);
-  const Packet x = pselect(large_mask, preciprocal(abs_x), abs_x);
-  const Packet p = patan_reduced<Scalar>::run(x);
-  // Apply transformations according to the range reduction masks.
-  Packet result = pselect(large_mask, psub(cst_pi_over_two, p), p);
-  // Return correct sign
-  return pxor(result, x_signmask);
-}
-
-/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
-    Doesn't do anything fancy, just a 9/8-degree rational interpolant which
-    is accurate up to a couple of ulps in the (approximate) range [-8, 8],
-    outside of which tanh(x) = +/-1 in single precision. The input is clamped
-    to the range [-c, c]. The value c is chosen as the smallest value where
-    the approximation evaluates to exactly 1.
-
-    This implementation works on both scalars and packets.
-*/
-template <typename T>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& a_x) {
-  // Clamp the inputs to the range [-c, c] and set everything
-  // outside that range to 1.0. The value c is chosen as the smallest
-  // floating point argument such that the approximation is exactly 1.
-  // This saves clamping the value at the end.
-#ifdef EIGEN_VECTORIZE_FMA
-  const T plus_clamp = pset1<T>(8.01773357391357422f);
-  const T minus_clamp = pset1<T>(-8.01773357391357422f);
-#else
-  const T plus_clamp = pset1<T>(7.90738964080810547f);
-  const T minus_clamp = pset1<T>(-7.90738964080810547f);
-#endif
-  const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
-
-  // The following rational approximation was generated by rminimax
-  // (https://gitlab.inria.fr/sfilip/rminimax) using the following
-  // command:
-  // $ ratapprox --function="tanh(x)" --dom='[-8.67,8.67]' --num="odd"
-  //   --den="even" --type="[9,8]" --numF="[SG]" --denF="[SG]" --log
-  //   --output=tanhf.sollya --dispCoeff="dec"
-
-  // The monomial coefficients of the numerator polynomial (odd).
-  constexpr float alpha[] = {1.394553628e-8f, 2.102733560e-5f, 3.520756727e-3f, 1.340216100e-1f};
-
-  // The monomial coefficients of the denominator polynomial (even).
-  constexpr float beta[] = {8.015776984e-7f, 3.326951409e-4f, 2.597254514e-2f, 4.673548340e-1f, 1.0f};
-
-  // Since the polynomials are odd/even, we need x^2.
-  const T x2 = pmul(x, x);
-  const T x3 = pmul(x2, x);
-
-  T p = ppolevl<T, 3>::run(x2, alpha);
-  T q = ppolevl<T, 4>::run(x2, beta);
-  // Take advantage of the fact that the constant term in p is 1 to compute
-  // x*(x^2*p + 1) = x^3 * p + x.
-  p = pmadd(x3, p, x);
-
-  // Divide the numerator by the denominator.
-  return pdiv(p, q);
-}
-
-/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
-    This uses a 19/18-degree rational interpolant which
-    is accurate up to a couple of ulps in the (approximate) range [-18.7, 18.7],
-    outside of which tanh(x) = +/-1 in single precision. The input is clamped
-    to the range [-c, c]. The value c is chosen as the smallest value where
-    the approximation evaluates to exactly 1.
-
-    This implementation works on both scalars and packets.
-*/
-template <typename T>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_double(const T& a_x) {
-  // Clamp the inputs to the range [-c, c] and set everything
-  // outside that range to 1.0. The value c is chosen as the smallest
-  // floating point argument such that the approximation is exactly 1.
-  // This saves clamping the value at the end.
-#ifdef EIGEN_VECTORIZE_FMA
-  const T plus_clamp = pset1<T>(17.6610191624600077);
-  const T minus_clamp = pset1<T>(-17.6610191624600077);
-#else
-  const T plus_clamp = pset1<T>(17.714196154005176);
-  const T minus_clamp = pset1<T>(-17.714196154005176);
-#endif
-  const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
-
-  // The following rational approximation was generated by rminimax
-  // (https://gitlab.inria.fr/sfilip/rminimax) using the following
-  // command:
-  // $ ./ratapprox --function="tanh(x)" --dom='[-18.72,18.72]'
-  //   --num="odd" --den="even" --type="[19,18]" --numF="[D]"
-  //   --denF="[D]" --log --output=tanh.sollya --dispCoeff="dec"
-
-  // The monomial coefficients of the numerator polynomial (odd).
-  constexpr double alpha[] = {2.6158007860482230e-23, 7.6534862268749319e-19, 3.1309488231386680e-15,
-                              4.2303918148209176e-12, 2.4618379131293676e-09, 6.8644367682497074e-07,
-                              9.3839087674268880e-05, 5.9809711724441161e-03, 1.5184719640284322e-01};
-
-  // The monomial coefficients of the denominator polynomial (even).
-  constexpr double beta[] = {6.463747022670968018e-21, 5.782506856739003571e-17,
-                             1.293019623712687916e-13, 1.123643448069621992e-10,
-                             4.492975677839633985e-08, 8.785185266237658698e-06,
-                             8.295161192716231542e-04, 3.437448108450402717e-02,
-                             4.851805297361760360e-01, 1.0};
-
-  // Since the polynomials are odd/even, we need x^2.
-  const T x2 = pmul(x, x);
-  const T x3 = pmul(x2, x);
-
-  // Interleave the evaluation of the numerator polynomial p and
-  // denominator polynomial q.
-  T p = ppolevl<T, 8>::run(x2, alpha);
-  T q = ppolevl<T, 9>::run(x2, beta);
-  // Take advantage of the fact that the constant term in p is 1 to compute
-  // x*(x^2*p + 1) = x^3 * p + x.
-  p = pmadd(x3, p, x);
-
-  // Divide the numerator by the denominator.
-  return pdiv(p, q);
-}
-
+// This function computes exp2(x) = exp(ln(2) * x).
+// To improve accuracy, the product ln(2)*x is computed using the twoprod
+// algorithm, such that ln(2) * x = p_hi + p_lo holds exactly. Then exp2(x) is
+// computed as exp2(x) = exp(p_hi) * exp(p_lo) ~= exp(p_hi) * (1 + p_lo). This
+// correction step this reduces the maximum absolute error as follows:
+//
+// type   | max error (simple product) | max error (twoprod) |
+// -----------------------------------------------------------
+// float  |       35 ulps              |       4 ulps        |
+// double |      363 ulps              |     110 ulps        |
+//
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Packet& x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_exp2(const Packet& _x) {
   typedef typename unpacket_traits<Packet>::type Scalar;
-  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
-
-  // For |x| in [0:0.5] we use a polynomial approximation of the form
-  // P(x) = x + x^3*(alpha[4] + x^2 * (alpha[3] + x^2 * (... x^2 * alpha[0]) ... )).
-  constexpr float alpha[] = {0.1819281280040740966796875f, 8.2311116158962249755859375e-2f,
-                             0.14672131836414337158203125f, 0.1997792422771453857421875f, 0.3333373963832855224609375f};
-  const Packet x2 = pmul(x, x);
-  const Packet x3 = pmul(x, x2);
-  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
-  p = pmadd(x3, p, x);
-
-  // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
-  const Packet half = pset1<Packet>(0.5f);
-  const Packet one = pset1<Packet>(1.0f);
-  Packet r = pdiv(padd(one, x), psub(one, x));
-  r = pmul(half, plog(r));
-
-  const Packet x_gt_half = pcmp_le(half, pabs(x));
-  const Packet x_eq_one = pcmp_eq(one, pabs(x));
-  const Packet x_gt_one = pcmp_lt(one, pabs(x));
-  const Packet sign_mask = pset1<Packet>(-0.0f);
-  const Packet x_sign = pand(sign_mask, x);
-  const Packet inf = pset1<Packet>(std::numeric_limits<float>::infinity());
-  return por(x_gt_one, pselect(x_eq_one, por(x_sign, inf), pselect(x_gt_half, r, p)));
+  constexpr int max_exponent = std::numeric_limits<Scalar>::max_exponent;
+  constexpr int digits = std::numeric_limits<Scalar>::digits;
+  constexpr Scalar max_cap = Scalar(max_exponent + 1);
+  constexpr Scalar min_cap = -Scalar(max_exponent + digits - 1);
+  Packet x = pmax(pmin(_x, pset1<Packet>(max_cap)), pset1<Packet>(min_cap));
+  Packet p_hi, p_lo;
+  twoprod(pset1<Packet>(Scalar(EIGEN_LN2)), x, p_hi, p_lo);
+  Packet exp2_hi = pexp(p_hi);
+  Packet exp2_lo = padd(pset1<Packet>(Scalar(1)), p_lo);
+  return pmul(exp2_hi, exp2_lo);
 }
 
+/** \internal \returns log10(x) for single precision float.
+    Computed as log(x) * log10(e).
+    Simply multiplying by a single float constant loses accuracy because
+    float(log10(e)) has rounding error. We use a hi+lo split instead:
+    log10(x) ~= log(x) * hi + log(x) * lo, computed via fma. */
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_double(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
-  // For x in [-0.5:0.5] we use a rational approximation of the form
-  // R(x) = x + x^3*P(x^2)/Q(x^2), where P is or order 4 and Q is of order 5.
-  constexpr double alpha[] = {3.3071338469301391e-03, -4.7129526768798737e-02, 1.8185306179826699e-01,
-                              -2.5949536095445679e-01, 1.2306328729812676e-01};
-
-  constexpr double beta[] = {-3.8679974580640881e-03, 7.6391885763341910e-02,  -4.2828141436397615e-01,
-                             9.8733495886883648e-01,  -1.0000000000000000e+00, 3.6918986189438030e-01};
-
-  const Packet x2 = pmul(x, x);
-  const Packet x3 = pmul(x, x2);
-  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
-  Packet q = ppolevl<Packet, 5>::run(x2, beta);
-  Packet y_small = pmadd(x3, pdiv(p, q), x);
-
-  // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
-  const Packet half = pset1<Packet>(0.5);
-  const Packet one = pset1<Packet>(1.0);
-  Packet y_large = pdiv(padd(one, x), psub(one, x));
-  y_large = pmul(half, plog(y_large));
-
-  const Packet x_gt_half = pcmp_le(half, pabs(x));
-  const Packet x_eq_one = pcmp_eq(one, pabs(x));
-  const Packet x_gt_one = pcmp_lt(one, pabs(x));
-  const Packet sign_mask = pset1<Packet>(-0.0);
-  const Packet x_sign = pand(sign_mask, x);
-  const Packet inf = pset1<Packet>(std::numeric_limits<double>::infinity());
-  return por(x_gt_one, pselect(x_eq_one, por(x_sign, inf), pselect(x_gt_half, y_large, y_small)));
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog10_float(const Packet& x) {
+  const Packet cst_log10e = pset1<Packet>(0.4342944819032518f);
+  return pmul(plog(x), cst_log10e);
 }
 
+/** \internal \returns log10(x) for double precision float.
+    Computed as log(x) * log10(e). */
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet& x, const Packet& y) {
-  typedef typename unpacket_traits<Packet>::as_real RealPacket;
-  // In the following we annotate the code for the case where the inputs
-  // are a pair length-2 SIMD vectors representing a single pair of complex
-  // numbers x = a + i*b, y = c + i*d.
-  const RealPacket y_abs = pabs(y.v);                        // |c|, |d|
-  const RealPacket y_abs_flip = pcplxflip(Packet(y_abs)).v;  // |d|, |c|
-  const RealPacket y_max = pmax(y_abs, y_abs_flip);          // max(|c|, |d|), max(|c|, |d|)
-  const RealPacket y_scaled = pdiv(y.v, y_max);              // c / max(|c|, |d|), d / max(|c|, |d|)
-  // Compute scaled denominator.
-  const RealPacket y_scaled_sq = pmul(y_scaled, y_scaled);  // c'**2, d'**2
-  const RealPacket denom = padd(y_scaled_sq, pcplxflip(Packet(y_scaled_sq)).v);
-  Packet result_scaled = pmul(x, pconj(Packet(y_scaled)));  // a * c' + b * d', -a * d + b * c
-  // Divide elementwise by denom.
-  result_scaled = Packet(pdiv(result_scaled.v, denom));
-  // Rescale result
-  return Packet(pdiv(result_scaled.v, y_max));
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog10_double(const Packet& x) {
+  const Packet cst_log10e = pset1<Packet>(0.4342944819032518);
+  return pmul(plog(x), cst_log10e);
 }
 
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_complex(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  typedef typename Scalar::value_type RealScalar;
-  typedef typename unpacket_traits<Packet>::as_real RealPacket;
-
-  RealPacket real_mask_rp = peven_mask(x.v);
-  Packet real_mask(real_mask_rp);
-
-  // Real part
-  RealPacket x_flip = pcplxflip(x).v;  // b, a
-  Packet x_norm = phypot_complex(x);   // sqrt(a^2 + b^2), sqrt(a^2 + b^2)
-  RealPacket xlogr = plog(x_norm.v);   // log(sqrt(a^2 + b^2)), log(sqrt(a^2 + b^2))
-
-  // Imag part
-  RealPacket ximg = patan2(x.v, x_flip);  // atan2(a, b), atan2(b, a)
-
-  const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
-  RealPacket x_abs = pabs(x.v);
-  RealPacket is_x_pos_inf = pcmp_eq(x_abs, cst_pos_inf);
-  RealPacket is_y_pos_inf = pcplxflip(Packet(is_x_pos_inf)).v;
-  RealPacket is_any_inf = por(is_x_pos_inf, is_y_pos_inf);
-  RealPacket xreal = pselect(is_any_inf, cst_pos_inf, xlogr);
-
-  Packet xres = pselect(real_mask, Packet(xreal), Packet(ximg));  // log(sqrt(a^2 + b^2)), atan2(b, a)
-  return xres;
-}
+}  // end namespace internal
+}  // end namespace Eigen
 
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_complex(const Packet& a) {
-  typedef typename unpacket_traits<Packet>::as_real RealPacket;
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  typedef typename Scalar::value_type RealScalar;
-  const RealPacket even_mask = peven_mask(a.v);
-  const RealPacket odd_mask = pcplxflip(Packet(even_mask)).v;
-
-  // Let a = x + iy.
-  // exp(a) = exp(x) * cis(y), plus some special edge-case handling.
-
-  // exp(x):
-  RealPacket x = pand(a.v, even_mask);
-  x = por(x, pcplxflip(Packet(x)).v);
-  RealPacket expx = pexp(x);  // exp(x);
-
-  // cis(y):
-  RealPacket y = pand(odd_mask, a.v);
-  y = por(y, pcplxflip(Packet(y)).v);
-  RealPacket cisy = psincos_float<false, RealPacket, true>(y);
-  cisy = pcplxflip(Packet(cisy)).v;  // cos(y) + i * sin(y)
-
-  const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
-  const RealPacket cst_neg_inf = pset1<RealPacket>(-NumTraits<RealScalar>::infinity());
-
-  // If x is -inf, we know that cossin(y) is bounded,
-  //   so the result is (0, +/-0), where the sign of the imaginary part comes
-  //   from the sign of cossin(y).
-  RealPacket cisy_sign = por(pandnot(cisy, pabs(cisy)), pset1<RealPacket>(RealScalar(1)));
-  cisy = pselect(pcmp_eq(x, cst_neg_inf), cisy_sign, cisy);
-
-  // If x is inf, and cos(y) has unknown sign (y is inf or NaN), the result
-  // is (+/-inf, NaN), where the signs are undetermined (take the sign of y).
-  RealPacket y_sign = por(pandnot(y, pabs(y)), pset1<RealPacket>(RealScalar(1)));
-  cisy = pselect(pand(pcmp_eq(x, cst_pos_inf), pisnan(cisy)), pand(y_sign, even_mask), cisy);
-  Packet result = Packet(pmul(expx, cisy));
-
-  // If y is +/- 0, the input is real, so take the real result for consistency.
-  result = pselect(Packet(pcmp_eq(y, pzero(y))), Packet(por(pand(expx, even_mask), pand(y, odd_mask))), result);
+// Include the split-out sections. Order matters: Pow depends on exp/log and FrexpLdexp,
+// Trig depends on exp (for ptanh_float), Complex depends on Trig (for psincos_selector).
+#include "GenericPacketMathPow.h"
+#include "GenericPacketMathTrig.h"
+#include "GenericPacketMathComplex.h"
 
-  return result;
-}
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt_complex(const Packet& a) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  typedef typename Scalar::value_type RealScalar;
-  typedef typename unpacket_traits<Packet>::as_real RealPacket;
-
-  // Computes the principal sqrt of the complex numbers in the input.
-  //
-  // For example, for packets containing 2 complex numbers stored in interleaved format
-  //    a = [a0, a1] = [x0, y0, x1, y1],
-  // where x0 = real(a0), y0 = imag(a0) etc., this function returns
-  //    b = [b0, b1] = [u0, v0, u1, v1],
-  // such that b0^2 = a0, b1^2 = a1.
-  //
-  // To derive the formula for the complex square roots, let's consider the equation for
-  // a single complex square root of the number x + i*y. We want to find real numbers
-  // u and v such that
-  //    (u + i*v)^2 = x + i*y  <=>
-  //    u^2 - v^2 + i*2*u*v = x + i*v.
-  // By equating the real and imaginary parts we get:
-  //    u^2 - v^2 = x
-  //    2*u*v = y.
-  //
-  // For x >= 0, this has the numerically stable solution
-  //    u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
-  //    v = 0.5 * (y / u)
-  // and for x < 0,
-  //    v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
-  //    u = 0.5 * (y / v)
-  //
-  //  To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as
-  //     l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) ,
-
-  // In the following, without lack of generality, we have annotated the code, assuming
-  // that the input is a packet of 2 complex numbers.
-  //
-  // Step 1. Compute l = [l0, l0, l1, l1], where
-  //    l0 = sqrt(x0^2 + y0^2),  l1 = sqrt(x1^2 + y1^2)
-  // To avoid over- and underflow, we use the stable formula for each hypotenuse
-  //    l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)),
-  // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1.
-
-  RealPacket a_abs = pabs(a.v);                        // [|x0|, |y0|, |x1|, |y1|]
-  RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v;  // [|y0|, |x0|, |y1|, |x1|]
-  RealPacket a_max = pmax(a_abs, a_abs_flip);
-  RealPacket a_min = pmin(a_abs, a_abs_flip);
-  RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
-  RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
-  RealPacket r = pdiv(a_min, a_max);
-  const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
-  RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r))));  // [l0, l0, l1, l1]
-  // Set l to a_max if a_min is zero.
-  l = pselect(a_min_zero_mask, a_max, l);
-
-  // Step 2. Compute [rho0, *, rho1, *], where
-  // rho0 = sqrt(0.5 * (l0 + |x0|)), rho1 =  sqrt(0.5 * (l1 + |x1|))
-  // We don't care about the imaginary parts computed here. They will be overwritten later.
-  const RealPacket cst_half = pset1<RealPacket>(RealScalar(0.5));
-  Packet rho;
-  rho.v = psqrt(pmul(cst_half, padd(a_abs, l)));
-
-  // Step 3. Compute [rho0, eta0, rho1, eta1], where
-  // eta0 = (y0 / l0) / 2, and eta1 = (y1 / l1) / 2.
-  // set eta = 0 of input is 0 + i0.
-  RealPacket eta = pandnot(pmul(cst_half, pdiv(a.v, pcplxflip(rho).v)), a_max_zero_mask);
-  RealPacket real_mask = peven_mask(a.v);
-  Packet positive_real_result;
-  // Compute result for inputs with positive real part.
-  positive_real_result.v = pselect(real_mask, rho.v, eta);
-
-  // Step 4. Compute solution for inputs with negative real part:
-  //         [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1]
-  const RealPacket cst_imag_sign_mask = pset1<Packet>(Scalar(RealScalar(0.0), RealScalar(-0.0))).v;
-  RealPacket imag_signs = pand(a.v, cst_imag_sign_mask);
-  Packet negative_real_result;
-  // Notice that rho is positive, so taking it's absolute value is a noop.
-  negative_real_result.v = por(pabs(pcplxflip(positive_real_result).v), imag_signs);
-
-  // Step 5. Select solution branch based on the sign of the real parts.
-  Packet negative_real_mask;
-  negative_real_mask.v = pcmp_lt(pand(real_mask, a.v), pzero(a.v));
-  negative_real_mask.v = por(negative_real_mask.v, pcplxflip(negative_real_mask).v);
-  Packet result = pselect(negative_real_mask, negative_real_result, positive_real_result);
-
-  // Step 6. Handle special cases for infinities:
-  // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN
-  // * If z is (x,-∞), the result is (+∞,-∞) even if x is NaN
-  // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y
-  // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y
-  const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
-  Packet is_inf;
-  is_inf.v = pcmp_eq(a_abs, cst_pos_inf);
-  Packet is_real_inf;
-  is_real_inf.v = pand(is_inf.v, real_mask);
-  is_real_inf = por(is_real_inf, pcplxflip(is_real_inf));
-  // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part.
-  Packet real_inf_result;
-  real_inf_result.v = pmul(a_abs, pset1<Packet>(Scalar(RealScalar(1.0), RealScalar(0.0))).v);
-  real_inf_result.v = pselect(negative_real_mask.v, pcplxflip(real_inf_result).v, real_inf_result.v);
-  // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part.
-  Packet is_imag_inf;
-  is_imag_inf.v = pandnot(is_inf.v, real_mask);
-  is_imag_inf = por(is_imag_inf, pcplxflip(is_imag_inf));
-  Packet imag_inf_result;
-  imag_inf_result.v = por(pand(cst_pos_inf, real_mask), pandnot(a.v, real_mask));
-  // unless otherwise specified, if either the real or imaginary component is nan, the entire result is nan
-  Packet result_is_nan = pisnan(result);
-  result = por(result_is_nan, result);
-
-  return pselect(is_imag_inf, imag_inf_result, pselect(is_real_inf, real_inf_result, result));
-}
+namespace Eigen {
+namespace internal {
 
-// \internal \returns the norm of a complex number z = x + i*y, defined as sqrt(x^2 + y^2).
-// Implemented using the hypot(a,b) algorithm from https://doi.org/10.48550/arXiv.1904.09481
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet phypot_complex(const Packet& a) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  typedef typename Scalar::value_type RealScalar;
-  typedef typename unpacket_traits<Packet>::as_real RealPacket;
-
-  const RealPacket cst_zero_rp = pset1<RealPacket>(static_cast<RealScalar>(0.0));
-  const RealPacket cst_minus_one_rp = pset1<RealPacket>(static_cast<RealScalar>(-1.0));
-  const RealPacket cst_two_rp = pset1<RealPacket>(static_cast<RealScalar>(2.0));
-  const RealPacket evenmask = peven_mask(a.v);
-
-  RealPacket a_abs = pabs(a.v);
-  RealPacket a_flip = pcplxflip(Packet(a_abs)).v;       // |b|, |a|
-  RealPacket a_all = pselect(evenmask, a_abs, a_flip);  // |a|, |a|
-  RealPacket b_all = pselect(evenmask, a_flip, a_abs);  // |b|, |b|
-
-  RealPacket a2 = pmul(a.v, a.v);                    // |a^2, b^2|
-  RealPacket a2_flip = pcplxflip(Packet(a2)).v;      // |b^2, a^2|
-  RealPacket h = psqrt(padd(a2, a2_flip));           // |sqrt(a^2 + b^2), sqrt(a^2 + b^2)|
-  RealPacket h_sq = pmul(h, h);                      // |a^2 + b^2, a^2 + b^2|
-  RealPacket a_sq = pselect(evenmask, a2, a2_flip);  // |a^2, a^2|
-  RealPacket m_h_sq = pmul(h_sq, cst_minus_one_rp);
-  RealPacket m_a_sq = pmul(a_sq, cst_minus_one_rp);
-  RealPacket x = psub(psub(pmadd(h, h, m_h_sq), pmadd(b_all, b_all, psub(a_sq, h_sq))), pmadd(a_all, a_all, m_a_sq));
-  h = psub(h, pdiv(x, pmul(cst_two_rp, h)));  // |h - x/(2*h), h - x/(2*h)|
-
-  // handle zero-case
-  RealPacket iszero = pcmp_eq(por(a_abs, a_flip), cst_zero_rp);
-
-  h = pandnot(h, iszero);  // |sqrt(a^2+b^2), sqrt(a^2+b^2)|
-  return Packet(h);        // |sqrt(a^2+b^2), sqrt(a^2+b^2)|
-}
+//----------------------------------------------------------------------
+// Sign Function
+//----------------------------------------------------------------------
 
 template <typename Packet>
 struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
@@ -1740,7 +676,7 @@ struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
   }
 };
 
-// \internal \returns the the sign of a complex number z, defined as z / abs(z).
+// \internal \returns the sign of a complex number z, defined as z / abs(z).
 template <typename Packet>
 struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
                                            NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
@@ -1775,766 +711,9 @@ struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
   }
 };
 
-// TODO(rmlarsen): The following set of utilities for double word arithmetic
-// should perhaps be refactored as a separate file, since it would be generally
-// useful for special function implementation etc. Writing the algorithms in
-// terms if a double word type would also make the code more readable.
-
-// This function splits x into the nearest integer n and fractional part r,
-// such that x = n + r holds exactly.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void absolute_split(const Packet& x, Packet& n, Packet& r) {
-  n = pround(x);
-  r = psub(x, n);
-}
-
-// This function computes the sum {s, r}, such that x + y = s_hi + s_lo
-// holds exactly, and s_hi = fl(x+y), if |x| >= |y|.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
-  s_hi = padd(x, y);
-  const Packet t = psub(s_hi, x);
-  s_lo = psub(y, t);
-}
-
-#ifdef EIGEN_VECTORIZE_FMA
-// This function implements the extended precision product of
-// a pair of floating point numbers. Given {x, y}, it computes the pair
-// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
-// p_hi = fl(x * y).
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
-  p_hi = pmul(x, y);
-  p_lo = pmsub(x, y, p_hi);
-}
-
-// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
-// x * y = xy + p_lo holds exactly.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
-  return pmsub(x, y, xy);
-}
-
-#else
-
-// This function implements the Veltkamp splitting. Given a floating point
-// number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds
-// exactly and that half of the significant of x fits in x_hi.
-// This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions",
-// 3rd edition, Birkh\"auser, 2016.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  constexpr int shift = (NumTraits<Scalar>::digits() + 1) / 2;
-  const Scalar shift_scale = Scalar(uint64_t(1) << shift);  // Scalar constructor not necessarily constexpr.
-  const Packet gamma = pmul(pset1<Packet>(shift_scale + Scalar(1)), x);
-  Packet rho = psub(x, gamma);
-  x_hi = padd(rho, gamma);
-  x_lo = psub(x, x_hi);
-}
-
-// This function implements Dekker's algorithm for products x * y.
-// Given floating point numbers {x, y} computes the pair
-// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
-// p_hi = fl(x * y).
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
-  Packet x_hi, x_lo, y_hi, y_lo;
-  veltkamp_splitting(x, x_hi, x_lo);
-  veltkamp_splitting(y, y_hi, y_lo);
-
-  p_hi = pmul(x, y);
-  p_lo = pmadd(x_hi, y_hi, pnegate(p_hi));
-  p_lo = pmadd(x_hi, y_lo, p_lo);
-  p_lo = pmadd(x_lo, y_hi, p_lo);
-  p_lo = pmadd(x_lo, y_lo, p_lo);
-}
-
-// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
-// x * y = xy + p_lo holds exactly.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
-  Packet x_hi, x_lo, y_hi, y_lo;
-  veltkamp_splitting(x, x_hi, x_lo);
-  veltkamp_splitting(y, y_hi, y_lo);
-
-  Packet p_lo = pmadd(x_hi, y_hi, pnegate(xy));
-  p_lo = pmadd(x_hi, y_lo, p_lo);
-  p_lo = pmadd(x_lo, y_hi, p_lo);
-  p_lo = pmadd(x_lo, y_lo, p_lo);
-  return p_lo;
-}
-
-#endif  // EIGEN_VECTORIZE_FMA
-
-// This function implements Dekker's algorithm for the addition
-// of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
-// It returns the result as a pair {s_hi, s_lo} such that
-// x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly.
-// This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions",
-// 3rd edition, Birkh\"auser, 2016.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
-                                                  const Packet& y_lo, Packet& s_hi, Packet& s_lo) {
-  const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi));
-  Packet r_hi_1, r_lo_1;
-  fast_twosum(x_hi, y_hi, r_hi_1, r_lo_1);
-  Packet r_hi_2, r_lo_2;
-  fast_twosum(y_hi, x_hi, r_hi_2, r_lo_2);
-  const Packet r_hi = pselect(x_greater_mask, r_hi_1, r_hi_2);
-
-  const Packet s1 = padd(padd(y_lo, r_lo_1), x_lo);
-  const Packet s2 = padd(padd(x_lo, r_lo_2), y_lo);
-  const Packet s = pselect(x_greater_mask, s1, s2);
-
-  fast_twosum(r_hi, s, s_hi, s_lo);
-}
-
-// This is a version of twosum for double word numbers,
-// which assumes that |x_hi| >= |y_hi|.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
-                                                       const Packet& y_lo, Packet& s_hi, Packet& s_lo) {
-  Packet r_hi, r_lo;
-  fast_twosum(x_hi, y_hi, r_hi, r_lo);
-  const Packet s = padd(padd(y_lo, r_lo), x_lo);
-  fast_twosum(r_hi, s, s_hi, s_lo);
-}
-
-// This is a version of twosum for adding a floating point number x to
-// double word number {y_hi, y_lo} number, with the assumption
-// that |x| >= |y_hi|.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y_hi, const Packet& y_lo,
-                                                       Packet& s_hi, Packet& s_lo) {
-  Packet r_hi, r_lo;
-  fast_twosum(x, y_hi, r_hi, r_lo);
-  const Packet s = padd(y_lo, r_lo);
-  fast_twosum(r_hi, s, s_hi, s_lo);
-}
-
-// This function implements the multiplication of a double word
-// number represented by {x_hi, x_lo} by a floating point number y.
-// It returns the result as a pair {p_hi, p_lo} such that
-// (x_hi + x_lo) * y = p_hi + p_lo hold with a relative error
-// of less than 2*2^{-2p}, where p is the number of significand bit
-// in the floating point type.
-// This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions",
-// 3rd edition, Birkh\"auser, 2016.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y,
-                                                   Packet& p_hi, Packet& p_lo) {
-  Packet c_hi, c_lo1;
-  twoprod(x_hi, y, c_hi, c_lo1);
-  const Packet c_lo2 = pmul(x_lo, y);
-  Packet t_hi, t_lo1;
-  fast_twosum(c_hi, c_lo2, t_hi, t_lo1);
-  const Packet t_lo2 = padd(t_lo1, c_lo1);
-  fast_twosum(t_hi, t_lo2, p_hi, p_lo);
-}
-
-// This function implements the multiplication of two double word
-// numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
-// It returns the result as a pair {p_hi, p_lo} such that
-// (x_hi + x_lo) * (y_hi + y_lo) = p_hi + p_lo holds with a relative error
-// of less than 2*2^{-2p}, where p is the number of significand bit
-// in the floating point type.
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
-                                                   const Packet& y_lo, Packet& p_hi, Packet& p_lo) {
-  Packet p_hi_hi, p_hi_lo;
-  twoprod(x_hi, x_lo, y_hi, p_hi_hi, p_hi_lo);
-  Packet p_lo_hi, p_lo_lo;
-  twoprod(x_hi, x_lo, y_lo, p_lo_hi, p_lo_lo);
-  fast_twosum(p_hi_hi, p_hi_lo, p_lo_hi, p_lo_lo, p_hi, p_lo);
-}
-
-// This function implements the division of double word {x_hi, x_lo}
-// by float y. This is Algorithm 15 from "Tight and rigorous error bounds
-// for basic building blocks of double-word arithmetic", Joldes, Muller, & Popescu,
-// 2017. https://hal.archives-ouvertes.fr/hal-01351529
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y,
-                                                             Packet& z_hi, Packet& z_lo) {
-  const Packet t_hi = pdiv(x_hi, y);
-  Packet pi_hi, pi_lo;
-  twoprod(t_hi, y, pi_hi, pi_lo);
-  const Packet delta_hi = psub(x_hi, pi_hi);
-  const Packet delta_t = psub(delta_hi, pi_lo);
-  const Packet delta = padd(delta_t, x_lo);
-  const Packet t_lo = pdiv(delta, y);
-  fast_twosum(t_hi, t_lo, z_hi, z_lo);
-}
-
-// This function computes log2(x) and returns the result as a double word.
-template <typename Scalar>
-struct accurate_log2 {
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
-    log2_x_hi = plog2(x);
-    log2_x_lo = pzero(x);
-  }
-};
-
-// This specialization uses a more accurate algorithm to compute log2(x) for
-// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.56508e-10.
-// This additional accuracy is needed to counter the error-magnification
-// inherent in multiplying by a potentially large exponent in pow(x,y).
-// The minimax polynomial used was calculated using the Rminimax tool,
-// see https://gitlab.inria.fr/sfilip/rminimax.
-// Command line:
-//   $ ratapprox --function="log2(1+x)/x"  --dom='[-0.2929,0.41422]'
-//   --type=[10,0]
-//       --numF="[D,D,SG]" --denF="[SG]" --log --dispCoeff="dec"
-//
-// The resulting implementation of pow(x,y) is accurate to 3 ulps.
-template <>
-struct accurate_log2<float> {
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) {
-    // Split the two lowest order constant coefficient into double-word representation.
-    constexpr double kC0 = 1.442695041742110273474963832995854318141937255859375e+00;
-    constexpr float kC0_hi = static_cast<float>(kC0);
-    constexpr float kC0_lo = static_cast<float>(kC0 - static_cast<double>(kC0_hi));
-    const Packet c0_hi = pset1<Packet>(kC0_hi);
-    const Packet c0_lo = pset1<Packet>(kC0_lo);
-
-    constexpr double kC1 = -7.2134751588268664068692714863573201000690460205078125e-01;
-    constexpr float kC1_hi = static_cast<float>(kC1);
-    constexpr float kC1_lo = static_cast<float>(kC1 - static_cast<double>(kC1_hi));
-    const Packet c1_hi = pset1<Packet>(kC1_hi);
-    const Packet c1_lo = pset1<Packet>(kC1_lo);
-
-    constexpr float c[] = {
-        9.7010828554630279541015625e-02,  -1.6896486282348632812500000e-01, 1.7200836539268493652343750e-01,
-        -1.7892272770404815673828125e-01, 2.0505344867706298828125000e-01,  -2.4046677350997924804687500e-01,
-        2.8857553005218505859375000e-01,  -3.6067414283752441406250000e-01, 4.8089790344238281250000000e-01};
-
-    // Evaluate the higher order terms in the polynomial using
-    // standard arithmetic.
-    const Packet one = pset1<Packet>(1.0f);
-    const Packet x = psub(z, one);
-    Packet p = ppolevl<Packet, 8>::run(x, c);
-    // Evaluate the final two step in Horner's rule using double-word
-    // arithmetic.
-    Packet p_hi, p_lo;
-    twoprod(x, p, p_hi, p_lo);
-    fast_twosum(c1_hi, c1_lo, p_hi, p_lo, p_hi, p_lo);
-    twoprod(p_hi, p_lo, x, p_hi, p_lo);
-    fast_twosum(c0_hi, c0_lo, p_hi, p_lo, p_hi, p_lo);
-    // Multiply by x to recover log2(z).
-    twoprod(p_hi, p_lo, x, log2_x_hi, log2_x_lo);
-  }
-};
-
-// This specialization uses a more accurate algorithm to compute log2(x) for
-// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~1.27e-18.
-// This additional accuracy is needed to counter the error-magnification
-// inherent in multiplying by a potentially large exponent in pow(x,y).
-// The minimax polynomial used was calculated using the Sollya tool.
-// See sollya.org.
-
-template <>
-struct accurate_log2<double> {
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
-    // We use a transformation of variables:
-    //    r = c * (x-1) / (x+1),
-    // such that
-    //    log2(x) = log2((1 + r/c) / (1 - r/c)) = f(r).
-    // The function f(r) can be approximated well using an odd polynomial
-    // of the form
-    //   P(r) = ((Q(r^2) * r^2 + C) * r^2 + 1) * r,
-    // For the implementation of log2<double> here, Q is of degree 6 with
-    // coefficient represented in working precision (double), while C is a
-    // constant represented in extra precision as a double word to achieve
-    // full accuracy.
-    //
-    // The polynomial coefficients were computed by the Sollya script:
-    //
-    // c = 2 / log(2);
-    // trans = c * (x-1)/(x+1);
-    // itrans = (1+x/c)/(1-x/c);
-    // interval=[trans(sqrt(0.5)); trans(sqrt(2))];
-    // print(interval);
-    // f = log2(itrans(x));
-    // p=fpminimax(f,[|1,3,5,7,9,11,13,15,17|],[|1,DD,double...|],interval,relative,floating);
-    const Packet q12 = pset1<Packet>(2.87074255468000586e-9);
-    const Packet q10 = pset1<Packet>(2.38957980901884082e-8);
-    const Packet q8 = pset1<Packet>(2.31032094540014656e-7);
-    const Packet q6 = pset1<Packet>(2.27279857398537278e-6);
-    const Packet q4 = pset1<Packet>(2.31271023278625638e-5);
-    const Packet q2 = pset1<Packet>(2.47556738444535513e-4);
-    const Packet q0 = pset1<Packet>(2.88543873228900172e-3);
-    const Packet C_hi = pset1<Packet>(0.0400377511598501157);
-    const Packet C_lo = pset1<Packet>(-4.77726582251425391e-19);
-    const Packet one = pset1<Packet>(1.0);
-
-    const Packet cst_2_log2e_hi = pset1<Packet>(2.88539008177792677);
-    const Packet cst_2_log2e_lo = pset1<Packet>(4.07660016854549667e-17);
-    // c * (x - 1)
-    Packet t_hi, t_lo;
-    // t = c * (x-1)
-    twoprod(cst_2_log2e_hi, cst_2_log2e_lo, psub(x, one), t_hi, t_lo);
-    // r = c * (x-1) / (x+1),
-    Packet r_hi, r_lo;
-    doubleword_div_fp(t_hi, t_lo, padd(x, one), r_hi, r_lo);
-
-    // r2 = r * r
-    Packet r2_hi, r2_lo;
-    twoprod(r_hi, r_lo, r_hi, r_lo, r2_hi, r2_lo);
-    // r4 = r2 * r2
-    Packet r4_hi, r4_lo;
-    twoprod(r2_hi, r2_lo, r2_hi, r2_lo, r4_hi, r4_lo);
-
-    // Evaluate Q(r^2) in working precision. We evaluate it in two parts
-    // (even and odd in r^2) to improve instruction level parallelism.
-    Packet q_even = pmadd(q12, r4_hi, q8);
-    Packet q_odd = pmadd(q10, r4_hi, q6);
-    q_even = pmadd(q_even, r4_hi, q4);
-    q_odd = pmadd(q_odd, r4_hi, q2);
-    q_even = pmadd(q_even, r4_hi, q0);
-    Packet q = pmadd(q_odd, r2_hi, q_even);
-
-    // Now evaluate the low order terms of P(x) in double word precision.
-    // In the following, due to the increasing magnitude of the coefficients
-    // and r being constrained to [-0.5, 0.5] we can use fast_twosum instead
-    // of the slower twosum.
-    // Q(r^2) * r^2
-    Packet p_hi, p_lo;
-    twoprod(r2_hi, r2_lo, q, p_hi, p_lo);
-    // Q(r^2) * r^2 + C
-    Packet p1_hi, p1_lo;
-    fast_twosum(C_hi, C_lo, p_hi, p_lo, p1_hi, p1_lo);
-    // (Q(r^2) * r^2 + C) * r^2
-    Packet p2_hi, p2_lo;
-    twoprod(r2_hi, r2_lo, p1_hi, p1_lo, p2_hi, p2_lo);
-    // ((Q(r^2) * r^2 + C) * r^2 + 1)
-    Packet p3_hi, p3_lo;
-    fast_twosum(one, p2_hi, p2_lo, p3_hi, p3_lo);
-
-    // log(z) ~= ((Q(r^2) * r^2 + C) * r^2 + 1) * r
-    twoprod(p3_hi, p3_lo, r_hi, r_lo, log2_x_hi, log2_x_lo);
-  }
-};
-
-// This function implements the non-trivial case of pow(x,y) where x is
-// positive and y is (possibly) non-integer.
-// Formally, pow(x,y) = exp2(y * log2(x)), where exp2(x) is shorthand for 2^x.
-// TODO(rmlarsen): We should probably add this as a packet up 'ppow', to make it
-// easier to specialize or turn off for specific types and/or backends.x
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  // Split x into exponent e_x and mantissa m_x.
-  Packet e_x;
-  Packet m_x = pfrexp(x, e_x);
-
-  // Adjust m_x to lie in [1/sqrt(2):sqrt(2)] to minimize absolute error in log2(m_x).
-  constexpr Scalar sqrt_half = Scalar(0.70710678118654752440);
-  const Packet m_x_scale_mask = pcmp_lt(m_x, pset1<Packet>(sqrt_half));
-  m_x = pselect(m_x_scale_mask, pmul(pset1<Packet>(Scalar(2)), m_x), m_x);
-  e_x = pselect(m_x_scale_mask, psub(e_x, pset1<Packet>(Scalar(1))), e_x);
-
-  // Compute log2(m_x) with 6 extra bits of accuracy.
-  Packet rx_hi, rx_lo;
-  accurate_log2<Scalar>()(m_x, rx_hi, rx_lo);
-
-  // Compute the two terms {y * e_x, y * r_x} in f = y * log2(x) with doubled
-  // precision using double word arithmetic.
-  Packet f1_hi, f1_lo, f2_hi, f2_lo;
-  twoprod(e_x, y, f1_hi, f1_lo);
-  twoprod(rx_hi, rx_lo, y, f2_hi, f2_lo);
-  // Sum the two terms in f using double word arithmetic. We know
-  // that |e_x| > |log2(m_x)|, except for the case where e_x==0.
-  // This means that we can use fast_twosum(f1,f2).
-  // In the case e_x == 0, e_x * y = f1 = 0, so we don't lose any
-  // accuracy by violating the assumption of fast_twosum, because
-  // it's a no-op.
-  Packet f_hi, f_lo;
-  fast_twosum(f1_hi, f1_lo, f2_hi, f2_lo, f_hi, f_lo);
-
-  // Split f into integer and fractional parts.
-  Packet n_z, r_z;
-  absolute_split(f_hi, n_z, r_z);
-  r_z = padd(r_z, f_lo);
-  Packet n_r;
-  absolute_split(r_z, n_r, r_z);
-  n_z = padd(n_z, n_r);
-
-  // We now have an accurate split of f = n_z + r_z and can compute
-  //   x^y = 2**{n_z + r_z) = exp2(r_z) * 2**{n_z}.
-  // Multiplication by the second factor can be done exactly using pldexp(), since
-  // it is an integer power of 2.
-  const Packet e_r = generic_exp2(r_z);
-
-  // Since we know that e_r is in [1/sqrt(2); sqrt(2)], we can use the fast version
-  // of pldexp to multiply by 2**{n_z} when |n_z| is sufficiently small.
-  constexpr Scalar kPldExpThresh = std::numeric_limits<Scalar>::max_exponent - 2;
-  const Packet pldexp_fast_unsafe = pcmp_lt(pset1<Packet>(kPldExpThresh), pabs(n_z));
-  if (predux_any(pldexp_fast_unsafe)) {
-    return pldexp(e_r, n_z);
-  }
-  return pldexp_fast(e_r, n_z);
-}
-
-// Generic implementation of pow(x,y).
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<!is_scalar<Packet>::value, Packet> generic_pow(
-    const Packet& x, const Packet& y) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-
-  const Packet cst_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
-  const Packet cst_zero = pset1<Packet>(Scalar(0));
-  const Packet cst_one = pset1<Packet>(Scalar(1));
-  const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
-
-  const Packet x_abs = pabs(x);
-  Packet pow = generic_pow_impl(x_abs, y);
-
-  // In the following we enforce the special case handling prescribed in
-  // https://en.cppreference.com/w/cpp/numeric/math/pow.
-
-  // Predicates for sign and magnitude of x.
-  const Packet x_is_negative = pcmp_lt(x, cst_zero);
-  const Packet x_is_zero = pcmp_eq(x, cst_zero);
-  const Packet x_is_one = pcmp_eq(x, cst_one);
-  const Packet x_has_signbit = psignbit(x);
-  const Packet x_abs_gt_one = pcmp_lt(cst_one, x_abs);
-  const Packet x_abs_is_inf = pcmp_eq(x_abs, cst_inf);
-
-  // Predicates for sign and magnitude of y.
-  const Packet y_abs = pabs(y);
-  const Packet y_abs_is_inf = pcmp_eq(y_abs, cst_inf);
-  const Packet y_is_negative = pcmp_lt(y, cst_zero);
-  const Packet y_is_zero = pcmp_eq(y, cst_zero);
-  const Packet y_is_one = pcmp_eq(y, cst_one);
-  // Predicates for whether y is integer and odd/even.
-  const Packet y_is_int = pandnot(pcmp_eq(pfloor(y), y), y_abs_is_inf);
-  const Packet y_div_2 = pmul(y, pset1<Packet>(Scalar(0.5)));
-  const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
-  const Packet y_is_odd_int = pandnot(y_is_int, y_is_even);
-  // Smallest exponent for which (1 + epsilon) overflows to infinity.
-  constexpr Scalar huge_exponent =
-      (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits<Scalar>::epsilon();
-  const Packet y_abs_is_huge = pcmp_le(pset1<Packet>(huge_exponent), y_abs);
-
-  // *  pow(base, exp) returns NaN if base is finite and negative
-  //    and exp is finite and non-integer.
-  pow = pselect(pandnot(x_is_negative, y_is_int), cst_nan, pow);
-
-  // * pow(±0, exp), where exp is negative, finite, and is an even integer or
-  // a non-integer, returns +∞
-  // * pow(±0, exp), where exp is positive non-integer or a positive even
-  // integer, returns +0
-  // * pow(+0, exp), where exp is a negative odd integer, returns +∞
-  // * pow(-0, exp), where exp is a negative odd integer, returns -∞
-  // * pow(+0, exp), where exp is a positive odd integer, returns +0
-  // * pow(-0, exp), where exp is a positive odd integer, returns -0
-  // Sign is flipped by the rule below.
-  pow = pselect(x_is_zero, pselect(y_is_negative, cst_inf, cst_zero), pow);
-
-  // pow(base, exp) returns -pow(abs(base), exp) if base has the sign bit set,
-  // and exp is an odd integer exponent.
-  pow = pselect(pand(x_has_signbit, y_is_odd_int), pnegate(pow), pow);
-
-  // * pow(base, -∞) returns +∞ for any |base|<1
-  // * pow(base, -∞) returns +0 for any |base|>1
-  // * pow(base, +∞) returns +0 for any |base|<1
-  // * pow(base, +∞) returns +∞ for any |base|>1
-  // * pow(±0, -∞) returns +∞
-  // * pow(-1, +-∞) = 1
-  Packet inf_y_val = pselect(por(pand(y_is_negative, x_is_zero), pxor(y_is_negative, x_abs_gt_one)), cst_inf, cst_zero);
-  inf_y_val = pselect(pcmp_eq(x, pset1<Packet>(Scalar(-1.0))), cst_one, inf_y_val);
-  pow = pselect(y_abs_is_huge, inf_y_val, pow);
-
-  // * pow(+∞, exp) returns +0 for any negative exp
-  // * pow(+∞, exp) returns +∞ for any positive exp
-  // * pow(-∞, exp) returns -0 if exp is a negative odd integer.
-  // * pow(-∞, exp) returns +0 if exp is a negative non-integer or negative
-  //     even integer.
-  // * pow(-∞, exp) returns -∞ if exp is a positive odd integer.
-  // * pow(-∞, exp) returns +∞ if exp is a positive non-integer or positive
-  //     even integer.
-  auto x_pos_inf_value = pselect(y_is_negative, cst_zero, cst_inf);
-  auto x_neg_inf_value = pselect(y_is_odd_int, pnegate(x_pos_inf_value), x_pos_inf_value);
-  pow = pselect(x_abs_is_inf, pselect(x_is_negative, x_neg_inf_value, x_pos_inf_value), pow);
-
-  // All cases of NaN inputs return NaN, except the two below.
-  pow = pselect(por(pisnan(x), pisnan(y)), cst_nan, pow);
-
-  // * pow(base, 1) returns base.
-  // * pow(base, +/-0) returns 1, regardless of base, even NaN.
-  // * pow(+1, exp) returns 1, regardless of exponent, even NaN.
-  pow = pselect(y_is_one, x, pselect(por(x_is_one, y_is_zero), cst_one, pow));
-
-  return pow;
-}
-
-template <typename Scalar>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<is_scalar<Scalar>::value, Scalar> generic_pow(
-    const Scalar& x, const Scalar& y) {
-  return numext::pow(x, y);
-}
-
-namespace unary_pow {
-
-template <typename ScalarExponent, bool IsInteger = NumTraits<ScalarExponent>::IsInteger>
-struct exponent_helper {
-  using safe_abs_type = ScalarExponent;
-  static constexpr ScalarExponent one_half = ScalarExponent(0.5);
-  // these routines assume that exp is an integer stored as a floating point type
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent safe_abs(const ScalarExponent& exp) {
-    return numext::abs(exp);
-  }
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool is_odd(const ScalarExponent& exp) {
-    eigen_assert(((numext::isfinite)(exp) && exp == numext::floor(exp)) && "exp must be an integer");
-    ScalarExponent exp_div_2 = exp * one_half;
-    ScalarExponent floor_exp_div_2 = numext::floor(exp_div_2);
-    return exp_div_2 != floor_exp_div_2;
-  }
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent floor_div_two(const ScalarExponent& exp) {
-    ScalarExponent exp_div_2 = exp * one_half;
-    return numext::floor(exp_div_2);
-  }
-};
-
-template <typename ScalarExponent>
-struct exponent_helper<ScalarExponent, true> {
-  // if `exp` is a signed integer type, cast it to its unsigned counterpart to safely store its absolute value
-  // consider the (rare) case where `exp` is an int32_t: abs(-2147483648) != 2147483648
-  using safe_abs_type = typename numext::get_integer_by_size<sizeof(ScalarExponent)>::unsigned_type;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_abs_type safe_abs(const ScalarExponent& exp) {
-    ScalarExponent mask = numext::signbit(exp);
-    safe_abs_type result = safe_abs_type(exp ^ mask);
-    return result + safe_abs_type(ScalarExponent(1) & mask);
-  }
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool is_odd(const safe_abs_type& exp) {
-    return exp % safe_abs_type(2) != safe_abs_type(0);
-  }
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_abs_type floor_div_two(const safe_abs_type& exp) {
-    return exp >> safe_abs_type(1);
-  }
-};
-
-template <typename Packet, typename ScalarExponent,
-          bool ReciprocateIfExponentIsNegative =
-              !NumTraits<typename unpacket_traits<Packet>::type>::IsInteger && NumTraits<ScalarExponent>::IsSigned>
-struct reciprocate {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
-    using Scalar = typename unpacket_traits<Packet>::type;
-    const Packet cst_pos_one = pset1<Packet>(Scalar(1));
-    return exponent < 0 ? pdiv(cst_pos_one, x) : x;
-  }
-};
-
-template <typename Packet, typename ScalarExponent>
-struct reciprocate<Packet, ScalarExponent, false> {
-  // pdiv not defined, nor necessary for integer base types
-  // if the exponent is unsigned, then the exponent cannot be negative
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent&) { return x; }
-};
-
-template <typename Packet, typename ScalarExponent>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet int_pow(const Packet& x, const ScalarExponent& exponent) {
-  using Scalar = typename unpacket_traits<Packet>::type;
-  using ExponentHelper = exponent_helper<ScalarExponent>;
-  using AbsExponentType = typename ExponentHelper::safe_abs_type;
-  const Packet cst_pos_one = pset1<Packet>(Scalar(1));
-  if (exponent == ScalarExponent(0)) return cst_pos_one;
-
-  Packet result = reciprocate<Packet, ScalarExponent>::run(x, exponent);
-  Packet y = cst_pos_one;
-  AbsExponentType m = ExponentHelper::safe_abs(exponent);
-
-  while (m > 1) {
-    bool odd = ExponentHelper::is_odd(m);
-    if (odd) y = pmul(y, result);
-    result = pmul(result, result);
-    m = ExponentHelper::floor_div_two(m);
-  }
-
-  return pmul(y, result);
-}
-
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!is_scalar<Packet>::value, Packet> gen_pow(
-    const Packet& x, const typename unpacket_traits<Packet>::type& exponent) {
-  const Packet exponent_packet = pset1<Packet>(exponent);
-  return generic_pow_impl(x, exponent_packet);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<is_scalar<Scalar>::value, Scalar> gen_pow(
-    const Scalar& x, const Scalar& exponent) {
-  return numext::pow(x, exponent);
-}
-
-template <typename Packet, typename ScalarExponent>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_nonint_nonint_errors(const Packet& x, const Packet& powx,
-                                                                         const ScalarExponent& exponent) {
-  using Scalar = typename unpacket_traits<Packet>::type;
-
-  // non-integer base and exponent case
-  const Packet cst_pos_zero = pzero(x);
-  const Packet cst_pos_one = pset1<Packet>(Scalar(1));
-  const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
-  const Packet cst_true = ptrue<Packet>(x);
-
-  const bool exponent_is_not_fin = !(numext::isfinite)(exponent);
-  const bool exponent_is_neg = exponent < ScalarExponent(0);
-  const bool exponent_is_pos = exponent > ScalarExponent(0);
-
-  const Packet exp_is_not_fin = exponent_is_not_fin ? cst_true : cst_pos_zero;
-  const Packet exp_is_neg = exponent_is_neg ? cst_true : cst_pos_zero;
-  const Packet exp_is_pos = exponent_is_pos ? cst_true : cst_pos_zero;
-  const Packet exp_is_inf = pand(exp_is_not_fin, por(exp_is_neg, exp_is_pos));
-  const Packet exp_is_nan = pandnot(exp_is_not_fin, por(exp_is_neg, exp_is_pos));
-
-  const Packet x_is_le_zero = pcmp_le(x, cst_pos_zero);
-  const Packet x_is_ge_zero = pcmp_le(cst_pos_zero, x);
-  const Packet x_is_zero = pand(x_is_le_zero, x_is_ge_zero);
-
-  const Packet abs_x = pabs(x);
-  const Packet abs_x_is_le_one = pcmp_le(abs_x, cst_pos_one);
-  const Packet abs_x_is_ge_one = pcmp_le(cst_pos_one, abs_x);
-  const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
-  const Packet abs_x_is_one = pand(abs_x_is_le_one, abs_x_is_ge_one);
-
-  Packet pow_is_inf_if_exp_is_neg = por(x_is_zero, pand(abs_x_is_le_one, exp_is_inf));
-  Packet pow_is_inf_if_exp_is_pos = por(abs_x_is_inf, pand(abs_x_is_ge_one, exp_is_inf));
-  Packet pow_is_one = pand(abs_x_is_one, por(exp_is_inf, x_is_ge_zero));
-
-  Packet result = powx;
-  result = por(x_is_le_zero, result);
-  result = pselect(pow_is_inf_if_exp_is_neg, pand(cst_pos_inf, exp_is_neg), result);
-  result = pselect(pow_is_inf_if_exp_is_pos, pand(cst_pos_inf, exp_is_pos), result);
-  result = por(exp_is_nan, result);
-  result = pselect(pow_is_one, cst_pos_one, result);
-  return result;
-}
-
-template <typename Packet, typename ScalarExponent,
-          std::enable_if_t<NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent& exponent) {
-  using Scalar = typename unpacket_traits<Packet>::type;
-
-  // signed integer base, signed integer exponent case
-
-  // This routine handles negative exponents.
-  // The return value is either 0, 1, or -1.
-  const Packet cst_pos_one = pset1<Packet>(Scalar(1));
-  const bool exponent_is_odd = exponent % ScalarExponent(2) != ScalarExponent(0);
-  const Packet exp_is_odd = exponent_is_odd ? ptrue<Packet>(x) : pzero<Packet>(x);
-
-  const Packet abs_x = pabs(x);
-  const Packet abs_x_is_one = pcmp_eq(abs_x, cst_pos_one);
-
-  Packet result = pselect(exp_is_odd, x, abs_x);
-  result = pselect(abs_x_is_one, result, pzero<Packet>(x));
-  return result;
-}
-
-template <typename Packet, typename ScalarExponent,
-          std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent&) {
-  using Scalar = typename unpacket_traits<Packet>::type;
-
-  // unsigned integer base, signed integer exponent case
-
-  // This routine handles negative exponents.
-  // The return value is either 0 or 1
-
-  const Scalar pos_one = Scalar(1);
-
-  const Packet cst_pos_one = pset1<Packet>(pos_one);
-
-  const Packet x_is_one = pcmp_eq(x, cst_pos_one);
-
-  return pand(x_is_one, x);
-}
-
-}  // end namespace unary_pow
-
-template <typename Packet, typename ScalarExponent,
-          bool BaseIsIntegerType = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger,
-          bool ExponentIsIntegerType = NumTraits<ScalarExponent>::IsInteger,
-          bool ExponentIsSigned = NumTraits<ScalarExponent>::IsSigned>
-struct unary_pow_impl;
-
-template <typename Packet, typename ScalarExponent, bool ExponentIsSigned>
-struct unary_pow_impl<Packet, ScalarExponent, false, false, ExponentIsSigned> {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
-    const bool exponent_is_integer = (numext::isfinite)(exponent) && numext::round(exponent) == exponent;
-    if (exponent_is_integer) {
-      // The simple recursive doubling implementation is only accurate to 3 ulps
-      // for integer exponents in [-3:7]. Since this is a common case, we
-      // specialize it here.
-      bool use_repeated_squaring =
-          (exponent <= ScalarExponent(7) && (!ExponentIsSigned || exponent >= ScalarExponent(-3)));
-      return use_repeated_squaring ? unary_pow::int_pow(x, exponent) : generic_pow(x, pset1<Packet>(exponent));
-    } else {
-      Packet result = unary_pow::gen_pow(x, exponent);
-      result = unary_pow::handle_nonint_nonint_errors(x, result, exponent);
-      return result;
-    }
-  }
-};
-
-template <typename Packet, typename ScalarExponent, bool ExponentIsSigned>
-struct unary_pow_impl<Packet, ScalarExponent, false, true, ExponentIsSigned> {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
-    return unary_pow::int_pow(x, exponent);
-  }
-};
-
-template <typename Packet, typename ScalarExponent>
-struct unary_pow_impl<Packet, ScalarExponent, true, true, true> {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
-    if (exponent < ScalarExponent(0)) {
-      return unary_pow::handle_negative_exponent(x, exponent);
-    } else {
-      return unary_pow::int_pow(x, exponent);
-    }
-  }
-};
-
-template <typename Packet, typename ScalarExponent>
-struct unary_pow_impl<Packet, ScalarExponent, true, true, false> {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
-    return unary_pow::int_pow(x, exponent);
-  }
-};
-
-// This function computes exp2(x) = exp(ln(2) * x).
-// To improve accuracy, the product ln(2)*x is computed using the twoprod
-// algorithm, such that ln(2) * x = p_hi + p_lo holds exactly. Then exp2(x) is
-// computed as exp2(x) = exp(p_hi) * exp(p_lo) ~= exp(p_hi) * (1 + p_lo). This
-// correction step this reduces the maximum absolute error as follows:
-//
-// type   | max error (simple product) | max error (twoprod) |
-// -----------------------------------------------------------
-// float  |       35 ulps              |       4 ulps        |
-// double |      363 ulps              |     110 ulps        |
-//
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_exp2(const Packet& _x) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  constexpr int max_exponent = std::numeric_limits<Scalar>::max_exponent;
-  constexpr int digits = std::numeric_limits<Scalar>::digits;
-  constexpr Scalar max_cap = Scalar(max_exponent + 1);
-  constexpr Scalar min_cap = -Scalar(max_exponent + digits - 1);
-  Packet x = pmax(pmin(_x, pset1<Packet>(max_cap)), pset1<Packet>(min_cap));
-  Packet p_hi, p_lo;
-  twoprod(pset1<Packet>(Scalar(EIGEN_LN2)), x, p_hi, p_lo);
-  Packet exp2_hi = pexp(p_hi);
-  Packet exp2_lo = padd(pset1<Packet>(Scalar(1)), p_lo);
-  return pmul(exp2_hi, exp2_lo);
-}
+//----------------------------------------------------------------------
+// Rounding Functions
+//----------------------------------------------------------------------
 
 template <typename Packet>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_rint(const Packet& a) {
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index 673954e92d4..4534c9d7e7c 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -70,11 +70,11 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Pack
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Packet _x);
 
-/** \internal \returns log(x) for single precision float */
+/** \internal \returns log(x) for double precision float */
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_double(const Packet _x);
 
-/** \internal \returns log2(x) for single precision float */
+/** \internal \returns log2(x) for double precision float */
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_double(const Packet _x);
 
@@ -82,6 +82,26 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_double(const Pa
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p(const Packet& x);
 
+/** \internal \returns log(1+x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_float(const Packet& x);
+
+/** \internal \returns log(1+x) for double precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_double(const Packet& x);
+
+/** \internal \returns log(1+x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog1p_float(const Packet& x) {
+  return generic_log1p_float(x);
+}
+
+/** \internal \returns log(1+x) for double precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog1p_double(const Packet& x) {
+  return generic_log1p_double(x);
+}
+
 /** \internal \returns exp(x)-1 */
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_expm1(const Packet& x);
@@ -95,7 +115,7 @@ template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_exp2(const Packet& x);
 
 /** \internal \returns exp(x) for single precision float */
-template <typename Packet>
+template <typename Packet, bool IsFinite = false>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x);
 
 /** \internal \returns exp(x) for double precision real numbers */
@@ -110,6 +130,10 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Pack
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x);
 
+/** \internal \returns tan(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_float(const Packet& x);
+
 /** \internal \returns sin(x) for double precision float */
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x);
@@ -118,6 +142,10 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Pac
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x);
 
+/** \internal \returns tan(x) for double precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_double(const Packet& x);
+
 /** \internal \returns asin(x) for single precision float */
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x);
@@ -142,6 +170,46 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Pa
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_double(const Packet& x);
 
+/** \internal \returns sinh(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psinh_float(const Packet& x);
+
+/** \internal \returns sinh(x) for double precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psinh_double(const Packet& x);
+
+/** \internal \returns cosh(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcosh_float(const Packet& x);
+
+/** \internal \returns cosh(x) for double precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcosh_double(const Packet& x);
+
+/** \internal \returns asinh(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasinh_float(const Packet& x);
+
+/** \internal \returns asinh(x) for double precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasinh_double(const Packet& x);
+
+/** \internal \returns acosh(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacosh_float(const Packet& x);
+
+/** \internal \returns acosh(x) for double precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacosh_double(const Packet& x);
+
+/** \internal \returns log10(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog10_float(const Packet& x);
+
+/** \internal \returns log10(x) for double precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog10_double(const Packet& x);
+
 /** \internal \returns sqrt(x) for complex types */
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt_complex(const Packet& a);
@@ -150,6 +218,10 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt_complex(const P
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet& x, const Packet& y);
 
+/** \internal \returns x * y for complex types */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pmul_complex(const Packet& x, const Packet& y);
+
 template <typename Packet, int N>
 struct ppolevl;
 
@@ -196,30 +268,71 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_round(const Packet& a);
 #define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PACKET) \
   EIGEN_FLOAT_PACKET_FUNCTION(sin, PACKET)                 \
   EIGEN_FLOAT_PACKET_FUNCTION(cos, PACKET)                 \
+  EIGEN_FLOAT_PACKET_FUNCTION(tan, PACKET)                 \
   EIGEN_FLOAT_PACKET_FUNCTION(asin, PACKET)                \
   EIGEN_FLOAT_PACKET_FUNCTION(acos, PACKET)                \
+  EIGEN_FLOAT_PACKET_FUNCTION(sinh, PACKET)                \
+  EIGEN_FLOAT_PACKET_FUNCTION(cosh, PACKET)                \
   EIGEN_FLOAT_PACKET_FUNCTION(tanh, PACKET)                \
+  EIGEN_FLOAT_PACKET_FUNCTION(asinh, PACKET)               \
+  EIGEN_FLOAT_PACKET_FUNCTION(acosh, PACKET)               \
   EIGEN_FLOAT_PACKET_FUNCTION(atanh, PACKET)               \
   EIGEN_FLOAT_PACKET_FUNCTION(log, PACKET)                 \
   EIGEN_FLOAT_PACKET_FUNCTION(log2, PACKET)                \
+  EIGEN_FLOAT_PACKET_FUNCTION(log10, PACKET)               \
   EIGEN_FLOAT_PACKET_FUNCTION(exp, PACKET)                 \
   EIGEN_FLOAT_PACKET_FUNCTION(cbrt, PACKET)                \
   EIGEN_GENERIC_PACKET_FUNCTION(expm1, PACKET)             \
   EIGEN_GENERIC_PACKET_FUNCTION(exp2, PACKET)              \
-  EIGEN_GENERIC_PACKET_FUNCTION(log1p, PACKET)             \
+  EIGEN_FLOAT_PACKET_FUNCTION(log1p, PACKET)               \
   EIGEN_GENERIC_PACKET_FUNCTION(atan, PACKET)
 
 #define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PACKET) \
-  EIGEN_DOUBLE_PACKET_FUNCTION(atanh, PACKET)               \
-  EIGEN_DOUBLE_PACKET_FUNCTION(log, PACKET)                 \
   EIGEN_DOUBLE_PACKET_FUNCTION(sin, PACKET)                 \
   EIGEN_DOUBLE_PACKET_FUNCTION(cos, PACKET)                 \
+  EIGEN_DOUBLE_PACKET_FUNCTION(tan, PACKET)                 \
+  EIGEN_DOUBLE_PACKET_FUNCTION(sinh, PACKET)                \
+  EIGEN_DOUBLE_PACKET_FUNCTION(cosh, PACKET)                \
+  EIGEN_DOUBLE_PACKET_FUNCTION(tanh, PACKET)                \
+  EIGEN_DOUBLE_PACKET_FUNCTION(asinh, PACKET)               \
+  EIGEN_DOUBLE_PACKET_FUNCTION(acosh, PACKET)               \
+  EIGEN_DOUBLE_PACKET_FUNCTION(atanh, PACKET)               \
+  EIGEN_DOUBLE_PACKET_FUNCTION(log, PACKET)                 \
   EIGEN_DOUBLE_PACKET_FUNCTION(log2, PACKET)                \
+  EIGEN_DOUBLE_PACKET_FUNCTION(log10, PACKET)               \
   EIGEN_DOUBLE_PACKET_FUNCTION(exp, PACKET)                 \
-  EIGEN_DOUBLE_PACKET_FUNCTION(tanh, PACKET)                \
   EIGEN_DOUBLE_PACKET_FUNCTION(cbrt, PACKET)                \
-  EIGEN_GENERIC_PACKET_FUNCTION(atan, PACKET)               \
-  EIGEN_GENERIC_PACKET_FUNCTION(exp2, PACKET)
+  EIGEN_GENERIC_PACKET_FUNCTION(expm1, PACKET)              \
+  EIGEN_GENERIC_PACKET_FUNCTION(exp2, PACKET)               \
+  EIGEN_DOUBLE_PACKET_FUNCTION(log1p, PACKET)               \
+  EIGEN_GENERIC_PACKET_FUNCTION(atan, PACKET)
+
+// Macro to instantiate complex math function specializations (psqrt, plog, pexp)
+// that delegate to the generic implementations. Use in arch-specific Complex.h files.
+#define EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(PacketType)                  \
+  template <>                                                             \
+  EIGEN_STRONG_INLINE PacketType psqrt<PacketType>(const PacketType& a) { \
+    return psqrt_complex<PacketType>(a);                                  \
+  }                                                                       \
+  template <>                                                             \
+  EIGEN_STRONG_INLINE PacketType plog<PacketType>(const PacketType& a) {  \
+    return plog_complex<PacketType>(a);                                   \
+  }                                                                       \
+  template <>                                                             \
+  EIGEN_STRONG_INLINE PacketType pexp<PacketType>(const PacketType& a) {  \
+    return pexp_complex<PacketType>(a);                                   \
+  }
+
+// Variant without pexp, for backends where pexp needs special handling for a given packet type.
+#define EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS_NO_EXP(PacketType)           \
+  template <>                                                             \
+  EIGEN_STRONG_INLINE PacketType psqrt<PacketType>(const PacketType& a) { \
+    return psqrt_complex<PacketType>(a);                                  \
+  }                                                                       \
+  template <>                                                             \
+  EIGEN_STRONG_INLINE PacketType plog<PacketType>(const PacketType& a) {  \
+    return plog_complex<PacketType>(a);                                   \
+  }
 
 }  // end namespace internal
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathPolynomials.h b/Eigen/src/Core/arch/Default/GenericPacketMathPolynomials.h
new file mode 100644
index 00000000000..a8ec6aebbc4
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathPolynomials.h
@@ -0,0 +1,151 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_POLYNOMIALS_H
+#define EIGEN_ARCH_GENERIC_PACKET_MATH_POLYNOMIALS_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+/* polevl (modified for Eigen)
+ *
+ *      Evaluate polynomial
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N+1];
+ *
+ * y = polevl<decltype(x), N>( x, coef);
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates polynomial of degree N:
+ *
+ *                     2          N
+ * y  =  C  + C x + C x  +...+ C x
+ *        0    1     2          N
+ *
+ * Coefficients are stored in reverse order:
+ *
+ * coef[0] = C  , ..., coef[N] = C  .
+ *            N                   0
+ *
+ *  The function p1evl() assumes that coef[N] = 1.0 and is
+ * omitted from the array.  Its calling arguments are
+ * otherwise the same as polevl().
+ *
+ *
+ * The Eigen implementation is templatized.  For best speed, store
+ * coef as a const array (constexpr), e.g.
+ *
+ * const double coef[] = {1.0, 2.0, 3.0, ...};
+ *
+ */
+template <typename Packet, int N>
+struct ppolevl {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
+                                                          const typename unpacket_traits<Packet>::type coeff[]) {
+    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return pmadd(ppolevl<Packet, N - 1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
+  }
+};
+
+template <typename Packet>
+struct ppolevl<Packet, 0> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
+                                                          const typename unpacket_traits<Packet>::type coeff[]) {
+    EIGEN_UNUSED_VARIABLE(x);
+    return pset1<Packet>(coeff[0]);
+  }
+};
+
+/* chbevl (modified for Eigen)
+ *
+ *     Evaluate Chebyshev series
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N], chebevl();
+ *
+ * y = chbevl( x, coef, N );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates the series
+ *
+ *        N-1
+ *         - '
+ *  y  =   >   coef[i] T (x/2)
+ *         -            i
+ *        i=0
+ *
+ * of Chebyshev polynomials Ti at argument x/2.
+ *
+ * Coefficients are stored in reverse order, i.e. the zero
+ * order term is last in the array.  Note N is the number of
+ * coefficients, not the order.
+ *
+ * If coefficients are for the interval a to b, x must
+ * have been transformed to x -> 2(2x - b - a)/(b-a) before
+ * entering the routine.  This maps x from (a, b) to (-1, 1),
+ * over which the Chebyshev polynomials are defined.
+ *
+ * If the coefficients are for the inverted interval, in
+ * which (a, b) is mapped to (1/b, 1/a), the transformation
+ * required is x -> 2(2ab/x - b - a)/(b-a).  If b is infinity,
+ * this becomes x -> 4a/x - 1.
+ *
+ *
+ *
+ * SPEED:
+ *
+ * Taking advantage of the recurrence properties of the
+ * Chebyshev polynomials, the routine requires one more
+ * addition per loop than evaluating a nested polynomial of
+ * the same degree.
+ *
+ */
+
+template <typename Packet, int N>
+struct pchebevl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(Packet x,
+                                                          const typename unpacket_traits<Packet>::type coef[]) {
+    typedef typename unpacket_traits<Packet>::type Scalar;
+    Packet b0 = pset1<Packet>(coef[0]);
+    Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
+    Packet b2;
+
+    for (int i = 1; i < N; i++) {
+      b2 = b1;
+      b1 = b0;
+      b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
+    }
+
+    return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
+  }
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_ARCH_GENERIC_PACKET_MATH_POLYNOMIALS_H
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathPow.h b/Eigen/src/Core/arch/Default/GenericPacketMathPow.h
new file mode 100644
index 00000000000..4ab75b91470
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathPow.h
@@ -0,0 +1,724 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018-2025 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_POW_H
+#define EIGEN_ARCH_GENERIC_PACKET_MATH_POW_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+//----------------------------------------------------------------------
+// Cubic Root Functions
+//----------------------------------------------------------------------
+
+// This function implements a single step of Halley's iteration for
+// computing x = y^(1/3):
+//   x_{k+1} = x_k - (x_k^3 - y) x_k / (2x_k^3 + y)
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_halley_iteration_step(const Packet& x_k,
+                                                                                      const Packet& y) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  Packet x_k_cb = pmul(x_k, pmul(x_k, x_k));
+  Packet denom = pmadd(pset1<Packet>(Scalar(2)), x_k_cb, y);
+  Packet num = psub(x_k_cb, y);
+  Packet r = pdiv(num, denom);
+  return pnmadd(x_k, r, x_k);
+}
+
+// Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
+// interval [0.125,1].
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_decompose(const Packet& x, Packet& e_div3) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  // Extract the significant s in the range [0.5,1) and exponent e, such that
+  // x = 2^e * s.
+  Packet e, s;
+  s = pfrexp(x, e);
+
+  // Split the exponent into a part divisible by 3 and the remainder.
+  // e = 3*e_div3 + e_mod3.
+  constexpr Scalar kOneThird = Scalar(1) / 3;
+  e_div3 = pceil(pmul(e, pset1<Packet>(kOneThird)));
+  Packet e_mod3 = pnmadd(pset1<Packet>(Scalar(3)), e_div3, e);
+
+  // Replace s by y = (s * 2^e_mod3).
+  return pldexp_fast(s, e_mod3);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_special_cases_and_sign(const Packet& x,
+                                                                                       const Packet& abs_root) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+
+  // Set sign.
+  const Packet sign_mask = pset1<Packet>(Scalar(-0.0));
+  const Packet x_sign = pand(sign_mask, x);
+  Packet root = por(x_sign, abs_root);
+
+  // Pass non-finite and zero values of x straight through.
+  const Packet is_not_finite = por(pisinf(x), pisnan(x));
+  const Packet is_zero = pcmp_eq(pzero(x), x);
+  const Packet use_x = por(is_not_finite, is_zero);
+  return pselect(use_x, x, root);
+}
+
+// Generic implementation of cbrt(x) for float.
+//
+// The algorithm computes the cubic root of the input by first
+// decomposing it into a exponent and significant
+//   x = s * 2^e.
+//
+// We can then write the cube root as
+//
+//   x^(1/3) = 2^(e/3) * s^(1/3)
+//           = 2^((3*e_div3 + e_mod3)/3) * s^(1/3)
+//           = 2^(e_div3) * 2^(e_mod3/3) * s^(1/3)
+//           = 2^(e_div3) * (s * 2^e_mod3)^(1/3)
+//
+// where e_div3 = ceil(e/3) and e_mod3 = e - 3*e_div3.
+//
+// The cube root of the second term y = (s * 2^e_mod3)^(1/3) is coarsely
+// approximated using a cubic polynomial and subsequently refined using a
+// single step of Halley's iteration, and finally the two terms are combined
+// using pldexp_fast.
+//
+// Note: Many alternatives exist for implementing cbrt. See, for example,
+// the excellent discussion in Kahan's note:
+//   https://csclub.uwaterloo.ca/~pbarfuss/qbrt.pdf
+// This particular implementation was found to be very fast and accurate
+// among several alternatives tried, but is probably not "optimal" on all
+// platforms.
+//
+// This is accurate to 2 ULP.
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_float(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+  // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
+  // interval [0.125,1].
+  Packet e_div3;
+  const Packet y = cbrt_decompose(pabs(x), e_div3);
+
+  // Compute initial approximation accurate to 5.22e-3.
+  // The polynomial was computed using Rminimax.
+  constexpr float alpha[] = {5.9220016002655029296875e-01f, -1.3859539031982421875e+00f, 1.4581282138824462890625e+00f,
+                             3.408401906490325927734375e-01f};
+  Packet r = ppolevl<Packet, 3>::run(y, alpha);
+
+  // Take one step of Halley's iteration.
+  r = cbrt_halley_iteration_step(r, y);
+
+  // Finally multiply by 2^(e_div3)
+  r = pldexp_fast(r, e_div3);
+
+  return cbrt_special_cases_and_sign(x, r);
+}
+
+// Generic implementation of cbrt(x) for double.
+//
+// The algorithm is identical to the one for float except that a different initial
+// approximation is used for y^(1/3) and two Halley iteration steps are performed.
+//
+// This is accurate to 1 ULP.
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_double(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
+
+  // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
+  // interval [0.125,1].
+  Packet e_div3;
+  const Packet y = cbrt_decompose(pabs(x), e_div3);
+
+  // Compute initial approximation accurate to 0.016.
+  // The polynomial was computed using Rminimax.
+  constexpr double alpha[] = {-4.69470621553356115551736138513660989701747894287109375e-01,
+                              1.072314636518546304699839311069808900356292724609375e+00,
+                              3.81249427609571867048288140722434036433696746826171875e-01};
+  Packet r = ppolevl<Packet, 2>::run(y, alpha);
+
+  // Take two steps of Halley's iteration.
+  r = cbrt_halley_iteration_step(r, y);
+  r = cbrt_halley_iteration_step(r, y);
+
+  // Finally multiply by 2^(e_div3).
+  r = pldexp_fast(r, e_div3);
+  return cbrt_special_cases_and_sign(x, r);
+}
+
+//----------------------------------------------------------------------
+// Power Functions (accurate_log2, generic_pow, unary_pow)
+//----------------------------------------------------------------------
+
+// This function computes log2(x) and returns the result as a double word.
+template <typename Scalar>
+struct accurate_log2 {
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) const {
+    log2_x_hi = plog2(x);
+    log2_x_lo = pzero(x);
+  }
+};
+
+// This specialization uses a more accurate algorithm to compute log2(x) for
+// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.56508e-10.
+// This additional accuracy is needed to counter the error-magnification
+// inherent in multiplying by a potentially large exponent in pow(x,y).
+// The minimax polynomial used was calculated using the Rminimax tool,
+// see https://gitlab.inria.fr/sfilip/rminimax.
+// Command line:
+//   $ ratapprox --function="log2(1+x)/x"  --dom='[-0.2929,0.41422]'
+//   --type=[10,0]
+//       --numF="[D,D,SG]" --denF="[SG]" --log --dispCoeff="dec"
+//
+// The resulting implementation of pow(x,y) is accurate to 3 ulps.
+template <>
+struct accurate_log2<float> {
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) const {
+    // Split the two lowest order constant coefficient into double-word representation.
+    constexpr double kC0 = 1.442695041742110273474963832995854318141937255859375e+00;
+    constexpr float kC0_hi = static_cast<float>(kC0);
+    constexpr float kC0_lo = static_cast<float>(kC0 - static_cast<double>(kC0_hi));
+    const Packet c0_hi = pset1<Packet>(kC0_hi);
+    const Packet c0_lo = pset1<Packet>(kC0_lo);
+
+    constexpr double kC1 = -7.2134751588268664068692714863573201000690460205078125e-01;
+    constexpr float kC1_hi = static_cast<float>(kC1);
+    constexpr float kC1_lo = static_cast<float>(kC1 - static_cast<double>(kC1_hi));
+    const Packet c1_hi = pset1<Packet>(kC1_hi);
+    const Packet c1_lo = pset1<Packet>(kC1_lo);
+
+    constexpr float c[] = {
+        9.7010828554630279541015625e-02,  -1.6896486282348632812500000e-01, 1.7200836539268493652343750e-01,
+        -1.7892272770404815673828125e-01, 2.0505344867706298828125000e-01,  -2.4046677350997924804687500e-01,
+        2.8857553005218505859375000e-01,  -3.6067414283752441406250000e-01, 4.8089790344238281250000000e-01};
+
+    // Evaluate the higher order terms in the polynomial using
+    // standard arithmetic.
+    const Packet one = pset1<Packet>(1.0f);
+    const Packet x = psub(z, one);
+    Packet p = ppolevl<Packet, 8>::run(x, c);
+    // Evaluate the final two step in Horner's rule using double-word
+    // arithmetic.
+    Packet p_hi, p_lo;
+    twoprod(x, p, p_hi, p_lo);
+    fast_twosum(c1_hi, c1_lo, p_hi, p_lo, p_hi, p_lo);
+    twoprod(p_hi, p_lo, x, p_hi, p_lo);
+    fast_twosum(c0_hi, c0_lo, p_hi, p_lo, p_hi, p_lo);
+    // Multiply by x to recover log2(z).
+    twoprod(p_hi, p_lo, x, log2_x_hi, log2_x_lo);
+  }
+};
+
+// This specialization uses a more accurate algorithm to compute log2(x) for
+// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~1.27e-18.
+// This additional accuracy is needed to counter the error-magnification
+// inherent in multiplying by a potentially large exponent in pow(x,y).
+// The minimax polynomial used was calculated using the Sollya tool.
+// See sollya.org.
+
+template <>
+struct accurate_log2<double> {
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) const {
+    // We use a transformation of variables:
+    //    r = c * (x-1) / (x+1),
+    // such that
+    //    log2(x) = log2((1 + r/c) / (1 - r/c)) = f(r).
+    // The function f(r) can be approximated well using an odd polynomial
+    // of the form
+    //   P(r) = ((Q(r^2) * r^2 + C) * r^2 + 1) * r,
+    // For the implementation of log2<double> here, Q is of degree 6 with
+    // coefficient represented in working precision (double), while C is a
+    // constant represented in extra precision as a double word to achieve
+    // full accuracy.
+    //
+    // The polynomial coefficients were computed by the Sollya script:
+    //
+    // c = 2 / log(2);
+    // trans = c * (x-1)/(x+1);
+    // itrans = (1+x/c)/(1-x/c);
+    // interval=[trans(sqrt(0.5)); trans(sqrt(2))];
+    // print(interval);
+    // f = log2(itrans(x));
+    // p=fpminimax(f,[|1,3,5,7,9,11,13,15,17|],[|1,DD,double...|],interval,relative,floating);
+    const Packet q12 = pset1<Packet>(2.87074255468000586e-9);
+    const Packet q10 = pset1<Packet>(2.38957980901884082e-8);
+    const Packet q8 = pset1<Packet>(2.31032094540014656e-7);
+    const Packet q6 = pset1<Packet>(2.27279857398537278e-6);
+    const Packet q4 = pset1<Packet>(2.31271023278625638e-5);
+    const Packet q2 = pset1<Packet>(2.47556738444535513e-4);
+    const Packet q0 = pset1<Packet>(2.88543873228900172e-3);
+    const Packet C_hi = pset1<Packet>(0.0400377511598501157);
+    const Packet C_lo = pset1<Packet>(-4.77726582251425391e-19);
+    const Packet one = pset1<Packet>(1.0);
+
+    const Packet cst_2_log2e_hi = pset1<Packet>(2.88539008177792677);
+    const Packet cst_2_log2e_lo = pset1<Packet>(4.07660016854549667e-17);
+    // c * (x - 1)
+    Packet t_hi, t_lo;
+    // t = c * (x-1)
+    twoprod(cst_2_log2e_hi, cst_2_log2e_lo, psub(x, one), t_hi, t_lo);
+    // r = c * (x-1) / (x+1),
+    Packet r_hi, r_lo;
+    doubleword_div_fp(t_hi, t_lo, padd(x, one), r_hi, r_lo);
+
+    // r2 = r * r
+    Packet r2_hi, r2_lo;
+    twoprod(r_hi, r_lo, r_hi, r_lo, r2_hi, r2_lo);
+    // r4 = r2 * r2
+    Packet r4_hi, r4_lo;
+    twoprod(r2_hi, r2_lo, r2_hi, r2_lo, r4_hi, r4_lo);
+
+    // Evaluate Q(r^2) in working precision. We evaluate it in two parts
+    // (even and odd in r^2) to improve instruction level parallelism.
+    Packet q_even = pmadd(q12, r4_hi, q8);
+    Packet q_odd = pmadd(q10, r4_hi, q6);
+    q_even = pmadd(q_even, r4_hi, q4);
+    q_odd = pmadd(q_odd, r4_hi, q2);
+    q_even = pmadd(q_even, r4_hi, q0);
+    Packet q = pmadd(q_odd, r2_hi, q_even);
+
+    // Now evaluate the low order terms of P(x) in double word precision.
+    // In the following, due to the increasing magnitude of the coefficients
+    // and r being constrained to [-0.5, 0.5] we can use fast_twosum instead
+    // of the slower twosum.
+    // Q(r^2) * r^2
+    Packet p_hi, p_lo;
+    twoprod(r2_hi, r2_lo, q, p_hi, p_lo);
+    // Q(r^2) * r^2 + C
+    Packet p1_hi, p1_lo;
+    fast_twosum(C_hi, C_lo, p_hi, p_lo, p1_hi, p1_lo);
+    // (Q(r^2) * r^2 + C) * r^2
+    Packet p2_hi, p2_lo;
+    twoprod(r2_hi, r2_lo, p1_hi, p1_lo, p2_hi, p2_lo);
+    // ((Q(r^2) * r^2 + C) * r^2 + 1)
+    Packet p3_hi, p3_lo;
+    fast_twosum(one, p2_hi, p2_lo, p3_hi, p3_lo);
+
+    // log(z) ~= ((Q(r^2) * r^2 + C) * r^2 + 1) * r
+    twoprod(p3_hi, p3_lo, r_hi, r_lo, log2_x_hi, log2_x_lo);
+  }
+};
+
+// This function implements the non-trivial case of pow(x,y) where x is
+// positive and y is (possibly) non-integer.
+// Formally, pow(x,y) = exp2(y * log2(x)), where exp2(x) is shorthand for 2^x.
+// TODO(rmlarsen): We should probably add this as a packet op 'ppow', to make it
+// easier to specialize or turn off for specific types and/or backends.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  // Split x into exponent e_x and mantissa m_x.
+  Packet e_x;
+  Packet m_x = pfrexp(x, e_x);
+
+  // Adjust m_x to lie in [1/sqrt(2):sqrt(2)] to minimize absolute error in log2(m_x).
+  constexpr Scalar sqrt_half = Scalar(0.70710678118654752440);
+  const Packet m_x_scale_mask = pcmp_lt(m_x, pset1<Packet>(sqrt_half));
+  m_x = pselect(m_x_scale_mask, pmul(pset1<Packet>(Scalar(2)), m_x), m_x);
+  e_x = pselect(m_x_scale_mask, psub(e_x, pset1<Packet>(Scalar(1))), e_x);
+
+  // Compute log2(m_x) with 6 extra bits of accuracy.
+  Packet rx_hi, rx_lo;
+  accurate_log2<Scalar>()(m_x, rx_hi, rx_lo);
+
+  // Compute the two terms {y * e_x, y * r_x} in f = y * log2(x) with doubled
+  // precision using double word arithmetic.
+  Packet f1_hi, f1_lo, f2_hi, f2_lo;
+  twoprod(e_x, y, f1_hi, f1_lo);
+  twoprod(rx_hi, rx_lo, y, f2_hi, f2_lo);
+  // Sum the two terms in f using double word arithmetic. We know
+  // that |e_x| > |log2(m_x)|, except for the case where e_x==0.
+  // This means that we can use fast_twosum(f1,f2).
+  // In the case e_x == 0, e_x * y = f1 = 0, so we don't lose any
+  // accuracy by violating the assumption of fast_twosum, because
+  // it's a no-op.
+  Packet f_hi, f_lo;
+  fast_twosum(f1_hi, f1_lo, f2_hi, f2_lo, f_hi, f_lo);
+
+  // Split f into integer and fractional parts.
+  Packet n_z, r_z;
+  absolute_split(f_hi, n_z, r_z);
+  r_z = padd(r_z, f_lo);
+  Packet n_r;
+  absolute_split(r_z, n_r, r_z);
+  n_z = padd(n_z, n_r);
+
+  // We now have an accurate split of f = n_z + r_z and can compute
+  //   x^y = 2**{n_z + r_z) = exp2(r_z) * 2**{n_z}.
+  // Multiplication by the second factor can be done exactly using pldexp(), since
+  // it is an integer power of 2.
+  const Packet e_r = generic_exp2(r_z);
+
+  // Since we know that e_r is in [1/sqrt(2); sqrt(2)], we can use the fast version
+  // of pldexp to multiply by 2**{n_z} when |n_z| is sufficiently small.
+  constexpr Scalar kPldExpThresh = std::numeric_limits<Scalar>::max_exponent - 2;
+  const Packet pldexp_fast_unsafe = pcmp_lt(pset1<Packet>(kPldExpThresh), pabs(n_z));
+  if (predux_any(pldexp_fast_unsafe)) {
+    return pldexp(e_r, n_z);
+  }
+  return pldexp_fast(e_r, n_z);
+}
+
+// Generic implementation of pow(x,y).
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<!is_scalar<Packet>::value, Packet> generic_pow(
+    const Packet& x, const Packet& y) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+
+  const Packet cst_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
+  const Packet cst_zero = pset1<Packet>(Scalar(0));
+  const Packet cst_one = pset1<Packet>(Scalar(1));
+  const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
+
+  const Packet x_abs = pabs(x);
+  Packet result = generic_pow_impl(x_abs, y);
+
+  // In the following we enforce the special case handling prescribed in
+  // https://en.cppreference.com/w/cpp/numeric/math/pow.
+
+  // Predicates for sign and magnitude of x.
+  const Packet x_is_negative = pcmp_lt(x, cst_zero);
+  const Packet x_is_zero = pcmp_eq(x, cst_zero);
+  const Packet x_is_one = pcmp_eq(x, cst_one);
+  const Packet x_has_signbit = psignbit(x);
+  const Packet x_abs_gt_one = pcmp_lt(cst_one, x_abs);
+  const Packet x_abs_is_inf = pcmp_eq(x_abs, cst_inf);
+
+  // Predicates for sign and magnitude of y.
+  const Packet y_abs = pabs(y);
+  const Packet y_abs_is_inf = pcmp_eq(y_abs, cst_inf);
+  const Packet y_is_negative = pcmp_lt(y, cst_zero);
+  const Packet y_is_zero = pcmp_eq(y, cst_zero);
+  const Packet y_is_one = pcmp_eq(y, cst_one);
+  // Predicates for whether y is integer and odd/even.
+  const Packet y_is_int = pandnot(pcmp_eq(pfloor(y), y), y_abs_is_inf);
+  const Packet y_div_2 = pmul(y, pset1<Packet>(Scalar(0.5)));
+  const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
+  const Packet y_is_odd_int = pandnot(y_is_int, y_is_even);
+  // Smallest exponent for which (1 + epsilon) overflows to infinity.
+  constexpr Scalar huge_exponent =
+      (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits<Scalar>::epsilon();
+  const Packet y_abs_is_huge = pcmp_le(pset1<Packet>(huge_exponent), y_abs);
+
+  // *  pow(base, exp) returns NaN if base is finite and negative
+  //    and exp is finite and non-integer.
+  result = pselect(pandnot(x_is_negative, y_is_int), cst_nan, result);
+
+  // * pow(±0, exp), where exp is negative, finite, and is an even integer or
+  // a non-integer, returns +∞
+  // * pow(±0, exp), where exp is positive non-integer or a positive even
+  // integer, returns +0
+  // * pow(+0, exp), where exp is a negative odd integer, returns +∞
+  // * pow(-0, exp), where exp is a negative odd integer, returns -∞
+  // * pow(+0, exp), where exp is a positive odd integer, returns +0
+  // * pow(-0, exp), where exp is a positive odd integer, returns -0
+  // Sign is flipped by the rule below.
+  result = pselect(x_is_zero, pselect(y_is_negative, cst_inf, cst_zero), result);
+
+  // pow(base, exp) returns -pow(abs(base), exp) if base has the sign bit set,
+  // and exp is an odd integer exponent.
+  result = pselect(pand(x_has_signbit, y_is_odd_int), pnegate(result), result);
+
+  // * pow(base, -∞) returns +∞ for any |base|<1
+  // * pow(base, -∞) returns +0 for any |base|>1
+  // * pow(base, +∞) returns +0 for any |base|<1
+  // * pow(base, +∞) returns +∞ for any |base|>1
+  // * pow(±0, -∞) returns +∞
+  // * pow(-1, +-∞) = 1
+  Packet inf_y_val = pselect(pxor(y_is_negative, x_abs_gt_one), cst_inf, cst_zero);
+  inf_y_val = pselect(pcmp_eq(x, pset1<Packet>(Scalar(-1.0))), cst_one, inf_y_val);
+  result = pselect(y_abs_is_huge, inf_y_val, result);
+
+  // * pow(+∞, exp) returns +0 for any negative exp
+  // * pow(+∞, exp) returns +∞ for any positive exp
+  // * pow(-∞, exp) returns -0 if exp is a negative odd integer.
+  // * pow(-∞, exp) returns +0 if exp is a negative non-integer or negative
+  //     even integer.
+  // * pow(-∞, exp) returns -∞ if exp is a positive odd integer.
+  // * pow(-∞, exp) returns +∞ if exp is a positive non-integer or positive
+  //     even integer.
+  auto x_pos_inf_value = pselect(y_is_negative, cst_zero, cst_inf);
+  auto x_neg_inf_value = pselect(y_is_odd_int, pnegate(x_pos_inf_value), x_pos_inf_value);
+  result = pselect(x_abs_is_inf, pselect(x_is_negative, x_neg_inf_value, x_pos_inf_value), result);
+
+  // All cases of NaN inputs return NaN, except the two below.
+  result = pselect(por(pisnan(x), pisnan(y)), cst_nan, result);
+
+  // * pow(base, 1) returns base.
+  // * pow(base, +/-0) returns 1, regardless of base, even NaN.
+  // * pow(+1, exp) returns 1, regardless of exponent, even NaN.
+  result = pselect(y_is_one, x, pselect(por(x_is_one, y_is_zero), cst_one, result));
+
+  return result;
+}
+
+template <typename Scalar>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<is_scalar<Scalar>::value, Scalar> generic_pow(
+    const Scalar& x, const Scalar& y) {
+  return numext::pow(x, y);
+}
+
+namespace unary_pow {
+
+template <typename ScalarExponent, bool IsInteger = NumTraits<ScalarExponent>::IsInteger>
+struct exponent_helper {
+  using safe_abs_type = ScalarExponent;
+  static constexpr ScalarExponent one_half = ScalarExponent(0.5);
+  // these routines assume that exp is an integer stored as a floating point type
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent safe_abs(const ScalarExponent& exp) {
+    return numext::abs(exp);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool is_odd(const ScalarExponent& exp) {
+    eigen_assert(((numext::isfinite)(exp) && exp == numext::floor(exp)) && "exp must be an integer");
+    ScalarExponent exp_div_2 = exp * one_half;
+    ScalarExponent floor_exp_div_2 = numext::floor(exp_div_2);
+    return exp_div_2 != floor_exp_div_2;
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent floor_div_two(const ScalarExponent& exp) {
+    ScalarExponent exp_div_2 = exp * one_half;
+    return numext::floor(exp_div_2);
+  }
+};
+
+template <typename ScalarExponent>
+struct exponent_helper<ScalarExponent, true> {
+  // if `exp` is a signed integer type, cast it to its unsigned counterpart to safely store its absolute value
+  // consider the (rare) case where `exp` is an int32_t: abs(-2147483648) != 2147483648
+  using safe_abs_type = typename numext::get_integer_by_size<sizeof(ScalarExponent)>::unsigned_type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_abs_type safe_abs(const ScalarExponent& exp) {
+    ScalarExponent mask = numext::signbit(exp);
+    safe_abs_type result = safe_abs_type(exp ^ mask);
+    return result + safe_abs_type(ScalarExponent(1) & mask);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool is_odd(const safe_abs_type& exp) {
+    return exp % safe_abs_type(2) != safe_abs_type(0);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_abs_type floor_div_two(const safe_abs_type& exp) {
+    return exp >> safe_abs_type(1);
+  }
+};
+
+template <typename Packet, typename ScalarExponent,
+          bool ReciprocateIfExponentIsNegative =
+              !NumTraits<typename unpacket_traits<Packet>::type>::IsInteger && NumTraits<ScalarExponent>::IsSigned>
+struct reciprocate {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
+    using Scalar = typename unpacket_traits<Packet>::type;
+    const Packet cst_pos_one = pset1<Packet>(Scalar(1));
+    return exponent < 0 ? pdiv(cst_pos_one, x) : x;
+  }
+};
+
+template <typename Packet, typename ScalarExponent>
+struct reciprocate<Packet, ScalarExponent, false> {
+  // pdiv not defined, nor necessary for integer base types
+  // if the exponent is unsigned, then the exponent cannot be negative
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent&) { return x; }
+};
+
+template <typename Packet, typename ScalarExponent>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet int_pow(const Packet& x, const ScalarExponent& exponent) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  using ExponentHelper = exponent_helper<ScalarExponent>;
+  using AbsExponentType = typename ExponentHelper::safe_abs_type;
+  const Packet cst_pos_one = pset1<Packet>(Scalar(1));
+  if (exponent == ScalarExponent(0)) return cst_pos_one;
+
+  Packet result = reciprocate<Packet, ScalarExponent>::run(x, exponent);
+  Packet y = cst_pos_one;
+  AbsExponentType m = ExponentHelper::safe_abs(exponent);
+
+  while (m > 1) {
+    bool odd = ExponentHelper::is_odd(m);
+    if (odd) y = pmul(y, result);
+    result = pmul(result, result);
+    m = ExponentHelper::floor_div_two(m);
+  }
+
+  return pmul(y, result);
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!is_scalar<Packet>::value, Packet> gen_pow(
+    const Packet& x, const typename unpacket_traits<Packet>::type& exponent) {
+  const Packet exponent_packet = pset1<Packet>(exponent);
+  // generic_pow_impl requires positive x; sign/error handling is done by the caller.
+  return generic_pow_impl(pabs(x), exponent_packet);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<is_scalar<Scalar>::value, Scalar> gen_pow(
+    const Scalar& x, const Scalar& exponent) {
+  return numext::pow(x, exponent);
+}
+
+// Handle special cases for pow(x, exponent) where both base and exponent are
+// floating point and the exponent is a non-integer scalar (uniform across all
+// SIMD lanes). This allows us to use scalar branches on exponent properties.
+template <typename Packet, typename ScalarExponent>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_nonint_nonint_errors(const Packet& x, const Packet& powx,
+                                                                         const ScalarExponent& exponent) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  const Packet cst_zero = pzero(x);
+  const Packet cst_one = pset1<Packet>(Scalar(1));
+  const Packet cst_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
+  const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
+
+  const Packet abs_x = pabs(x);
+
+  // x < 0 with non-integer exponent -> NaN.
+  Packet result = pselect(pcmp_lt(x, cst_zero), cst_nan, powx);
+
+  if (!(numext::isfinite)(exponent)) {
+    if (exponent != exponent) {
+      // pow(x, NaN) = NaN, except pow(+1, NaN) = 1.
+      result = pselect(pcmp_eq(x, cst_one), cst_one, cst_nan);
+    } else {
+      // Exponent is +inf or -inf.
+      const Packet abs_x_is_one = pcmp_eq(abs_x, cst_one);
+      if (exponent > ScalarExponent(0)) {
+        // pow(x, +inf): |x| > 1 -> +inf, |x| < 1 -> 0, |x| == 1 -> 1.
+        result = pselect(pcmp_lt(cst_one, abs_x), cst_inf, cst_zero);
+      } else {
+        // pow(x, -inf): |x| < 1 -> +inf, |x| > 1 -> 0, |x| == 1 -> 1.
+        result = pselect(pcmp_lt(abs_x, cst_one), cst_inf, cst_zero);
+      }
+      // pow(+-1, +-inf) = 1.
+      result = pselect(abs_x_is_one, cst_one, result);
+    }
+  } else {
+    // Finite non-integer exponent.
+    const Packet x_is_zero = pcmp_eq(x, cst_zero);
+    const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_inf);
+    if (exponent < ScalarExponent(0)) {
+      // pow(+-0, negative non-integer) = +inf. pow(+-inf, negative) = +0.
+      result = pselect(x_is_zero, cst_inf, result);
+      result = pselect(abs_x_is_inf, cst_zero, result);
+    } else {
+      // pow(+-0, positive non-integer) = +0. pow(+-inf, positive) = +inf.
+      result = pselect(x_is_zero, cst_zero, result);
+      result = pselect(abs_x_is_inf, cst_inf, result);
+    }
+  }
+
+  // NaN base produces NaN. This overrides all cases above, but pow(NaN, 0) = 1
+  // and pow(NaN, integer) are handled by the integer exponent path and never
+  // reach this function.
+  result = pselect(pisnan(x), cst_nan, result);
+
+  return result;
+}
+
+template <typename Packet, typename ScalarExponent,
+          std::enable_if_t<NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent& exponent) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+
+  // signed integer base, signed integer exponent case
+
+  // This routine handles negative exponents.
+  // The return value is either 0, 1, or -1.
+  const Packet cst_pos_one = pset1<Packet>(Scalar(1));
+  const bool exponent_is_odd = exponent % ScalarExponent(2) != ScalarExponent(0);
+  const Packet exp_is_odd = exponent_is_odd ? ptrue<Packet>(x) : pzero<Packet>(x);
+
+  const Packet abs_x = pabs(x);
+  const Packet abs_x_is_one = pcmp_eq(abs_x, cst_pos_one);
+
+  Packet result = pselect(exp_is_odd, x, abs_x);
+  result = pselect(abs_x_is_one, result, pzero<Packet>(x));
+  return result;
+}
+
+template <typename Packet, typename ScalarExponent,
+          std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent&) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+
+  // unsigned integer base, signed integer exponent case
+
+  // This routine handles negative exponents.
+  // The return value is either 0 or 1
+
+  const Scalar pos_one = Scalar(1);
+
+  const Packet cst_pos_one = pset1<Packet>(pos_one);
+
+  const Packet x_is_one = pcmp_eq(x, cst_pos_one);
+
+  return pand(x_is_one, x);
+}
+
+}  // end namespace unary_pow
+
+template <typename Packet, typename ScalarExponent,
+          bool BaseIsIntegerType = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger,
+          bool ExponentIsIntegerType = NumTraits<ScalarExponent>::IsInteger,
+          bool ExponentIsSigned = NumTraits<ScalarExponent>::IsSigned>
+struct unary_pow_impl;
+
+template <typename Packet, typename ScalarExponent, bool ExponentIsSigned>
+struct unary_pow_impl<Packet, ScalarExponent, false, false, ExponentIsSigned> {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
+    const bool exponent_is_integer = (numext::isfinite)(exponent) && numext::round(exponent) == exponent;
+    if (exponent_is_integer) {
+      // The simple recursive doubling implementation is only accurate to 3 ulps
+      // for integer exponents in [-3:7]. Since this is a common case, we
+      // specialize it here.
+      bool use_repeated_squaring =
+          (exponent <= ScalarExponent(7) && (!ExponentIsSigned || exponent >= ScalarExponent(-3)));
+      return use_repeated_squaring ? unary_pow::int_pow(x, exponent) : generic_pow(x, pset1<Packet>(exponent));
+    } else {
+      Packet result = unary_pow::gen_pow(x, exponent);
+      result = unary_pow::handle_nonint_nonint_errors(x, result, exponent);
+      return result;
+    }
+  }
+};
+
+template <typename Packet, typename ScalarExponent, bool ExponentIsSigned>
+struct unary_pow_impl<Packet, ScalarExponent, false, true, ExponentIsSigned> {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
+    return unary_pow::int_pow(x, exponent);
+  }
+};
+
+template <typename Packet, typename ScalarExponent>
+struct unary_pow_impl<Packet, ScalarExponent, true, true, true> {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
+    if (exponent < ScalarExponent(0)) {
+      return unary_pow::handle_negative_exponent(x, exponent);
+    } else {
+      return unary_pow::int_pow(x, exponent);
+    }
+  }
+};
+
+template <typename Packet, typename ScalarExponent>
+struct unary_pow_impl<Packet, ScalarExponent, true, true, false> {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
+    return unary_pow::int_pow(x, exponent);
+  }
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_ARCH_GENERIC_PACKET_MATH_POW_H
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathTrig.h b/Eigen/src/Core/arch/Default/GenericPacketMathTrig.h
new file mode 100644
index 00000000000..5514ffd959f
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathTrig.h
@@ -0,0 +1,1067 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2018-2025 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_TRIG_H
+#define EIGEN_ARCH_GENERIC_PACKET_MATH_TRIG_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+//----------------------------------------------------------------------
+// Trigonometric Functions
+//----------------------------------------------------------------------
+
+// Enum for selecting which function to compute. SinCos is intended to compute
+// pairs of Sin and Cos of the even entries in the packet, e.g.
+// SinCos([a, *, b, *]) = [sin(a), cos(a), sin(b), cos(b)].
+enum class TrigFunction : uint8_t { Sin, Cos, Tan, SinCos };
+
+// The following code is inspired by the following stack-overflow answer:
+//   https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
+// It has been largely optimized:
+//  - By-pass calls to frexp.
+//  - Aligned loads of required 96 bits of 2/pi. This is accomplished by
+//    (1) balancing the mantissa and exponent to the required bits of 2/pi are
+//    aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
+//  - Avoid a branch in rounding and extraction of the remaining fractional part.
+// Overall, I measured a speed up higher than x2 on x86-64.
+inline float trig_reduce_huge(float xf, Eigen::numext::int32_t* quadrant) {
+  using Eigen::numext::int32_t;
+  using Eigen::numext::int64_t;
+  using Eigen::numext::uint32_t;
+  using Eigen::numext::uint64_t;
+
+  const double pio2_62 = 3.4061215800865545e-19;     // pi/2 * 2^-62
+  const uint64_t zero_dot_five = uint64_t(1) << 61;  // 0.5 in 2.62-bit fixed-point format
+
+  // 192 bits of 2/pi for Payne-Hanek reduction
+  // Bits are introduced by packet of 8 to enable aligned reads.
+  static const uint32_t two_over_pi[] = {
+      0x00000028, 0x000028be, 0x0028be60, 0x28be60db, 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a, 0x91054a7f,
+      0x054a7f09, 0x4a7f09d5, 0x7f09d5f4, 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770, 0x4d377036, 0x377036d8,
+      0x7036d8a5, 0x36d8a566, 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410, 0x10e41000, 0xe4100000};
+
+  uint32_t xi = numext::bit_cast<uint32_t>(xf);
+  // Below, -118 = -126 + 8.
+  //   -126 is to get the exponent,
+  //   +8 is to enable alignment of 2/pi's bits on 8 bits.
+  // This is possible because the fractional part of x as only 24 meaningful bits.
+  uint32_t e = (xi >> 23) - 118;
+  // Extract the mantissa and shift it to align it wrt the exponent
+  xi = ((xi & 0x007fffffu) | 0x00800000u) << (e & 0x7);
+
+  uint32_t i = e >> 3;
+  uint32_t twoopi_1 = two_over_pi[i - 1];
+  uint32_t twoopi_2 = two_over_pi[i + 3];
+  uint32_t twoopi_3 = two_over_pi[i + 7];
+
+  // Compute x * 2/pi in 2.62-bit fixed-point format.
+  uint64_t p;
+  p = uint64_t(xi) * twoopi_3;
+  p = uint64_t(xi) * twoopi_2 + (p >> 32);
+  p = (uint64_t(xi * twoopi_1) << 32) + p;
+
+  // Round to nearest: add 0.5 and extract integral part.
+  uint64_t q = (p + zero_dot_five) >> 62;
+  *quadrant = int(q);
+  // Now it remains to compute "r = x - q*pi/2" with high accuracy,
+  // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
+  //   r = (p-q)*pi/2,
+  // where the product can be be carried out with sufficient accuracy using double precision.
+  p -= q << 62;
+  return float(double(int64_t(p)) * pio2_62);
+}
+
+template <TrigFunction Func, typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+#if EIGEN_COMP_GNUC_STRICT
+    __attribute__((optimize("-fno-unsafe-math-optimizations")))
+#endif
+    Packet
+    psincos_float(const Packet& _x) {
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+
+  const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f);  // 2/PI
+  const Packet cst_rounding_magic = pset1<Packet>(12582912);           // 2^23 for rounding
+  const PacketI csti_1 = pset1<PacketI>(1);
+  const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x80000000u));
+
+  Packet x = pabs(_x);
+
+  // Scale x by 2/Pi to find x's octant.
+  Packet y = pmul(x, cst_2oPI);
+
+  // Rounding trick to find nearest integer:
+  Packet y_round = padd(y, cst_rounding_magic);
+  EIGEN_OPTIMIZATION_BARRIER(y_round)
+  PacketI y_int = preinterpret<PacketI>(y_round);  // last 23 digits represent integer (if abs(x)<2^24)
+  y = psub(y_round, cst_rounding_magic);           // nearest integer to x * (2/pi)
+
+// Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4
+// using "Extended precision modular arithmetic"
+#if defined(EIGEN_VECTORIZE_FMA)
+  // This version requires true FMA for high accuracy.
+  // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
+  constexpr float huge_th = (Func == TrigFunction::Sin) ? 117435.992f : 71476.0625f;
+  x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
+  x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
+  x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
+#else
+  // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
+  // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
+  // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
+
+  // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
+  // and 2 ULP up to:
+  constexpr float huge_th = (Func == TrigFunction::Sin) ? 25966.f : 18838.f;
+  x = pmadd(y, pset1<Packet>(-1.5703125), x);  // = 0xbfc90000
+  EIGEN_OPTIMIZATION_BARRIER(x)
+  x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x);  // = 0xb9fdc000
+  EIGEN_OPTIMIZATION_BARRIER(x)
+  x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x);                      // = 0x342ee000
+  x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x);  // = 0x2e74b9ee
+
+// For the record, the following set of coefficients maintain 2ULP up
+// to a slightly larger range:
+// const float huge_th = ComputeSine ? 51981.f : 39086.125f;
+// but it slightly fails to maintain 1ULP for two values of sin below pi.
+// x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
+// x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
+// x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
+// x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
+
+// For the record, with only 3 iterations it is possible to maintain
+// 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
+// The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
+#endif
+
+  if (predux_any(pcmp_le(pset1<Packet>(huge_th), pabs(_x)))) {
+    const int PacketSize = unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Eigen::numext::int32_t y_int2[PacketSize];
+    pstoreu(vals, pabs(_x));
+    pstoreu(x_cpy, x);
+    pstoreu(y_int2, y_int);
+    for (int k = 0; k < PacketSize; ++k) {
+      float val = vals[k];
+      if (val >= huge_th && (numext::isfinite)(val)) x_cpy[k] = trig_reduce_huge(val, &y_int2[k]);
+    }
+    x = ploadu<Packet>(x_cpy);
+    y_int = ploadu<PacketI>(y_int2);
+  }
+
+  // Get the polynomial selection mask from the second bit of y_int
+  // We'll calculate both (sin and cos) polynomials and then select from the two.
+  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
+
+  Packet x2 = pmul(x, x);
+
+  // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
+  Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f);
+  y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f));
+  y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f));
+  y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
+  y1 = pmadd(y1, x2, pset1<Packet>(1.f));
+
+  // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
+  // octave/matlab code to compute those coefficients:
+  //    x = (0:0.0001:pi/4)';
+  //    A = [x.^3 x.^5 x.^7];
+  //    w = ((1.-(x/(pi/4)).^2).^5)*2000+1;         # weights trading relative accuracy
+  //    c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
+  //    printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
+  //
+  Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
+  y2 = pmadd(y2, x2, pset1<Packet>(0.0083326873655616851693794799871284340042620897293090820312500000f));
+  y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
+  y2 = pmul(y2, x2);
+  y2 = pmadd(y2, x, x);
+
+  // Select the correct result from the two polynomials.
+  // Compute the sign to apply to the polynomial.
+  // sin: sign = second_bit(y_int) xor signbit(_x)
+  // cos: sign = second_bit(y_int+1)
+  Packet sign_bit = (Func == TrigFunction::Sin) ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
+                                                : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
+  sign_bit = pand(sign_bit, cst_sign_mask);  // clear all but left most bit
+
+  if ((Func == TrigFunction::SinCos) || (Func == TrigFunction::Tan)) {
+    Packet peven = peven_mask(x);
+    Packet ysin = pselect(poly_mask, y2, y1);
+    Packet ycos = pselect(poly_mask, y1, y2);
+    Packet sign_bit_sin = pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)));
+    Packet sign_bit_cos = preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
+    sign_bit_sin = pand(sign_bit_sin, cst_sign_mask);  // clear all but left most bit
+    sign_bit_cos = pand(sign_bit_cos, cst_sign_mask);  // clear all but left most bit
+    y = (Func == TrigFunction::SinCos) ? pselect(peven, pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos))
+                                       : pdiv(pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos));
+  } else {
+    y = (Func == TrigFunction::Sin) ? pselect(poly_mask, y2, y1) : pselect(poly_mask, y1, y2);
+    y = pxor(y, sign_bit);
+  }
+  return y;
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Packet& x) {
+  return psincos_float<TrigFunction::Sin>(x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x) {
+  return psincos_float<TrigFunction::Cos>(x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_float(const Packet& x) {
+  return psincos_float<TrigFunction::Tan>(x);
+}
+
+// Pi/2 split into 3 double-precision parts (triple-double).
+// c1 + c2 + c3 = pi/2 to ~159 bits. Computed by Sollya.
+// c1 = RD(pi/2), c2 = RD(pi/2 - c1), c3 = RD(pi/2 - c1 - c2).
+template <typename Packet>
+Packet cst_pio2_1() {
+  return pset1<Packet>(-1.5707963267948965579989817342720925807952880859375);  // -0x1.921fb54442d18p0
+}
+template <typename Packet>
+Packet cst_pio2_2() {
+  return pset1<Packet>(-6.12323399573676603586882014729198302312846062338790e-17);  // -0x1.1a62633145c07p-54
+}
+template <typename Packet>
+Packet cst_pio2_3() {
+  return pset1<Packet>(1.4973849048591698329435081771059920083527504761695190e-33);  //  0x1.f1976b7ed8fbcp-110
+}
+
+// Trigonometric argument reduction for double.
+// Reduces x to t such that x + q * pi/2 = t, where |t| <= pi/4.
+// Uses a triple-double split of pi/2 (cst_pio2_{1,2,3}).
+template <typename Packet>
+Packet trig_reduce_small_double(const Packet& x, const Packet& q) {
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+  // With FMA, pmadd(a, b, c) = fl(a*b + c) in a single rounding,
+  // so Cody-Waite reduction is accurate even under catastrophic cancellation.
+  Packet t;
+  t = pmadd(cst_pio2_1<Packet>(), q, x);
+  t = pmadd(cst_pio2_2<Packet>(), q, t);
+  t = pmadd(cst_pio2_3<Packet>(), q, t);
+  return t;
+#else
+  // Without FMA, pmadd is mul + add (two roundings). For large q,
+  // pmul(pio2_1, q) rounds before the cancellation with x, losing
+  // catastrophic amounts of precision (observed: ~10 digits lost).
+  // Use error-free transformations to preserve accuracy.
+
+  // Compute q * pio2_1 exactly as a double-word using Dekker's algorithm.
+  Packet qp_hi, qp_lo;
+  twoprod(cst_pio2_1<Packet>(), q, qp_hi, qp_lo);
+
+  // Error-free addition of x and qp_hi using Knuth's 2sum.
+  // Returns t_hi + t_lo = x + qp_hi exactly, with t_hi = fl(x + qp_hi).
+  Packet t_hi = padd(x, qp_hi);
+  Packet v = psub(t_hi, x);
+  Packet t_lo = padd(psub(x, psub(t_hi, v)), psub(qp_hi, v));
+
+  // Accumulate the low part of the product and the remaining pi/2 terms.
+  t_lo = padd(t_lo, qp_lo);
+  t_lo = pmadd(cst_pio2_2<Packet>(), q, t_lo);
+  t_lo = pmadd(cst_pio2_3<Packet>(), q, t_lo);
+
+  return padd(t_hi, t_lo);
+#endif
+}
+
+template <TrigFunction Func, typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+#if EIGEN_COMP_GNUC_STRICT
+    __attribute__((optimize("-fno-unsafe-math-optimizations")))
+#endif
+    Packet
+    psincos_double(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  typedef typename unpacket_traits<PacketI>::type ScalarI;
+
+  const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint64_t>(0x8000000000000000u));
+
+  // If the argument is smaller than this value, use a simpler argument reduction
+  const double small_th = 15;
+  // If the argument is bigger than this value, use the non-vectorized std version
+  const double huge_th = 1e14;
+
+  // 2/PI as a double-word: hi + lo = 2/pi to ~107 bits. Computed by Sollya.
+  const Packet cst_2oPI_hi =
+      pset1<Packet>(0.63661977236758138243288840385503135621547698974609375);  // 0x1.45f306dc9c883p-1
+  const Packet cst_2oPI_lo =
+      pset1<Packet>(-3.9357353350364971763790381828183628368294820823718866e-17);  // -0x1.6b01ec5417056p-55
+  // Integer Packet constants
+  const PacketI cst_one = pset1<PacketI>(ScalarI(1));
+
+  Packet x_abs = pabs(x);
+
+  // Scale x by 2/Pi
+  PacketI q_int;
+  Packet s;
+
+  if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(small_th), x_abs)))) {
+    // Medium path: use double-word product x * (2/pi) for precise quadrant computation.
+    Packet prod_hi, prod_lo;
+    twoprod(x_abs, cst_2oPI_hi, prod_hi, prod_lo);
+    // Correction for 2/pi truncation: add x * lo(2/pi)
+    prod_lo = pmadd(x_abs, cst_2oPI_lo, prod_lo);
+
+    // Round the double-word (prod_hi, prod_lo) to the nearest integer.
+    Packet q = pround(prod_hi);
+    // Compute exact fractional part to check if rounding was correct.
+    Packet frac = padd(psub(prod_hi, q), prod_lo);
+    // Correct if fractional part crossed +-0.5 boundary.
+    q = padd(q, pand(pcmp_lt(pset1<Packet>(0.5), frac), pset1<Packet>(1.0)));
+    q = padd(q, pand(pcmp_lt(frac, pset1<Packet>(-0.5)), pset1<Packet>(-1.0)));
+
+    q_int = pcast<Packet, PacketI>(q);
+    s = trig_reduce_small_double(x_abs, q);
+  } else {
+    // Small path: simple reduction with triple-double pi/2 split.
+    Packet qval_noround = pmul(x_abs, cst_2oPI_hi);
+    q_int = pcast<Packet, PacketI>(padd(qval_noround, pset1<Packet>(0.5)));
+    Packet q = pcast<PacketI, Packet>(q_int);
+    s = trig_reduce_small_double(x_abs, q);
+  }
+
+  Packet ss = pmul(s, s);
+
+  // Minimax polynomial approximation of cos(x) on [-pi/4, pi/4].
+  // cos(x) = 1 + u * P(u), where u = x^2 and P is degree 6 (7 FMAs total).
+  // Coefficients computed by Sollya fpminimax. Max polynomial error ~1.3e-19.
+  Packet scos = pset1<Packet>(-1.1368926065317776472832699312119132152576472805094454088248312473297119140625e-11);
+  scos = pmadd(scos, ss, pset1<Packet>(2.0875905481768720039634091158002593413556269297259859740734100341796875e-09));
+  scos = pmadd(scos, ss, pset1<Packet>(-2.7557315712466412785356544880299711763882442028261721134185791015625e-07));
+  scos = pmadd(scos, ss, pset1<Packet>(2.480158729424286522739599714082459058772656135261058807373046875e-05));
+  scos = pmadd(scos, ss, pset1<Packet>(-1.388888888888178789471350427220386336557567119598388671875e-03));
+  scos = pmadd(scos, ss, pset1<Packet>(4.166666666666664353702032030923874117434024810791015625e-02));
+  scos = pmadd(scos, ss, pset1<Packet>(-0.5));
+  scos = pmadd(scos, ss, pset1<Packet>(1.0));
+
+  // Minimax polynomial approximation of sin(x) on [-pi/4, pi/4].
+  // sin(x) = x * (1 + u * R(u)), where u = x^2 and R is degree 5.
+  // Computed as: x + x * u * R(u) (6 FMAs + 1 mul).
+  // Coefficients computed by Sollya fpminimax. Max polynomial error ~1.0e-17.
+  Packet ssin = pset1<Packet>(1.59193066075142890698150587293845624470289834562208852730691432952880859375e-10);
+  ssin = pmadd(ssin, ss, pset1<Packet>(-2.50511517945670206974594627392927126408039839589037001132965087890625e-08));
+  ssin = pmadd(ssin, ss, pset1<Packet>(2.755731622544328228235042954619160582296899519860744476318359375e-06));
+  ssin = pmadd(ssin, ss, pset1<Packet>(-1.9841269837089632013978068858506276228581555187702178955078125e-04));
+  ssin = pmadd(ssin, ss, pset1<Packet>(8.333333333331312264835588621281203813850879669189453125e-03));
+  ssin = pmadd(ssin, ss, pset1<Packet>(-0.1666666666666666574148081281236954964697360992431640625));
+  ssin = pmul(ssin, ss);
+  ssin = pmadd(ssin, s, s);
+
+  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(q_int, cst_one), pzero(q_int)));
+
+  Packet sign_sin = pxor(x, preinterpret<Packet>(plogical_shift_left<62>(q_int)));
+  Packet sign_cos = preinterpret<Packet>(plogical_shift_left<62>(padd(q_int, cst_one)));
+  Packet sign_bit, sFinalRes;
+  if (Func == TrigFunction::Sin) {
+    sign_bit = sign_sin;
+    sFinalRes = pselect(poly_mask, ssin, scos);
+  } else if (Func == TrigFunction::Cos) {
+    sign_bit = sign_cos;
+    sFinalRes = pselect(poly_mask, scos, ssin);
+  } else if (Func == TrigFunction::Tan) {
+    sign_bit = pxor(sign_sin, sign_cos);
+    sFinalRes = pdiv(pselect(poly_mask, ssin, scos), pselect(poly_mask, scos, ssin));
+  } else if (Func == TrigFunction::SinCos) {
+    Packet peven = peven_mask(x);
+    sign_bit = pselect(peven, sign_sin, sign_cos);
+    sFinalRes = pselect(pxor(peven, poly_mask), scos, ssin);
+  }
+  sign_bit = pand(sign_bit, cst_sign_mask);  // clear all but left most bit
+  sFinalRes = pxor(sFinalRes, sign_bit);
+
+  // For inputs above huge_th the medium-path reduction loses too much precision. A vectorized
+  // Payne-Hanek reduction was investigated and judged not worthwhile (high implementation cost
+  // for what is in practice a rare path), so these inputs fall back to the scalar libm.
+  if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(huge_th), x_abs)))) {
+    const int PacketSize = unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double sincos_vals[PacketSize];
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double x_cpy[PacketSize];
+    pstoreu(x_cpy, x);
+    pstoreu(sincos_vals, sFinalRes);
+    for (int k = 0; k < PacketSize; ++k) {
+      double val = x_cpy[k];
+      if (std::abs(val) > huge_th && (numext::isfinite)(val)) {
+        if (Func == TrigFunction::Sin) {
+          sincos_vals[k] = std::sin(val);
+        } else if (Func == TrigFunction::Cos) {
+          sincos_vals[k] = std::cos(val);
+        } else if (Func == TrigFunction::Tan) {
+          sincos_vals[k] = std::tan(val);
+        } else if (Func == TrigFunction::SinCos) {
+          sincos_vals[k] = k % 2 == 0 ? std::sin(val) : std::cos(val);
+        }
+      }
+    }
+    sFinalRes = ploadu<Packet>(sincos_vals);
+  }
+  return sFinalRes;
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x) {
+  return psincos_double<TrigFunction::Sin>(x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x) {
+  return psincos_double<TrigFunction::Cos>(x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_double(const Packet& x) {
+  return psincos_double<TrigFunction::Tan>(x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+    std::enable_if_t<std::is_same<typename unpacket_traits<Packet>::type, float>::value, Packet>
+    psincos_selector(const Packet& x) {
+  return psincos_float<TrigFunction::SinCos, Packet>(x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+    std::enable_if_t<std::is_same<typename unpacket_traits<Packet>::type, double>::value, Packet>
+    psincos_selector(const Packet& x) {
+  return psincos_double<TrigFunction::SinCos, Packet>(x);
+}
+
+//----------------------------------------------------------------------
+// Inverse Trigonometric Functions
+//----------------------------------------------------------------------
+
+// Generic implementation of acos(x).
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x_in) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+  const Packet cst_one = pset1<Packet>(Scalar(1));
+  const Packet cst_pi = pset1<Packet>(Scalar(EIGEN_PI));
+  const Packet p6 = pset1<Packet>(Scalar(2.36423197202384471893310546875e-3));
+  const Packet p5 = pset1<Packet>(Scalar(-1.1368644423782825469970703125e-2));
+  const Packet p4 = pset1<Packet>(Scalar(2.717843465507030487060546875e-2));
+  const Packet p3 = pset1<Packet>(Scalar(-4.8969544470310211181640625e-2));
+  const Packet p2 = pset1<Packet>(Scalar(8.8804088532924652099609375e-2));
+  const Packet p1 = pset1<Packet>(Scalar(-0.214591205120086669921875));
+  const Packet p0 = pset1<Packet>(Scalar(1.57079637050628662109375));
+
+  // For x in [0:1], we approximate acos(x)/sqrt(1-x), which is a smooth
+  // function, by a 6'th order polynomial.
+  // For x in [-1:0) we use that acos(-x) = pi - acos(x).
+  const Packet neg_mask = psignbit(x_in);
+  const Packet abs_x = pabs(x_in);
+
+  // Evaluate the polynomial using Horner's rule:
+  //   P(x) = p0 + x * (p1 +  x * (p2 + ... (p5 + x * p6)) ... ) .
+  // We evaluate even and odd terms independently to increase
+  // instruction level parallelism.
+  Packet x2 = pmul(x_in, x_in);
+  Packet p_even = pmadd(p6, x2, p4);
+  Packet p_odd = pmadd(p5, x2, p3);
+  p_even = pmadd(p_even, x2, p2);
+  p_odd = pmadd(p_odd, x2, p1);
+  p_even = pmadd(p_even, x2, p0);
+  Packet p = pmadd(p_odd, abs_x, p_even);
+
+  // The polynomial approximates acos(x)/sqrt(1-x), so
+  // multiply by sqrt(1-x) to get acos(x).
+  // Conveniently returns NaN for arguments outside [-1:1].
+  Packet denom = psqrt(psub(cst_one, abs_x));
+  Packet result = pmul(denom, p);
+  // Undo mapping for negative arguments.
+  return pselect(neg_mask, psub(cst_pi, result), result);
+}
+
+// Generic implementation of asin(x).
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x_in) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+  constexpr float kPiOverTwo = static_cast<float>(EIGEN_PI / 2);
+
+  const Packet cst_half = pset1<Packet>(0.5f);
+  const Packet cst_one = pset1<Packet>(1.0f);
+  const Packet cst_two = pset1<Packet>(2.0f);
+  const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
+
+  const Packet abs_x = pabs(x_in);
+  const Packet sign_mask = pandnot(x_in, abs_x);
+  const Packet invalid_mask = pcmp_lt(cst_one, abs_x);
+
+  // For arguments |x| > 0.5, we map x back to [0:0.5] using
+  // the transformation x_large = sqrt(0.5*(1-x)), and use the
+  // identity
+  //   asin(x) = pi/2 - 2 * asin( sqrt( 0.5 * (1 - x)))
+
+  const Packet x_large = psqrt(pnmadd(cst_half, abs_x, cst_half));
+  const Packet large_mask = pcmp_lt(cst_half, abs_x);
+  const Packet x = pselect(large_mask, x_large, abs_x);
+  const Packet x2 = pmul(x, x);
+
+  // For |x| < 0.5 approximate asin(x)/x by an 8th order polynomial with
+  // even terms only.
+  constexpr float alpha[] = {5.08838854730129241943359375e-2f, 3.95139865577220916748046875e-2f,
+                             7.550220191478729248046875e-2f, 0.16664917767047882080078125f, 1.00000011920928955078125f};
+  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
+  p = pmul(p, x);
+
+  const Packet p_large = pnmadd(cst_two, p, cst_pi_over_two);
+  p = pselect(large_mask, p_large, p);
+  // Flip the sign for negative arguments.
+  p = pxor(p, sign_mask);
+  // Return NaN for arguments outside [-1:1].
+  return por(invalid_mask, p);
+}
+
+template <typename Scalar>
+struct patan_reduced {
+  template <typename Packet>
+  static EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet run(const Packet& x);
+};
+
+template <>
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<double>::run(const Packet& x) {
+  constexpr double alpha[] = {2.6667153866462208e-05, 3.0917513112462781e-03, 5.2574296781008604e-02,
+                              3.0409318473444424e-01, 7.5365702534987022e-01, 8.2704055405494614e-01,
+                              3.3004361289279920e-01};
+
+  constexpr double beta[] = {
+      2.7311202462436667e-04, 1.0899150928962708e-02, 1.1548932646420353e-01, 4.9716458728465573e-01, 1.0,
+      9.3705509168587852e-01, 3.3004361289279920e-01};
+
+  Packet x2 = pmul(x, x);
+  Packet p = ppolevl<Packet, 6>::run(x2, alpha);
+  Packet q = ppolevl<Packet, 6>::run(x2, beta);
+  return pmul(x, pdiv(p, q));
+}
+
+// Computes elementwise atan(x) for x in [-1:1] with 2 ulp accuracy.
+template <>
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<float>::run(const Packet& x) {
+  constexpr float alpha[] = {1.12026982009410858154296875e-01f, 7.296695709228515625e-01f, 8.109951019287109375e-01f};
+
+  constexpr float beta[] = {1.00917108356952667236328125e-02f, 2.8318560123443603515625e-01f, 1.0f,
+                            8.109951019287109375e-01f};
+
+  Packet x2 = pmul(x, x);
+  Packet p = ppolevl<Packet, 2>::run(x2, alpha);
+  Packet q = ppolevl<Packet, 3>::run(x2, beta);
+  return pmul(x, pdiv(p, q));
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_atan(const Packet& x_in) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+
+  constexpr Scalar kPiOverTwo = static_cast<Scalar>(EIGEN_PI / 2);
+
+  const Packet cst_signmask = pset1<Packet>(Scalar(-0.0));
+  const Packet cst_one = pset1<Packet>(Scalar(1));
+  const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
+
+  //   "Large": For |x| > 1, use atan(1/x) = sign(x)*pi/2 - atan(x).
+  //   "Small": For |x| <= 1, approximate atan(x) directly by a polynomial
+  //            calculated using Rminimax.
+
+  const Packet abs_x = pabs(x_in);
+  const Packet x_signmask = pand(x_in, cst_signmask);
+  const Packet large_mask = pcmp_lt(cst_one, abs_x);
+  const Packet x = pselect(large_mask, preciprocal(abs_x), abs_x);
+  const Packet p = patan_reduced<Scalar>::run(x);
+  // Apply transformations according to the range reduction masks.
+  Packet result = pselect(large_mask, psub(cst_pi_over_two, p), p);
+  // Return correct sign
+  return pxor(result, x_signmask);
+}
+
+//----------------------------------------------------------------------
+// Hyperbolic Functions
+//----------------------------------------------------------------------
+
+#ifdef EIGEN_FAST_MATH
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
+    Doesn't do anything fancy, just a 9/8-degree rational interpolant which
+    is accurate up to a couple of ulps in the (approximate) range [-8, 8],
+    outside of which tanh(x) = +/-1 in single precision. The input is clamped
+    to the range [-c, c]. The value c is chosen as the smallest value where
+    the approximation evaluates to exactly 1.
+
+    This implementation works on both scalars and packets.
+*/
+template <typename T>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& a_x) {
+  // Clamp the inputs to the range [-c, c] and set everything
+  // outside that range to 1.0. The value c is chosen as the smallest
+  // floating point argument such that the approximation is exactly 1.
+  // This saves clamping the value at the end.
+#ifdef EIGEN_VECTORIZE_FMA
+  const T plus_clamp = pset1<T>(8.01773357391357422f);
+  const T minus_clamp = pset1<T>(-8.01773357391357422f);
+#else
+  const T plus_clamp = pset1<T>(7.90738964080810547f);
+  const T minus_clamp = pset1<T>(-7.90738964080810547f);
+#endif
+  const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
+
+  // The following rational approximation was generated by rminimax
+  // (https://gitlab.inria.fr/sfilip/rminimax) using the following
+  // command:
+  // $ ratapprox --function="tanh(x)" --dom='[-8.67,8.67]' --num="odd"
+  //   --den="even" --type="[9,8]" --numF="[SG]" --denF="[SG]" --log
+  //   --output=tanhf.sollya --dispCoeff="dec"
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  constexpr float alpha[] = {1.394553628e-8f, 2.102733560e-5f, 3.520756727e-3f, 1.340216100e-1f};
+
+  // The monomial coefficients of the denominator polynomial (even).
+  constexpr float beta[] = {8.015776984e-7f, 3.326951409e-4f, 2.597254514e-2f, 4.673548340e-1f, 1.0f};
+
+  // Since the polynomials are odd/even, we need x^2.
+  const T x2 = pmul(x, x);
+  const T x3 = pmul(x2, x);
+
+  T p = ppolevl<T, 3>::run(x2, alpha);
+  T q = ppolevl<T, 4>::run(x2, beta);
+  // Take advantage of the fact that the constant term in p is 1 to compute
+  // x*(x^2*p + 1) = x^3 * p + x.
+  p = pmadd(x3, p, x);
+
+  // Divide the numerator by the denominator.
+  return pdiv(p, q);
+}
+
+#else
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise).
+    On the domain [-1.25:1.25] we use an approximation of the form
+    tanh(x) ~= x^3 * (P(x) / Q(x)) + x, where P and Q are polynomials in x^2.
+    For |x| > 1.25, tanh is implemented as tanh(x) = 1 - (2 / (1 + exp(2*x))).
+
+    This implementation has a maximum error of 1 ULP (measured with AVX2+FMA).
+
+    This implementation works on both scalars and packets.
+*/
+template <typename T>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& x) {
+  // The polynomial coefficients were computed using Rminimax:
+  // % ./ratapprox --function="tanh(x)-x" --dom='[-1.25,1.25]' --num="[x^3,x^5]" --den="even"
+  //     --type="[3,4]" --numF="[SG]" --denF="[SG]" --log --dispCoeff="dec" --output=tanhf.solly
+  constexpr float alpha[] = {-1.46725140511989593505859375e-02f, -3.333333432674407958984375e-01f};
+  constexpr float beta[] = {1.570280082523822784423828125e-02, 4.4401752948760986328125e-01, 1.0f};
+  const T x2 = pmul(x, x);
+  const T x3 = pmul(x2, x);
+  const T p = ppolevl<T, 1>::run(x2, alpha);
+  const T q = ppolevl<T, 2>::run(x2, beta);
+  const T small_tanh = pmadd(x3, pdiv(p, q), x);
+
+  const T sign_mask = pset1<T>(-0.0f);
+  const T abs_x = pandnot(x, sign_mask);
+  constexpr float kSmallThreshold = 1.25f;
+  const T large_mask = pcmp_lt(pset1<T>(kSmallThreshold), abs_x);
+  // Fast exit if all elements are small.
+  if (!predux_any(large_mask)) {
+    return small_tanh;
+  }
+
+  //  Compute as 1 - (2 / (1 + exp(2*x)))
+  const T one = pset1<T>(1.0f);
+  const T two = pset1<T>(2.0f);
+  const T s = pexp_float<T, true>(pmul(two, abs_x));
+  const T abs_tanh = psub(one, pdiv(two, padd(s, one)));
+
+  // Handle infinite inputs and set sign bit.
+  constexpr float kHugeThreshold = 16.0f;
+  const T huge_mask = pcmp_lt(pset1<T>(kHugeThreshold), abs_x);
+  const T x_sign = pand(sign_mask, x);
+  const T large_tanh = por(x_sign, pselect(huge_mask, one, abs_tanh));
+  return pselect(large_mask, large_tanh, small_tanh);
+}
+
+#endif  // EIGEN_FAST_MATH
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
+    This uses a 19/18-degree rational interpolant which
+    is accurate up to a couple of ulps in the (approximate) range [-18.7, 18.7],
+    outside of which tanh(x) = +/-1 in single precision. The input is clamped
+    to the range [-c, c]. The value c is chosen as the smallest value where
+    the approximation evaluates to exactly 1.
+
+    This implementation works on both scalars and packets.
+*/
+template <typename T>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_double(const T& a_x) {
+  // Clamp the inputs to the range [-c, c] and set everything
+  // outside that range to 1.0. The value c is chosen as the smallest
+  // floating point argument such that the approximation is exactly 1.
+  // This saves clamping the value at the end.
+#ifdef EIGEN_VECTORIZE_FMA
+  const T plus_clamp = pset1<T>(17.6610191624600077);
+  const T minus_clamp = pset1<T>(-17.6610191624600077);
+#else
+  const T plus_clamp = pset1<T>(17.714196154005176);
+  const T minus_clamp = pset1<T>(-17.714196154005176);
+#endif
+  const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
+  // The following rational approximation was generated by rminimax
+  // (https://gitlab.inria.fr/sfilip/rminimax) using the following
+  // command:
+  // $ ./ratapprox --function="tanh(x)" --dom='[-18.72,18.72]'
+  //   --num="odd" --den="even" --type="[19,18]" --numF="[D]"
+  //   --denF="[D]" --log --output=tanh.sollya --dispCoeff="dec"
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  constexpr double alpha[] = {2.6158007860482230e-23, 7.6534862268749319e-19, 3.1309488231386680e-15,
+                              4.2303918148209176e-12, 2.4618379131293676e-09, 6.8644367682497074e-07,
+                              9.3839087674268880e-05, 5.9809711724441161e-03, 1.5184719640284322e-01};
+
+  // The monomial coefficients of the denominator polynomial (even).
+  constexpr double beta[] = {6.463747022670968018e-21, 5.782506856739003571e-17,
+                             1.293019623712687916e-13, 1.123643448069621992e-10,
+                             4.492975677839633985e-08, 8.785185266237658698e-06,
+                             8.295161192716231542e-04, 3.437448108450402717e-02,
+                             4.851805297361760360e-01, 1.0};
+
+  // Since the polynomials are odd/even, we need x^2.
+  const T x2 = pmul(x, x);
+  const T x3 = pmul(x2, x);
+
+  // Interleave the evaluation of the numerator polynomial p and
+  // denominator polynomial q.
+  T p = ppolevl<T, 8>::run(x2, alpha);
+  T q = ppolevl<T, 9>::run(x2, beta);
+  // Take advantage of the fact that the constant term in p is 1 to compute
+  // x*(x^2*p + 1) = x^3 * p + x.
+  p = pmadd(x3, p, x);
+
+  // Divide the numerator by the denominator.
+  return pdiv(p, q);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+  // For |x| in [0:0.5] we use a polynomial approximation of the form
+  // P(x) = x + x^3*(alpha[4] + x^2 * (alpha[3] + x^2 * (... x^2 * alpha[0]) ... )).
+  constexpr float alpha[] = {0.1819281280040740966796875f, 8.2311116158962249755859375e-2f,
+                             0.14672131836414337158203125f, 0.1997792422771453857421875f, 0.3333373963832855224609375f};
+  const Packet x2 = pmul(x, x);
+  const Packet x3 = pmul(x, x2);
+  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
+  p = pmadd(x3, p, x);
+
+  // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
+  const Packet half = pset1<Packet>(0.5f);
+  const Packet one = pset1<Packet>(1.0f);
+  Packet r = pdiv(padd(one, x), psub(one, x));
+  r = pmul(half, plog(r));
+
+  const Packet x_gt_half = pcmp_le(half, pabs(x));
+  const Packet x_eq_one = pcmp_eq(one, pabs(x));
+  const Packet x_gt_one = pcmp_lt(one, pabs(x));
+  const Packet sign_mask = pset1<Packet>(-0.0f);
+  const Packet x_sign = pand(sign_mask, x);
+  const Packet inf = pset1<Packet>(std::numeric_limits<float>::infinity());
+  return por(x_gt_one, pselect(x_eq_one, por(x_sign, inf), pselect(x_gt_half, r, p)));
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_double(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
+  // For x in [-0.5:0.5] we use a rational approximation of the form
+  // R(x) = x + x^3*P(x^2)/Q(x^2), where P is or order 4 and Q is of order 5.
+  constexpr double alpha[] = {3.3071338469301391e-03, -4.7129526768798737e-02, 1.8185306179826699e-01,
+                              -2.5949536095445679e-01, 1.2306328729812676e-01};
+
+  constexpr double beta[] = {-3.8679974580640881e-03, 7.6391885763341910e-02,  -4.2828141436397615e-01,
+                             9.8733495886883648e-01,  -1.0000000000000000e+00, 3.6918986189438030e-01};
+
+  const Packet x2 = pmul(x, x);
+  const Packet x3 = pmul(x, x2);
+  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
+  Packet q = ppolevl<Packet, 5>::run(x2, beta);
+  Packet y_small = pmadd(x3, pdiv(p, q), x);
+
+  // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
+  const Packet half = pset1<Packet>(0.5);
+  const Packet one = pset1<Packet>(1.0);
+  Packet y_large = pdiv(padd(one, x), psub(one, x));
+  y_large = pmul(half, plog(y_large));
+
+  const Packet x_gt_half = pcmp_le(half, pabs(x));
+  const Packet x_eq_one = pcmp_eq(one, pabs(x));
+  const Packet x_gt_one = pcmp_lt(one, pabs(x));
+  const Packet sign_mask = pset1<Packet>(-0.0);
+  const Packet x_sign = pand(sign_mask, x);
+  const Packet inf = pset1<Packet>(std::numeric_limits<double>::infinity());
+  return por(x_gt_one, pselect(x_eq_one, por(x_sign, inf), pselect(x_gt_half, y_large, y_small)));
+}
+
+//----------------------------------------------------------------------
+// sinh / cosh
+//----------------------------------------------------------------------
+
+/** \internal \returns the hyperbolic sine of \a x (coeff-wise).
+    Uses sinh(x) = (exp(x) - exp(-x)) / 2.
+    Near overflow, uses sinh(x) = sign(x) * exp(|x|) / 2 via ldexp to avoid inf.
+    For |x| < 1, uses a direct polynomial to avoid catastrophic cancellation.
+*/
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psinh_float(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+  const Packet sign_mask = pset1<Packet>(-0.0f);
+  const Packet abs_x = pandnot(x, sign_mask);
+  const Packet x_sign = pand(x, sign_mask);
+
+  // For |x| < 1, use a polynomial approximation to avoid
+  // cancellation in exp(x) - exp(-x).
+  constexpr float alpha[] = {2.7557314045e-06f, 1.9841270114e-04f, 8.3333335817e-03f, 1.6666666716e-01f};
+  const Packet x2 = pmul(x, x);
+  Packet p_small = ppolevl<Packet, 3>::run(x2, alpha);
+  p_small = pmadd(pmul(x2, x), p_small, x);
+
+  // Compute e = exp(|x|) / 2 = exp(|x| - 1) * (e/2), where e is Euler's number.
+  // Using a single exp avoids a second expensive call, and subtracting 1 (exactly
+  // representable) instead of ln2 avoids rounding error in the argument to exp,
+  // which would be amplified into large relative output error.
+  const Packet half_e = pset1<Packet>(1.3591409142295225f);  // e/2
+  const Packet one = pset1<Packet>(1.0f);
+  const Packet e = pmul(pexp(psub(abs_x, one)), half_e);
+
+  // Medium path (1 <= |x| < 20):
+  //   sinh(x) = (exp(|x|) - exp(-|x|)) / 2
+  //           = (2*e - 1/(2*e)) / 2 = e - 1/(4*e)
+  const Packet quarter = pset1<Packet>(0.25f);
+  Packet p_medium = psub(e, pdiv(quarter, e));
+
+  // Large path (|x| >= 20): exp(-|x|) is negligible, sinh(x) ~ exp(|x|)/2 = e.
+  const Packet large_threshold = pset1<Packet>(20.0f);
+  const Packet large_mask = pcmp_lt(large_threshold, abs_x);
+  Packet p_large = pselect(large_mask, e, p_medium);
+  p_large = por(x_sign, p_large);
+
+  const Packet small_mask = pcmp_lt(abs_x, one);
+  return pselect(small_mask, p_small, p_large);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psinh_double(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
+
+  const Packet sign_mask = pset1<Packet>(-0.0);
+  const Packet abs_x = pandnot(x, sign_mask);
+  const Packet x_sign = pand(x, sign_mask);
+
+  // Taylor series: sinh(x) = x + x^3/3! + x^5/5! + ... + x^19/19!
+  // Polynomial form: sinh(x) = x + x^3 * P(x^2) where P(t) = sum_{k=0}^{8} t^k/(2k+3)!
+  // ppolevl stores highest-degree coefficient first.
+  constexpr double alpha[] = {
+      8.2206352466243297e-18,  // t^8: 1/19!
+      2.8114572543455206e-15,  // t^7: 1/17!
+      7.6471637318198164e-13,  // t^6: 1/15!
+      1.6059043836821613e-10,  // t^5: 1/13!
+      2.5052108385441718e-08,  // t^4: 1/11!
+      2.7557319223985893e-06,  // t^3: 1/9!
+      1.9841269841269841e-04,  // t^2: 1/7!
+      8.3333333333333332e-03,  // t^1: 1/5!
+      1.6666666666666666e-01,  // t^0: 1/3!
+  };
+  const Packet x2 = pmul(x, x);
+  Packet p_small = ppolevl<Packet, 8>::run(x2, alpha);
+  p_small = pmadd(pmul(x2, x), p_small, x);
+
+  // Compute e = exp(|x|) / 2 = exp(|x| - 1) * (e/2), where e is Euler's number.
+  // Subtracting 1 (exactly representable) instead of ln2 avoids rounding error
+  // in the argument to exp, which would be amplified into large relative error.
+  const Packet half_e = pset1<Packet>(1.3591409142295225);  // e/2
+  const Packet one = pset1<Packet>(1.0);
+  const Packet e = pmul(pexp(psub(abs_x, one)), half_e);
+
+  // Medium path (1 <= |x| < 20):
+  //   sinh(x) = (exp(|x|) - exp(-|x|)) / 2 = e - 1/(4*e)
+  const Packet quarter = pset1<Packet>(0.25);
+  Packet p_medium = psub(e, pdiv(quarter, e));
+
+  // Large path (|x| >= 20): exp(-|x|) is negligible, sinh(x) ~ exp(|x|)/2 = e.
+  const Packet large_threshold = pset1<Packet>(20.0);
+  const Packet large_mask = pcmp_lt(large_threshold, abs_x);
+  Packet p_large = pselect(large_mask, e, p_medium);
+  p_large = por(x_sign, p_large);
+  const Packet small_mask = pcmp_lt(abs_x, one);
+  return pselect(small_mask, p_small, p_large);
+}
+
+/** \internal \returns the hyperbolic cosine of \a x (coeff-wise).
+    Uses cosh(x) = (exp(|x|) + exp(-|x|)) / 2.
+    Near overflow, uses ldexp(exp(|x| - ln2), -1) to avoid premature inf.
+*/
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcosh_float(const Packet& x) {
+  const Packet abs_x = pabs(x);
+
+  // Compute e = exp(|x|) / 2 = exp(|x| - 1) * (e/2), where e is Euler's number.
+  // Using a single exp avoids a second expensive call, and subtracting 1 (exactly
+  // representable) instead of ln2 avoids rounding error in the argument to exp,
+  // which would be amplified into large relative output error.
+  const Packet half_e = pset1<Packet>(1.3591409142295225f);  // e/2
+  const Packet one = pset1<Packet>(1.0f);
+  const Packet e = pmul(pexp(psub(abs_x, one)), half_e);
+
+  // Medium path: cosh(x) = (exp(|x|) + exp(-|x|)) / 2
+  //            = (2*e + 1/(2*e)) / 2 = e + 1/(4*e)
+  const Packet quarter = pset1<Packet>(0.25f);
+  Packet p_medium = padd(e, pdiv(quarter, e));
+
+  // Large path (|x| >= 20): exp(-|x|) is negligible, cosh(x) ~ exp(|x|)/2 = e.
+  const Packet large_threshold = pset1<Packet>(20.0f);
+  const Packet large_mask = pcmp_lt(large_threshold, abs_x);
+  return pselect(large_mask, e, p_medium);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcosh_double(const Packet& x) {
+  const Packet abs_x = pabs(x);
+
+  // Compute e = exp(|x|) / 2 = exp(|x| - 1) * (e/2), where e is Euler's number.
+  // Subtracting 1 (exactly representable) instead of ln2 avoids rounding error
+  // in the argument to exp, which would be amplified into large relative error.
+  const Packet half_e = pset1<Packet>(1.3591409142295225);  // e/2
+  const Packet one = pset1<Packet>(1.0);
+  const Packet e = pmul(pexp(psub(abs_x, one)), half_e);
+
+  // Medium path: cosh(x) = (exp(|x|) + exp(-|x|)) / 2 = e + 1/(4*e)
+  const Packet quarter = pset1<Packet>(0.25);
+  Packet p_medium = padd(e, pdiv(quarter, e));
+
+  // Large path (|x| >= 20): exp(-|x|) is negligible, cosh(x) ~ exp(|x|)/2 = e.
+  const Packet large_threshold = pset1<Packet>(20.0);
+  const Packet large_mask = pcmp_lt(large_threshold, abs_x);
+  return pselect(large_mask, e, p_medium);
+}
+
+//----------------------------------------------------------------------
+// asinh / acosh
+//----------------------------------------------------------------------
+
+/** \internal \returns the inverse hyperbolic sine of \a x (coeff-wise).
+    Uses a single log1p call by selecting the argument before the transcendental:
+    For moderate |x|: log1p(|x| + x^2 / (1 + sqrt(1 + x^2)))
+    For large |x|:    log1p(|x| - 1) + ln2  (avoids x^2 overflow)
+*/
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasinh_float(const Packet& x) {
+  const Packet sign_mask = pset1<Packet>(-0.0f);
+  const Packet abs_x = pandnot(x, sign_mask);
+  const Packet x_sign = pand(x, sign_mask);
+  const Packet one = pset1<Packet>(1.0f);
+
+  // For |x| >= 1e10, use log(2|x|) = log1p(|x| - 1) + ln2 to avoid x^2 overflow.
+  const Packet large_mask = pcmp_lt(pset1<Packet>(1e10f), abs_x);
+  // Guard x^2 against overflow in the large case.
+  const Packet x2 = pmul(abs_x, pselect(large_mask, pzero(abs_x), abs_x));
+  // For |x| < 1e10: log1p(|x| + x^2 / (1 + sqrt(1 + x^2))).
+  // Algebraically equivalent to log(|x| + sqrt(x^2 + 1))
+  // but avoids cancellation for small |x|.
+  Packet normal_arg = padd(abs_x, pdiv(x2, padd(one, psqrt(padd(one, x2)))));
+  // For |x| >= 1e10: log1p(|x| - 1), then add ln2 after.
+  Packet large_arg = psub(abs_x, one);
+  // Select argument, then call log1p once.
+  Packet result = generic_log1p(pselect(large_mask, large_arg, normal_arg));
+  // Add ln2 for the large path: log(2|x|) = log(|x|) + ln2 = log1p(|x|-1) + ln2.
+  const Packet ln2 = pset1<Packet>(0.6931471805599453f);
+  result = pselect(large_mask, padd(result, ln2), result);
+  return por(x_sign, result);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasinh_double(const Packet& x) {
+  const Packet sign_mask = pset1<Packet>(-0.0);
+  const Packet abs_x = pandnot(x, sign_mask);
+  const Packet x_sign = pand(x, sign_mask);
+  const Packet one = pset1<Packet>(1.0);
+
+  const Packet large_mask = pcmp_lt(pset1<Packet>(1e150), abs_x);
+  const Packet x2 = pmul(abs_x, pselect(large_mask, pzero(abs_x), abs_x));
+  Packet normal_arg = padd(abs_x, pdiv(x2, padd(one, psqrt(padd(one, x2)))));
+  Packet large_arg = psub(abs_x, one);
+  Packet result = generic_log1p(pselect(large_mask, large_arg, normal_arg));
+  const Packet ln2 = pset1<Packet>(0.6931471805599453);
+  result = pselect(large_mask, padd(result, ln2), result);
+  return por(x_sign, result);
+}
+
+/** \internal \returns the inverse hyperbolic cosine of \a x (coeff-wise).
+    Uses a single log1p call by selecting the argument before the transcendental:
+    For moderate x: log1p(t + sqrt(t*(t+2))) where t = x - 1
+    For huge x:     log1p(t) + ln2  (avoids t*(t+2) overflow)
+    Returns NaN for x < 1.
+*/
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacosh_float(const Packet& x) {
+  const Packet one = pset1<Packet>(1.0f);
+  const Packet two = pset1<Packet>(2.0f);
+  const Packet t = psub(x, one);
+  const Packet huge_mask = pcmp_lt(pset1<Packet>(1e10f), x);
+  // Guard t*(t+2) against overflow in the huge case.
+  const Packet t_tp2 = pmul(pselect(huge_mask, pzero(t), t), padd(t, two));
+  Packet normal_arg = padd(t, psqrt(t_tp2));
+  // For huge x: acosh(x) = log(2x) = log1p(x - 1) + ln2.
+  Packet huge_arg = t;
+  // Select argument, then call log1p once.
+  Packet result = generic_log1p(pselect(huge_mask, huge_arg, normal_arg));
+  const Packet ln2 = pset1<Packet>(0.6931471805599453f);
+  result = pselect(huge_mask, padd(result, ln2), result);
+  // Return NaN for x < 1.
+  const Packet invalid_mask = pcmp_lt(x, one);
+  return por(invalid_mask, result);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacosh_double(const Packet& x) {
+  const Packet one = pset1<Packet>(1.0);
+  const Packet two = pset1<Packet>(2.0);
+  const Packet t = psub(x, one);
+  const Packet huge_mask = pcmp_lt(pset1<Packet>(1e150), x);
+  const Packet t_tp2 = pmul(pselect(huge_mask, pzero(t), t), padd(t, two));
+  Packet normal_arg = padd(t, psqrt(t_tp2));
+  Packet huge_arg = t;
+  Packet result = generic_log1p(pselect(huge_mask, huge_arg, normal_arg));
+  const Packet ln2 = pset1<Packet>(0.6931471805599453);
+  result = pselect(huge_mask, padd(result, ln2), result);
+  const Packet invalid_mask = pcmp_lt(x, one);
+  return por(invalid_mask, result);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_ARCH_GENERIC_PACKET_MATH_TRIG_H
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index c073fe8ec79..d1acabec109 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -45,7 +45,7 @@
 // Eigen with GPU support.
 // Any functions that require `numext::bit_cast` may also not be constexpr,
 // including any native types when setting via raw bit values.
-#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
+#if defined(EIGEN_GPUCC) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
 #define _EIGEN_MAYBE_CONSTEXPR
 #else
 #define _EIGEN_MAYBE_CONSTEXPR constexpr
@@ -57,6 +57,45 @@
     return float2half(METHOD<PACKET_F>(half2float(_x)));                                                   \
   }
 
+#define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_F16(PACKET_F, PACKET_F16) \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pcos)                      \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, psin)                      \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, psinh)                     \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pcosh)                     \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pasinh)                    \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pacosh)                    \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pexp)                      \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pexp2)                     \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pexpm1)                    \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, plog)                      \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, plog1p)                    \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, plog2)                     \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, plog10)                    \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, preciprocal)               \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, prsqrt)                    \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pcbrt)                     \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, psqrt)                     \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, ptanh)
+
+// F16 wrappers for unsupported/SpecialFunctions.
+#define EIGEN_INSTANTIATE_SPECIAL_FUNCS_F16(PACKET_F, PACKET_F16) \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, perf)                 \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pndtri)
+
+#define EIGEN_INSTANTIATE_BESSEL_FUNCS_F16(PACKET_F, PACKET_F16) \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_i0)          \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_i0e)         \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_i1)          \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_i1e)         \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_j0)          \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_j1)          \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_k0)          \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_k0e)         \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_k1)          \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_k1e)         \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_y0)          \
+  F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, pbessel_y1)
+
 namespace Eigen {
 
 struct half;
@@ -82,12 +121,12 @@ namespace half_impl {
 //
 // Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves
 // this error, and hence the following convoluted #if condition
-#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
+#if !defined(EIGEN_GPUCC) || !defined(EIGEN_GPU_COMPILE_PHASE)
 
 // Make our own __half_raw definition that is similar to CUDA's.
 struct __half_raw {
   struct construct_from_rep_tag {};
-#if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE))
+#if (defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE))
   // Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF)
   // The element type for shared memory cannot have non-trivial constructors
   // and hence the following special casing (which skips the zero-initilization).
@@ -113,16 +152,12 @@ struct __half_raw {
 #endif
 };
 
-#elif defined(EIGEN_HAS_HIP_FP16)
+#elif defined(EIGEN_HIPCC)
 // HIP GPU compile phase: nothing to do here.
 // HIP fp16 header file has a definition for __half_raw
-#elif defined(EIGEN_HAS_CUDA_FP16)
+#elif defined(EIGEN_CUDACC)
 
 // CUDA GPU compile phase.
-#if EIGEN_CUDA_SDK_VER < 90000
-// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
-typedef __half __half_raw;
-#endif  // defined(EIGEN_HAS_CUDA_FP16)
 
 #elif defined(SYCL_DEVICE_ONLY)
 typedef cl::sycl::half __half_raw;
@@ -136,15 +171,13 @@ struct half_base : public __half_raw {
   EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base() {}
   EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}
 
-#if defined(EIGEN_HAS_GPU_FP16)
-#if defined(EIGEN_HAS_HIP_FP16)
+#if defined(EIGEN_GPUCC)
+#if defined(EIGEN_HIPCC)
   EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
-#elif defined(EIGEN_HAS_CUDA_FP16)
-#if EIGEN_CUDA_SDK_VER >= 90000
+#elif defined(EIGEN_CUDACC)
   EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
 #endif
 #endif
-#endif
 };
 
 }  // namespace half_impl
@@ -153,36 +186,29 @@ struct half_base : public __half_raw {
 struct half : public half_impl::half_base {
   // Writing this out as separate #if-else blocks to make the code easier to follow
   // The same applies to most #if-else blocks in this file
-#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
+#if !defined(EIGEN_GPUCC) || !defined(EIGEN_GPU_COMPILE_PHASE)
   // Use the same base class for the following two scenarios
   // * when compiling without GPU support enabled
   // * during host compile phase when compiling with GPU support enabled
   typedef half_impl::__half_raw __half_raw;
-#elif defined(EIGEN_HAS_HIP_FP16)
+#elif defined(EIGEN_HIPCC)
   // Nothing to do here
   // HIP fp16 header file has a definition for __half_raw
-#elif defined(EIGEN_HAS_CUDA_FP16)
-// Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
-// (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP!  So keeping this within
-// #if defined(EIGEN_HAS_CUDA_FP16) is needed
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-  typedef half_impl::__half_raw __half_raw;
-#endif
+#elif defined(EIGEN_CUDACC)
+  // Nothing to do here.
 #endif
 
   EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half() {}
 
   EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}
 
-#if defined(EIGEN_HAS_GPU_FP16)
-#if defined(EIGEN_HAS_HIP_FP16)
+#if defined(EIGEN_GPUCC)
+#if defined(EIGEN_HIPCC)
   EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
-#elif defined(EIGEN_HAS_CUDA_FP16)
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
+#elif defined(EIGEN_CUDACC)
   EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
 #endif
 #endif
-#endif
 
 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
   explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(__fp16 b)
@@ -209,7 +235,7 @@ struct half : public half_impl::half_base {
     return half_impl::half_to_float(*this);
   }
 
-#if defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE)
   EIGEN_DEVICE_FUNC operator __half() const {
     ::__half_raw hr;
     hr.x = x;
@@ -267,6 +293,8 @@ struct numeric_limits_half_impl {
   static _EIGEN_MAYBE_CONSTEXPR Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x0001); }
 };
 
+// Redundant out-of-class definitions are required pre-C++17 but deprecated since.
+#if EIGEN_COMP_CXXVER < 17
 template <typename T>
 constexpr const bool numeric_limits_half_impl<T>::is_specialized;
 template <typename T>
@@ -316,6 +344,7 @@ template <typename T>
 constexpr const bool numeric_limits_half_impl<T>::traps;
 template <typename T>
 constexpr const bool numeric_limits_half_impl<T>::tinyness_before;
+#endif
 }  // end namespace half_impl
 }  // end namespace Eigen
 
@@ -338,8 +367,7 @@ namespace Eigen {
 
 namespace half_impl {
 
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 // Note: We deliberately do *not* define this to 1 even if we have Arm's native
 // fp16 type since GPU half types are rather different from native CPU half types.
 #define EIGEN_HAS_NATIVE_GPU_FP16
@@ -351,24 +379,10 @@ namespace half_impl {
 // conversion steps back and forth.
 
 #if defined(EIGEN_HAS_NATIVE_GPU_FP16)
-EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
-  return __hadd(::__half(a), ::__half(b));
-#else
-  return __hadd(a, b);
-#endif
-}
+EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) { return __hadd(::__half(a), ::__half(b)); }
 EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) { return __hmul(a, b); }
 EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) { return __hsub(a, b); }
-EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) {
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
-  return __hdiv(a, b);
-#else
-  float num = __half2float(a);
-  float denom = __half2float(b);
-  return __float2half(num / denom);
-#endif
-}
+EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) { return __hdiv(a, b); }
 EIGEN_STRONG_INLINE __device__ half operator-(const half& a) { return __hneg(a); }
 EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) {
   a = a + b;
@@ -463,7 +477,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half&
 // We need to provide emulated *host-side* FP16 operators for clang.
 #pragma push_macro("EIGEN_DEVICE_FUNC")
 #undef EIGEN_DEVICE_FUNC
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
 #define EIGEN_DEVICE_FUNC __host__
 #else  // both host and device need emulated ops.
 #define EIGEN_DEVICE_FUNC __host__ __device__
@@ -508,7 +522,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
 // fp16 uses 1 sign bit, 5 exponent bits, and 10 mantissa bits. The bit pattern conveys NaN when all the exponent
 // bits (5) are set, and at least one mantissa bit is set. The sign bit is irrelevant for determining NaN. To check for
 // NaN, clear the sign bit and check if the integral representation is greater than 01111100000000. To test
-// for non-NaN, clear the sign bit and check if the integeral representation is less than or equal to 01111100000000.
+// for non-NaN, clear the sign bit and check if the integral representation is less than or equal to 01111100000000.
 
 // convert sign-magnitude representation to two's complement
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int16_t mapToSigned(uint16_t a) {
@@ -594,7 +608,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint
   // because this is constexpr function.
   // Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out
   // of this catch22 by having separate bodies for GPU / non GPU
-#if defined(EIGEN_HAS_GPU_FP16)
+#if defined(EIGEN_GPUCC)
   __half_raw h;
   h.x = x;
   return h;
@@ -619,8 +633,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   __half tmp_ff = __float2half(ff);
   return *(__half_raw*)&tmp_ff;
 
@@ -693,8 +706,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   return __half2float(h);
 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
   return static_cast<float>(h.x);
@@ -721,7 +733,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
     o_bits = Eigen::numext::bit_cast<uint32_t>(Eigen::numext::bit_cast<float>(o_bits) - magic);
   }
 
-  o_bits |= (h.x & 0x8000) << 16;  // sign bit
+  o_bits |= (h.x & 0x8000u) << 16;  // sign bit
   return Eigen::numext::bit_cast<float>(o_bits);
 #endif
 }
@@ -736,8 +748,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) {
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   return __hisnan(a);
 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
   return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) > 0x7c00;
@@ -768,16 +779,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hexp(a));
 #else
   return half(::expf(float(a)));
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hexp2(a));
 #else
   return half(::exp2f(float(a)));
@@ -785,9 +794,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) {
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \
-     EIGEN_CUDA_ARCH >= 530) ||                                                                 \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   return half(hlog(a));
 #else
   return half(::logf(float(a)));
@@ -800,8 +807,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log2(const half& a) {
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hsqrt(a));
 #else
   return half(::sqrtf(float(a)));
@@ -822,16 +828,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) { return half(::a
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan(const half& a) { return half(::atanf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atanh(const half& a) { return half(::atanhf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if (defined(EIGEN_CUDA_ARCH)) || defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hfloor(a));
 #else
   return half(::floorf(float(a)));
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if (defined(EIGEN_CUDA_ARCH)) || defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hceil(a));
 #else
   return half(::ceilf(float(a)));
@@ -955,23 +959,22 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast<uint16_t, Eigen::half>(c
   return Eigen::half_impl::raw_half_as_uint16(src);
 }
 
+// Specialize multiply-add to match packet operations and reduce conversions to/from float.
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half madd<Eigen::half>(const Eigen::half& x, const Eigen::half& y,
+                                                                    const Eigen::half& z) {
+  return Eigen::half(static_cast<float>(x) * static_cast<float>(y) + static_cast<float>(z));
+}
+
 }  // namespace numext
 }  // namespace Eigen
 
-// Add the missing shfl* intrinsics.
-// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
-//   CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
-//
-// HIP and CUDA prior to SDK 9.0 define
-//    __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
-// CUDA since 9.0 deprecates those and instead defines
-//    __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
-//    with native support for __half and __nv_bfloat16
-//
+// Warp shuffle overloads for Eigen::half.
+// CUDA uses __shfl_*_sync (with mask); HIP uses __shfl_* (no mask).
 // Note that the following are __device__ - only functions.
-#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) || defined(EIGEN_HIPCC)
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
 
-#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
+#if defined(EIGEN_CUDACC)
 
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane,
                                                        int width = warpSize) {
@@ -997,7 +1000,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen:
   return static_cast<Eigen::half>(__shfl_xor_sync(mask, h, laneMask, width));
 }
 
-#else  // HIP or CUDA SDK < 9.0
+#else  // HIP
 
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
   const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
@@ -1023,7 +1026,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM
 #endif  // __shfl*
 
 // ldg() has an overload for __half_raw, but we also need one for Eigen::half.
-#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) || defined(EIGEN_HIPCC)
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
 EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) {
   return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
 }
@@ -1046,8 +1049,7 @@ namespace internal {
 template <>
 struct cast_impl<float, half> {
   EIGEN_DEVICE_FUNC static inline half run(const float& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
     return __float2half(a);
 #else
     return half(a);
@@ -1058,8 +1060,7 @@ struct cast_impl<float, half> {
 template <>
 struct cast_impl<int, half> {
   EIGEN_DEVICE_FUNC static inline half run(const int& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
     return __float2half(static_cast<float>(a));
 #else
     return half(static_cast<float>(a));
@@ -1070,8 +1071,7 @@ struct cast_impl<int, half> {
 template <>
 struct cast_impl<half, float> {
   EIGEN_DEVICE_FUNC static inline float run(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
     return __half2float(a);
 #else
     return static_cast<float>(a);
diff --git a/Eigen/src/Core/arch/GPU/Complex.h b/Eigen/src/Core/arch/GPU/Complex.h
index fa46aec7e07..be4c2181951 100644
--- a/Eigen/src/Core/arch/GPU/Complex.h
+++ b/Eigen/src/Core/arch/GPU/Complex.h
@@ -62,54 +62,6 @@ namespace Eigen {
 // Specialized std::complex overloads.
 namespace complex_operator_detail {
 
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_multiply(const std::complex<T>& a,
-                                                                       const std::complex<T>& b) {
-  const T a_real = numext::real(a);
-  const T a_imag = numext::imag(a);
-  const T b_real = numext::real(b);
-  const T b_imag = numext::imag(b);
-  return std::complex<T>(a_real * b_real - a_imag * b_imag, a_imag * b_real + a_real * b_imag);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide_fast(const std::complex<T>& a,
-                                                                          const std::complex<T>& b) {
-  const T a_real = numext::real(a);
-  const T a_imag = numext::imag(a);
-  const T b_real = numext::real(b);
-  const T b_imag = numext::imag(b);
-  const T norm = (b_real * b_real + b_imag * b_imag);
-  return std::complex<T>((a_real * b_real + a_imag * b_imag) / norm, (a_imag * b_real - a_real * b_imag) / norm);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide_stable(const std::complex<T>& a,
-                                                                            const std::complex<T>& b) {
-  const T a_real = numext::real(a);
-  const T a_imag = numext::imag(a);
-  const T b_real = numext::real(b);
-  const T b_imag = numext::imag(b);
-  // Smith's complex division (https://arxiv.org/pdf/1210.4539.pdf),
-  // guards against over/under-flow.
-  const bool scale_imag = numext::abs(b_imag) <= numext::abs(b_real);
-  const T rscale = scale_imag ? T(1) : b_real / b_imag;
-  const T iscale = scale_imag ? b_imag / b_real : T(1);
-  const T denominator = b_real * rscale + b_imag * iscale;
-  return std::complex<T>((a_real * rscale + a_imag * iscale) / denominator,
-                         (a_imag * rscale - a_real * iscale) / denominator);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide(const std::complex<T>& a,
-                                                                     const std::complex<T>& b) {
-#if EIGEN_FAST_MATH
-  return complex_divide_fast(a, b);
-#else
-  return complex_divide_stable(a, b);
-#endif
-}
-
 // NOTE: We cannot specialize compound assignment operators with Scalar T,
 //         (i.e.  operator@=(const T&), for @=+,-,*,/)
 //       since they are already specialized for float/double/long double within
@@ -151,7 +103,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide(const std::
                                                                                                                     \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator*(const std::complex<T>& a,                         \
                                                                   const std::complex<T>& b) {                       \
-    return complex_multiply(a, b);                                                                                  \
+    return internal::complex_multiply(a, b);                                                                        \
   }                                                                                                                 \
                                                                                                                     \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator*(const std::complex<T>& a, const T& b) {           \
@@ -164,7 +116,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide(const std::
                                                                                                                     \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator/(const std::complex<T>& a,                         \
                                                                   const std::complex<T>& b) {                       \
-    return complex_divide(a, b);                                                                                    \
+    return internal::complex_divide(a, b);                                                                          \
   }                                                                                                                 \
                                                                                                                     \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator/(const std::complex<T>& a, const T& b) {           \
@@ -172,7 +124,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide(const std::
   }                                                                                                                 \
                                                                                                                     \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator/(const T& a, const std::complex<T>& b) {           \
-    return complex_divide(std::complex<T>(a, 0), b);                                                                \
+    return internal::complex_divide(std::complex<T>(a, 0), b);                                                      \
   }                                                                                                                 \
                                                                                                                     \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T>& operator+=(std::complex<T>& a, const std::complex<T>& b) { \
@@ -188,12 +140,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide(const std::
   }                                                                                                                 \
                                                                                                                     \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T>& operator*=(std::complex<T>& a, const std::complex<T>& b) { \
-    a = complex_multiply(a, b);                                                                                     \
+    a = internal::complex_multiply(a, b);                                                                           \
     return a;                                                                                                       \
   }                                                                                                                 \
                                                                                                                     \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T>& operator/=(std::complex<T>& a, const std::complex<T>& b) { \
-    a = complex_divide(a, b);                                                                                       \
+    a = internal::complex_divide(a, b);                                                                             \
     return a;                                                                                                       \
   }                                                                                                                 \
                                                                                                                     \
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index 328b1b93f19..9649843c58f 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -17,19 +17,8 @@ namespace Eigen {
 
 namespace internal {
 
-// Read-only data cached load available.
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
-#define EIGEN_GPU_HAS_LDG 1
-#endif
-
-// FP16 math available.
-#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
-#define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
-#endif
-
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
-#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
-#endif
+// Read-only data cached load (__ldg) and native FP16 arithmetic are available
+// on all supported GPU architectures (sm_70+ for CUDA, GFX906+ for HIP).
 
 // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
 // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
@@ -56,94 +45,84 @@ struct is_arithmetic<double2> {
 
 template <>
 struct packet_traits<float> : default_packet_traits {
-  typedef float4 type;
-  typedef float4 half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 4,
-
-    HasDiv = 1,
-    HasSin = 0,
-    HasCos = 0,
-    HasLog = 1,
-    HasExp = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasLGamma = 1,
-    HasDiGamma = 1,
-    HasZeta = 1,
-    HasPolygamma = 1,
-    HasErf = 1,
-    HasErfc = 1,
-    HasNdtri = 1,
-    HasBessel = 1,
-    HasIGamma = 1,
-    HasIGammaDerA = 1,
-    HasGammaSampleDerAlpha = 1,
-    HasIGammac = 1,
-    HasBetaInc = 1,
-
-    HasBlend = 0,
-    HasFloor = 1,
-    HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS
-  };
+  using type = float4;
+  using half = float4;
+  static constexpr int Vectorizable = 1;
+  static constexpr int AlignedOnScalar = 1;
+  static constexpr int size = 4;
+
+  static constexpr int HasDiv = 1;
+  static constexpr int HasSin = 0;
+  static constexpr int HasCos = 0;
+  static constexpr int HasLog = 1;
+  static constexpr int HasExp = 1;
+  static constexpr int HasSqrt = 1;
+  static constexpr int HasRsqrt = 1;
+  static constexpr int HasLGamma = 1;
+  static constexpr int HasDiGamma = 1;
+  static constexpr int HasZeta = 1;
+  static constexpr int HasPolygamma = 1;
+  static constexpr int HasErf = 1;
+  static constexpr int HasErfc = 1;
+  static constexpr int HasNdtri = 1;
+  static constexpr int HasBessel = 1;
+  static constexpr int HasIGamma = 1;
+  static constexpr int HasIGammaDerA = 1;
+  static constexpr int HasGammaSampleDerAlpha = 1;
+  static constexpr int HasIGammac = 1;
+  static constexpr int HasBetaInc = 1;
+
+  static constexpr int HasFloor = 1;
+  static constexpr int HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS;
 };
 
 template <>
 struct packet_traits<double> : default_packet_traits {
-  typedef double2 type;
-  typedef double2 half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 2,
-
-    HasDiv = 1,
-    HasLog = 1,
-    HasExp = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasLGamma = 1,
-    HasDiGamma = 1,
-    HasZeta = 1,
-    HasPolygamma = 1,
-    HasErf = 1,
-    HasErfc = 1,
-    HasNdtri = 1,
-    HasBessel = 1,
-    HasIGamma = 1,
-    HasIGammaDerA = 1,
-    HasGammaSampleDerAlpha = 1,
-    HasIGammac = 1,
-    HasBetaInc = 1,
-    HasBlend = 0,
-  };
+  using type = double2;
+  using half = double2;
+  static constexpr int Vectorizable = 1;
+  static constexpr int AlignedOnScalar = 1;
+  static constexpr int size = 2;
+
+  static constexpr int HasDiv = 1;
+  static constexpr int HasLog = 1;
+  static constexpr int HasExp = 1;
+  static constexpr int HasSqrt = 1;
+  static constexpr int HasRsqrt = 1;
+  static constexpr int HasLGamma = 1;
+  static constexpr int HasDiGamma = 1;
+  static constexpr int HasZeta = 1;
+  static constexpr int HasPolygamma = 1;
+  static constexpr int HasErf = 1;
+  static constexpr int HasErfc = 1;
+  static constexpr int HasNdtri = 1;
+  static constexpr int HasBessel = 1;
+  static constexpr int HasIGamma = 1;
+  static constexpr int HasIGammaDerA = 1;
+  static constexpr int HasGammaSampleDerAlpha = 1;
+  static constexpr int HasIGammac = 1;
+  static constexpr int HasBetaInc = 1;
 };
 
 template <>
 struct unpacket_traits<float4> {
-  typedef float type;
-  enum {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-  typedef float4 half;
+  using type = float;
+  static constexpr int size = 4;
+  static constexpr int alignment = Aligned16;
+  static constexpr bool vectorizable = true;
+  static constexpr bool masked_load_available = false;
+  static constexpr bool masked_store_available = false;
+  using half = float4;
 };
 template <>
 struct unpacket_traits<double2> {
-  typedef double type;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-  typedef double2 half;
+  using type = double;
+  static constexpr int size = 2;
+  static constexpr int alignment = Aligned16;
+  static constexpr bool vectorizable = true;
+  static constexpr bool masked_load_available = false;
+  static constexpr bool masked_store_available = false;
+  using half = double2;
 };
 
 template <>
@@ -405,7 +384,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const dou
 
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   return __ldg(reinterpret_cast<const float4*>(from));
 #else
   return make_float4(from[0], from[1], from[2], from[3]);
@@ -413,7 +392,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   return __ldg(reinterpret_cast<const double2*>(from));
 #else
   return make_double2(from[0], from[1]);
@@ -422,7 +401,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const
 
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   return make_float4(__ldg(from + 0), __ldg(from + 1), __ldg(from + 2), __ldg(from + 3));
 #else
   return make_float4(from[0], from[1], from[2], from[3]);
@@ -430,7 +409,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   return make_double2(__ldg(from + 0), __ldg(from + 1));
 #else
   return make_double2(from[0], from[1]);
@@ -550,6 +529,15 @@ EIGEN_DEVICE_FUNC inline double2 ptrunc<double2>(const double2& a) {
   return make_double2(trunc(a.x), trunc(a.y));
 }
 
+template <>
+EIGEN_DEVICE_FUNC inline float4 pround<float4>(const float4& a) {
+  return make_float4(roundf(a.x), roundf(a.y), roundf(a.z), roundf(a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double2 pround<double2>(const double2& a) {
+  return make_double2(round(a.x), round(a.y));
+}
+
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<float4, 4>& kernel) {
   float tmp = kernel.packet[0].y;
   kernel.packet[0].y = kernel.packet[1].x;
@@ -584,23 +572,20 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<double2, 2>& kernel) {
 
 #endif  // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
 
-// Half-packet functions are not available on the host for CUDA 9.0-9.2, only
-// on device. There is no benefit to using them on the host anyways, since they are
-// emulated.
-#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
+// Half-packet functions are only available in GPU device compilation — they use
+// intrinsics (__half2, etc.) that have no host-side benefit.
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 
-typedef ulonglong2 Packet4h2;
+using Packet4h2 = ulonglong2;
 template <>
 struct unpacket_traits<Packet4h2> {
-  typedef Eigen::half type;
-  enum {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-  typedef Packet4h2 half;
+  using type = Eigen::half;
+  static constexpr int size = 8;
+  static constexpr int alignment = Aligned16;
+  static constexpr bool vectorizable = true;
+  static constexpr bool masked_load_available = false;
+  static constexpr bool masked_store_available = false;
+  using half = Packet4h2;
 };
 template <>
 struct is_arithmetic<Packet4h2> {
@@ -609,15 +594,13 @@ struct is_arithmetic<Packet4h2> {
 
 template <>
 struct unpacket_traits<half2> {
-  typedef Eigen::half type;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-  typedef half2 half;
+  using type = Eigen::half;
+  static constexpr int size = 2;
+  static constexpr int alignment = Aligned16;
+  static constexpr bool vectorizable = true;
+  static constexpr bool masked_load_available = false;
+  static constexpr bool masked_store_available = false;
+  using half = half2;
 };
 template <>
 struct is_arithmetic<half2> {
@@ -626,23 +609,21 @@ struct is_arithmetic<half2> {
 
 template <>
 struct packet_traits<Eigen::half> : default_packet_traits {
-  typedef Packet4h2 type;
-  typedef Packet4h2 half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 8,
-    HasAdd = 1,
-    HasSub = 1,
-    HasMul = 1,
-    HasDiv = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasExp = 1,
-    HasExpm1 = 1,
-    HasLog = 1,
-    HasLog1p = 1
-  };
+  using type = Packet4h2;
+  using half = Packet4h2;
+  static constexpr int Vectorizable = 1;
+  static constexpr int AlignedOnScalar = 1;
+  static constexpr int size = 8;
+  static constexpr int HasAdd = 1;
+  static constexpr int HasSub = 1;
+  static constexpr int HasMul = 1;
+  static constexpr int HasDiv = 1;
+  static constexpr int HasSqrt = 1;
+  static constexpr int HasRsqrt = 1;
+  static constexpr int HasExp = 1;
+  static constexpr int HasExpm1 = 1;
+  static constexpr int HasLog = 1;
+  static constexpr int HasLog1p = 1;
 };
 
 template <>
@@ -683,7 +664,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2&
 }
 
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   // Input is guaranteed to be properly aligned.
   return __ldg(reinterpret_cast<const half2*>(from));
 #else
@@ -692,7 +673,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half*
 }
 
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(const Eigen::half* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   return __halves2half2(__ldg(from + 0), __ldg(from + 1));
 #else
   return __halves2half2(*(from + 0), *(from + 1));
@@ -738,12 +719,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<half2, 2>& ker
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __halves2half2(a, __hadd(a, __float2half(1.0f)));
-#else
-  float f = __half2float(a) + 1.0f;
-  return __halves2half2(a, __float2half(f));
-#endif
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) {
@@ -830,89 +806,21 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2&
   return __halves2half2(result1, result2);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __hadd2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 + b1;
-  float r2 = a2 + b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { return __hadd2(a, b); }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __hsub2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 - b1;
-  float r2 = a2 - b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { return __hsub2(a, b); }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __hneg2(a);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return __floats2half2_rn(-a1, -a2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { return __hneg2(a); }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __hmul2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 * b1;
-  float r2 = a2 * b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { return __hmul2(a, b); }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __hfma2(a, b, c);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float c1 = __low2float(c);
-  float c2 = __high2float(c);
-  float r1 = a1 * b1 + c1;
-  float r2 = a2 * b2 + c2;
-  return __floats2half2_rn(r1, r2);
-#endif
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __h2div(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 / b1;
-  float r2 = a2 / b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { return __h2div(a, b); }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) {
   float a1 = __low2float(a);
@@ -935,47 +843,23 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b)
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __hadd(__low2half(a), __high2half(a));
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return Eigen::half(__float2half(a1 + a2));
-#endif
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   __half first = __low2half(a);
   __half second = __high2half(a);
   return __hgt(first, second) ? first : second;
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return a1 > a2 ? __low2half(a) : __high2half(a);
-#endif
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   __half first = __low2half(a);
   __half second = __high2half(a);
   return __hlt(first, second) ? first : second;
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return a1 < a2 ? __low2half(a) : __high2half(a);
-#endif
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __hmul(__low2half(a), __high2half(a));
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return Eigen::half(__float2half(a1 * a2));
-#endif
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
@@ -994,8 +878,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) {
   return __floats2half2_rn(r1, r2);
 }
 
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || defined(EIGEN_HIP_DEVICE_COMPILE)
-
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); }
@@ -1003,41 +885,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); }
-
-#else
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = logf(a1);
-  float r2 = logf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = expf(a1);
-  float r2 = expf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = sqrtf(a1);
-  float r2 = sqrtf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = rsqrtf(a1);
-  float r2 = rsqrtf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-#endif
 }  // namespace
 
 template <>
@@ -1084,19 +931,17 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to,
 
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
   Packet4h2 r;
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   r = __ldg(reinterpret_cast<const Packet4h2*>(from));
-  return r;
 #else
-  Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   r_alias[0] = ploadt_ro_aligned(from + 0);
   r_alias[1] = ploadt_ro_aligned(from + 2);
   r_alias[2] = ploadt_ro_aligned(from + 4);
   r_alias[3] = ploadt_ro_aligned(from + 6);
-  return r;
 #endif
+  return r;
 }
 
 template <>
@@ -1265,7 +1110,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::ha
   p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), __hadd(a, __float2half(5.0f)));
   p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), __hadd(a, __float2half(7.0f)));
   return r;
-#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+#elif defined(EIGEN_CUDA_ARCH)
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
 
@@ -1283,16 +1128,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::ha
   r_alias[3] = plset(__high2half(c));
 
   return r;
-
-#else
-  float f = __half2float(a);
-  Packet4h2 r;
-  half2* p_alias = reinterpret_cast<half2*>(&r);
-  p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
-  p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
-  p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
-  p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
-  return r;
 #endif
 }
 
@@ -1526,7 +1361,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(const Pa
   half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3]));
   __half first = predux_max(m0);
   __half second = predux_max(m1);
-#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+#if defined(EIGEN_CUDA_ARCH)
   return (__hgt(first, second) ? first : second);
 #else
   float ffirst = __half2float(first);
@@ -1542,7 +1377,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(const Pa
   half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3]));
   __half first = predux_min(m0);
   __half second = predux_min(m1);
-#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+#if defined(EIGEN_CUDA_ARCH)
   return (__hlt(first, second) ? first : second);
 #else
   float ffirst = __half2float(first);
@@ -1634,47 +1469,17 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 prsqrt<Packet4h2>(const Packet4h
 // the implementation of GPU half reduction.
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __hadd2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 + b1;
-  float r2 = a2 + b2;
-  return __floats2half2_rn(r1, r2);
-#endif
 }
 
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __hmul2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 * b1;
-  float r2 = a2 * b2;
-  return __floats2half2_rn(r1, r2);
-#endif
 }
 
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __h2div(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 / b1;
-  float r2 = a2 / b2;
-  return __floats2half2_rn(r1, r2);
-#endif
 }
 
 template <>
@@ -1699,11 +1504,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const ha
   return __halves2half2(r1, r2);
 }
 
-#endif  // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
-
-#undef EIGEN_GPU_HAS_LDG
-#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
-#undef EIGEN_GPU_HAS_FP16_ARITHMETIC
+#endif  // defined(EIGEN_GPU_COMPILE_PHASE)
 
 }  // end namespace internal
 
diff --git a/Eigen/src/Core/arch/GPU/Tuple.h b/Eigen/src/Core/arch/GPU/Tuple.h
index 402d92f76a2..e9b608c1f30 100644
--- a/Eigen/src/Core/arch/GPU/Tuple.h
+++ b/Eigen/src/Core/arch/GPU/Tuple.h
@@ -34,7 +34,7 @@ class TupleImpl<N, T1, Ts...> {
   template <typename U1 = T1,
             typename EnableIf = std::enable_if_t<std::is_default_constructible<U1>::value &&
                                                  reduce_all<std::is_default_constructible<Ts>::value...>::value>>
-  constexpr EIGEN_DEVICE_FUNC TupleImpl() : head_{}, tail_{} {}
+  constexpr EIGEN_DEVICE_FUNC TupleImpl() : m_head{}, m_tail{} {}
 
   // Element constructor.
   template <typename U1, typename... Us,
@@ -45,45 +45,45 @@ class TupleImpl<N, T1, Ts...> {
                                                       // this does not look like a copy/move constructor.
                                                       N > 1 || std::is_convertible<U1, T1>::value)>>
   constexpr EIGEN_DEVICE_FUNC TupleImpl(U1&& arg1, Us&&... args)
-      : head_(std::forward<U1>(arg1)), tail_(std::forward<Us>(args)...) {}
+      : m_head(std::forward<U1>(arg1)), m_tail(std::forward<Us>(args)...) {}
 
   // The first stored value.
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T1& head() { return head_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T1& head() { return m_head; }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& head() const { return head_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& head() const { return m_head; }
 
   // The tail values.
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TupleImpl<N - 1, Ts...>& tail() { return tail_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TupleImpl<N - 1, Ts...>& tail() { return m_tail; }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const TupleImpl<N - 1, Ts...>& tail() const { return tail_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const TupleImpl<N - 1, Ts...>& tail() const { return m_tail; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(TupleImpl& other) {
     using numext::swap;
-    swap(head_, other.head_);
-    swap(tail_, other.tail_);
+    swap(m_head, other.m_head);
+    swap(m_tail, other.m_tail);
   }
 
   template <typename... UTypes>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl& operator=(const TupleImpl<N, UTypes...>& other) {
-    head_ = other.head_;
-    tail_ = other.tail_;
+    m_head = other.m_head;
+    m_tail = other.m_tail;
     return *this;
   }
 
   template <typename... UTypes>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl& operator=(TupleImpl<N, UTypes...>&& other) {
-    head_ = std::move(other.head_);
-    tail_ = std::move(other.tail_);
+    m_head = std::move(other.m_head);
+    m_tail = std::move(other.m_tail);
     return *this;
   }
 
  private:
-  // Allow related tuples to reference head_/tail_.
+  // Allow related tuples to reference m_head/m_tail.
   template <size_t M, typename... UTypes>
   friend class TupleImpl;
 
-  T1 head_;
-  TupleImpl<N - 1, Ts...> tail_;
+  T1 m_head;
+  TupleImpl<N - 1, Ts...> m_tail;
 };
 
 // Empty tuple specialization.
@@ -187,7 +187,7 @@ struct unwrap_reference_wrapper<std::reference_wrapper<T>> {
 // For use in make_tuple, decays a type and unwraps a reference_wrapper.
 template <typename T>
 struct unwrap_decay {
-  using type = typename unwrap_reference_wrapper<typename std::decay<T>::type>::type;
+  using type = typename unwrap_reference_wrapper<std::decay_t<T>>::type;
 };
 
 /**
@@ -223,12 +223,12 @@ constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename tuple_get_impl<Idx, Typ
  * \param tuples ... list of tuples.
  * \return concatenated tuple.
  */
-template <typename... Tuples, typename EnableIf = std::enable_if_t<
-                                  internal::reduce_all<is_tuple<typename std::decay<Tuples>::type>::value...>::value>>
+template <typename... Tuples,
+          typename EnableIf = std::enable_if_t<internal::reduce_all<is_tuple<std::decay_t<Tuples>>::value...>::value>>
 constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    typename tuple_cat_impl<sizeof...(Tuples), typename std::decay<Tuples>::type...>::ReturnType
+    typename tuple_cat_impl<sizeof...(Tuples), std::decay_t<Tuples>...>::ReturnType
     tuple_cat(Tuples&&... tuples) {
-  return tuple_cat_impl<sizeof...(Tuples), typename std::decay<Tuples>::type...>::run(std::forward<Tuples>(tuples)...);
+  return tuple_cat_impl<sizeof...(Tuples), std::decay_t<Tuples>...>::run(std::forward<Tuples>(tuples)...);
 }
 
 /**
diff --git a/Eigen/src/Core/arch/GPU/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h
index ae43f8eaf26..27cbbbe0eaf 100644
--- a/Eigen/src/Core/arch/GPU/TypeCasting.h
+++ b/Eigen/src/Core/arch/GPU/TypeCasting.h
@@ -17,8 +17,7 @@ namespace Eigen {
 
 namespace internal {
 
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 
 template <>
 struct type_casting_traits<Eigen::half, float> {
diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h
index ccba96efd77..12494268355 100644
--- a/Eigen/src/Core/arch/HVX/PacketMath.h
+++ b/Eigen/src/Core/arch/HVX/PacketMath.h
@@ -152,16 +152,12 @@ struct packet_traits<float> : default_packet_traits {
     HasNegate = 1,
     HasAbs = 1,
     HasArg = 0,
-    HasAbs2 = 0,
     HasAbsDiff = 0,
     HasMin = 1,
     HasMax = 1,
     HasConj = 0,
     HasSetLinear = 0,
-    HasBlend = 0,
-
     HasDiv = 0,
-
     HasSin = 0,
     HasCos = 0,
     HasACos = 0,
@@ -241,18 +237,18 @@ EIGEN_STRONG_INLINE Packet8f pzero<Packet8f>(const Packet8f&) {
 }
 
 template <HVXPacketSize T>
-EIGEN_STRONG_INLINE typename unpacket_traits<HVXPacket<T>>::half predux_half_dowto4_hvx(const HVXPacket<T>& a) {
+EIGEN_STRONG_INLINE typename unpacket_traits<HVXPacket<T>>::half predux_half_hvx(const HVXPacket<T>& a) {
   const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
   return unpacket_traits<HVXPacket<T>>::half::Create(
       Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), sizeof(float) * packet_size / 2), a.Get())));
 }
 template <>
-EIGEN_STRONG_INLINE Packet16f predux_half_dowto4(const Packet32f& a) {
-  return predux_half_dowto4_hvx(a);
+EIGEN_STRONG_INLINE Packet16f predux_half(const Packet32f& a) {
+  return predux_half_hvx(a);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) {
-  return predux_half_dowto4_hvx(a);
+EIGEN_STRONG_INLINE Packet8f predux_half(const Packet16f& a) {
+  return predux_half_hvx(a);
 }
 
 template <HVXPacketSize T>
@@ -399,9 +395,26 @@ EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) {
   return pnegate_hvx(a);
 }
 
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> ptrue_hvx(const HVXPacket<T>& a) {
+  return HVXPacket<T>::Create(Q6_V_vsplat_R(0x3f800000));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f ptrue(const Packet32f& a) {
+  return ptrue_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f ptrue(const Packet16f& a) {
+  return ptrue_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f ptrue(const Packet8f& a) {
+  return ptrue_hvx(a);
+}
+
 template <HVXPacketSize T>
 EIGEN_STRONG_INLINE HVXPacket<T> pcmp_le_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
-  HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
+  HVX_Vector v_true = ptrue(a).Get();
   HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get());
   return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true));
 }
@@ -420,7 +433,7 @@ EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) {
 
 template <HVXPacketSize T>
 EIGEN_STRONG_INLINE HVXPacket<T> pcmp_eq_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
-  HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
+  HVX_Vector v_true = ptrue(a).Get();
   HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get());
   return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
 }
@@ -439,7 +452,7 @@ EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) {
 
 template <HVXPacketSize T>
 EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
-  HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
+  HVX_Vector v_true = ptrue(a).Get();
   HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
   return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
 }
@@ -458,7 +471,7 @@ EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) {
 
 template <HVXPacketSize T>
 EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_or_nan_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
-  HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
+  HVX_Vector v_true = ptrue(a).Get();
   HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
   return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
 }
@@ -751,11 +764,19 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 32>& kernel) {
 template <HVXPacketSize T>
 EIGEN_STRONG_INLINE float predux_hvx(const HVXPacket<T>& a) {
   const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
+#if __HVX_ARCH__ >= 79
+  HVX_Vector vsum = Q6_Vsf_vadd_VsfVsf(a.Get(), Q6_V_vror_VR(a.Get(), sizeof(float)));
+  for (int i = 2; i < packet_size; i <<= 1) {
+    vsum = Q6_Vsf_vadd_VsfVsf(vsum, Q6_V_vror_VR(vsum, i * sizeof(float)));
+  }
+  return pfirst(HVXPacket<T>::Create(vsum));
+#else
   HVX_Vector vsum = Q6_Vqf32_vadd_VsfVsf(a.Get(), Q6_V_vror_VR(a.Get(), sizeof(float)));
   for (int i = 2; i < packet_size; i <<= 1) {
     vsum = Q6_Vqf32_vadd_Vqf32Vqf32(vsum, Q6_V_vror_VR(vsum, i * sizeof(float)));
   }
   return pfirst(HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(vsum)));
+#endif
 }
 template <>
 EIGEN_STRONG_INLINE float predux<Packet32f>(const Packet32f& a) {
diff --git a/Eigen/src/Core/arch/LSX/Complex.h b/Eigen/src/Core/arch/LSX/Complex.h
index 0b60a831208..522d3d0bf27 100644
--- a/Eigen/src/Core/arch/LSX/Complex.h
+++ b/Eigen/src/Core/arch/LSX/Complex.h
@@ -226,11 +226,6 @@ EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2c
   return pdiv_complex(a, b);
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet2cf plog<Packet2cf>(const Packet2cf& a) {
-  return plog_complex(a);
-}
-
 template <>
 EIGEN_STRONG_INLINE Packet2cf pzero(const Packet2cf& /* a */) {
   __m128 v = {0.0f, 0.0f, 0.0f, 0.0f};
@@ -251,11 +246,6 @@ EIGEN_STRONG_INLINE Packet2cf pmadd<Packet2cf>(const Packet2cf& a, const Packet2
   return result;
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a) {
-  return pexp_complex(a);
-}
-
 //---------- double ----------
 struct Packet1cd {
   EIGEN_STRONG_INLINE Packet1cd() {}
@@ -364,7 +354,6 @@ EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packe
   return res;
 }
 
-// FIXME force unaligned load, this is a temporary fix
 template <>
 EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
@@ -384,7 +373,6 @@ EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* fr
   return pset1<Packet1cd>(*from);
 }
 
-// FIXME force unaligned store, this is a temporary fix
 template <>
 EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
   EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v));
@@ -458,20 +446,8 @@ EIGEN_DEVICE_FUNC inline Packet2cf pselect(const Packet2cf& mask, const Packet2c
   return res;
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
-  return psqrt_complex<Packet1cd>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
-  return psqrt_complex<Packet2cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd plog<Packet1cd>(const Packet1cd& a) {
-  return plog_complex(a);
-}
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet2cf)
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS_NO_EXP(Packet1cd)
 
 template <>
 EIGEN_STRONG_INLINE Packet1cd pzero<Packet1cd>(const Packet1cd& /* a */) {
diff --git a/Eigen/src/Core/arch/LSX/PacketMath.h b/Eigen/src/Core/arch/LSX/PacketMath.h
index 87232aa29d1..366399f6903 100644
--- a/Eigen/src/Core/arch/LSX/PacketMath.h
+++ b/Eigen/src/Core/arch/LSX/PacketMath.h
@@ -169,10 +169,8 @@ struct packet_traits<int8_t> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 16,
 
-    HasAbs2 = 0,
     HasSetLinear = 0,
     HasCmp = 1,
-    HasBlend = 0
   };
 };
 
@@ -185,11 +183,9 @@ struct packet_traits<int16_t> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 8,
 
-    HasAbs2 = 0,
     HasSetLinear = 0,
     HasCmp = 1,
     HasDiv = 1,
-    HasBlend = 0
   };
 };
 
@@ -202,11 +198,9 @@ struct packet_traits<int32_t> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 4,
 
-    HasAbs2 = 0,
     HasSetLinear = 0,
     HasCmp = 1,
     HasDiv = 1,
-    HasBlend = 0
   };
 };
 
@@ -219,11 +213,9 @@ struct packet_traits<int64_t> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 2,
 
-    HasAbs2 = 0,
     HasSetLinear = 0,
     HasCmp = 1,
     HasDiv = 1,
-    HasBlend = 0
   };
 };
 
@@ -236,11 +228,9 @@ struct packet_traits<uint8_t> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 16,
 
-    HasAbs2 = 0,
     HasSetLinear = 0,
     HasNegate = 0,
     HasCmp = 1,
-    HasBlend = 0
   };
 };
 
@@ -253,12 +243,10 @@ struct packet_traits<uint16_t> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 8,
 
-    HasAbs2 = 0,
     HasSetLinear = 0,
     HasNegate = 0,
     HasCmp = 1,
     HasDiv = 1,
-    HasBlend = 0
   };
 };
 
@@ -271,12 +259,10 @@ struct packet_traits<uint32_t> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 4,
 
-    HasAbs2 = 0,
     HasSetLinear = 0,
     HasNegate = 0,
     HasCmp = 1,
     HasDiv = 1,
-    HasBlend = 0
   };
 };
 
@@ -289,12 +275,10 @@ struct packet_traits<uint64_t> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 2,
 
-    HasAbs2 = 0,
     HasSetLinear = 0,
     HasNegate = 0,
     HasCmp = 1,
     HasDiv = 1,
-    HasBlend = 0
   };
 };
 
@@ -307,9 +291,7 @@ struct packet_traits<float> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 4,
 
-    HasAbs2 = 0,
     HasSetLinear = 0,
-    HasBlend = 0,
     HasSign = 0,
     HasDiv = 1,
     HasExp = 1,
@@ -328,9 +310,7 @@ struct packet_traits<double> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 2,
 
-    HasAbs2 = 0,
     HasSetLinear = 0,
-    HasBlend = 0,
     HasSign = 0,
     HasDiv = 1,
     HasSqrt = 1,
@@ -1399,6 +1379,47 @@ EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) {
   return a;
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16c pabsdiff(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vabsd_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pabsdiff(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vabsd_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pabsdiff(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vabsd_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pabsdiff(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vabsd_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pabsdiff(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vabsd_bu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pabsdiff(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vabsd_hu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pabsdiff(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vabsd_wu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pabsdiff(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vabsd_du(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pabsdiff(const Packet4f& a, const Packet4f& b) {
+  return pabs(psub(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pabsdiff(const Packet2d& a, const Packet2d& b) {
+  return pabs(psub(a, b));
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__lsx_vld(from, 0);
@@ -2687,11 +2708,6 @@ EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /* a */) {
   return v;
 }
 template <>
-EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b) {
-  Packet4f v = psub(a, b);
-  return pabs(v);
-}
-template <>
 EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
   return pmin<Packet4f>(a, b);
 }
@@ -2753,48 +2769,23 @@ template <>
 EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
   return pldexp_generic(a, exponent);
 }
-
-template <>
-EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b) {
-  Packet16c v = psub(a, b);
-  return pabs(v);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b) {
-  Packet8s v = psub(a, b);
-  return pabs(v);
-}
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) {
   return __lsx_vbitsel_v(b, a, mask);
 }
-
-template <>
-EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b) {
-  Packet4i v = psub(a, b);
-  return pabs(v);
-}
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
   return __lsx_vbitsel_v(b, a, mask);
 }
-
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
   return __lsx_vbitsel_v(b, a, mask);
 }
-
 template <>
 EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
   return __lsx_vdiv_bu(a, b);
 }
 template <>
-EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
-  Packet16uc v = psub(a, b);
-  return pabs(v);
-}
-template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a,
                                                          const Packet16uc& b) {
   return __lsx_vbitsel_v(b, a, mask);
@@ -2811,12 +2802,6 @@ EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
   }
   return res;
 }
-
-template <>
-EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b) {
-  Packet8us v = psub(a, b);
-  return pabs(v);
-}
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) {
   return __lsx_vbitsel_v(b, a, mask);
@@ -2834,11 +2819,6 @@ EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
   return res;
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
-  Packet4ui v = psub(a, b);
-  return pabs(v);
-}
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
   return __lsx_vbitsel_v(b, a, mask);
diff --git a/Eigen/src/Core/arch/LSX/TypeCasting.h b/Eigen/src/Core/arch/LSX/TypeCasting.h
index cda86806765..0b2906b8b07 100644
--- a/Eigen/src/Core/arch/LSX/TypeCasting.h
+++ b/Eigen/src/Core/arch/LSX/TypeCasting.h
@@ -18,6 +18,192 @@ namespace Eigen {
 
 namespace internal {
 
+//==============================================================================
+// type_casting_traits
+//==============================================================================
+
+// float <-> double
+template <>
+struct type_casting_traits<float, double> : vectorized_type_casting_traits<float, double> {};
+template <>
+struct type_casting_traits<double, float> : vectorized_type_casting_traits<double, float> {};
+
+// float <-> integer types
+template <>
+struct type_casting_traits<float, int8_t> : vectorized_type_casting_traits<float, int8_t> {};
+template <>
+struct type_casting_traits<int8_t, float> : vectorized_type_casting_traits<int8_t, float> {};
+template <>
+struct type_casting_traits<float, uint8_t> : vectorized_type_casting_traits<float, uint8_t> {};
+template <>
+struct type_casting_traits<uint8_t, float> : vectorized_type_casting_traits<uint8_t, float> {};
+template <>
+struct type_casting_traits<float, int16_t> : vectorized_type_casting_traits<float, int16_t> {};
+template <>
+struct type_casting_traits<int16_t, float> : vectorized_type_casting_traits<int16_t, float> {};
+template <>
+struct type_casting_traits<float, uint16_t> : vectorized_type_casting_traits<float, uint16_t> {};
+template <>
+struct type_casting_traits<uint16_t, float> : vectorized_type_casting_traits<uint16_t, float> {};
+template <>
+struct type_casting_traits<float, int32_t> : vectorized_type_casting_traits<float, int32_t> {};
+template <>
+struct type_casting_traits<int32_t, float> : vectorized_type_casting_traits<int32_t, float> {};
+template <>
+struct type_casting_traits<float, uint32_t> : vectorized_type_casting_traits<float, uint32_t> {};
+template <>
+struct type_casting_traits<uint32_t, float> : vectorized_type_casting_traits<uint32_t, float> {};
+template <>
+struct type_casting_traits<float, int64_t> : vectorized_type_casting_traits<float, int64_t> {};
+template <>
+struct type_casting_traits<int64_t, float> : vectorized_type_casting_traits<int64_t, float> {};
+template <>
+struct type_casting_traits<float, uint64_t> : vectorized_type_casting_traits<float, uint64_t> {};
+template <>
+struct type_casting_traits<uint64_t, float> : vectorized_type_casting_traits<uint64_t, float> {};
+
+// double <-> integer types
+template <>
+struct type_casting_traits<double, int8_t> : vectorized_type_casting_traits<double, int8_t> {};
+template <>
+struct type_casting_traits<int8_t, double> : vectorized_type_casting_traits<int8_t, double> {};
+template <>
+struct type_casting_traits<double, uint8_t> : vectorized_type_casting_traits<double, uint8_t> {};
+template <>
+struct type_casting_traits<uint8_t, double> : vectorized_type_casting_traits<uint8_t, double> {};
+template <>
+struct type_casting_traits<double, int16_t> : vectorized_type_casting_traits<double, int16_t> {};
+template <>
+struct type_casting_traits<int16_t, double> : vectorized_type_casting_traits<int16_t, double> {};
+template <>
+struct type_casting_traits<double, uint16_t> : vectorized_type_casting_traits<double, uint16_t> {};
+template <>
+struct type_casting_traits<uint16_t, double> : vectorized_type_casting_traits<uint16_t, double> {};
+template <>
+struct type_casting_traits<double, int32_t> : vectorized_type_casting_traits<double, int32_t> {};
+template <>
+struct type_casting_traits<int32_t, double> : vectorized_type_casting_traits<int32_t, double> {};
+template <>
+struct type_casting_traits<double, uint32_t> : vectorized_type_casting_traits<double, uint32_t> {};
+template <>
+struct type_casting_traits<uint32_t, double> : vectorized_type_casting_traits<uint32_t, double> {};
+template <>
+struct type_casting_traits<double, int64_t> : vectorized_type_casting_traits<double, int64_t> {};
+template <>
+struct type_casting_traits<int64_t, double> : vectorized_type_casting_traits<int64_t, double> {};
+template <>
+struct type_casting_traits<double, uint64_t> : vectorized_type_casting_traits<double, uint64_t> {};
+template <>
+struct type_casting_traits<uint64_t, double> : vectorized_type_casting_traits<uint64_t, double> {};
+
+// int8_t <-> other integer types
+template <>
+struct type_casting_traits<int8_t, int16_t> : vectorized_type_casting_traits<int8_t, int16_t> {};
+template <>
+struct type_casting_traits<int16_t, int8_t> : vectorized_type_casting_traits<int16_t, int8_t> {};
+template <>
+struct type_casting_traits<int8_t, uint16_t> : vectorized_type_casting_traits<int8_t, uint16_t> {};
+template <>
+struct type_casting_traits<uint16_t, int8_t> : vectorized_type_casting_traits<uint16_t, int8_t> {};
+template <>
+struct type_casting_traits<int8_t, int32_t> : vectorized_type_casting_traits<int8_t, int32_t> {};
+template <>
+struct type_casting_traits<int32_t, int8_t> : vectorized_type_casting_traits<int32_t, int8_t> {};
+template <>
+struct type_casting_traits<int8_t, uint32_t> : vectorized_type_casting_traits<int8_t, uint32_t> {};
+template <>
+struct type_casting_traits<uint32_t, int8_t> : vectorized_type_casting_traits<uint32_t, int8_t> {};
+template <>
+struct type_casting_traits<int8_t, int64_t> : vectorized_type_casting_traits<int8_t, int64_t> {};
+template <>
+struct type_casting_traits<int64_t, int8_t> : vectorized_type_casting_traits<int64_t, int8_t> {};
+template <>
+struct type_casting_traits<int8_t, uint64_t> : vectorized_type_casting_traits<int8_t, uint64_t> {};
+template <>
+struct type_casting_traits<uint64_t, int8_t> : vectorized_type_casting_traits<uint64_t, int8_t> {};
+
+// uint8_t <-> other integer types
+template <>
+struct type_casting_traits<uint8_t, int16_t> : vectorized_type_casting_traits<uint8_t, int16_t> {};
+template <>
+struct type_casting_traits<int16_t, uint8_t> : vectorized_type_casting_traits<int16_t, uint8_t> {};
+template <>
+struct type_casting_traits<uint8_t, uint16_t> : vectorized_type_casting_traits<uint8_t, uint16_t> {};
+template <>
+struct type_casting_traits<uint16_t, uint8_t> : vectorized_type_casting_traits<uint16_t, uint8_t> {};
+template <>
+struct type_casting_traits<uint8_t, int32_t> : vectorized_type_casting_traits<uint8_t, int32_t> {};
+template <>
+struct type_casting_traits<int32_t, uint8_t> : vectorized_type_casting_traits<int32_t, uint8_t> {};
+template <>
+struct type_casting_traits<uint8_t, uint32_t> : vectorized_type_casting_traits<uint8_t, uint32_t> {};
+template <>
+struct type_casting_traits<uint32_t, uint8_t> : vectorized_type_casting_traits<uint32_t, uint8_t> {};
+template <>
+struct type_casting_traits<uint8_t, int64_t> : vectorized_type_casting_traits<uint8_t, int64_t> {};
+template <>
+struct type_casting_traits<int64_t, uint8_t> : vectorized_type_casting_traits<int64_t, uint8_t> {};
+template <>
+struct type_casting_traits<uint8_t, uint64_t> : vectorized_type_casting_traits<uint8_t, uint64_t> {};
+template <>
+struct type_casting_traits<uint64_t, uint8_t> : vectorized_type_casting_traits<uint64_t, uint8_t> {};
+
+// int16_t <-> wider integer types
+template <>
+struct type_casting_traits<int16_t, int32_t> : vectorized_type_casting_traits<int16_t, int32_t> {};
+template <>
+struct type_casting_traits<int32_t, int16_t> : vectorized_type_casting_traits<int32_t, int16_t> {};
+template <>
+struct type_casting_traits<int16_t, uint32_t> : vectorized_type_casting_traits<int16_t, uint32_t> {};
+template <>
+struct type_casting_traits<uint32_t, int16_t> : vectorized_type_casting_traits<uint32_t, int16_t> {};
+template <>
+struct type_casting_traits<int16_t, int64_t> : vectorized_type_casting_traits<int16_t, int64_t> {};
+template <>
+struct type_casting_traits<int64_t, int16_t> : vectorized_type_casting_traits<int64_t, int16_t> {};
+template <>
+struct type_casting_traits<int16_t, uint64_t> : vectorized_type_casting_traits<int16_t, uint64_t> {};
+template <>
+struct type_casting_traits<uint64_t, int16_t> : vectorized_type_casting_traits<uint64_t, int16_t> {};
+
+// uint16_t <-> wider integer types
+template <>
+struct type_casting_traits<uint16_t, int32_t> : vectorized_type_casting_traits<uint16_t, int32_t> {};
+template <>
+struct type_casting_traits<int32_t, uint16_t> : vectorized_type_casting_traits<int32_t, uint16_t> {};
+template <>
+struct type_casting_traits<uint16_t, uint32_t> : vectorized_type_casting_traits<uint16_t, uint32_t> {};
+template <>
+struct type_casting_traits<uint32_t, uint16_t> : vectorized_type_casting_traits<uint32_t, uint16_t> {};
+template <>
+struct type_casting_traits<uint16_t, int64_t> : vectorized_type_casting_traits<uint16_t, int64_t> {};
+template <>
+struct type_casting_traits<int64_t, uint16_t> : vectorized_type_casting_traits<int64_t, uint16_t> {};
+template <>
+struct type_casting_traits<uint16_t, uint64_t> : vectorized_type_casting_traits<uint16_t, uint64_t> {};
+template <>
+struct type_casting_traits<uint64_t, uint16_t> : vectorized_type_casting_traits<uint64_t, uint16_t> {};
+
+// int32_t <-> 64-bit integer types
+template <>
+struct type_casting_traits<int32_t, int64_t> : vectorized_type_casting_traits<int32_t, int64_t> {};
+template <>
+struct type_casting_traits<int64_t, int32_t> : vectorized_type_casting_traits<int64_t, int32_t> {};
+template <>
+struct type_casting_traits<int32_t, uint64_t> : vectorized_type_casting_traits<int32_t, uint64_t> {};
+template <>
+struct type_casting_traits<uint64_t, int32_t> : vectorized_type_casting_traits<uint64_t, int32_t> {};
+
+// uint32_t <-> 64-bit integer types
+template <>
+struct type_casting_traits<uint32_t, int64_t> : vectorized_type_casting_traits<uint32_t, int64_t> {};
+template <>
+struct type_casting_traits<int64_t, uint32_t> : vectorized_type_casting_traits<int64_t, uint32_t> {};
+template <>
+struct type_casting_traits<uint32_t, uint64_t> : vectorized_type_casting_traits<uint32_t, uint64_t> {};
+template <>
+struct type_casting_traits<uint64_t, uint32_t> : vectorized_type_casting_traits<uint64_t, uint32_t> {};
+
 //==============================================================================
 // preinterpret
 //==============================================================================
@@ -93,42 +279,42 @@ EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2l>(const Packet2l&
 template <>
 EIGEN_STRONG_INLINE Packet2l pcast<Packet4f, Packet2l>(const Packet4f& a) {
   Packet2d tmp = __lsx_vfcvtl_d_s(a);
-  return __lsx_vftint_l_d(tmp);
+  return __lsx_vftintrz_l_d(tmp);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ul pcast<Packet4f, Packet2ul>(const Packet4f& a) {
   Packet2d tmp = __lsx_vfcvtl_d_s(a);
-  return __lsx_vftint_lu_d(tmp);
+  return __lsx_vftintrz_lu_d(tmp);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
-  return __lsx_vftint_w_s(a);
+  return __lsx_vftintrz_w_s(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
-  return __lsx_vftint_wu_s(a);
+  return __lsx_vftintrz_wu_s(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet4f, Packet8s>(const Packet4f& a, const Packet4f& b) {
-  return __lsx_vssrlni_h_w(__lsx_vftint_w_s(a), __lsx_vftint_w_s(b), 0);
+  return __lsx_vpickev_h(__lsx_vftintrz_w_s(b), __lsx_vftintrz_w_s(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8us pcast<Packet4f, Packet8us>(const Packet4f& a, const Packet4f& b) {
-  return __lsx_vssrlni_hu_w(__lsx_vftint_wu_s(a), __lsx_vftint_wu_s(b), 0);
+  return __lsx_vpickev_h(__lsx_vftintrz_wu_s(b), __lsx_vftintrz_wu_s(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet4f, Packet16c>(const Packet4f& a, const Packet4f& b, const Packet4f& c,
                                                          const Packet4f& d) {
-  Packet8s tmp1 = __lsx_vssrlni_h_w(__lsx_vftint_w_s(a), __lsx_vftint_w_s(b), 0);
-  Packet8s tmp2 = __lsx_vssrlni_h_w(__lsx_vftint_w_s(c), __lsx_vftint_w_s(d), 0);
-  return __lsx_vssrlni_b_h((__m128i)tmp1, (__m128i)tmp2, 0);
+  Packet8s tmp1 = __lsx_vpickev_h(__lsx_vftintrz_w_s(b), __lsx_vftintrz_w_s(a));
+  Packet8s tmp2 = __lsx_vpickev_h(__lsx_vftintrz_w_s(d), __lsx_vftintrz_w_s(c));
+  return __lsx_vpickev_b((__m128i)tmp2, (__m128i)tmp1);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16uc pcast<Packet4f, Packet16uc>(const Packet4f& a, const Packet4f& b, const Packet4f& c,
                                                            const Packet4f& d) {
-  Packet8us tmp1 = __lsx_vssrlni_hu_w(__lsx_vftint_wu_s(a), __lsx_vftint_wu_s(b), 0);
-  Packet8us tmp2 = __lsx_vssrlni_hu_w(__lsx_vftint_wu_s(c), __lsx_vftint_wu_s(d), 0);
-  return __lsx_vssrlni_bu_h((__m128i)tmp1, (__m128i)tmp2, 0);
+  Packet8us tmp1 = __lsx_vpickev_h(__lsx_vftintrz_wu_s(b), __lsx_vftintrz_wu_s(a));
+  Packet8us tmp2 = __lsx_vpickev_h(__lsx_vftintrz_wu_s(d), __lsx_vftintrz_wu_s(c));
+  return __lsx_vpickev_b((__m128i)tmp2, (__m128i)tmp1);
 }
 
 template <>
@@ -230,11 +416,11 @@ EIGEN_STRONG_INLINE Packet4ui pcast<Packet8s, Packet4ui>(const Packet8s& a) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet8s, Packet16c>(const Packet8s& a, const Packet8s& b) {
-  return __lsx_vssrlni_b_h((__m128i)a, (__m128i)b, 0);
+  return __lsx_vpickev_b((__m128i)b, (__m128i)a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16uc pcast<Packet8s, Packet16uc>(const Packet8s& a, const Packet8s& b) {
-  return (Packet16uc)__lsx_vssrlni_b_h((__m128i)a, (__m128i)b, 0);
+  return (Packet16uc)__lsx_vpickev_b((__m128i)b, (__m128i)a);
 }
 
 template <>
@@ -262,11 +448,11 @@ EIGEN_STRONG_INLINE Packet4i pcast<Packet8us, Packet4i>(const Packet8us& a) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet16uc pcast<Packet8us, Packet16uc>(const Packet8us& a, const Packet8us& b) {
-  return __lsx_vssrlni_bu_h((__m128i)a, (__m128i)b, 0);
+  return __lsx_vpickev_b((__m128i)b, (__m128i)a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet8us, Packet16c>(const Packet8us& a, const Packet8us& b) {
-  return (Packet16c)__lsx_vssrlni_bu_h((__m128i)a, (__m128i)b, 0);
+  return (Packet16c)__lsx_vpickev_b((__m128i)b, (__m128i)a);
 }
 
 template <>
@@ -283,25 +469,25 @@ EIGEN_STRONG_INLINE Packet2ul pcast<Packet4i, Packet2ul>(const Packet4i& a) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet4i, Packet8s>(const Packet4i& a, const Packet4i& b) {
-  return __lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0);
+  return __lsx_vpickev_h((__m128i)b, (__m128i)a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8us pcast<Packet4i, Packet8us>(const Packet4i& a, const Packet4i& b) {
-  return (Packet8us)__lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0);
+  return (Packet8us)__lsx_vpickev_h((__m128i)b, (__m128i)a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet4i, Packet16c>(const Packet4i& a, const Packet4i& b, const Packet4i& c,
                                                          const Packet4i& d) {
-  Packet8s tmp1 = __lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0);
-  Packet8s tmp2 = __lsx_vssrlni_h_w((__m128i)c, (__m128i)d, 0);
-  return __lsx_vssrlni_b_h((__m128i)tmp1, (__m128i)tmp2, 0);
+  Packet8s tmp1 = __lsx_vpickev_h((__m128i)b, (__m128i)a);
+  Packet8s tmp2 = __lsx_vpickev_h((__m128i)d, (__m128i)c);
+  return __lsx_vpickev_b((__m128i)tmp2, (__m128i)tmp1);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16uc pcast<Packet4i, Packet16uc>(const Packet4i& a, const Packet4i& b, const Packet4i& c,
                                                            const Packet4i& d) {
-  Packet8s tmp1 = __lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0);
-  Packet8s tmp2 = __lsx_vssrlni_h_w((__m128i)c, (__m128i)d, 0);
-  return (Packet16uc)__lsx_vssrlni_b_h((__m128i)tmp1, (__m128i)tmp2, 0);
+  Packet8s tmp1 = __lsx_vpickev_h((__m128i)b, (__m128i)a);
+  Packet8s tmp2 = __lsx_vpickev_h((__m128i)d, (__m128i)c);
+  return (Packet16uc)__lsx_vpickev_b((__m128i)tmp2, (__m128i)tmp1);
 }
 
 template <>
@@ -318,52 +504,52 @@ EIGEN_STRONG_INLINE Packet2l pcast<Packet4ui, Packet2l>(const Packet4ui& a) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet8us pcast<Packet4ui, Packet8us>(const Packet4ui& a, const Packet4ui& b) {
-  return __lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0);
+  return __lsx_vpickev_h((__m128i)b, (__m128i)a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet4ui, Packet8s>(const Packet4ui& a, const Packet4ui& b) {
-  return (Packet8s)__lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0);
+  return (Packet8s)__lsx_vpickev_h((__m128i)b, (__m128i)a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16uc pcast<Packet4ui, Packet16uc>(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c,
                                                             const Packet4ui& d) {
-  Packet8us tmp1 = __lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0);
-  Packet8us tmp2 = __lsx_vssrlni_hu_w((__m128i)c, (__m128i)d, 0);
-  return __lsx_vssrlni_bu_h((__m128i)tmp1, (__m128i)tmp2, 0);
+  Packet8us tmp1 = __lsx_vpickev_h((__m128i)b, (__m128i)a);
+  Packet8us tmp2 = __lsx_vpickev_h((__m128i)d, (__m128i)c);
+  return __lsx_vpickev_b((__m128i)tmp2, (__m128i)tmp1);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet4ui, Packet16c>(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c,
                                                           const Packet4ui& d) {
-  Packet8us tmp1 = __lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0);
-  Packet8us tmp2 = __lsx_vssrlni_hu_w((__m128i)c, (__m128i)d, 0);
-  return (Packet16c)__lsx_vssrlni_bu_h((__m128i)tmp1, (__m128i)tmp2, 0);
+  Packet8us tmp1 = __lsx_vpickev_h((__m128i)b, (__m128i)a);
+  Packet8us tmp2 = __lsx_vpickev_h((__m128i)d, (__m128i)c);
+  return (Packet16c)__lsx_vpickev_b((__m128i)tmp2, (__m128i)tmp1);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4f pcast<Packet2l, Packet4f>(const Packet2l& a, const Packet2l& b) {
-  return __lsx_vffint_s_w(__lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0));
+  return __lsx_vfcvt_s_d(__lsx_vffint_d_l(b), __lsx_vffint_d_l(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet2l, Packet4i>(const Packet2l& a, const Packet2l& b) {
-  return __lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0);
+  return __lsx_vpickev_w((__m128i)b, (__m128i)a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4ui pcast<Packet2l, Packet4ui>(const Packet2l& a, const Packet2l& b) {
-  return (Packet4ui)__lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0);
+  return (Packet4ui)__lsx_vpickev_w((__m128i)b, (__m128i)a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet2l, Packet8s>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
                                                        const Packet2l& d) {
-  Packet4i tmp1 = __lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0);
-  Packet4i tmp2 = __lsx_vssrlni_w_d((__m128i)c, (__m128i)d, 0);
-  return __lsx_vssrlni_h_w((__m128i)tmp1, (__m128i)tmp2, 0);
+  Packet4i tmp1 = __lsx_vpickev_w((__m128i)b, (__m128i)a);
+  Packet4i tmp2 = __lsx_vpickev_w((__m128i)d, (__m128i)c);
+  return __lsx_vpickev_h((__m128i)tmp2, (__m128i)tmp1);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8us pcast<Packet2l, Packet8us>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
                                                          const Packet2l& d) {
-  Packet4i tmp1 = __lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0);
-  Packet4i tmp2 = __lsx_vssrlni_w_d((__m128i)c, (__m128i)d, 0);
-  return (Packet8us)__lsx_vssrlni_h_w((__m128i)tmp1, (__m128i)tmp2, 0);
+  Packet4i tmp1 = __lsx_vpickev_w((__m128i)b, (__m128i)a);
+  Packet4i tmp2 = __lsx_vpickev_w((__m128i)d, (__m128i)c);
+  return (Packet8us)__lsx_vpickev_h((__m128i)tmp2, (__m128i)tmp1);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet2l, Packet16c>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
@@ -371,7 +557,7 @@ EIGEN_STRONG_INLINE Packet16c pcast<Packet2l, Packet16c>(const Packet2l& a, cons
                                                          const Packet2l& g, const Packet2l& h) {
   const Packet8s abcd = pcast<Packet2l, Packet8s>(a, b, c, d);
   const Packet8s efgh = pcast<Packet2l, Packet8s>(e, f, g, h);
-  return __lsx_vssrlni_b_h((__m128i)abcd, (__m128i)efgh, 0);
+  return __lsx_vpickev_b((__m128i)efgh, (__m128i)abcd);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16uc pcast<Packet2l, Packet16uc>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
@@ -379,34 +565,34 @@ EIGEN_STRONG_INLINE Packet16uc pcast<Packet2l, Packet16uc>(const Packet2l& a, co
                                                            const Packet2l& g, const Packet2l& h) {
   const Packet8us abcd = pcast<Packet2l, Packet8us>(a, b, c, d);
   const Packet8us efgh = pcast<Packet2l, Packet8us>(e, f, g, h);
-  return __lsx_vssrlni_bu_h((__m128i)abcd, (__m128i)efgh, 0);
+  return __lsx_vpickev_b((__m128i)efgh, (__m128i)abcd);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4f pcast<Packet2ul, Packet4f>(const Packet2ul& a, const Packet2ul& b) {
-  return __lsx_vffint_s_wu(__lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0));
+  return __lsx_vfcvt_s_d(__lsx_vffint_d_lu(b), __lsx_vffint_d_lu(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4ui pcast<Packet2ul, Packet4ui>(const Packet2ul& a, const Packet2ul& b) {
-  return __lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0);
+  return __lsx_vpickev_w((__m128i)b, (__m128i)a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet2ul, Packet4i>(const Packet2ul& a, const Packet2ul& b) {
-  return (Packet4i)__lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0);
+  return (Packet4i)__lsx_vpickev_w((__m128i)b, (__m128i)a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8us pcast<Packet2ul, Packet8us>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
                                                           const Packet2ul& d) {
-  Packet4ui tmp1 = __lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0);
-  Packet4ui tmp2 = __lsx_vssrlni_wu_d((__m128i)c, (__m128i)d, 0);
-  return __lsx_vssrlni_hu_w((__m128i)tmp1, (__m128i)tmp2, 0);
+  Packet4ui tmp1 = __lsx_vpickev_w((__m128i)b, (__m128i)a);
+  Packet4ui tmp2 = __lsx_vpickev_w((__m128i)d, (__m128i)c);
+  return __lsx_vpickev_h((__m128i)tmp2, (__m128i)tmp1);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet2ul, Packet8s>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
                                                         const Packet2ul& d) {
-  Packet4ui tmp1 = __lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0);
-  Packet4ui tmp2 = __lsx_vssrlni_wu_d((__m128i)c, (__m128i)d, 0);
-  return (Packet8s)__lsx_vssrlni_hu_w((__m128i)tmp1, (__m128i)tmp2, 0);
+  Packet4ui tmp1 = __lsx_vpickev_w((__m128i)b, (__m128i)a);
+  Packet4ui tmp2 = __lsx_vpickev_w((__m128i)d, (__m128i)c);
+  return (Packet8s)__lsx_vpickev_h((__m128i)tmp2, (__m128i)tmp1);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16uc pcast<Packet2ul, Packet16uc>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
@@ -414,7 +600,7 @@ EIGEN_STRONG_INLINE Packet16uc pcast<Packet2ul, Packet16uc>(const Packet2ul& a,
                                                             const Packet2ul& g, const Packet2ul& h) {
   const Packet8s abcd = pcast<Packet2ul, Packet8s>(a, b, c, d);
   const Packet8s efgh = pcast<Packet2ul, Packet8s>(e, f, g, h);
-  return __lsx_vssrlni_b_h((__m128i)abcd, (__m128i)efgh, 0);
+  return __lsx_vpickev_b((__m128i)efgh, (__m128i)abcd);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet2ul, Packet16c>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
@@ -422,7 +608,7 @@ EIGEN_STRONG_INLINE Packet16c pcast<Packet2ul, Packet16c>(const Packet2ul& a, co
                                                           const Packet2ul& g, const Packet2ul& h) {
   const Packet8us abcd = pcast<Packet2ul, Packet8us>(a, b, c, d);
   const Packet8us efgh = pcast<Packet2ul, Packet8us>(e, f, g, h);
-  return __lsx_vssrlni_bu_h((__m128i)abcd, (__m128i)efgh, 0);
+  return __lsx_vpickev_b((__m128i)efgh, (__m128i)abcd);
 }
 
 template <>
@@ -431,33 +617,33 @@ EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const
 }
 template <>
 EIGEN_STRONG_INLINE Packet2l pcast<Packet2d, Packet2l>(const Packet2d& a) {
-  return __lsx_vftint_l_d(a);
+  return __lsx_vftintrz_l_d(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ul pcast<Packet2d, Packet2ul>(const Packet2d& a) {
-  return __lsx_vftint_lu_d(a);
+  return __lsx_vftintrz_lu_d(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet2d, Packet4i>(const Packet2d& a, const Packet2d& b) {
-  return __lsx_vssrlni_w_d(__lsx_vftint_l_d(a), __lsx_vftint_l_d(b), 0);
+  return __lsx_vpickev_w(__lsx_vftintrz_l_d(b), __lsx_vftintrz_l_d(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4ui pcast<Packet2d, Packet4ui>(const Packet2d& a, const Packet2d& b) {
-  return __lsx_vssrlni_wu_d(__lsx_vftint_lu_d(a), __lsx_vftint_lu_d(b), 0);
+  return __lsx_vpickev_w(__lsx_vftintrz_lu_d(b), __lsx_vftintrz_lu_d(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet2d, Packet8s>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
                                                        const Packet2d& d) {
-  Packet4i tmp1 = __lsx_vssrlni_w_d(__lsx_vftint_l_d(a), __lsx_vftint_l_d(b), 0);
-  Packet4i tmp2 = __lsx_vssrlni_w_d(__lsx_vftint_l_d(c), __lsx_vftint_l_d(d), 0);
-  return __lsx_vssrlni_h_w((__m128i)tmp1, (__m128i)tmp2, 0);
+  Packet4i tmp1 = __lsx_vpickev_w(__lsx_vftintrz_l_d(b), __lsx_vftintrz_l_d(a));
+  Packet4i tmp2 = __lsx_vpickev_w(__lsx_vftintrz_l_d(d), __lsx_vftintrz_l_d(c));
+  return __lsx_vpickev_h((__m128i)tmp2, (__m128i)tmp1);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8us pcast<Packet2d, Packet8us>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
                                                          const Packet2d& d) {
-  Packet4ui tmp1 = __lsx_vssrlni_wu_d(__lsx_vftint_lu_d(a), __lsx_vftint_lu_d(b), 0);
-  Packet4ui tmp2 = __lsx_vssrlni_wu_d(__lsx_vftint_lu_d(c), __lsx_vftint_lu_d(d), 0);
-  return __lsx_vssrlni_hu_w((__m128i)tmp1, (__m128i)tmp2, 0);
+  Packet4ui tmp1 = __lsx_vpickev_w(__lsx_vftintrz_lu_d(b), __lsx_vftintrz_lu_d(a));
+  Packet4ui tmp2 = __lsx_vpickev_w(__lsx_vftintrz_lu_d(d), __lsx_vftintrz_lu_d(c));
+  return __lsx_vpickev_h((__m128i)tmp2, (__m128i)tmp1);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet2d, Packet16c>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
@@ -465,7 +651,7 @@ EIGEN_STRONG_INLINE Packet16c pcast<Packet2d, Packet16c>(const Packet2d& a, cons
                                                          const Packet2d& g, const Packet2d& h) {
   const Packet8s abcd = pcast<Packet2d, Packet8s>(a, b, c, d);
   const Packet8s efgh = pcast<Packet2d, Packet8s>(e, f, g, h);
-  return __lsx_vssrlni_b_h((__m128i)abcd, (__m128i)efgh, 0);
+  return __lsx_vpickev_b((__m128i)efgh, (__m128i)abcd);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16uc pcast<Packet2d, Packet16uc>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
@@ -473,7 +659,7 @@ EIGEN_STRONG_INLINE Packet16uc pcast<Packet2d, Packet16uc>(const Packet2d& a, co
                                                            const Packet2d& g, const Packet2d& h) {
   const Packet8us abcd = pcast<Packet2d, Packet8us>(a, b, c, d);
   const Packet8us efgh = pcast<Packet2d, Packet8us>(e, f, g, h);
-  return __lsx_vssrlni_bu_h((__m128i)abcd, (__m128i)efgh, 0);
+  return __lsx_vpickev_b((__m128i)efgh, (__m128i)abcd);
 }
 
 template <>
diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h
index 2d2fbbca469..a0771a19884 100644
--- a/Eigen/src/Core/arch/MSA/Complex.h
+++ b/Eigen/src/Core/arch/MSA/Complex.h
@@ -100,12 +100,14 @@ struct packet_traits<std::complex<float> > : default_packet_traits {
     HasMul = 1,
     HasDiv = 1,
     HasNegate = 1,
+    HasSqrt = 1,
+    HasLog = 1,
+    HasExp = 1,
     HasAbs = 0,
     HasAbs2 = 0,
     HasMin = 0,
     HasMax = 0,
     HasSetLinear = 0,
-    HasBlend = 1
   };
 };
 
@@ -314,12 +316,6 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
   kernel.packet[1].v = tmp;
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
-                                     const Packet2cf& elsePacket) {
-  return (Packet2cf)(Packet4f)pblend<Packet2d>(ifPacket, (Packet2d)thenPacket.v, (Packet2d)elsePacket.v);
-}
-
 //---------- double ----------
 
 struct Packet1cd {
@@ -400,6 +396,8 @@ struct packet_traits<std::complex<double> > : default_packet_traits {
     HasMul = 1,
     HasDiv = 1,
     HasNegate = 1,
+    HasSqrt = 1,
+    HasLog = 1,
     HasAbs = 0,
     HasAbs2 = 0,
     HasMin = 0,
@@ -613,6 +611,9 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
   kernel.packet[1].v = v2;
 }
 
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet2cf)
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS_NO_EXP(Packet1cd)
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h
index 81da24f8dd5..40e4ccdc265 100644
--- a/Eigen/src/Core/arch/MSA/PacketMath.h
+++ b/Eigen/src/Core/arch/MSA/PacketMath.h
@@ -35,18 +35,7 @@ namespace internal {
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
 #endif
 
-#if 0
-#define EIGEN_MSA_DEBUG                                                             \
-  static bool firstTime = true;                                                     \
-  do {                                                                              \
-    if (firstTime) {                                                                \
-      std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \
-      firstTime = false;                                                            \
-    }                                                                               \
-  } while (0)
-#else
 #define EIGEN_MSA_DEBUG
-#endif
 
 #define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a))
 
@@ -81,7 +70,7 @@ struct packet_traits<float> : default_packet_traits {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    // FIXME check the Has*
+    // FIXME: verify the Has* flags.
     HasDiv = 1,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
@@ -91,7 +80,6 @@ struct packet_traits<float> : default_packet_traits {
     HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasBlend = 1
   };
 };
 
@@ -103,9 +91,8 @@ struct packet_traits<int32_t> : default_packet_traits {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    // FIXME check the Has*
+    // FIXME: verify the Has* flags.
     HasDiv = 1,
-    HasBlend = 1
   };
 };
 
@@ -803,19 +790,36 @@ EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
-                                    const Packet4f& elsePacket) {
-  Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
-  Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
-  return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
+  // frint.w uses the current rounding mode (default: round to nearest, ties to even).
+  Packet4f v = a;
+  asm volatile("frint.w %w[v], %w[v]\n" : [v] "+f"(v));
+  return v;
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
-                                    const Packet4i& elsePacket) {
-  Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
-  Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
-  return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
+  Packet4f v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 2\n"  // 1 = round toward zero.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.w %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (Packet4f)__builtin_msa_fcult_w(a, b);
 }
 
 //---------- double ----------
@@ -851,12 +855,11 @@ struct packet_traits<double> : default_packet_traits {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
-    // FIXME check the Has*
+    // FIXME: verify the Has* flags.
     HasDiv = 1,
     HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasBlend = 1
   };
 };
 
@@ -1223,11 +1226,36 @@ EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
-                                    const Packet2d& elsePacket) {
-  Packet2ul select = {ifPacket.select[0], ifPacket.select[1]};
-  Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0);
-  return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
+  // frint.d uses the current rounding mode (default: round to nearest, ties to even).
+  Packet2d v = a;
+  asm volatile("frint.d %w[v], %w[v]\n" : [v] "+f"(v));
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
+  Packet2d v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 2\n"  // 1 = round toward zero.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.d %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (Packet2d)__builtin_msa_fcult_d(a, b);
 }
 
 }  // end namespace internal
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 4190d1bd1ac..48ac0cfcfdd 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -48,7 +48,7 @@ struct Packet2cf {
 };
 
 template <>
-struct packet_traits<std::complex<float> > : default_packet_traits {
+struct packet_traits<std::complex<float>> : default_packet_traits {
   typedef Packet2cf type;
   typedef Packet1cf half;
   enum {
@@ -73,30 +73,13 @@ struct packet_traits<std::complex<float> > : default_packet_traits {
 };
 
 template <>
-struct unpacket_traits<Packet1cf> {
-  typedef std::complex<float> type;
-  typedef Packet1cf half;
-  typedef Packet2f as_real;
-  enum {
-    size = 1,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet1cf> : neon_unpacket_default<Packet1cf, std::complex<float>> {
+  using as_real = Packet2f;
 };
 template <>
-struct unpacket_traits<Packet2cf> {
-  typedef std::complex<float> type;
-  typedef Packet1cf half;
-  typedef Packet4f as_real;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet2cf> : neon_unpacket_default<Packet2cf, std::complex<float>> {
+  using half = Packet1cf;
+  using as_real = Packet4f;
 };
 
 template <>
@@ -297,11 +280,13 @@ EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packe
 
 template <>
 EIGEN_STRONG_INLINE Packet1cf pload<Packet1cf>(const std::complex<float>* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(pload<Packet2f>((const float*)from));
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(
+      pload<Packet2f>(assume_aligned<unpacket_traits<Packet1cf>::alignment>(reinterpret_cast<const float*>(from))));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(reinterpret_cast<const float*>(from)));
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(
+      pload<Packet4f>(assume_aligned<unpacket_traits<Packet2cf>::alignment>(reinterpret_cast<const float*>(from))));
 }
 
 template <>
@@ -323,20 +308,22 @@ EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* fro
 }
 
 template <>
-EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet1cf& from) {
-  EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v);
+EIGEN_STRONG_INLINE void pstore<std::complex<float>>(std::complex<float>* to, const Packet1cf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore(assume_aligned<unpacket_traits<Packet1cf>::alignment>(reinterpret_cast<float*>(to)),
+                                   from.v);
 }
 template <>
-EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
-  EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<float*>(to), from.v);
+EIGEN_STRONG_INLINE void pstore<std::complex<float>>(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore(assume_aligned<unpacket_traits<Packet2cf>::alignment>(reinterpret_cast<float*>(to)),
+                                   from.v);
 }
 
 template <>
-EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet1cf& from) {
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float>>(std::complex<float>* to, const Packet1cf& from) {
   EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v);
 }
 template <>
-EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float>>(std::complex<float>* to, const Packet2cf& from) {
   EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<float*>(to), from.v);
 }
 
@@ -369,7 +356,7 @@ EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::comp
 }
 
 template <>
-EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+EIGEN_STRONG_INLINE void prefetch<std::complex<float>>(const std::complex<float>* addr) {
   EIGEN_ARM_PREFETCH(reinterpret_cast<const float*>(addr));
 }
 
@@ -469,35 +456,8 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
   kernel.packet[1].v = tmp;
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet1cf psqrt<Packet1cf>(const Packet1cf& a) {
-  return psqrt_complex<Packet1cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
-  return psqrt_complex<Packet2cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cf plog<Packet1cf>(const Packet1cf& a) {
-  return plog_complex(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf plog<Packet2cf>(const Packet2cf& a) {
-  return plog_complex(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cf pexp<Packet1cf>(const Packet1cf& a) {
-  return pexp_complex(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a) {
-  return pexp_complex(a);
-}
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet1cf)
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet2cf)
 
 //---------- double ----------
 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
@@ -514,7 +474,7 @@ struct Packet1cd {
 };
 
 template <>
-struct packet_traits<std::complex<double> > : default_packet_traits {
+struct packet_traits<std::complex<double>> : default_packet_traits {
   typedef Packet1cd type;
   typedef Packet1cd half;
   enum {
@@ -529,6 +489,7 @@ struct packet_traits<std::complex<double> > : default_packet_traits {
     HasNegate = 1,
     HasSqrt = 1,
     HasLog = 1,
+    HasExp = 1,
     HasAbs = 0,
     HasAbs2 = 0,
     HasMin = 0,
@@ -538,22 +499,14 @@ struct packet_traits<std::complex<double> > : default_packet_traits {
 };
 
 template <>
-struct unpacket_traits<Packet1cd> {
-  typedef std::complex<double> type;
-  typedef Packet1cd half;
-  typedef Packet2d as_real;
-  enum {
-    size = 1,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet1cd> : neon_unpacket_default<Packet1cd, std::complex<double>> {
+  using as_real = Packet2d;
 };
 
 template <>
 EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>(reinterpret_cast<const double*>(from)));
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(
+      pload<Packet2d>(assume_aligned<unpacket_traits<Packet1cd>::alignment>(reinterpret_cast<const double*>(from))));
 }
 
 template <>
@@ -665,17 +618,18 @@ EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* fr
 }
 
 template <>
-EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
-  EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<double*>(to), from.v);
+EIGEN_STRONG_INLINE void pstore<std::complex<double>>(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore(assume_aligned<unpacket_traits<Packet1cd>::alignment>(reinterpret_cast<double*>(to)),
+                                   from.v);
 }
 
 template <>
-EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double>>(std::complex<double>* to, const Packet1cd& from) {
   EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), from.v);
 }
 
 template <>
-EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+EIGEN_STRONG_INLINE void prefetch<std::complex<double>>(const std::complex<double>* addr) {
   EIGEN_ARM_PREFETCH(reinterpret_cast<const double*>(addr));
 }
 
@@ -697,7 +651,7 @@ EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::com
 template <>
 EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
   EIGEN_ALIGN16 std::complex<double> res;
-  pstore<std::complex<double> >(&res, a);
+  pstore<std::complex<double>>(&res, a);
   return res;
 }
 
@@ -733,15 +687,7 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
   kernel.packet[1].v = tmp;
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
-  return psqrt_complex<Packet1cd>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd plog<Packet1cd>(const Packet1cd& a) {
-  return plog_complex(a);
-}
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet1cd)
 
 #endif  // EIGEN_ARCH_ARM64
 
diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h
index 0046e01efb8..5c48db80f42 100644
--- a/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -33,12 +33,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet8hf ptanh<Packet8hf>(const Packet8hf
 }
 #endif  // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
 
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, psin)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pcos)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, plog)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pexp)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pexp2)
-BF16_PACKET_FUNCTION(Packet4f, Packet4bf, ptanh)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_BF16(Packet4f, Packet4bf)
 
 template <>
 EIGEN_STRONG_INLINE Packet4bf pfrexp(const Packet4bf& a, Packet4bf& exponent) {
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 9364cffca9f..fb222b846de 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -189,22 +189,29 @@ struct packet_traits<float> : default_packet_traits {
     HasNegate = 1,
     HasAbs = 1,
     HasArg = 0,
-    HasAbs2 = 1,
     HasAbsDiff = 1,
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
     HasSetLinear = 1,
-    HasBlend = 0,
     HasDiv = 1,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
+    HasTan = EIGEN_FAST_MATH,
     HasACos = 1,
     HasASin = 1,
     HasATan = 1,
     HasATanh = 1,
+    HasSinh = 1,
+    HasCosh = 1,
+    HasASinh = 1,
+    HasACosh = 1,
     HasLog = 1,
+    HasLog10 = 1,
     HasExp = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasPow = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
     HasCbrt = 1,
@@ -234,12 +241,10 @@ struct packet_traits<int8_t> : default_packet_traits {
     HasAbs = 1,
     HasAbsDiff = 1,
     HasArg = 0,
-    HasAbs2 = 1,
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
     HasSetLinear = 1,
-    HasBlend = 0
   };
 };
 
@@ -261,12 +266,10 @@ struct packet_traits<uint8_t> : default_packet_traits {
     HasAbs = 1,
     HasAbsDiff = 1,
     HasArg = 0,
-    HasAbs2 = 1,
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
     HasSetLinear = 1,
-    HasBlend = 0,
 
     HasSqrt = 1
   };
@@ -290,12 +293,10 @@ struct packet_traits<int16_t> : default_packet_traits {
     HasAbs = 1,
     HasAbsDiff = 1,
     HasArg = 0,
-    HasAbs2 = 1,
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
     HasSetLinear = 1,
-    HasBlend = 0
   };
 };
 
@@ -317,12 +318,10 @@ struct packet_traits<uint16_t> : default_packet_traits {
     HasAbs = 1,
     HasAbsDiff = 1,
     HasArg = 0,
-    HasAbs2 = 1,
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
     HasSetLinear = 1,
-    HasBlend = 0,
     HasSqrt = 1
   };
 };
@@ -344,13 +343,11 @@ struct packet_traits<int32_t> : default_packet_traits {
     HasNegate = 1,
     HasAbs = 1,
     HasArg = 0,
-    HasAbs2 = 1,
     HasAbsDiff = 1,
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
     HasSetLinear = 1,
-    HasBlend = 0
   };
 };
 
@@ -371,13 +368,11 @@ struct packet_traits<uint32_t> : default_packet_traits {
     HasNegate = 0,
     HasAbs = 1,
     HasArg = 0,
-    HasAbs2 = 1,
     HasAbsDiff = 1,
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
     HasSetLinear = 1,
-    HasBlend = 0,
 
     HasSqrt = 1
   };
@@ -400,13 +395,11 @@ struct packet_traits<int64_t> : default_packet_traits {
     HasNegate = 1,
     HasAbs = 1,
     HasArg = 0,
-    HasAbs2 = 1,
     HasAbsDiff = 1,
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
     HasSetLinear = 1,
-    HasBlend = 0
   };
 };
 
@@ -427,234 +420,82 @@ struct packet_traits<uint64_t> : default_packet_traits {
     HasNegate = 0,
     HasAbs = 1,
     HasArg = 0,
-    HasAbs2 = 1,
     HasAbsDiff = 1,
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
     HasSetLinear = 1,
-    HasBlend = 0
   };
 };
 
-template <>
-struct unpacket_traits<Packet2f> {
-  typedef float type;
-  typedef Packet2f half;
-  typedef Packet2i integer_packet;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+template <typename Packet, typename Scalar>
+struct neon_unpacket_default {
+  using type = Scalar;
+  using half = Packet;
+  static constexpr int size = sizeof(Packet) / sizeof(Scalar);
+  static constexpr int alignment = sizeof(Packet);
+  static constexpr bool vectorizable = true;
+  static constexpr bool masked_load_available = false;
+  static constexpr bool masked_store_available = false;
 };
+
 template <>
-struct unpacket_traits<Packet4f> {
-  typedef float type;
-  typedef Packet2f half;
-  typedef Packet4i integer_packet;
-  enum {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet2f> : neon_unpacket_default<Packet2f, float> {
+  using integer_packet = Packet2i;
 };
 template <>
-struct unpacket_traits<Packet4c> {
-  typedef int8_t type;
-  typedef Packet4c half;
-  enum {
-    size = 4,
-    alignment = Unaligned,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet4f> : neon_unpacket_default<Packet4f, float> {
+  using half = Packet2f;
+  using integer_packet = Packet4i;
 };
 template <>
-struct unpacket_traits<Packet8c> {
-  typedef int8_t type;
-  typedef Packet4c half;
-  enum {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
+struct unpacket_traits<Packet4c> : neon_unpacket_default<Packet4c, int8_t> {};
 template <>
-struct unpacket_traits<Packet16c> {
-  typedef int8_t type;
-  typedef Packet8c half;
-  enum {
-    size = 16,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet8c> : neon_unpacket_default<Packet8c, int8_t> {
+  using half = Packet4c;
 };
 template <>
-struct unpacket_traits<Packet4uc> {
-  typedef uint8_t type;
-  typedef Packet4uc half;
-  enum {
-    size = 4,
-    alignment = Unaligned,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet16c> : neon_unpacket_default<Packet16c, int8_t> {
+  using half = Packet8c;
 };
 template <>
-struct unpacket_traits<Packet8uc> {
-  typedef uint8_t type;
-  typedef Packet4uc half;
-  enum {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
+struct unpacket_traits<Packet4uc> : neon_unpacket_default<Packet4uc, uint8_t> {};
 template <>
-struct unpacket_traits<Packet16uc> {
-  typedef uint8_t type;
-  typedef Packet8uc half;
-  enum {
-    size = 16,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet8uc> : neon_unpacket_default<Packet8uc, uint8_t> {
+  using half = Packet4uc;
 };
 template <>
-struct unpacket_traits<Packet4s> {
-  typedef int16_t type;
-  typedef Packet4s half;
-  enum {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet16uc> : neon_unpacket_default<Packet16uc, uint8_t> {
+  using half = Packet8uc;
 };
 template <>
-struct unpacket_traits<Packet8s> {
-  typedef int16_t type;
-  typedef Packet4s half;
-  enum {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
+struct unpacket_traits<Packet4s> : neon_unpacket_default<Packet4s, int16_t> {};
 template <>
-struct unpacket_traits<Packet4us> {
-  typedef uint16_t type;
-  typedef Packet4us half;
-  enum {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet8s> : neon_unpacket_default<Packet8s, int16_t> {
+  using half = Packet4s;
 };
 template <>
-struct unpacket_traits<Packet8us> {
-  typedef uint16_t type;
-  typedef Packet4us half;
-  enum {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
+struct unpacket_traits<Packet4us> : neon_unpacket_default<Packet4us, uint16_t> {};
 template <>
-struct unpacket_traits<Packet2i> {
-  typedef int32_t type;
-  typedef Packet2i half;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet8us> : neon_unpacket_default<Packet8us, uint16_t> {
+  using half = Packet4us;
 };
 template <>
-struct unpacket_traits<Packet4i> {
-  typedef int32_t type;
-  typedef Packet2i half;
-  enum {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
+struct unpacket_traits<Packet2i> : neon_unpacket_default<Packet2i, int32_t> {};
 template <>
-struct unpacket_traits<Packet2ui> {
-  typedef uint32_t type;
-  typedef Packet2ui half;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet4i> : neon_unpacket_default<Packet4i, int32_t> {
+  using half = Packet2i;
 };
 template <>
-struct unpacket_traits<Packet4ui> {
-  typedef uint32_t type;
-  typedef Packet2ui half;
-  enum {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
+struct unpacket_traits<Packet2ui> : neon_unpacket_default<Packet2ui, uint32_t> {};
 template <>
-struct unpacket_traits<Packet2l> {
-  typedef int64_t type;
-  typedef Packet2l half;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet4ui> : neon_unpacket_default<Packet4ui, uint32_t> {
+  using half = Packet2ui;
 };
 template <>
-struct unpacket_traits<Packet2ul> {
-  typedef uint64_t type;
-  typedef Packet2ul half;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
+struct unpacket_traits<Packet2l> : neon_unpacket_default<Packet2l, int64_t> {};
+template <>
+struct unpacket_traits<Packet2ul> : neon_unpacket_default<Packet2ul, uint64_t> {};
 
 template <>
 EIGEN_STRONG_INLINE Packet2f pzero(const Packet2f& /*a*/) {
@@ -2417,11 +2258,11 @@ EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) {
 
 template <>
 EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(const float* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from);
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(assume_aligned<unpacket_traits<Packet2f>::alignment>(from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from);
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(assume_aligned<unpacket_traits<Packet4f>::alignment>(from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(const int8_t* from) {
@@ -2431,11 +2272,11 @@ EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(const int8_t* from) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(const int8_t* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from);
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(assume_aligned<unpacket_traits<Packet8c>::alignment>(from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from);
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(assume_aligned<unpacket_traits<Packet16c>::alignment>(from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(const uint8_t* from) {
@@ -2445,51 +2286,51 @@ EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(const uint8_t* from) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(const uint8_t* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from);
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(assume_aligned<unpacket_traits<Packet8uc>::alignment>(from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from);
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(assume_aligned<unpacket_traits<Packet16uc>::alignment>(from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(const int16_t* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from);
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(assume_aligned<unpacket_traits<Packet4s>::alignment>(from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from);
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(assume_aligned<unpacket_traits<Packet8s>::alignment>(from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(const uint16_t* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from);
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(assume_aligned<unpacket_traits<Packet4us>::alignment>(from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from);
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(assume_aligned<unpacket_traits<Packet8us>::alignment>(from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(const int32_t* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from);
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(assume_aligned<unpacket_traits<Packet2i>::alignment>(from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from);
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(assume_aligned<unpacket_traits<Packet4i>::alignment>(from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(const uint32_t* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from);
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(assume_aligned<unpacket_traits<Packet2ui>::alignment>(from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from);
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(assume_aligned<unpacket_traits<Packet4ui>::alignment>(from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from);
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(assume_aligned<unpacket_traits<Packet2l>::alignment>(from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from);
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(assume_aligned<unpacket_traits<Packet2ul>::alignment>(from));
 }
 
 template <>
@@ -2660,38 +2501,60 @@ template <>
 EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) {
   return vld1q_dup_f32(from);
 }
+
+// WORKAROUND: Apple Clang 17.0.0 (and Homebrew Clang 21.1.8) at -O0 optimization
+// generate incorrect code for vld1_dup_[su]8, ignoring the pointer offset.
+// We use vdup_n_s8(*from) to force a safe scalar load before broadcast.
+EIGEN_ALWAYS_INLINE int8x8_t eigen_vld1_dup_s8(const int8_t* ptr) {
+#if EIGEN_COMP_CLANGAPPLE && EIGEN_ARCH_ARM64
+  return vdup_n_s8(*ptr);
+#else
+  return vld1_dup_s8(ptr);
+#endif
+}
+
+EIGEN_ALWAYS_INLINE uint8x8_t eigen_vld1_dup_u8(const uint8_t* ptr) {
+#if EIGEN_COMP_CLANGAPPLE && EIGEN_ARCH_ARM64
+  return vdup_n_u8(*ptr);
+#else
+  return vld1_dup_u8(ptr);
+#endif
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet4c ploadquad<Packet4c>(const int8_t* from) {
-  return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0);
+  return vget_lane_s32(vreinterpret_s32_s8(eigen_vld1_dup_s8(from)), 0);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8c ploadquad<Packet8c>(const int8_t* from) {
   return vreinterpret_s8_u32(
-      vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
+      vzip_u32(vreinterpret_u32_s8(eigen_vld1_dup_s8(from)), vreinterpret_u32_s8(eigen_vld1_dup_s8(from + 1))).val[0]);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from) {
   const int8x8_t a = vreinterpret_s8_u32(
-      vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
+      vzip_u32(vreinterpret_u32_s8(eigen_vld1_dup_s8(from)), vreinterpret_u32_s8(eigen_vld1_dup_s8(from + 1))).val[0]);
   const int8x8_t b = vreinterpret_s8_u32(
-      vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from + 2)), vreinterpret_u32_s8(vld1_dup_s8(from + 3))).val[0]);
+      vzip_u32(vreinterpret_u32_s8(eigen_vld1_dup_s8(from + 2)), vreinterpret_u32_s8(eigen_vld1_dup_s8(from + 3)))
+          .val[0]);
   return vcombine_s8(a, b);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4uc ploadquad<Packet4uc>(const uint8_t* from) {
-  return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0);
+  return vget_lane_u32(vreinterpret_u32_u8(eigen_vld1_dup_u8(from)), 0);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8uc ploadquad<Packet8uc>(const uint8_t* from) {
   return vreinterpret_u8_u32(
-      vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
+      vzip_u32(vreinterpret_u32_u8(eigen_vld1_dup_u8(from)), vreinterpret_u32_u8(eigen_vld1_dup_u8(from + 1))).val[0]);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from) {
   const uint8x8_t a = vreinterpret_u8_u32(
-      vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
+      vzip_u32(vreinterpret_u32_u8(eigen_vld1_dup_u8(from)), vreinterpret_u32_u8(eigen_vld1_dup_u8(from + 1))).val[0]);
   const uint8x8_t b = vreinterpret_u8_u32(
-      vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from + 2)), vreinterpret_u32_u8(vld1_dup_u8(from + 3))).val[0]);
+      vzip_u32(vreinterpret_u32_u8(eigen_vld1_dup_u8(from + 2)), vreinterpret_u32_u8(eigen_vld1_dup_u8(from + 3)))
+          .val[0]);
   return vcombine_u8(a, b);
 }
 template <>
@@ -2713,11 +2576,11 @@ EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) {
 
 template <>
 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to, from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1_f32(assume_aligned<unpacket_traits<Packet2f>::alignment>(to), from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(assume_aligned<unpacket_traits<Packet4f>::alignment>(to), from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet4c& from) {
@@ -2725,11 +2588,11 @@ EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet4c& from) {
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet8c& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to, from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1_s8(assume_aligned<unpacket_traits<Packet8c>::alignment>(to), from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to, from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(assume_aligned<unpacket_traits<Packet16c>::alignment>(to), from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet4uc& from) {
@@ -2737,51 +2600,51 @@ EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet4uc& from) {
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet8uc& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to, from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1_u8(assume_aligned<unpacket_traits<Packet8uc>::alignment>(to), from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to, from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(assume_aligned<unpacket_traits<Packet16uc>::alignment>(to), from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet4s& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to, from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1_s16(assume_aligned<unpacket_traits<Packet4s>::alignment>(to), from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to, from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(assume_aligned<unpacket_traits<Packet8s>::alignment>(to), from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet4us& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to, from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1_u16(assume_aligned<unpacket_traits<Packet4us>::alignment>(to), from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to, from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(assume_aligned<unpacket_traits<Packet8us>::alignment>(to), from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet2i& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to, from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1_s32(assume_aligned<unpacket_traits<Packet2i>::alignment>(to), from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(assume_aligned<unpacket_traits<Packet4i>::alignment>(to), from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet2ui& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to, from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1_u32(assume_aligned<unpacket_traits<Packet2ui>::alignment>(to), from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to, from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(assume_aligned<unpacket_traits<Packet4ui>::alignment>(to), from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to, from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(assume_aligned<unpacket_traits<Packet2l>::alignment>(to), from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to, from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(assume_aligned<unpacket_traits<Packet2ul>::alignment>(to), from);
 }
 
 template <>
@@ -3665,27 +3528,27 @@ EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
 #endif
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half(const Packet8c& a) {
   return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a, vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half(const Packet16c& a) {
   return vadd_s8(vget_high_s8(a), vget_low_s8(a));
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half(const Packet8uc& a) {
   return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a, vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half(const Packet16uc& a) {
   return vadd_u8(vget_high_u8(a), vget_low_u8(a));
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half(const Packet8s& a) {
   return vadd_s16(vget_high_s16(a), vget_low_s16(a));
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half(const Packet8us& a) {
   return vadd_u16(vget_high_u16(a), vget_low_u16(a));
 }
 
@@ -4130,8 +3993,16 @@ EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a) {
 
 template <>
 EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
-  uint32x2_t tmp = vorr_u32(vget_low_u32(vreinterpretq_u32_f32(x)), vget_high_u32(vreinterpretq_u32_f32(x)));
-  return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
+  uint32x4_t u = vreinterpretq_u32_f32(x);
+#if EIGEN_ARCH_ARM64
+  return vget_lane_u64(vreinterpret_u64_u16(vmovn_u32(u)), 0);
+#else
+  uint32x2_t tmp = vorr_u32(vget_low_u32(u), vget_high_u32(u));
+  uint32_t a, b;
+  // GCC and Clang refuse to emit this instruction.
+  asm("vmov %0, %1, %P2" : "=r"(a), "=r"(b) : "w"(tmp));
+  return a | b;
+#endif
 }
 
 // Helpers for ptranspose.
@@ -4780,13 +4651,11 @@ struct packet_traits<bfloat16> : default_packet_traits {
     HasNegate = 1,
     HasAbs = 1,
     HasArg = 0,
-    HasAbs2 = 1,
     HasAbsDiff = 1,
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
     HasSetLinear = 1,
-    HasBlend = 0,
     HasDiv = 1,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
@@ -4801,17 +4670,7 @@ struct packet_traits<bfloat16> : default_packet_traits {
 };
 
 template <>
-struct unpacket_traits<Packet4bf> {
-  typedef bfloat16 type;
-  typedef Packet4bf half;
-  enum {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
+struct unpacket_traits<Packet4bf> : neon_unpacket_default<Packet4bf, bfloat16> {};
 
 namespace detail {
 template <>
@@ -4866,7 +4725,8 @@ EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
 
 template <>
 EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from) {
-  return Packet4bf(pload<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
+  return Packet4bf(
+      pload<Packet4us>(reinterpret_cast<const uint16_t*>(assume_aligned<unpacket_traits<Packet4bf>::alignment>(from))));
 }
 
 template <>
@@ -4876,7 +4736,8 @@ EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from) {
 
 template <>
 EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1_u16(
+      reinterpret_cast<uint16_t*>(assume_aligned<unpacket_traits<Packet4bf>::alignment>(to)), from);
 }
 
 template <>
@@ -5173,24 +5034,31 @@ struct packet_traits<double> : default_packet_traits {
     HasNegate = 1,
     HasAbs = 1,
     HasArg = 0,
-    HasAbs2 = 1,
     HasAbsDiff = 1,
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
     HasSetLinear = 1,
-    HasBlend = 0,
 
     HasDiv = 1,
 
 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
     HasExp = 1,
     HasLog = 1,
+    HasLog10 = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasPow = 1,
     HasATan = 1,
     HasATanh = 1,
+    HasSinh = 1,
+    HasCosh = 1,
+    HasASinh = 1,
+    HasACosh = 1,
 #endif
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
+    HasTan = EIGEN_FAST_MATH,
     HasSqrt = 1,
     HasRsqrt = 1,
     HasCbrt = 1,
@@ -5201,17 +5069,8 @@ struct packet_traits<double> : default_packet_traits {
 };
 
 template <>
-struct unpacket_traits<Packet2d> {
-  typedef double type;
-  typedef Packet2d half;
-  typedef Packet2l integer_packet;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet2d> : neon_unpacket_default<Packet2d, double> {
+  using integer_packet = Packet2l;
 };
 
 template <>
@@ -5373,7 +5232,7 @@ EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
 
 template <>
 EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from);
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(assume_aligned<unpacket_traits<Packet2d>::alignment>(from));
 }
 
 template <>
@@ -5387,7 +5246,7 @@ EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(assume_aligned<unpacket_traits<Packet2d>::alignment>(to), from);
 }
 
 template <>
@@ -5555,13 +5414,11 @@ struct packet_traits<Eigen::half> : default_packet_traits {
     HasNegate = 1,
     HasAbs = 1,
     HasArg = 0,
-    HasAbs2 = 1,
     HasAbsDiff = 0,
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
     HasSetLinear = 1,
-    HasBlend = 0,
     HasInsert = 1,
     HasReduxp = 1,
     HasDiv = 1,
@@ -5579,33 +5436,14 @@ struct packet_traits<Eigen::half> : default_packet_traits {
 };
 
 template <>
-struct unpacket_traits<Packet4hf> {
-  typedef Eigen::half type;
-  typedef Packet4hf half;
-  enum {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-
+struct unpacket_traits<Packet4hf> : neon_unpacket_default<Packet4hf, half> {};
 template <>
-struct unpacket_traits<Packet8hf> {
-  typedef Eigen::half type;
-  typedef Packet4hf half;
-  enum {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet8hf> : neon_unpacket_default<Packet8hf, half> {
+  using half = Packet4hf;
 };
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4<Packet8hf>(const Packet8hf& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half<Packet8hf>(const Packet8hf& a) {
   return vadd_f16(vget_low_f16(a), vget_high_f16(a));
 }
 
@@ -5934,12 +5772,14 @@ EIGEN_STRONG_INLINE Packet4hf pandnot<Packet4hf>(const Packet4hf& a, const Packe
 
 template <>
 EIGEN_STRONG_INLINE Packet8hf pload<Packet8hf>(const Eigen::half* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(
+      reinterpret_cast<const float16_t*>(assume_aligned<unpacket_traits<Packet8hf>::alignment>(from)));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4hf pload<Packet4hf>(const Eigen::half* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(
+      reinterpret_cast<const float16_t*>(assume_aligned<unpacket_traits<Packet4hf>::alignment>(from)));
 }
 
 template <>
@@ -6014,12 +5854,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a,
 
 template <>
 EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(
+      reinterpret_cast<float16_t*>(assume_aligned<unpacket_traits<Packet8hf>::alignment>(to)), from);
 }
 
 template <>
 EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
-  EIGEN_DEBUG_ALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
+  EIGEN_DEBUG_ALIGNED_STORE vst1_f16(
+      reinterpret_cast<float16_t*>(assume_aligned<unpacket_traits<Packet4hf>::alignment>(to)), from);
 }
 
 template <>
diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h
index 58d7b8cc980..748d701555a 100644
--- a/Eigen/src/Core/arch/NEON/TypeCasting.h
+++ b/Eigen/src/Core/arch/NEON/TypeCasting.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
+// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@gmail.com>
 // Copyright (C) 2020 Antonio Sanchez <cantonios@google.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
diff --git a/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h
new file mode 100644
index 00000000000..026e1dbc130
--- /dev/null
+++ b/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h
@@ -0,0 +1,236 @@
+// This file is part of Eigen, a lightweight C template library
+// for linear algebra.
+//
+// Copyright (C) 2024 Kseniya Zaytseva <kseniya.zaytseva@syntacore.com>
+// Copyright (C) 2025 Chip Kerchner <ckerchner@tenstorrent.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H
+#define EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+/********************************* real ************************************/
+
+template <>
+struct gebp_traits<float, float, false, false, Architecture::RVV10, GEBPPacketFull>
+    : gebp_traits<float, float, false, false, Architecture::Generic, GEBPPacketFull> {
+  typedef float RhsPacket;
+  typedef QuadPacket<float> RhsPacketx4;
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1<RhsPacket>(*b); }
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad<RhsPacket>(b); }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+    c = __riscv_vfmadd_vf_f32m1(a, b, c, unpacket_traits<AccPacket>::size);
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 2
+    c = __riscv_vfmadd_vf_f32m2(a, b, c, unpacket_traits<AccPacket>::size);
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 4
+    c = __riscv_vfmadd_vf_f32m4(a, b, c, unpacket_traits<AccPacket>::size);
+#endif
+  }
+
+#if EIGEN_RISCV64_DEFAULT_LMUL >= 2
+  EIGEN_STRONG_INLINE void madd(const Packet1Xf& a, const RhsPacket& b, Packet1Xf& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    c = __riscv_vfmadd_vf_f32m1(a, b, c, unpacket_traits<Packet1Xf>::size);
+  }
+#endif
+#if EIGEN_RISCV64_DEFAULT_LMUL == 4
+  EIGEN_STRONG_INLINE void madd(const Packet2Xf& a, const RhsPacket& b, Packet2Xf& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    c = __riscv_vfmadd_vf_f32m2(a, b, c, unpacket_traits<Packet2Xf>::size);
+  }
+#endif
+
+  template <typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const LaneIdType& lane) const {
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+    c = __riscv_vfmadd_vf_f32m1(a, b.get(lane), c, unpacket_traits<AccPacket>::size);
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 2
+    c = __riscv_vfmadd_vf_f32m2(a, b.get(lane), c, unpacket_traits<AccPacket>::size);
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 4
+    c = __riscv_vfmadd_vf_f32m4(a, b.get(lane), c, unpacket_traits<AccPacket>::size);
+#endif
+  }
+};
+
+template <>
+struct gebp_traits<double, double, false, false, Architecture::RVV10, GEBPPacketFull>
+    : gebp_traits<double, double, false, false, Architecture::Generic, GEBPPacketFull> {
+  typedef double RhsPacket;
+  typedef QuadPacket<double> RhsPacketx4;
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1<RhsPacket>(*b); }
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad<RhsPacket>(b); }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+    c = __riscv_vfmadd_vf_f64m1(a, b, c, unpacket_traits<AccPacket>::size);
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 2
+    c = __riscv_vfmadd_vf_f64m2(a, b, c, unpacket_traits<AccPacket>::size);
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 4
+    c = __riscv_vfmadd_vf_f64m4(a, b, c, unpacket_traits<AccPacket>::size);
+#endif
+  }
+
+#if EIGEN_RISCV64_DEFAULT_LMUL >= 2
+  EIGEN_STRONG_INLINE void madd(const Packet1Xd& a, const RhsPacket& b, Packet1Xd& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    c = __riscv_vfmadd_vf_f64m1(a, b, c, unpacket_traits<Packet1Xd>::size);
+  }
+#endif
+#if EIGEN_RISCV64_DEFAULT_LMUL == 4
+  EIGEN_STRONG_INLINE void madd(const Packet2Xd& a, const RhsPacket& b, Packet2Xd& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    c = __riscv_vfmadd_vf_f64m2(a, b, c, unpacket_traits<Packet2Xd>::size);
+  }
+#endif
+
+  template <typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const LaneIdType& lane) const {
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+    c = __riscv_vfmadd_vf_f64m1(a, b.get(lane), c, unpacket_traits<AccPacket>::size);
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 2
+    c = __riscv_vfmadd_vf_f64m2(a, b.get(lane), c, unpacket_traits<AccPacket>::size);
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 4
+    c = __riscv_vfmadd_vf_f64m4(a, b.get(lane), c, unpacket_traits<AccPacket>::size);
+#endif
+  }
+};
+
+#if defined(EIGEN_VECTORIZE_RVV10FP16)
+
+template <>
+struct gebp_traits<half, half, false, false, Architecture::RVV10>
+    : gebp_traits<half, half, false, false, Architecture::Generic> {
+  typedef half RhsPacket;
+  typedef PacketXh LhsPacket;
+  typedef PacketXh AccPacket;
+  typedef QuadPacket<half> RhsPacketx4;
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1<RhsPacket>(*b); }
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = pload<RhsPacket>(b); }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+    c = __riscv_vfmadd_vf_f16m1(a, numext::bit_cast<_Float16>(b), c, unpacket_traits<AccPacket>::size);
+#else
+    c = __riscv_vfmadd_vf_f16m2(a, numext::bit_cast<_Float16>(b), c, unpacket_traits<AccPacket>::size);
+#endif
+  }
+
+#if EIGEN_RISCV64_DEFAULT_LMUL >= 2
+  EIGEN_STRONG_INLINE void madd(const Packet1Xh& a, const RhsPacket& b, Packet1Xh& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    c = __riscv_vfmadd_vf_f16m1(a, numext::bit_cast<_Float16>(b), c, unpacket_traits<Packet1Xh>::size);
+  }
+#endif
+
+  template <typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const LaneIdType& lane) const {
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+    c = __riscv_vfmadd_vf_f16m1(a, numext::bit_cast<_Float16>(b.get(lane)), c, unpacket_traits<AccPacket>::size);
+#else
+    c = __riscv_vfmadd_vf_f16m2(a, numext::bit_cast<_Float16>(b.get(lane)), c, unpacket_traits<AccPacket>::size);
+#endif
+  }
+};
+
+#endif
+
+#if defined(EIGEN_VECTORIZE_RVV10BF16)
+
+template <>
+struct gebp_traits<bfloat16, bfloat16, false, false, Architecture::RVV10>
+    : gebp_traits<bfloat16, bfloat16, false, false, Architecture::Generic> {
+  typedef bfloat16 RhsPacket;
+  typedef PacketXbf LhsPacket;
+  typedef PacketXbf AccPacket;
+  typedef QuadPacket<bfloat16> RhsPacketx4;
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1<RhsPacket>(*b); }
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = pload<RhsPacket>(b); }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+    c = F32ToBf16(
+        __riscv_vfwmaccbf16_vf_f32m2(Bf16ToF32(c), numext::bit_cast<__bf16>(b), a, unpacket_traits<AccPacket>::size));
+#else
+    c = F32ToBf16(
+        __riscv_vfwmaccbf16_vf_f32m4(Bf16ToF32(c), numext::bit_cast<__bf16>(b), a, unpacket_traits<AccPacket>::size));
+#endif
+  }
+
+#if EIGEN_RISCV64_DEFAULT_LMUL >= 2
+  EIGEN_STRONG_INLINE void madd(const Packet1Xbf& a, const RhsPacket& b, Packet1Xbf& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    c = F32ToBf16(
+        __riscv_vfwmaccbf16_vf_f32m2(Bf16ToF32(c), numext::bit_cast<__bf16>(b), a, unpacket_traits<Packet1Xbf>::size));
+  }
+#endif
+
+  template <typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const LaneIdType& lane) const {
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+    c = F32ToBf16(__riscv_vfwmaccbf16_vf_f32m2(Bf16ToF32(c), numext::bit_cast<__bf16>(b.get(lane)), a,
+                                               unpacket_traits<AccPacket>::size));
+#else
+    c = F32ToBf16(__riscv_vfwmaccbf16_vf_f32m4(Bf16ToF32(c), numext::bit_cast<__bf16>(b.get(lane)), a,
+                                               unpacket_traits<AccPacket>::size));
+#endif
+  }
+};
+
+#endif
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H
diff --git a/Eigen/src/Core/arch/RVV10/MathFunctions.h b/Eigen/src/Core/arch/RVV10/MathFunctions.h
new file mode 100644
index 00000000000..10a70c446ce
--- /dev/null
+++ b/Eigen/src/Core/arch/RVV10/MathFunctions.h
@@ -0,0 +1,30 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 Kseniya Zaytseva <kseniya.zaytseva@syntacore.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_RVV10_H
+#define EIGEN_MATH_FUNCTIONS_RVV10_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet1Xf)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet2Xf)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet4Xf)
+
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet1Xd)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet2Xd)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet4Xd)
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_RVV10_H
diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h
new file mode 100644
index 00000000000..679d5c1fc2c
--- /dev/null
+++ b/Eigen/src/Core/arch/RVV10/PacketMath.h
@@ -0,0 +1,2442 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 Kseniya Zaytseva <kseniya.zaytseva@syntacore.com>
+// Copyright (C) 2025 Chip Kerchner <ckerchner@tenstorrent.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_RVV10_H
+#define EIGEN_PACKET_MATH_RVV10_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+
+template <typename Scalar, std::size_t VectorLength, std::size_t VectorLMul>
+struct rvv_packet_size_selector {
+  enum { size = VectorLength * VectorLMul / (sizeof(Scalar) * CHAR_BIT) };
+};
+
+template <std::size_t VectorLength, std::size_t VectorLMul>
+struct rvv_packet_alignment_selector {
+  enum {
+    alignment =
+        (VectorLength * VectorLMul) >= 1024
+            ? Aligned128
+            : ((VectorLength * VectorLMul) >= 512 ? Aligned64
+                                                  : ((VectorLength * VectorLMul) >= 256 ? Aligned32 : Aligned16))
+  };
+};
+
+typedef vbool64_t PacketMask64;
+typedef vbool32_t PacketMask32;
+typedef vbool16_t PacketMask16;
+typedef vbool8_t PacketMask8;
+typedef vbool4_t PacketMask4;
+
+/********************************* int32 **************************************/
+typedef eigen_packet_wrapper<vint32m1_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))), 0> Packet1Xi;
+typedef eigen_packet_wrapper<vuint32m1_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))), 1> Packet1Xu;
+
+typedef eigen_packet_wrapper<vint32m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 2> Packet2Xi;
+typedef eigen_packet_wrapper<vuint32m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 3> Packet2Xu;
+
+typedef eigen_packet_wrapper<vint32m4_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))), 4> Packet4Xi;
+typedef eigen_packet_wrapper<vuint32m4_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))), 5> Packet4Xu;
+
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+typedef Packet1Xi PacketXi;
+typedef Packet1Xu PacketXu;
+
+template <>
+struct packet_traits<numext::int32_t> : default_packet_traits {
+  typedef Packet1Xi type;
+  typedef Packet1Xi half;  // Half not implemented yet
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<numext::int32_t, EIGEN_RISCV64_RVV_VL, 1>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0
+  };
+};
+
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 2
+typedef Packet2Xi PacketXi;
+typedef Packet2Xu PacketXu;
+
+template <>
+struct packet_traits<numext::int32_t> : default_packet_traits {
+  typedef Packet2Xi type;
+  typedef Packet1Xi half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<numext::int32_t, EIGEN_RISCV64_RVV_VL, 2>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0
+  };
+};
+
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 4
+typedef Packet4Xi PacketXi;
+typedef Packet4Xu PacketXu;
+
+template <>
+struct packet_traits<numext::int32_t> : default_packet_traits {
+  typedef Packet4Xi type;
+  typedef Packet2Xi half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<numext::int32_t, EIGEN_RISCV64_RVV_VL, 4>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0
+  };
+};
+#endif
+
+template <>
+struct unpacket_traits<Packet1Xi> {
+  typedef numext::int32_t type;
+  typedef Packet1Xi half;  // Half not yet implemented
+  typedef numext::uint8_t mask_t;
+  enum {
+    size = rvv_packet_size_selector<numext::int32_t, EIGEN_RISCV64_RVV_VL, 1>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 1>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2Xi> {
+  typedef numext::int32_t type;
+  typedef Packet1Xi half;
+  typedef numext::uint8_t mask_t;
+  enum {
+    size = rvv_packet_size_selector<numext::int32_t, EIGEN_RISCV64_RVV_VL, 2>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 2>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet4Xi> {
+  typedef numext::int32_t type;
+  typedef Packet2Xi half;
+  typedef numext::uint8_t mask_t;
+  enum {
+    size = rvv_packet_size_selector<numext::int32_t, EIGEN_RISCV64_RVV_VL, 4>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 4>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<numext::int32_t>(const numext::int32_t* addr) {
+#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+  __builtin_prefetch(addr);
+#endif
+}
+
+/********************************* Packet1Xi ************************************/
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pset1<Packet1Xi>(const numext::int32_t& from) {
+  return __riscv_vmv_v_x_i32m1(from, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi plset<Packet1Xi>(const numext::int32_t& a) {
+  Packet1Xi idx = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits<Packet1Xi>::size));
+  return __riscv_vadd_vx_i32m1(idx, a, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pzero<Packet1Xi>(const Packet1Xi& /*a*/) {
+  return __riscv_vmv_v_x_i32m1(0, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi padd<Packet1Xi>(const Packet1Xi& a, const Packet1Xi& b) {
+  return __riscv_vadd_vv_i32m1(a, b, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi psub<Packet1Xi>(const Packet1Xi& a, const Packet1Xi& b) {
+  return __riscv_vsub(a, b, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pnegate(const Packet1Xi& a) {
+  return __riscv_vneg(a, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pconj(const Packet1Xi& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pmul<Packet1Xi>(const Packet1Xi& a, const Packet1Xi& b) {
+  return __riscv_vmul(a, b, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pdiv<Packet1Xi>(const Packet1Xi& a, const Packet1Xi& b) {
+  return __riscv_vdiv(a, b, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pmadd(const Packet1Xi& a, const Packet1Xi& b, const Packet1Xi& c) {
+  return __riscv_vmadd(a, b, c, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pmsub(const Packet1Xi& a, const Packet1Xi& b, const Packet1Xi& c) {
+  return __riscv_vmadd(a, b, pnegate(c), unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pnmadd(const Packet1Xi& a, const Packet1Xi& b, const Packet1Xi& c) {
+  return __riscv_vnmsub_vv_i32m1(a, b, c, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pnmsub(const Packet1Xi& a, const Packet1Xi& b, const Packet1Xi& c) {
+  return __riscv_vnmsub_vv_i32m1(a, b, pnegate(c), unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pmin<Packet1Xi>(const Packet1Xi& a, const Packet1Xi& b) {
+  return __riscv_vmin(a, b, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pmax<Packet1Xi>(const Packet1Xi& a, const Packet1Xi& b) {
+  return __riscv_vmax(a, b, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pcmp_le<Packet1Xi>(const Packet1Xi& a, const Packet1Xi& b) {
+  PacketMask32 mask = __riscv_vmsle_vv_i32m1_b32(a, b, unpacket_traits<Packet1Xi>::size);
+  return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pcmp_lt<Packet1Xi>(const Packet1Xi& a, const Packet1Xi& b) {
+  PacketMask32 mask = __riscv_vmslt_vv_i32m1_b32(a, b, unpacket_traits<Packet1Xi>::size);
+  return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pcmp_eq<Packet1Xi>(const Packet1Xi& a, const Packet1Xi& b) {
+  PacketMask32 mask = __riscv_vmseq_vv_i32m1_b32(a, b, unpacket_traits<Packet1Xi>::size);
+  return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi ptrue<Packet1Xi>(const Packet1Xi& /*a*/) {
+  return __riscv_vmv_v_x_i32m1(0xffffffffu, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pand<Packet1Xi>(const Packet1Xi& a, const Packet1Xi& b) {
+  return __riscv_vand_vv_i32m1(a, b, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi por<Packet1Xi>(const Packet1Xi& a, const Packet1Xi& b) {
+  return __riscv_vor_vv_i32m1(a, b, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pxor<Packet1Xi>(const Packet1Xi& a, const Packet1Xi& b) {
+  return __riscv_vxor_vv_i32m1(a, b, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pandnot<Packet1Xi>(const Packet1Xi& a, const Packet1Xi& b) {
+  return __riscv_vand_vv_i32m1(a, __riscv_vnot_v_i32m1(b, unpacket_traits<Packet1Xi>::size),
+                               unpacket_traits<Packet1Xi>::size);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet1Xi parithmetic_shift_right(Packet1Xi a) {
+  return __riscv_vsra_vx_i32m1(a, N, unpacket_traits<Packet1Xi>::size);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet1Xi plogical_shift_right(Packet1Xi a) {
+  return __riscv_vreinterpret_i32m1(
+      __riscv_vsrl_vx_u32m1(__riscv_vreinterpret_u32m1(a), N, unpacket_traits<Packet1Xi>::size));
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet1Xi plogical_shift_left(Packet1Xi a) {
+  return __riscv_vsll_vx_i32m1(a, N, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pload<Packet1Xi>(const numext::int32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi ploadu<Packet1Xi>(const numext::int32_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi ploaddup<Packet1Xi>(const numext::int32_t* from) {
+  Packet1Xu data = __riscv_vreinterpret_v_i32m1_u32m1(pload<Packet1Xi>(from));
+  return __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_u64m1_i64m1(__riscv_vlmul_trunc_v_u64m2_u64m1(
+      __riscv_vwmaccu_vx_u64m2(__riscv_vwaddu_vv_u64m2(data, data, unpacket_traits<Packet1Xi>::size), 0xffffffffu, data,
+                               unpacket_traits<Packet1Xi>::size))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi ploadquad<Packet1Xi>(const numext::int32_t* from) {
+  Packet1Xu idx =
+      __riscv_vsrl_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits<Packet1Xi>::size), 2, unpacket_traits<Packet1Xi>::size);
+  return __riscv_vrgather_vv_i32m1(pload<Packet1Xi>(from), idx, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int32_t>(numext::int32_t* to, const Packet1Xi& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int32_t>(numext::int32_t* to, const Packet1Xi& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1Xi pgather<numext::int32_t, Packet1Xi>(const numext::int32_t* from, Index stride) {
+  return __riscv_vlse32_v_i32m1(from, stride * sizeof(numext::int32_t), unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<numext::int32_t, Packet1Xi>(numext::int32_t* to, const Packet1Xi& from,
+                                                                   Index stride) {
+  __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t pfirst<Packet1Xi>(const Packet1Xi& a) {
+  return __riscv_vmv_x_s_i32m1_i32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi preverse(const Packet1Xi& a) {
+  Packet1Xu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits<Packet1Xi>::size),
+                                         unpacket_traits<Packet1Xi>::size - 1, unpacket_traits<Packet1Xi>::size);
+  return __riscv_vrgather_vv_i32m1(a, idx, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pabs(const Packet1Xi& a) {
+  Packet1Xi mask = __riscv_vsra_vx_i32m1(a, 31, unpacket_traits<Packet1Xi>::size);
+  return __riscv_vsub_vv_i32m1(__riscv_vxor_vv_i32m1(a, mask, unpacket_traits<Packet1Xi>::size), mask,
+                               unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux<Packet1Xi>(const Packet1Xi& a) {
+  return __riscv_vmv_x(__riscv_vredsum_vs_i32m1_i32m1(a, __riscv_vmv_v_x_i32m1(0, unpacket_traits<Packet1Xi>::size),
+                                                      unpacket_traits<Packet1Xi>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_mul<Packet1Xi>(const Packet1Xi& a) {
+  // Multiply the vector by its reverse
+  Packet1Xi prod = __riscv_vmul_vv_i32m1(preverse(a), a, unpacket_traits<Packet1Xi>::size);
+  Packet1Xi half_prod;
+
+  if (EIGEN_RISCV64_RVV_VL >= 1024) {
+    half_prod = __riscv_vslidedown_vx_i32m1(prod, 8, unpacket_traits<Packet1Xi>::size);
+    prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits<Packet1Xi>::size);
+  }
+  if (EIGEN_RISCV64_RVV_VL >= 512) {
+    half_prod = __riscv_vslidedown_vx_i32m1(prod, 4, unpacket_traits<Packet1Xi>::size);
+    prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits<Packet1Xi>::size);
+  }
+  if (EIGEN_RISCV64_RVV_VL >= 256) {
+    half_prod = __riscv_vslidedown_vx_i32m1(prod, 2, unpacket_traits<Packet1Xi>::size);
+    prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits<Packet1Xi>::size);
+  }
+  // Last reduction
+  half_prod = __riscv_vslidedown_vx_i32m1(prod, 1, unpacket_traits<Packet1Xi>::size);
+  prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits<Packet1Xi>::size);
+
+  // The reduction is done to the first element.
+  return pfirst(prod);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_min<Packet1Xi>(const Packet1Xi& a) {
+  return __riscv_vmv_x(__riscv_vredmin_vs_i32m1_i32m1(
+      a, __riscv_vmv_v_x_i32m1((std::numeric_limits<numext::int32_t>::max)(), unpacket_traits<Packet1Xi>::size),
+      unpacket_traits<Packet1Xi>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_max<Packet1Xi>(const Packet1Xi& a) {
+  return __riscv_vmv_x(__riscv_vredmax_vs_i32m1_i32m1(
+      a, __riscv_vmv_v_x_i32m1((std::numeric_limits<numext::int32_t>::min)(), unpacket_traits<Packet1Xi>::size),
+      unpacket_traits<Packet1Xi>::size));
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet1Xi, N>& kernel) {
+  numext::int32_t buffer[unpacket_traits<Packet1Xi>::size * N] = {0};
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits<Packet1Xi>::size);
+  }
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] =
+        __riscv_vle32_v_i32m1(&buffer[i * unpacket_traits<Packet1Xi>::size], unpacket_traits<Packet1Xi>::size);
+  }
+}
+
+/********************************* float32 ************************************/
+
+typedef eigen_packet_wrapper<vfloat32m1_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))), 6> Packet1Xf;
+typedef eigen_packet_wrapper<vfloat32m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 7>
+    Packet2Xf;
+typedef eigen_packet_wrapper<vfloat32m4_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))), 8>
+    Packet4Xf;
+
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+typedef Packet1Xf PacketXf;
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef Packet1Xf type;
+  typedef Packet1Xf half;
+
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<float, EIGEN_RISCV64_RVV_VL, 1>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0,
+
+    HasCmp = 1,
+    HasDiv = 1,
+    HasRound = 1,
+
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasTan = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH
+  };
+};
+
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 2
+typedef Packet2Xf PacketXf;
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef Packet2Xf type;
+  typedef Packet1Xf half;
+
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<float, EIGEN_RISCV64_RVV_VL, 2>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0,
+
+    HasCmp = 1,
+    HasDiv = 1,
+    HasRound = 1,
+
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH
+  };
+};
+
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 4
+typedef Packet4Xf PacketXf;
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef Packet4Xf type;
+  typedef Packet2Xf half;
+
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<float, EIGEN_RISCV64_RVV_VL, 4>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0,
+
+    HasCmp = 1,
+    HasDiv = 1,
+    HasRound = 1,
+
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH
+  };
+};
+#endif
+
+template <>
+struct unpacket_traits<Packet1Xf> {
+  typedef float type;
+  typedef Packet1Xf half;  // Half not yet implemented
+  typedef Packet1Xi integer_packet;
+  typedef numext::uint8_t mask_t;
+  typedef PacketMask32 packet_mask;
+
+  enum {
+    size = rvv_packet_size_selector<float, EIGEN_RISCV64_RVV_VL, 1>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 1>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2Xf> {
+  typedef float type;
+  typedef Packet1Xf half;
+  typedef Packet2Xi integer_packet;
+  typedef numext::uint8_t mask_t;
+  typedef PacketMask16 packet_mask;
+
+  enum {
+    size = rvv_packet_size_selector<float, EIGEN_RISCV64_RVV_VL, 2>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 2>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet4Xf> {
+  typedef float type;
+  typedef Packet2Xf half;
+  typedef Packet4Xi integer_packet;
+  typedef numext::uint8_t mask_t;
+  typedef PacketMask8 packet_mask;
+
+  enum {
+    size = rvv_packet_size_selector<float, EIGEN_RISCV64_RVV_VL, 4>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 4>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+/********************************* Packet1Xf ************************************/
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf ptrue<Packet1Xf>(const Packet1Xf& /*a*/) {
+  return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(0xffffffffu, unpacket_traits<Packet1Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pzero<Packet1Xf>(const Packet1Xf& /*a*/) {
+  return __riscv_vfmv_v_f_f32m1(0.0f, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pabs(const Packet1Xf& a) {
+  return __riscv_vfabs_v_f32m1(a, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pabsdiff(const Packet1Xf& a, const Packet1Xf& b) {
+  return __riscv_vfabs_v_f32m1(__riscv_vfsub_vv_f32m1(a, b, unpacket_traits<Packet1Xf>::size),
+                               unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pset1<Packet1Xf>(const float& from) {
+  return __riscv_vfmv_v_f_f32m1(from, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pset1frombits<Packet1Xf>(numext::uint32_t from) {
+  return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(from, unpacket_traits<Packet1Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf plset<Packet1Xf>(const float& a) {
+  Packet1Xf idx = __riscv_vfcvt_f_x_v_f32m1(
+      __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits<Packet1Xi>::size)),
+      unpacket_traits<Packet1Xf>::size);
+  return __riscv_vfadd_vf_f32m1(idx, a, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet1Xf>(const float* a, Packet1Xf& a0, Packet1Xf& a1, Packet1Xf& a2,
+                                                Packet1Xf& a3) {
+  vfloat32m1_t aa = __riscv_vle32_v_f32m1(a, 4);
+  a0 = __riscv_vrgather_vx_f32m1(aa, 0, unpacket_traits<Packet1Xf>::size);
+  a1 = __riscv_vrgather_vx_f32m1(aa, 1, unpacket_traits<Packet1Xf>::size);
+  a2 = __riscv_vrgather_vx_f32m1(aa, 2, unpacket_traits<Packet1Xf>::size);
+  a3 = __riscv_vrgather_vx_f32m1(aa, 3, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf padd<Packet1Xf>(const Packet1Xf& a, const Packet1Xf& b) {
+  return __riscv_vfadd_vv_f32m1(a, b, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf psub<Packet1Xf>(const Packet1Xf& a, const Packet1Xf& b) {
+  return __riscv_vfsub_vv_f32m1(a, b, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pnegate(const Packet1Xf& a) {
+  return __riscv_vfneg_v_f32m1(a, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf psignbit(const Packet1Xf& a) {
+  return __riscv_vreinterpret_v_i32m1_f32m1(
+      __riscv_vsra_vx_i32m1(__riscv_vreinterpret_v_f32m1_i32m1(a), 31, unpacket_traits<Packet1Xi>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pconj(const Packet1Xf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pmul<Packet1Xf>(const Packet1Xf& a, const Packet1Xf& b) {
+  return __riscv_vfmul_vv_f32m1(a, b, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pdiv<Packet1Xf>(const Packet1Xf& a, const Packet1Xf& b) {
+  return __riscv_vfdiv_vv_f32m1(a, b, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pmadd(const Packet1Xf& a, const Packet1Xf& b, const Packet1Xf& c) {
+  return __riscv_vfmadd_vv_f32m1(a, b, c, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pmsub(const Packet1Xf& a, const Packet1Xf& b, const Packet1Xf& c) {
+  return __riscv_vfmsub_vv_f32m1(a, b, c, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pnmadd(const Packet1Xf& a, const Packet1Xf& b, const Packet1Xf& c) {
+  return __riscv_vfnmsub_vv_f32m1(a, b, c, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pnmsub(const Packet1Xf& a, const Packet1Xf& b, const Packet1Xf& c) {
+  return __riscv_vfnmadd_vv_f32m1(a, b, c, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pmin<Packet1Xf>(const Packet1Xf& a, const Packet1Xf& b) {
+  Packet1Xf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet1Xf>::size);
+  PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits<Packet1Xf>::size);
+  PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits<Packet1Xf>::size);
+  mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits<Packet1Xf>::size);
+
+  return __riscv_vfmin_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pmin<PropagateNaN, Packet1Xf>(const Packet1Xf& a, const Packet1Xf& b) {
+  return pmin<Packet1Xf>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pmin<PropagateNumbers, Packet1Xf>(const Packet1Xf& a, const Packet1Xf& b) {
+  return __riscv_vfmin_vv_f32m1(a, b, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pmax<Packet1Xf>(const Packet1Xf& a, const Packet1Xf& b) {
+  Packet1Xf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet1Xf>::size);
+  PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits<Packet1Xf>::size);
+  PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits<Packet1Xf>::size);
+  mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits<Packet1Xf>::size);
+
+  return __riscv_vfmax_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pmax<PropagateNaN, Packet1Xf>(const Packet1Xf& a, const Packet1Xf& b) {
+  return pmax<Packet1Xf>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pmax<PropagateNumbers, Packet1Xf>(const Packet1Xf& a, const Packet1Xf& b) {
+  return __riscv_vfmax_vv_f32m1(a, b, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pcmp_le<Packet1Xf>(const Packet1Xf& a, const Packet1Xf& b) {
+  PacketMask32 mask = __riscv_vmfle_vv_f32m1_b32(a, b, unpacket_traits<Packet1Xf>::size);
+  return __riscv_vmerge_vvm_f32m1(pzero<Packet1Xf>(a), ptrue<Packet1Xf>(a), mask, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pcmp_lt<Packet1Xf>(const Packet1Xf& a, const Packet1Xf& b) {
+  PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits<Packet1Xf>::size);
+  return __riscv_vmerge_vvm_f32m1(pzero<Packet1Xf>(a), ptrue<Packet1Xf>(a), mask, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pcmp_eq<Packet1Xf>(const Packet1Xf& a, const Packet1Xf& b) {
+  PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits<Packet1Xf>::size);
+  return __riscv_vmerge_vvm_f32m1(pzero<Packet1Xf>(a), ptrue<Packet1Xf>(a), mask, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pcmp_lt_or_nan<Packet1Xf>(const Packet1Xf& a, const Packet1Xf& b) {
+  PacketMask32 mask = __riscv_vmfge_vv_f32m1_b32(a, b, unpacket_traits<Packet1Xf>::size);
+  return __riscv_vfmerge_vfm_f32m1(ptrue<Packet1Xf>(a), 0.0f, mask, unpacket_traits<Packet1Xf>::size);
+}
+
+// Logical Operations are not supported for float, so reinterpret casts
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pand<Packet1Xf>(const Packet1Xf& a, const Packet1Xf& b) {
+  return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1(
+      __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits<Packet1Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf por<Packet1Xf>(const Packet1Xf& a, const Packet1Xf& b) {
+  return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vor_vv_u32m1(
+      __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits<Packet1Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pxor<Packet1Xf>(const Packet1Xf& a, const Packet1Xf& b) {
+  return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vv_u32m1(
+      __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits<Packet1Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pandnot<Packet1Xf>(const Packet1Xf& a, const Packet1Xf& b) {
+  return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1(
+      __riscv_vreinterpret_v_f32m1_u32m1(a),
+      __riscv_vnot_v_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits<Packet1Xf>::size),
+      unpacket_traits<Packet1Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pload<Packet1Xf>(const float* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf ploadu<Packet1Xf>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf ploaddup<Packet1Xf>(const float* from) {
+  Packet1Xu data = __riscv_vreinterpret_v_f32m1_u32m1(pload<Packet1Xf>(from));
+  return __riscv_vreinterpret_v_i32m1_f32m1(
+      __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_u64m1_i64m1(__riscv_vlmul_trunc_v_u64m2_u64m1(
+          __riscv_vwmaccu_vx_u64m2(__riscv_vwaddu_vv_u64m2(data, data, unpacket_traits<Packet1Xi>::size), 0xffffffffu,
+                                   data, unpacket_traits<Packet1Xi>::size)))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf ploadquad<Packet1Xf>(const float* from) {
+  Packet1Xu idx =
+      __riscv_vsrl_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits<Packet1Xf>::size), 2, unpacket_traits<Packet1Xf>::size);
+  return __riscv_vrgather_vv_f32m1(pload<Packet1Xf>(from), idx, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet1Xf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet1Xf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1Xf pgather<float, Packet1Xf>(const float* from, Index stride) {
+  return __riscv_vlse32_v_f32m1(from, stride * sizeof(float), unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet1Xf>(float* to, const Packet1Xf& from, Index stride) {
+  __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet1Xf>(const Packet1Xf& a) {
+  return __riscv_vfmv_f_s_f32m1_f32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf psqrt(const Packet1Xf& a) {
+  return __riscv_vfsqrt_v_f32m1(a, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf print<Packet1Xf>(const Packet1Xf& a) {
+  const Packet1Xf limit = pset1<Packet1Xf>(static_cast<float>(1 << 23));
+  const Packet1Xf abs_a = pabs(a);
+
+  PacketMask32 mask = __riscv_vmfne_vv_f32m1_b32(a, a, unpacket_traits<Packet1Xf>::size);
+  const Packet1Xf x = __riscv_vfadd_vv_f32m1_tumu(mask, a, a, a, unpacket_traits<Packet1Xf>::size);
+  const Packet1Xf new_x = __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1(a, unpacket_traits<Packet1Xf>::size),
+                                                    unpacket_traits<Packet1Xf>::size);
+
+  mask = __riscv_vmflt_vv_f32m1_b32(abs_a, limit, unpacket_traits<Packet1Xf>::size);
+  Packet1Xf signed_x = __riscv_vfsgnj_vv_f32m1(new_x, x, unpacket_traits<Packet1Xf>::size);
+  return __riscv_vmerge_vvm_f32m1(x, signed_x, mask, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pfloor<Packet1Xf>(const Packet1Xf& a) {
+  Packet1Xf tmp = print<Packet1Xf>(a);
+  // If greater, subtract one.
+  PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, tmp, unpacket_traits<Packet1Xf>::size);
+  return __riscv_vfsub_vf_f32m1_tumu(mask, tmp, tmp, 1.0f, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf preverse(const Packet1Xf& a) {
+  Packet1Xu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits<Packet1Xf>::size),
+                                         unpacket_traits<Packet1Xf>::size - 1, unpacket_traits<Packet1Xf>::size);
+  return __riscv_vrgather_vv_f32m1(a, idx, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pfrexp<Packet1Xf>(const Packet1Xf& a, Packet1Xf& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<Packet1Xf>(const Packet1Xf& a) {
+  return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m1_f32m1(
+      a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits<Packet1Xf>::size), unpacket_traits<Packet1Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet1Xf>(const Packet1Xf& a) {
+  // Multiply the vector by its reverse
+  Packet1Xf prod = __riscv_vfmul_vv_f32m1(preverse(a), a, unpacket_traits<Packet1Xf>::size);
+  Packet1Xf half_prod;
+
+  if (EIGEN_RISCV64_RVV_VL >= 1024) {
+    half_prod = __riscv_vslidedown_vx_f32m1(prod, 8, unpacket_traits<Packet1Xf>::size);
+    prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits<Packet1Xf>::size);
+  }
+  if (EIGEN_RISCV64_RVV_VL >= 512) {
+    half_prod = __riscv_vslidedown_vx_f32m1(prod, 4, unpacket_traits<Packet1Xf>::size);
+    prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits<Packet1Xf>::size);
+  }
+  if (EIGEN_RISCV64_RVV_VL >= 256) {
+    half_prod = __riscv_vslidedown_vx_f32m1(prod, 2, unpacket_traits<Packet1Xf>::size);
+    prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits<Packet1Xf>::size);
+  }
+  // Last reduction
+  half_prod = __riscv_vslidedown_vx_f32m1(prod, 1, unpacket_traits<Packet1Xf>::size);
+  prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits<Packet1Xf>::size);
+
+  // The reduction is done to the first element.
+  return pfirst(prod);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet1Xf>(const Packet1Xf& a) {
+  return (std::min)(
+      __riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1(
+          a, __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet1Xf>::size),
+          unpacket_traits<Packet1Xf>::size)),
+      (std::numeric_limits<float>::max)());
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet1Xf>(const Packet1Xf& a) {
+  return (std::max)(
+      __riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1(
+          a, __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet1Xf>::size),
+          unpacket_traits<Packet1Xf>::size)),
+      -(std::numeric_limits<float>::max)());
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet1Xf, N>& kernel) {
+  float buffer[unpacket_traits<Packet1Xf>::size * N];
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits<Packet1Xf>::size);
+  }
+
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] =
+        __riscv_vle32_v_f32m1(&buffer[i * unpacket_traits<Packet1Xf>::size], unpacket_traits<Packet1Xf>::size);
+  }
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pldexp<Packet1Xf>(const Packet1Xf& a, const Packet1Xf& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketMask32 por(const PacketMask32& a, const PacketMask32& b) {
+  return __riscv_vmor_mm_b32(a, b, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketMask32 pand(const PacketMask32& a, const PacketMask32& b) {
+  return __riscv_vmand_mm_b32(a, b, unpacket_traits<Packet1Xf>::size);
+}
+
+EIGEN_STRONG_INLINE PacketMask32 pcmp_eq_mask(const Packet1Xf& a, const Packet1Xf& b) {
+  return __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits<Packet1Xf>::size);
+}
+
+EIGEN_STRONG_INLINE PacketMask32 pcmp_lt_mask(const Packet1Xf& a, const Packet1Xf& b) {
+  return __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits<Packet1Xf>::size);
+}
+
+EIGEN_STRONG_INLINE Packet1Xf pselect(const PacketMask32& mask, const Packet1Xf& a, const Packet1Xf& b) {
+  return __riscv_vmerge_vvm_f32m1(b, a, mask, unpacket_traits<Packet1Xf>::size);
+}
+
+EIGEN_STRONG_INLINE Packet1Xf pselect(const Packet1Xf& mask, const Packet1Xf& a, const Packet1Xf& b) {
+  PacketMask32 mask2 =
+      __riscv_vmsne_vx_i32m1_b32(__riscv_vreinterpret_v_f32m1_i32m1(mask), 0, unpacket_traits<Packet1Xf>::size);
+  return __riscv_vmerge_vvm_f32m1(b, a, mask2, unpacket_traits<Packet1Xf>::size);
+}
+
+/********************************* int64 **************************************/
+
+typedef eigen_packet_wrapper<vint64m1_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))), 9> Packet1Xl;
+typedef eigen_packet_wrapper<vuint64m1_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))), 10> Packet1Xul;
+
+typedef eigen_packet_wrapper<vint64m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 11> Packet2Xl;
+typedef eigen_packet_wrapper<vuint64m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 12>
+    Packet2Xul;
+
+typedef eigen_packet_wrapper<vint64m4_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))), 13> Packet4Xl;
+typedef eigen_packet_wrapper<vuint64m4_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))), 14>
+    Packet4Xul;
+
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+typedef Packet1Xl PacketXl;
+typedef Packet1Xul PacketXul;
+
+template <>
+struct packet_traits<numext::int64_t> : default_packet_traits {
+  typedef Packet1Xl type;
+  typedef Packet1Xl half;  // Half not implemented yet
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<numext::int64_t, EIGEN_RISCV64_RVV_VL, 1>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0
+  };
+};
+
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 2
+typedef Packet2Xl PacketXl;
+typedef Packet2Xul PacketXul;
+
+template <>
+struct packet_traits<numext::int64_t> : default_packet_traits {
+  typedef Packet2Xl type;
+  typedef Packet1Xl half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<numext::int64_t, EIGEN_RISCV64_RVV_VL, 2>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0
+  };
+};
+
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 4
+typedef Packet4Xl PacketXl;
+typedef Packet4Xul PacketXul;
+
+template <>
+struct packet_traits<numext::int64_t> : default_packet_traits {
+  typedef Packet4Xl type;
+  typedef Packet2Xl half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<numext::int64_t, EIGEN_RISCV64_RVV_VL, 4>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0
+  };
+};
+#endif
+
+template <>
+struct unpacket_traits<Packet1Xl> {
+  typedef numext::int64_t type;
+  typedef Packet1Xl half;  // Half not yet implemented
+  typedef numext::uint8_t mask_t;
+  enum {
+    size = rvv_packet_size_selector<numext::int64_t, EIGEN_RISCV64_RVV_VL, 1>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 1>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2Xl> {
+  typedef numext::int64_t type;
+  typedef Packet1Xl half;
+  typedef numext::uint8_t mask_t;
+  enum {
+    size = rvv_packet_size_selector<numext::int64_t, EIGEN_RISCV64_RVV_VL, 2>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 2>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet4Xl> {
+  typedef numext::int64_t type;
+  typedef Packet2Xl half;
+  typedef numext::uint8_t mask_t;
+  enum {
+    size = rvv_packet_size_selector<numext::int64_t, EIGEN_RISCV64_RVV_VL, 4>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 4>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<numext::int64_t>(const numext::int64_t* addr) {
+#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+  __builtin_prefetch(addr);
+#endif
+}
+
+/********************************* Packet1Xl ************************************/
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pset1<Packet1Xl>(const numext::int64_t& from) {
+  return __riscv_vmv_v_x_i64m1(from, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl plset<Packet1Xl>(const numext::int64_t& a) {
+  Packet1Xl idx = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits<Packet1Xl>::size));
+  return __riscv_vadd_vx_i64m1(idx, a, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pzero<Packet1Xl>(const Packet1Xl& /*a*/) {
+  return __riscv_vmv_v_x_i64m1(0, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl padd<Packet1Xl>(const Packet1Xl& a, const Packet1Xl& b) {
+  return __riscv_vadd_vv_i64m1(a, b, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl psub<Packet1Xl>(const Packet1Xl& a, const Packet1Xl& b) {
+  return __riscv_vsub(a, b, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pnegate(const Packet1Xl& a) {
+  return __riscv_vneg(a, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pconj(const Packet1Xl& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pmul<Packet1Xl>(const Packet1Xl& a, const Packet1Xl& b) {
+  return __riscv_vmul(a, b, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pdiv<Packet1Xl>(const Packet1Xl& a, const Packet1Xl& b) {
+  return __riscv_vdiv(a, b, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pmadd(const Packet1Xl& a, const Packet1Xl& b, const Packet1Xl& c) {
+  return __riscv_vmadd(a, b, c, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pmsub(const Packet1Xl& a, const Packet1Xl& b, const Packet1Xl& c) {
+  return __riscv_vmadd(a, b, pnegate(c), unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pnmadd(const Packet1Xl& a, const Packet1Xl& b, const Packet1Xl& c) {
+  return __riscv_vnmsub_vv_i64m1(a, b, c, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pnmsub(const Packet1Xl& a, const Packet1Xl& b, const Packet1Xl& c) {
+  return __riscv_vnmsub_vv_i64m1(a, b, pnegate(c), unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pmin<Packet1Xl>(const Packet1Xl& a, const Packet1Xl& b) {
+  return __riscv_vmin(a, b, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pmax<Packet1Xl>(const Packet1Xl& a, const Packet1Xl& b) {
+  return __riscv_vmax(a, b, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pcmp_le<Packet1Xl>(const Packet1Xl& a, const Packet1Xl& b) {
+  PacketMask64 mask = __riscv_vmsle_vv_i64m1_b64(a, b, unpacket_traits<Packet1Xl>::size);
+  return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pcmp_lt<Packet1Xl>(const Packet1Xl& a, const Packet1Xl& b) {
+  PacketMask64 mask = __riscv_vmslt_vv_i64m1_b64(a, b, unpacket_traits<Packet1Xl>::size);
+  return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pcmp_eq<Packet1Xl>(const Packet1Xl& a, const Packet1Xl& b) {
+  PacketMask64 mask = __riscv_vmseq_vv_i64m1_b64(a, b, unpacket_traits<Packet1Xl>::size);
+  return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl ptrue<Packet1Xl>(const Packet1Xl& /*a*/) {
+  return __riscv_vmv_v_x_i64m1(0xffffffffffffffffu, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pand<Packet1Xl>(const Packet1Xl& a, const Packet1Xl& b) {
+  return __riscv_vand_vv_i64m1(a, b, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl por<Packet1Xl>(const Packet1Xl& a, const Packet1Xl& b) {
+  return __riscv_vor_vv_i64m1(a, b, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pxor<Packet1Xl>(const Packet1Xl& a, const Packet1Xl& b) {
+  return __riscv_vxor_vv_i64m1(a, b, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pandnot<Packet1Xl>(const Packet1Xl& a, const Packet1Xl& b) {
+  return __riscv_vand_vv_i64m1(a, __riscv_vnot_v_i64m1(b, unpacket_traits<Packet1Xl>::size),
+                               unpacket_traits<Packet1Xl>::size);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet1Xl parithmetic_shift_right(Packet1Xl a) {
+  return __riscv_vsra_vx_i64m1(a, N, unpacket_traits<Packet1Xl>::size);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet1Xl plogical_shift_right(Packet1Xl a) {
+  return __riscv_vreinterpret_i64m1(
+      __riscv_vsrl_vx_u64m1(__riscv_vreinterpret_u64m1(a), N, unpacket_traits<Packet1Xl>::size));
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet1Xl plogical_shift_left(Packet1Xl a) {
+  return __riscv_vsll_vx_i64m1(a, N, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pload<Packet1Xl>(const numext::int64_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl ploadu<Packet1Xl>(const numext::int64_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl ploaddup<Packet1Xl>(const numext::int64_t* from) {
+  Packet1Xul idx =
+      __riscv_vsrl_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits<Packet1Xl>::size), 1, unpacket_traits<Packet1Xl>::size);
+  return __riscv_vrgather_vv_i64m1(pload<Packet1Xl>(from), idx, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl ploadquad<Packet1Xl>(const numext::int64_t* from) {
+  Packet1Xul idx =
+      __riscv_vsrl_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits<Packet1Xl>::size), 2, unpacket_traits<Packet1Xl>::size);
+  return __riscv_vrgather_vv_i64m1(pload<Packet1Xl>(from), idx, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int64_t>(numext::int64_t* to, const Packet1Xl& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int64_t>(numext::int64_t* to, const Packet1Xl& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1Xl pgather<numext::int64_t, Packet1Xl>(const numext::int64_t* from, Index stride) {
+  return __riscv_vlse64_v_i64m1(from, stride * sizeof(numext::int64_t), unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<numext::int64_t, Packet1Xl>(numext::int64_t* to, const Packet1Xl& from,
+                                                                   Index stride) {
+  __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int64_t pfirst<Packet1Xl>(const Packet1Xl& a) {
+  return __riscv_vmv_x_s_i64m1_i64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl preverse(const Packet1Xl& a) {
+  Packet1Xul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits<Packet1Xl>::size),
+                                          unpacket_traits<Packet1Xl>::size - 1, unpacket_traits<Packet1Xl>::size);
+  return __riscv_vrgather_vv_i64m1(a, idx, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pabs(const Packet1Xl& a) {
+  Packet1Xl mask = __riscv_vsra_vx_i64m1(a, 63, unpacket_traits<Packet1Xl>::size);
+  return __riscv_vsub_vv_i64m1(__riscv_vxor_vv_i64m1(a, mask, unpacket_traits<Packet1Xl>::size), mask,
+                               unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int64_t predux<Packet1Xl>(const Packet1Xl& a) {
+  return __riscv_vmv_x(__riscv_vredsum_vs_i64m1_i64m1(a, __riscv_vmv_v_x_i64m1(0, unpacket_traits<Packet1Xl>::size),
+                                                      unpacket_traits<Packet1Xl>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int64_t predux_mul<Packet1Xl>(const Packet1Xl& a) {
+  // Multiply the vector by its reverse
+  Packet1Xl prod = __riscv_vmul_vv_i64m1(preverse(a), a, unpacket_traits<Packet1Xl>::size);
+  Packet1Xl half_prod;
+
+  if (EIGEN_RISCV64_RVV_VL >= 1024) {
+    half_prod = __riscv_vslidedown_vx_i64m1(prod, 4, unpacket_traits<Packet1Xl>::size);
+    prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits<Packet1Xl>::size);
+  }
+  if (EIGEN_RISCV64_RVV_VL >= 512) {
+    half_prod = __riscv_vslidedown_vx_i64m1(prod, 2, unpacket_traits<Packet1Xl>::size);
+    prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits<Packet1Xl>::size);
+  }
+  if (EIGEN_RISCV64_RVV_VL >= 256) {
+    half_prod = __riscv_vslidedown_vx_i64m1(prod, 1, unpacket_traits<Packet1Xl>::size);
+    prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits<Packet1Xl>::size);
+  }
+
+  // The reduction is done to the first element.
+  return pfirst(prod);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int64_t predux_min<Packet1Xl>(const Packet1Xl& a) {
+  return __riscv_vmv_x(__riscv_vredmin_vs_i64m1_i64m1(
+      a, __riscv_vmv_v_x_i64m1((std::numeric_limits<numext::int64_t>::max)(), unpacket_traits<Packet1Xl>::size),
+      unpacket_traits<Packet1Xl>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int64_t predux_max<Packet1Xl>(const Packet1Xl& a) {
+  return __riscv_vmv_x(__riscv_vredmax_vs_i64m1_i64m1(
+      a, __riscv_vmv_v_x_i64m1((std::numeric_limits<numext::int64_t>::min)(), unpacket_traits<Packet1Xl>::size),
+      unpacket_traits<Packet1Xl>::size));
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet1Xl, N>& kernel) {
+  numext::int64_t buffer[unpacket_traits<Packet1Xl>::size * N] = {0};
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits<Packet1Xl>::size);
+  }
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] =
+        __riscv_vle64_v_i64m1(&buffer[i * unpacket_traits<Packet1Xl>::size], unpacket_traits<Packet1Xl>::size);
+  }
+}
+
+/********************************* double ************************************/
+
+typedef eigen_packet_wrapper<vfloat64m1_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))), 15> Packet1Xd;
+typedef eigen_packet_wrapper<vfloat64m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 16>
+    Packet2Xd;
+typedef eigen_packet_wrapper<vfloat64m4_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))), 17>
+    Packet4Xd;
+
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+typedef Packet1Xd PacketXd;
+
+template <>
+struct packet_traits<double> : default_packet_traits {
+  typedef Packet1Xd type;
+  typedef Packet1Xd half;
+
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<double, EIGEN_RISCV64_RVV_VL, 1>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0,
+
+    HasCmp = 1,
+    HasDiv = 1,
+    HasRound = 1,
+
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1
+  };
+};
+
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 2
+typedef Packet2Xd PacketXd;
+
+template <>
+struct packet_traits<double> : default_packet_traits {
+  typedef Packet2Xd type;
+  typedef Packet1Xd half;
+
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<double, EIGEN_RISCV64_RVV_VL, 2>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0,
+
+    HasCmp = 1,
+    HasDiv = 1,
+    HasRound = 1,
+
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1
+  };
+};
+
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 4
+typedef Packet4Xd PacketXd;
+
+template <>
+struct packet_traits<double> : default_packet_traits {
+  typedef Packet4Xd type;
+  typedef Packet2Xd half;
+
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<double, EIGEN_RISCV64_RVV_VL, 4>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0,
+
+    HasCmp = 1,
+    HasDiv = 1,
+    HasRound = 1,
+
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1
+  };
+};
+#endif
+
+template <>
+struct unpacket_traits<Packet1Xd> {
+  typedef double type;
+  typedef Packet1Xd half;  // Half not yet implemented
+  typedef Packet1Xl integer_packet;
+  typedef numext::uint8_t mask_t;
+  typedef PacketMask64 packet_mask;
+
+  enum {
+    size = rvv_packet_size_selector<double, EIGEN_RISCV64_RVV_VL, 1>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 1>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2Xd> {
+  typedef double type;
+  typedef Packet1Xd half;
+  typedef Packet2Xl integer_packet;
+  typedef numext::uint8_t mask_t;
+  typedef PacketMask32 packet_mask;
+
+  enum {
+    size = rvv_packet_size_selector<double, EIGEN_RISCV64_RVV_VL, 2>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 2>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet4Xd> {
+  typedef double type;
+  typedef Packet2Xd half;
+  typedef Packet4Xl integer_packet;
+  typedef numext::uint8_t mask_t;
+  typedef PacketMask16 packet_mask;
+
+  enum {
+    size = rvv_packet_size_selector<double, EIGEN_RISCV64_RVV_VL, 4>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 4>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+/********************************* Packet1Xd ************************************/
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd ptrue<Packet1Xd>(const Packet1Xd& /*a*/) {
+  return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(0xffffffffffffffffu, unpacket_traits<Packet1Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pzero<Packet1Xd>(const Packet1Xd& /*a*/) {
+  return __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pabs(const Packet1Xd& a) {
+  return __riscv_vfabs_v_f64m1(a, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pabsdiff(const Packet1Xd& a, const Packet1Xd& b) {
+  return __riscv_vfabs_v_f64m1(__riscv_vfsub_vv_f64m1(a, b, unpacket_traits<Packet1Xd>::size),
+                               unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pset1<Packet1Xd>(const double& from) {
+  return __riscv_vfmv_v_f_f64m1(from, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pset1frombits<Packet1Xd>(numext::uint64_t from) {
+  return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(from, unpacket_traits<Packet1Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd plset<Packet1Xd>(const double& a) {
+  Packet1Xd idx = __riscv_vfcvt_f_x_v_f64m1(
+      __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits<Packet1Xl>::size)),
+      unpacket_traits<Packet1Xd>::size);
+  return __riscv_vfadd_vf_f64m1(idx, a, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet1Xd>(const double* a, Packet1Xd& a0, Packet1Xd& a1, Packet1Xd& a2,
+                                                Packet1Xd& a3) {
+  if (EIGEN_RISCV64_RVV_VL >= 256) {
+    vfloat64m1_t aa = __riscv_vle64_v_f64m1(a, 4);
+    a0 = __riscv_vrgather_vx_f64m1(aa, 0, unpacket_traits<Packet1Xd>::size);
+    a1 = __riscv_vrgather_vx_f64m1(aa, 1, unpacket_traits<Packet1Xd>::size);
+    a2 = __riscv_vrgather_vx_f64m1(aa, 2, unpacket_traits<Packet1Xd>::size);
+    a3 = __riscv_vrgather_vx_f64m1(aa, 3, unpacket_traits<Packet1Xd>::size);
+  } else {
+    vfloat64m1_t aa0 = __riscv_vle64_v_f64m1(a + 0, 2);
+    vfloat64m1_t aa1 = __riscv_vle64_v_f64m1(a + 2, 2);
+    a0 = __riscv_vrgather_vx_f64m1(aa0, 0, unpacket_traits<Packet1Xd>::size);
+    a1 = __riscv_vrgather_vx_f64m1(aa0, 1, unpacket_traits<Packet1Xd>::size);
+    a2 = __riscv_vrgather_vx_f64m1(aa1, 0, unpacket_traits<Packet1Xd>::size);
+    a3 = __riscv_vrgather_vx_f64m1(aa1, 1, unpacket_traits<Packet1Xd>::size);
+  }
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd padd<Packet1Xd>(const Packet1Xd& a, const Packet1Xd& b) {
+  return __riscv_vfadd_vv_f64m1(a, b, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd psub<Packet1Xd>(const Packet1Xd& a, const Packet1Xd& b) {
+  return __riscv_vfsub_vv_f64m1(a, b, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pnegate(const Packet1Xd& a) {
+  return __riscv_vfneg_v_f64m1(a, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd psignbit(const Packet1Xd& a) {
+  return __riscv_vreinterpret_v_i64m1_f64m1(
+      __riscv_vsra_vx_i64m1(__riscv_vreinterpret_v_f64m1_i64m1(a), 63, unpacket_traits<Packet1Xl>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pconj(const Packet1Xd& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pmul<Packet1Xd>(const Packet1Xd& a, const Packet1Xd& b) {
+  return __riscv_vfmul_vv_f64m1(a, b, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pdiv<Packet1Xd>(const Packet1Xd& a, const Packet1Xd& b) {
+  return __riscv_vfdiv_vv_f64m1(a, b, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pmadd(const Packet1Xd& a, const Packet1Xd& b, const Packet1Xd& c) {
+  return __riscv_vfmadd_vv_f64m1(a, b, c, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pmsub(const Packet1Xd& a, const Packet1Xd& b, const Packet1Xd& c) {
+  return __riscv_vfmsub_vv_f64m1(a, b, c, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pnmadd(const Packet1Xd& a, const Packet1Xd& b, const Packet1Xd& c) {
+  return __riscv_vfnmsub_vv_f64m1(a, b, c, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pnmsub(const Packet1Xd& a, const Packet1Xd& b, const Packet1Xd& c) {
+  return __riscv_vfnmadd_vv_f64m1(a, b, c, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pmin<Packet1Xd>(const Packet1Xd& a, const Packet1Xd& b) {
+  Packet1Xd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet1Xd>::size);
+  PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits<Packet1Xd>::size);
+  PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits<Packet1Xd>::size);
+  mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits<Packet1Xd>::size);
+
+  return __riscv_vfmin_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pmin<PropagateNaN, Packet1Xd>(const Packet1Xd& a, const Packet1Xd& b) {
+  return pmin<Packet1Xd>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pmin<PropagateNumbers, Packet1Xd>(const Packet1Xd& a, const Packet1Xd& b) {
+  return __riscv_vfmin_vv_f64m1(a, b, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pmax<Packet1Xd>(const Packet1Xd& a, const Packet1Xd& b) {
+  Packet1Xd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet1Xd>::size);
+  PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits<Packet1Xd>::size);
+  PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits<Packet1Xd>::size);
+  mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits<Packet1Xd>::size);
+
+  return __riscv_vfmax_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pmax<PropagateNaN, Packet1Xd>(const Packet1Xd& a, const Packet1Xd& b) {
+  return pmax<Packet1Xd>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pmax<PropagateNumbers, Packet1Xd>(const Packet1Xd& a, const Packet1Xd& b) {
+  return __riscv_vfmax_vv_f64m1(a, b, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pcmp_le<Packet1Xd>(const Packet1Xd& a, const Packet1Xd& b) {
+  PacketMask64 mask = __riscv_vmfle_vv_f64m1_b64(a, b, unpacket_traits<Packet1Xd>::size);
+  return __riscv_vmerge_vvm_f64m1(pzero<Packet1Xd>(a), ptrue<Packet1Xd>(a), mask, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pcmp_lt<Packet1Xd>(const Packet1Xd& a, const Packet1Xd& b) {
+  PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits<Packet1Xd>::size);
+  return __riscv_vmerge_vvm_f64m1(pzero<Packet1Xd>(a), ptrue<Packet1Xd>(a), mask, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pcmp_eq<Packet1Xd>(const Packet1Xd& a, const Packet1Xd& b) {
+  PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits<Packet1Xd>::size);
+  return __riscv_vmerge_vvm_f64m1(pzero<Packet1Xd>(a), ptrue<Packet1Xd>(a), mask, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pcmp_lt_or_nan<Packet1Xd>(const Packet1Xd& a, const Packet1Xd& b) {
+  PacketMask64 mask = __riscv_vmfge_vv_f64m1_b64(a, b, unpacket_traits<Packet1Xd>::size);
+  return __riscv_vfmerge_vfm_f64m1(ptrue<Packet1Xd>(a), 0.0, mask, unpacket_traits<Packet1Xd>::size);
+}
+
+// Logical Operations are not supported for double, so reinterpret casts
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pand<Packet1Xd>(const Packet1Xd& a, const Packet1Xd& b) {
+  return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1(
+      __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits<Packet1Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd por<Packet1Xd>(const Packet1Xd& a, const Packet1Xd& b) {
+  return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vor_vv_u64m1(
+      __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits<Packet1Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pxor<Packet1Xd>(const Packet1Xd& a, const Packet1Xd& b) {
+  return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vxor_vv_u64m1(
+      __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits<Packet1Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pandnot<Packet1Xd>(const Packet1Xd& a, const Packet1Xd& b) {
+  return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1(
+      __riscv_vreinterpret_v_f64m1_u64m1(a),
+      __riscv_vnot_v_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits<Packet1Xd>::size),
+      unpacket_traits<Packet1Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pload<Packet1Xd>(const double* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd ploadu<Packet1Xd>(const double* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd ploaddup<Packet1Xd>(const double* from) {
+  Packet1Xul idx =
+      __riscv_vsrl_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits<Packet1Xd>::size), 1, unpacket_traits<Packet1Xd>::size);
+  return __riscv_vrgather_vv_f64m1(pload<Packet1Xd>(from), idx, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd ploadquad<Packet1Xd>(const double* from) {
+  Packet1Xul idx =
+      __riscv_vsrl_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits<Packet1Xd>::size), 2, unpacket_traits<Packet1Xd>::size);
+  return __riscv_vrgather_vv_f64m1(pload<Packet1Xd>(from), idx, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet1Xd& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet1Xd& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1Xd pgather<double, Packet1Xd>(const double* from, Index stride) {
+  return __riscv_vlse64_v_f64m1(from, stride * sizeof(double), unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet1Xd>(double* to, const Packet1Xd& from, Index stride) {
+  __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet1Xd>(const Packet1Xd& a) {
+  return __riscv_vfmv_f_s_f64m1_f64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd psqrt(const Packet1Xd& a) {
+  return __riscv_vfsqrt_v_f64m1(a, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd print<Packet1Xd>(const Packet1Xd& a) {
+  const Packet1Xd limit = pset1<Packet1Xd>(static_cast<double>(1ull << 52));
+  const Packet1Xd abs_a = pabs(a);
+
+  PacketMask64 mask = __riscv_vmfne_vv_f64m1_b64(a, a, unpacket_traits<Packet1Xd>::size);
+  const Packet1Xd x = __riscv_vfadd_vv_f64m1_tumu(mask, a, a, a, unpacket_traits<Packet1Xd>::size);
+  const Packet1Xd new_x = __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1(a, unpacket_traits<Packet1Xd>::size),
+                                                    unpacket_traits<Packet1Xd>::size);
+
+  mask = __riscv_vmflt_vv_f64m1_b64(abs_a, limit, unpacket_traits<Packet1Xd>::size);
+  Packet1Xd signed_x = __riscv_vfsgnj_vv_f64m1(new_x, x, unpacket_traits<Packet1Xd>::size);
+  return __riscv_vmerge_vvm_f64m1(x, signed_x, mask, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pfloor<Packet1Xd>(const Packet1Xd& a) {
+  Packet1Xd tmp = print<Packet1Xd>(a);
+  // If greater, subtract one.
+  PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, tmp, unpacket_traits<Packet1Xd>::size);
+  return __riscv_vfsub_vf_f64m1_tumu(mask, tmp, tmp, 1.0, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd preverse(const Packet1Xd& a) {
+  Packet1Xul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits<Packet1Xd>::size),
+                                          unpacket_traits<Packet1Xd>::size - 1, unpacket_traits<Packet1Xd>::size);
+  return __riscv_vrgather_vv_f64m1(a, idx, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pfrexp<Packet1Xd>(const Packet1Xd& a, Packet1Xd& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux<Packet1Xd>(const Packet1Xd& a) {
+  return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m1_f64m1(
+      a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits<Packet1Xd>::size), unpacket_traits<Packet1Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet1Xd>(const Packet1Xd& a) {
+  // Multiply the vector by its reverse
+  Packet1Xd prod = __riscv_vfmul_vv_f64m1(preverse(a), a, unpacket_traits<Packet1Xd>::size);
+  Packet1Xd half_prod;
+
+  if (EIGEN_RISCV64_RVV_VL >= 1024) {
+    half_prod = __riscv_vslidedown_vx_f64m1(prod, 4, unpacket_traits<Packet1Xd>::size);
+    prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits<Packet1Xd>::size);
+  }
+  if (EIGEN_RISCV64_RVV_VL >= 512) {
+    half_prod = __riscv_vslidedown_vx_f64m1(prod, 2, unpacket_traits<Packet1Xd>::size);
+    prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits<Packet1Xd>::size);
+  }
+  if (EIGEN_RISCV64_RVV_VL >= 256) {
+    half_prod = __riscv_vslidedown_vx_f64m1(prod, 1, unpacket_traits<Packet1Xd>::size);
+    prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits<Packet1Xd>::size);
+  }
+
+  // The reduction is done to the first element.
+  return pfirst(prod);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet1Xd>(const Packet1Xd& a) {
+  return (std::min)(
+      __riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1(
+          a, __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet1Xd>::size),
+          unpacket_traits<Packet1Xd>::size)),
+      (std::numeric_limits<double>::max)());
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet1Xd>(const Packet1Xd& a) {
+  return (std::max)(
+      __riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1(
+          a, __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet1Xd>::size),
+          unpacket_traits<Packet1Xd>::size)),
+      -(std::numeric_limits<double>::max)());
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet1Xd, N>& kernel) {
+  double buffer[unpacket_traits<Packet1Xd>::size * N];
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits<Packet1Xd>::size);
+  }
+
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] =
+        __riscv_vle64_v_f64m1(&buffer[i * unpacket_traits<Packet1Xd>::size], unpacket_traits<Packet1Xd>::size);
+  }
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pldexp<Packet1Xd>(const Packet1Xd& a, const Packet1Xd& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketMask64 por(const PacketMask64& a, const PacketMask64& b) {
+  return __riscv_vmor_mm_b64(a, b, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketMask64 pandnot(const PacketMask64& a, const PacketMask64& b) {
+  return __riscv_vmor_mm_b64(a, b, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketMask64 pand(const PacketMask64& a, const PacketMask64& b) {
+  return __riscv_vmand_mm_b64(a, b, unpacket_traits<Packet1Xd>::size);
+}
+
+EIGEN_STRONG_INLINE PacketMask64 pcmp_eq_mask(const Packet1Xd& a, const Packet1Xd& b) {
+  return __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits<Packet1Xd>::size);
+}
+
+EIGEN_STRONG_INLINE PacketMask64 pcmp_lt_mask(const Packet1Xd& a, const Packet1Xd& b) {
+  return __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits<Packet1Xd>::size);
+}
+
+EIGEN_STRONG_INLINE Packet1Xd pselect(const PacketMask64& mask, const Packet1Xd& a, const Packet1Xd& b) {
+  return __riscv_vmerge_vvm_f64m1(b, a, mask, unpacket_traits<Packet1Xd>::size);
+}
+
+EIGEN_STRONG_INLINE Packet1Xd pselect(const Packet1Xd& mask, const Packet1Xd& a, const Packet1Xd& b) {
+  PacketMask64 mask2 =
+      __riscv_vmsne_vx_i64m1_b64(__riscv_vreinterpret_v_f64m1_i64m1(mask), 0, unpacket_traits<Packet1Xd>::size);
+  return __riscv_vmerge_vvm_f64m1(b, a, mask2, unpacket_traits<Packet1Xd>::size);
+}
+
+/********************************* short **************************************/
+
+typedef eigen_packet_wrapper<vint16m1_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))), 18> Packet1Xs;
+typedef eigen_packet_wrapper<vuint16m1_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))), 19> Packet1Xsu;
+
+typedef eigen_packet_wrapper<vint16m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 20> Packet2Xs;
+typedef eigen_packet_wrapper<vuint16m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 21>
+    Packet2Xsu;
+
+typedef eigen_packet_wrapper<vint16m4_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))), 22> Packet4Xs;
+typedef eigen_packet_wrapper<vuint16m4_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))), 23>
+    Packet4Xsu;
+
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+typedef Packet1Xs PacketXs;
+typedef Packet1Xsu PacketXsu;
+
+template <>
+struct packet_traits<numext::int16_t> : default_packet_traits {
+  typedef Packet1Xs type;
+  typedef Packet1Xs half;  // Half not implemented yet
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<numext::int16_t, EIGEN_RISCV64_RVV_VL, 1>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0
+  };
+};
+
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 2
+typedef Packet2Xs PacketXs;
+typedef Packet2Xsu PacketXsu;
+
+template <>
+struct packet_traits<numext::int16_t> : default_packet_traits {
+  typedef Packet2Xs type;
+  typedef Packet1Xs half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<numext::int16_t, EIGEN_RISCV64_RVV_VL, 2>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0
+  };
+};
+
+#elif EIGEN_RISCV64_DEFAULT_LMUL == 4
+typedef Packet4Xs PacketXs;
+typedef Packet4Xsu PacketXsu;
+
+template <>
+struct packet_traits<numext::int16_t> : default_packet_traits {
+  typedef Packet4Xs type;
+  typedef Packet2Xs half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<numext::int16_t, EIGEN_RISCV64_RVV_VL, 4>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0
+  };
+};
+#endif
+
+template <>
+struct unpacket_traits<Packet1Xs> {
+  typedef numext::int16_t type;
+  typedef Packet1Xs half;  // Half not yet implemented
+  typedef numext::uint8_t mask_t;
+  enum {
+    size = rvv_packet_size_selector<numext::int16_t, EIGEN_RISCV64_RVV_VL, 1>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 1>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2Xs> {
+  typedef numext::int16_t type;
+  typedef Packet1Xs half;
+  typedef numext::uint8_t mask_t;
+  enum {
+    size = rvv_packet_size_selector<numext::int16_t, EIGEN_RISCV64_RVV_VL, 2>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 2>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet4Xs> {
+  typedef numext::int16_t type;
+  typedef Packet2Xs half;
+  typedef numext::uint8_t mask_t;
+  enum {
+    size = rvv_packet_size_selector<numext::int16_t, EIGEN_RISCV64_RVV_VL, 4>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 4>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<numext::int16_t>(const numext::int16_t* addr) {
+#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+  __builtin_prefetch(addr);
+#endif
+}
+
+/********************************* Packet1Xs ************************************/
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pset1<Packet1Xs>(const numext::int16_t& from) {
+  return __riscv_vmv_v_x_i16m1(from, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs plset<Packet1Xs>(const numext::int16_t& a) {
+  Packet1Xs idx = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vid_v_u16m1(unpacket_traits<Packet1Xs>::size));
+  return __riscv_vadd_vx_i16m1(idx, a, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pzero<Packet1Xs>(const Packet1Xs& /*a*/) {
+  return __riscv_vmv_v_x_i16m1(0, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs padd<Packet1Xs>(const Packet1Xs& a, const Packet1Xs& b) {
+  return __riscv_vadd_vv_i16m1(a, b, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs psub<Packet1Xs>(const Packet1Xs& a, const Packet1Xs& b) {
+  return __riscv_vsub(a, b, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pnegate(const Packet1Xs& a) {
+  return __riscv_vneg(a, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pconj(const Packet1Xs& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pmul<Packet1Xs>(const Packet1Xs& a, const Packet1Xs& b) {
+  return __riscv_vmul(a, b, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pdiv<Packet1Xs>(const Packet1Xs& a, const Packet1Xs& b) {
+  return __riscv_vdiv(a, b, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pmadd(const Packet1Xs& a, const Packet1Xs& b, const Packet1Xs& c) {
+  return __riscv_vmadd(a, b, c, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pmsub(const Packet1Xs& a, const Packet1Xs& b, const Packet1Xs& c) {
+  return __riscv_vmadd(a, b, pnegate(c), unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pnmadd(const Packet1Xs& a, const Packet1Xs& b, const Packet1Xs& c) {
+  return __riscv_vnmsub_vv_i16m1(a, b, c, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pnmsub(const Packet1Xs& a, const Packet1Xs& b, const Packet1Xs& c) {
+  return __riscv_vnmsub_vv_i16m1(a, b, pnegate(c), unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pmin<Packet1Xs>(const Packet1Xs& a, const Packet1Xs& b) {
+  return __riscv_vmin(a, b, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pmax<Packet1Xs>(const Packet1Xs& a, const Packet1Xs& b) {
+  return __riscv_vmax(a, b, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pcmp_le<Packet1Xs>(const Packet1Xs& a, const Packet1Xs& b) {
+  PacketMask16 mask = __riscv_vmsle_vv_i16m1_b16(a, b, unpacket_traits<Packet1Xs>::size);
+  return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast<short>(0xffff), mask, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pcmp_lt<Packet1Xs>(const Packet1Xs& a, const Packet1Xs& b) {
+  PacketMask16 mask = __riscv_vmslt_vv_i16m1_b16(a, b, unpacket_traits<Packet1Xs>::size);
+  return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast<short>(0xffff), mask, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pcmp_eq<Packet1Xs>(const Packet1Xs& a, const Packet1Xs& b) {
+  PacketMask16 mask = __riscv_vmseq_vv_i16m1_b16(a, b, unpacket_traits<Packet1Xs>::size);
+  return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast<short>(0xffff), mask, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs ptrue<Packet1Xs>(const Packet1Xs& /*a*/) {
+  return __riscv_vmv_v_x_i16m1(static_cast<unsigned short>(0xffffu), unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pand<Packet1Xs>(const Packet1Xs& a, const Packet1Xs& b) {
+  return __riscv_vand_vv_i16m1(a, b, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs por<Packet1Xs>(const Packet1Xs& a, const Packet1Xs& b) {
+  return __riscv_vor_vv_i16m1(a, b, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pxor<Packet1Xs>(const Packet1Xs& a, const Packet1Xs& b) {
+  return __riscv_vxor_vv_i16m1(a, b, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pandnot<Packet1Xs>(const Packet1Xs& a, const Packet1Xs& b) {
+  return __riscv_vand_vv_i16m1(a, __riscv_vnot_v_i16m1(b, unpacket_traits<Packet1Xs>::size),
+                               unpacket_traits<Packet1Xs>::size);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet1Xs parithmetic_shift_right(Packet1Xs a) {
+  return __riscv_vsra_vx_i16m1(a, N, unpacket_traits<Packet1Xs>::size);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet1Xs plogical_shift_right(Packet1Xs a) {
+  return __riscv_vreinterpret_i16m1(
+      __riscv_vsrl_vx_u16m1(__riscv_vreinterpret_u16m1(a), N, unpacket_traits<Packet1Xs>::size));
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet1Xs plogical_shift_left(Packet1Xs a) {
+  return __riscv_vsll_vx_i16m1(a, N, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pload<Packet1Xs>(const numext::int16_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs ploadu<Packet1Xs>(const numext::int16_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs ploaddup<Packet1Xs>(const numext::int16_t* from) {
+  Packet1Xsu data = __riscv_vreinterpret_v_i16m1_u16m1(pload<Packet1Xs>(from));
+  return __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_u32m1_i32m1(__riscv_vlmul_trunc_v_u32m2_u32m1(
+      __riscv_vwmaccu_vx_u32m2(__riscv_vwaddu_vv_u32m2(data, data, unpacket_traits<Packet1Xs>::size), 0xffffu, data,
+                               unpacket_traits<Packet1Xs>::size))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs ploadquad<Packet1Xs>(const numext::int16_t* from) {
+  Packet1Xsu idx =
+      __riscv_vsrl_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits<Packet1Xs>::size), 2, unpacket_traits<Packet1Xs>::size);
+  return __riscv_vrgather_vv_i16m1(pload<Packet1Xs>(from), idx, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int16_t>(numext::int16_t* to, const Packet1Xs& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int16_t>(numext::int16_t* to, const Packet1Xs& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1Xs pgather<numext::int16_t, Packet1Xs>(const numext::int16_t* from, Index stride) {
+  return __riscv_vlse16_v_i16m1(from, stride * sizeof(numext::int16_t), unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<numext::int16_t, Packet1Xs>(numext::int16_t* to, const Packet1Xs& from,
+                                                                   Index stride) {
+  __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int16_t pfirst<Packet1Xs>(const Packet1Xs& a) {
+  return __riscv_vmv_x_s_i16m1_i16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs preverse(const Packet1Xs& a) {
+  Packet1Xsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits<Packet1Xs>::size),
+                                          unpacket_traits<Packet1Xs>::size - 1, unpacket_traits<Packet1Xs>::size);
+  return __riscv_vrgather_vv_i16m1(a, idx, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pabs(const Packet1Xs& a) {
+  Packet1Xs mask = __riscv_vsra_vx_i16m1(a, 15, unpacket_traits<Packet1Xs>::size);
+  return __riscv_vsub_vv_i16m1(__riscv_vxor_vv_i16m1(a, mask, unpacket_traits<Packet1Xs>::size), mask,
+                               unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int16_t predux<Packet1Xs>(const Packet1Xs& a) {
+  return __riscv_vmv_x(__riscv_vredsum_vs_i16m1_i16m1(a, __riscv_vmv_v_x_i16m1(0, unpacket_traits<Packet1Xs>::size),
+                                                      unpacket_traits<Packet1Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int16_t predux_mul<Packet1Xs>(const Packet1Xs& a) {
+  // Multiply the vector by its reverse
+  Packet1Xs prod = __riscv_vmul_vv_i16m1(preverse(a), a, unpacket_traits<Packet1Xs>::size);
+  Packet1Xs half_prod;
+
+  if (EIGEN_RISCV64_RVV_VL >= 1024) {
+    half_prod = __riscv_vslidedown_vx_i16m1(prod, 16, unpacket_traits<Packet1Xs>::size);
+    prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits<Packet1Xs>::size);
+  }
+  if (EIGEN_RISCV64_RVV_VL >= 512) {
+    half_prod = __riscv_vslidedown_vx_i16m1(prod, 8, unpacket_traits<Packet1Xs>::size);
+    prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits<Packet1Xs>::size);
+  }
+  if (EIGEN_RISCV64_RVV_VL >= 256) {
+    half_prod = __riscv_vslidedown_vx_i16m1(prod, 4, unpacket_traits<Packet1Xs>::size);
+    prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits<Packet1Xs>::size);
+  }
+  // Last reduction
+  half_prod = __riscv_vslidedown_vx_i16m1(prod, 2, unpacket_traits<Packet1Xs>::size);
+  prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits<Packet1Xs>::size);
+
+  half_prod = __riscv_vslidedown_vx_i16m1(prod, 1, unpacket_traits<Packet1Xs>::size);
+  prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits<Packet1Xs>::size);
+
+  // The reduction is done to the first element.
+  return pfirst(prod);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int16_t predux_min<Packet1Xs>(const Packet1Xs& a) {
+  return __riscv_vmv_x(__riscv_vredmin_vs_i16m1_i16m1(
+      a, __riscv_vmv_v_x_i16m1((std::numeric_limits<numext::int16_t>::max)(), unpacket_traits<Packet1Xs>::size),
+      unpacket_traits<Packet1Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int16_t predux_max<Packet1Xs>(const Packet1Xs& a) {
+  return __riscv_vmv_x(__riscv_vredmax_vs_i16m1_i16m1(
+      a, __riscv_vmv_v_x_i16m1((std::numeric_limits<numext::int16_t>::min)(), unpacket_traits<Packet1Xs>::size),
+      unpacket_traits<Packet1Xs>::size));
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet1Xs, N>& kernel) {
+  numext::int16_t buffer[unpacket_traits<Packet1Xs>::size * N] = {0};
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits<Packet1Xs>::size);
+  }
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] =
+        __riscv_vle16_v_i16m1(&buffer[i * unpacket_traits<Packet1Xs>::size], unpacket_traits<Packet1Xs>::size);
+  }
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_RVV10_H
diff --git a/Eigen/src/Core/arch/RVV10/PacketMath2.h b/Eigen/src/Core/arch/RVV10/PacketMath2.h
new file mode 100644
index 00000000000..ccf496b9abf
--- /dev/null
+++ b/Eigen/src/Core/arch/RVV10/PacketMath2.h
@@ -0,0 +1,1527 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 Kseniya Zaytseva <kseniya.zaytseva@syntacore.com>
+// Copyright (C) 2025 Chip Kerchner <ckerchner@tenstorrent.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET2_MATH_RVV10_H
+#define EIGEN_PACKET2_MATH_RVV10_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+/********************************* Packet2Xi ************************************/
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pset1<Packet2Xi>(const numext::int32_t& from) {
+  return __riscv_vmv_v_x_i32m2(from, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi plset<Packet2Xi>(const numext::int32_t& a) {
+  Packet2Xi idx = __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vid_v_u32m2(unpacket_traits<Packet2Xi>::size));
+  return __riscv_vadd_vx_i32m2(idx, a, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pzero<Packet2Xi>(const Packet2Xi& /*a*/) {
+  return __riscv_vmv_v_x_i32m2(0, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi padd<Packet2Xi>(const Packet2Xi& a, const Packet2Xi& b) {
+  return __riscv_vadd_vv_i32m2(a, b, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi psub<Packet2Xi>(const Packet2Xi& a, const Packet2Xi& b) {
+  return __riscv_vsub(a, b, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pnegate(const Packet2Xi& a) {
+  return __riscv_vneg(a, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pconj(const Packet2Xi& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pmul<Packet2Xi>(const Packet2Xi& a, const Packet2Xi& b) {
+  return __riscv_vmul(a, b, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pdiv<Packet2Xi>(const Packet2Xi& a, const Packet2Xi& b) {
+  return __riscv_vdiv(a, b, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pmadd(const Packet2Xi& a, const Packet2Xi& b, const Packet2Xi& c) {
+  return __riscv_vmadd(a, b, c, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pmsub(const Packet2Xi& a, const Packet2Xi& b, const Packet2Xi& c) {
+  return __riscv_vmadd(a, b, pnegate(c), unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pnmadd(const Packet2Xi& a, const Packet2Xi& b, const Packet2Xi& c) {
+  return __riscv_vnmsub_vv_i32m2(a, b, c, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pnmsub(const Packet2Xi& a, const Packet2Xi& b, const Packet2Xi& c) {
+  return __riscv_vnmsub_vv_i32m2(a, b, pnegate(c), unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pmin<Packet2Xi>(const Packet2Xi& a, const Packet2Xi& b) {
+  return __riscv_vmin(a, b, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pmax<Packet2Xi>(const Packet2Xi& a, const Packet2Xi& b) {
+  return __riscv_vmax(a, b, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pcmp_le<Packet2Xi>(const Packet2Xi& a, const Packet2Xi& b) {
+  PacketMask16 mask = __riscv_vmsle_vv_i32m2_b16(a, b, unpacket_traits<Packet2Xi>::size);
+  return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pcmp_lt<Packet2Xi>(const Packet2Xi& a, const Packet2Xi& b) {
+  PacketMask16 mask = __riscv_vmslt_vv_i32m2_b16(a, b, unpacket_traits<Packet2Xi>::size);
+  return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pcmp_eq<Packet2Xi>(const Packet2Xi& a, const Packet2Xi& b) {
+  PacketMask16 mask = __riscv_vmseq_vv_i32m2_b16(a, b, unpacket_traits<Packet2Xi>::size);
+  return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi ptrue<Packet2Xi>(const Packet2Xi& /*a*/) {
+  return __riscv_vmv_v_x_i32m2(0xffffffffu, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pand<Packet2Xi>(const Packet2Xi& a, const Packet2Xi& b) {
+  return __riscv_vand_vv_i32m2(a, b, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi por<Packet2Xi>(const Packet2Xi& a, const Packet2Xi& b) {
+  return __riscv_vor_vv_i32m2(a, b, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pxor<Packet2Xi>(const Packet2Xi& a, const Packet2Xi& b) {
+  return __riscv_vxor_vv_i32m2(a, b, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pandnot<Packet2Xi>(const Packet2Xi& a, const Packet2Xi& b) {
+  return __riscv_vand_vv_i32m2(a, __riscv_vnot_v_i32m2(b, unpacket_traits<Packet2Xi>::size),
+                               unpacket_traits<Packet2Xi>::size);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2Xi parithmetic_shift_right(Packet2Xi a) {
+  return __riscv_vsra_vx_i32m2(a, N, unpacket_traits<Packet2Xi>::size);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2Xi plogical_shift_right(Packet2Xi a) {
+  return __riscv_vreinterpret_i32m2(
+      __riscv_vsrl_vx_u32m2(__riscv_vreinterpret_u32m2(a), N, unpacket_traits<Packet2Xi>::size));
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2Xi plogical_shift_left(Packet2Xi a) {
+  return __riscv_vsll_vx_i32m2(a, N, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pload<Packet2Xi>(const numext::int32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi ploadu<Packet2Xi>(const numext::int32_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi ploaddup<Packet2Xi>(const numext::int32_t* from) {
+  Packet2Xu data = __riscv_vreinterpret_v_i32m2_u32m2(pload<Packet2Xi>(from));
+  return __riscv_vreinterpret_v_i64m2_i32m2(__riscv_vreinterpret_v_u64m2_i64m2(__riscv_vlmul_trunc_v_u64m4_u64m2(
+      __riscv_vwmaccu_vx_u64m4(__riscv_vwaddu_vv_u64m4(data, data, unpacket_traits<Packet2Xi>::size), 0xffffffffu, data,
+                               unpacket_traits<Packet2Xi>::size))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi ploadquad<Packet2Xi>(const numext::int32_t* from) {
+  Packet2Xu idx =
+      __riscv_vsrl_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits<Packet2Xi>::size), 2, unpacket_traits<Packet2Xi>::size);
+  return __riscv_vrgather_vv_i32m2(pload<Packet2Xi>(from), idx, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int32_t>(numext::int32_t* to, const Packet2Xi& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int32_t>(numext::int32_t* to, const Packet2Xi& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2Xi pgather<numext::int32_t, Packet2Xi>(const numext::int32_t* from, Index stride) {
+  return __riscv_vlse32_v_i32m2(from, stride * sizeof(numext::int32_t), unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<numext::int32_t, Packet2Xi>(numext::int32_t* to, const Packet2Xi& from,
+                                                                   Index stride) {
+  __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t pfirst<Packet2Xi>(const Packet2Xi& a) {
+  return __riscv_vmv_x_s_i32m2_i32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi preverse(const Packet2Xi& a) {
+  Packet2Xu idx = __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits<Packet2Xi>::size),
+                                         unpacket_traits<Packet2Xi>::size - 1, unpacket_traits<Packet2Xi>::size);
+  return __riscv_vrgather_vv_i32m2(a, idx, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pabs(const Packet2Xi& a) {
+  Packet2Xi mask = __riscv_vsra_vx_i32m2(a, 31, unpacket_traits<Packet2Xi>::size);
+  return __riscv_vsub_vv_i32m2(__riscv_vxor_vv_i32m2(a, mask, unpacket_traits<Packet2Xi>::size), mask,
+                               unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux<Packet2Xi>(const Packet2Xi& a) {
+  return __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(a, __riscv_vmv_v_x_i32m1(0, unpacket_traits<Packet2Xi>::size / 2),
+                                                      unpacket_traits<Packet2Xi>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_mul<Packet2Xi>(const Packet2Xi& a) {
+  return predux_mul<Packet1Xi>(__riscv_vmul_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1),
+                                                     unpacket_traits<Packet1Xi>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_min<Packet2Xi>(const Packet2Xi& a) {
+  return __riscv_vmv_x(__riscv_vredmin_vs_i32m2_i32m1(
+      a, __riscv_vmv_v_x_i32m1((std::numeric_limits<numext::int32_t>::max)(), unpacket_traits<Packet2Xi>::size / 2),
+      unpacket_traits<Packet2Xi>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_max<Packet2Xi>(const Packet2Xi& a) {
+  return __riscv_vmv_x(__riscv_vredmax_vs_i32m2_i32m1(
+      a, __riscv_vmv_v_x_i32m1((std::numeric_limits<numext::int32_t>::min)(), unpacket_traits<Packet2Xi>::size / 2),
+      unpacket_traits<Packet2Xi>::size));
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2Xi, N>& kernel) {
+  numext::int32_t buffer[unpacket_traits<Packet2Xi>::size * N] = {0};
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits<Packet2Xi>::size);
+  }
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] =
+        __riscv_vle32_v_i32m2(&buffer[i * unpacket_traits<Packet2Xi>::size], unpacket_traits<Packet2Xi>::size);
+  }
+}
+
+template <typename Packet = Packet4Xi>
+EIGEN_STRONG_INLINE
+    std::enable_if_t<std::is_same<Packet, Packet4Xi>::value && (unpacket_traits<Packet4Xi>::size % 8) == 0, Packet2Xi>
+    predux_half(const Packet4Xi& a) {
+  return __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(a, 0), __riscv_vget_v_i32m4_i32m2(a, 1),
+                               unpacket_traits<Packet2Xi>::size);
+}
+
+template <typename Packet = Packet2Xi>
+EIGEN_STRONG_INLINE
+    std::enable_if_t<std::is_same<Packet, Packet2Xi>::value && (unpacket_traits<Packet2Xi>::size % 8) == 0, Packet1Xi>
+    predux_half(const Packet2Xi& a) {
+  return __riscv_vadd_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1),
+                               unpacket_traits<Packet1Xi>::size);
+}
+
+/********************************* Packet2Xf ************************************/
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf ptrue<Packet2Xf>(const Packet2Xf& /*a*/) {
+  return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(0xffffffffu, unpacket_traits<Packet2Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pzero<Packet2Xf>(const Packet2Xf& /*a*/) {
+  return __riscv_vfmv_v_f_f32m2(0.0f, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pabs(const Packet2Xf& a) {
+  return __riscv_vfabs_v_f32m2(a, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pabsdiff(const Packet2Xf& a, const Packet2Xf& b) {
+  return __riscv_vfabs_v_f32m2(__riscv_vfsub_vv_f32m2(a, b, unpacket_traits<Packet2Xf>::size),
+                               unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pset1<Packet2Xf>(const float& from) {
+  return __riscv_vfmv_v_f_f32m2(from, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pset1frombits<Packet2Xf>(numext::uint32_t from) {
+  return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(from, unpacket_traits<Packet2Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf plset<Packet2Xf>(const float& a) {
+  Packet2Xf idx = __riscv_vfcvt_f_x_v_f32m2(
+      __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vid_v_u32m2(unpacket_traits<Packet4Xi>::size)),
+      unpacket_traits<Packet2Xf>::size);
+  return __riscv_vfadd_vf_f32m2(idx, a, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet2Xf>(const float* a, Packet2Xf& a0, Packet2Xf& a1, Packet2Xf& a2,
+                                                Packet2Xf& a3) {
+  vfloat32m2_t aa = __riscv_vle32_v_f32m2(a, 4);
+  a0 = __riscv_vrgather_vx_f32m2(aa, 0, unpacket_traits<Packet2Xf>::size);
+  a1 = __riscv_vrgather_vx_f32m2(aa, 1, unpacket_traits<Packet2Xf>::size);
+  a2 = __riscv_vrgather_vx_f32m2(aa, 2, unpacket_traits<Packet2Xf>::size);
+  a3 = __riscv_vrgather_vx_f32m2(aa, 3, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf padd<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
+  return __riscv_vfadd_vv_f32m2(a, b, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf psub<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
+  return __riscv_vfsub_vv_f32m2(a, b, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pnegate(const Packet2Xf& a) {
+  return __riscv_vfneg_v_f32m2(a, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf psignbit(const Packet2Xf& a) {
+  return __riscv_vreinterpret_v_i32m2_f32m2(
+      __riscv_vsra_vx_i32m2(__riscv_vreinterpret_v_f32m2_i32m2(a), 31, unpacket_traits<Packet2Xi>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pconj(const Packet2Xf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pmul<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
+  return __riscv_vfmul_vv_f32m2(a, b, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pdiv<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
+  return __riscv_vfdiv_vv_f32m2(a, b, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pmadd(const Packet2Xf& a, const Packet2Xf& b, const Packet2Xf& c) {
+  return __riscv_vfmadd_vv_f32m2(a, b, c, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pmsub(const Packet2Xf& a, const Packet2Xf& b, const Packet2Xf& c) {
+  return __riscv_vfmsub_vv_f32m2(a, b, c, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pnmadd(const Packet2Xf& a, const Packet2Xf& b, const Packet2Xf& c) {
+  return __riscv_vfnmsub_vv_f32m2(a, b, c, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pnmsub(const Packet2Xf& a, const Packet2Xf& b, const Packet2Xf& c) {
+  return __riscv_vfnmadd_vv_f32m2(a, b, c, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pmin<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
+  Packet2Xf nans = __riscv_vfmv_v_f_f32m2((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet2Xf>::size);
+  PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits<Packet2Xf>::size);
+  PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits<Packet2Xf>::size);
+  mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits<Packet2Xf>::size);
+
+  return __riscv_vfmin_vv_f32m2_tumu(mask, nans, a, b, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pmin<PropagateNaN, Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
+  return pmin<Packet2Xf>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pmin<PropagateNumbers, Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
+  return __riscv_vfmin_vv_f32m2(a, b, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pmax<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
+  Packet2Xf nans = __riscv_vfmv_v_f_f32m2((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet2Xf>::size);
+  PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits<Packet2Xf>::size);
+  PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits<Packet2Xf>::size);
+  mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits<Packet2Xf>::size);
+
+  return __riscv_vfmax_vv_f32m2_tumu(mask, nans, a, b, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pmax<PropagateNaN, Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
+  return pmax<Packet2Xf>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pmax<PropagateNumbers, Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
+  return __riscv_vfmax_vv_f32m2(a, b, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pcmp_le<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
+  PacketMask16 mask = __riscv_vmfle_vv_f32m2_b16(a, b, unpacket_traits<Packet2Xf>::size);
+  return __riscv_vmerge_vvm_f32m2(pzero<Packet2Xf>(a), ptrue<Packet2Xf>(a), mask, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pcmp_lt<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
+  PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, b, unpacket_traits<Packet2Xf>::size);
+  return __riscv_vmerge_vvm_f32m2(pzero<Packet2Xf>(a), ptrue<Packet2Xf>(a), mask, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pcmp_eq<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
+  PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, b, unpacket_traits<Packet2Xf>::size);
+  return __riscv_vmerge_vvm_f32m2(pzero<Packet2Xf>(a), ptrue<Packet2Xf>(a), mask, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pcmp_lt_or_nan<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
+  PacketMask16 mask = __riscv_vmfge_vv_f32m2_b16(a, b, unpacket_traits<Packet2Xf>::size);
+  return __riscv_vfmerge_vfm_f32m2(ptrue<Packet2Xf>(a), 0.0f, mask, unpacket_traits<Packet2Xf>::size);
+}
+
+EIGEN_STRONG_INLINE Packet2Xf pselect(const PacketMask16& mask, const Packet2Xf& a, const Packet2Xf& b) {
+  return __riscv_vmerge_vvm_f32m2(b, a, mask, unpacket_traits<Packet2Xf>::size);
+}
+
+EIGEN_STRONG_INLINE Packet2Xf pselect(const Packet2Xf& mask, const Packet2Xf& a, const Packet2Xf& b) {
+  PacketMask16 mask2 =
+      __riscv_vmsne_vx_i32m2_b16(__riscv_vreinterpret_v_f32m2_i32m2(mask), 0, unpacket_traits<Packet2Xf>::size);
+  return __riscv_vmerge_vvm_f32m2(b, a, mask2, unpacket_traits<Packet2Xf>::size);
+}
+
+// Logical Operations are not supported for float, so reinterpret casts
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pand<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
+  return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2(
+      __riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits<Packet2Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf por<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
+  return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vor_vv_u32m2(
+      __riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits<Packet2Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pxor<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
+  return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vxor_vv_u32m2(
+      __riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits<Packet2Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pandnot<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& b) {
+  return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2(
+      __riscv_vreinterpret_v_f32m2_u32m2(a),
+      __riscv_vnot_v_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits<Packet2Xf>::size),
+      unpacket_traits<Packet2Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pload<Packet2Xf>(const float* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf ploadu<Packet2Xf>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf ploaddup<Packet2Xf>(const float* from) {
+  Packet2Xu data = __riscv_vreinterpret_v_f32m2_u32m2(pload<Packet2Xf>(from));
+  return __riscv_vreinterpret_v_i32m2_f32m2(
+      __riscv_vreinterpret_v_i64m2_i32m2(__riscv_vreinterpret_v_u64m2_i64m2(__riscv_vlmul_trunc_v_u64m4_u64m2(
+          __riscv_vwmaccu_vx_u64m4(__riscv_vwaddu_vv_u64m4(data, data, unpacket_traits<Packet2Xi>::size), 0xffffffffu,
+                                   data, unpacket_traits<Packet2Xi>::size)))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf ploadquad<Packet2Xf>(const float* from) {
+  Packet2Xu idx =
+      __riscv_vsrl_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits<Packet2Xf>::size), 2, unpacket_traits<Packet2Xf>::size);
+  return __riscv_vrgather_vv_f32m2(pload<Packet2Xf>(from), idx, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2Xf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet2Xf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2Xf pgather<float, Packet2Xf>(const float* from, Index stride) {
+  return __riscv_vlse32_v_f32m2(from, stride * sizeof(float), unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet2Xf>(float* to, const Packet2Xf& from, Index stride) {
+  __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet2Xf>(const Packet2Xf& a) {
+  return __riscv_vfmv_f_s_f32m2_f32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf psqrt(const Packet2Xf& a) {
+  return __riscv_vfsqrt_v_f32m2(a, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf print<Packet2Xf>(const Packet2Xf& a) {
+  const Packet2Xf limit = pset1<Packet2Xf>(static_cast<float>(1 << 23));
+  const Packet2Xf abs_a = pabs(a);
+
+  PacketMask16 mask = __riscv_vmfne_vv_f32m2_b16(a, a, unpacket_traits<Packet2Xf>::size);
+  const Packet2Xf x = __riscv_vfadd_vv_f32m2_tumu(mask, a, a, a, unpacket_traits<Packet2Xf>::size);
+  const Packet2Xf new_x = __riscv_vfcvt_f_x_v_f32m2(__riscv_vfcvt_x_f_v_i32m2(a, unpacket_traits<Packet2Xf>::size),
+                                                    unpacket_traits<Packet2Xf>::size);
+
+  mask = __riscv_vmflt_vv_f32m2_b16(abs_a, limit, unpacket_traits<Packet2Xf>::size);
+  Packet2Xf signed_x = __riscv_vfsgnj_vv_f32m2(new_x, x, unpacket_traits<Packet2Xf>::size);
+  return __riscv_vmerge_vvm_f32m2(x, signed_x, mask, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pfloor<Packet2Xf>(const Packet2Xf& a) {
+  Packet2Xf tmp = print<Packet2Xf>(a);
+  // If greater, subtract one.
+  PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, tmp, unpacket_traits<Packet2Xf>::size);
+  return __riscv_vfsub_vf_f32m2_tumu(mask, tmp, tmp, 1.0f, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf preverse(const Packet2Xf& a) {
+  Packet2Xu idx = __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits<Packet2Xf>::size),
+                                         unpacket_traits<Packet2Xf>::size - 1, unpacket_traits<Packet2Xf>::size);
+  return __riscv_vrgather_vv_f32m2(a, idx, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pfrexp<Packet2Xf>(const Packet2Xf& a, Packet2Xf& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<Packet2Xf>(const Packet2Xf& a) {
+  return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m2_f32m1(
+      a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits<Packet2Xf>::size / 2), unpacket_traits<Packet2Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet2Xf>(const Packet2Xf& a) {
+  return predux_mul<Packet1Xf>(__riscv_vfmul_vv_f32m1(
+      __riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), unpacket_traits<Packet1Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet2Xf>(const Packet2Xf& a) {
+  return (std::min)(
+      __riscv_vfmv_f(__riscv_vfredmin_vs_f32m2_f32m1(
+          a, __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet2Xf>::size / 2),
+          unpacket_traits<Packet2Xf>::size)),
+      (std::numeric_limits<float>::max)());
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet2Xf>(const Packet2Xf& a) {
+  return (std::max)(
+      __riscv_vfmv_f(__riscv_vfredmax_vs_f32m2_f32m1(
+          a, __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet2Xf>::size / 2),
+          unpacket_traits<Packet2Xf>::size)),
+      -(std::numeric_limits<float>::max)());
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2Xf, N>& kernel) {
+  float buffer[unpacket_traits<Packet2Xf>::size * N];
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits<Packet2Xf>::size);
+  }
+
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] =
+        __riscv_vle32_v_f32m2(&buffer[i * unpacket_traits<Packet2Xf>::size], unpacket_traits<Packet2Xf>::size);
+  }
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pldexp<Packet2Xf>(const Packet2Xf& a, const Packet2Xf& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template <typename Packet = Packet4Xf>
+EIGEN_STRONG_INLINE
+    std::enable_if_t<std::is_same<Packet, Packet4Xf>::value && (unpacket_traits<Packet4Xf>::size % 8) == 0, Packet2Xf>
+    predux_half(const Packet4Xf& a) {
+  return __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(a, 0), __riscv_vget_v_f32m4_f32m2(a, 1),
+                                unpacket_traits<Packet2Xf>::size);
+}
+
+template <typename Packet = Packet2Xf>
+EIGEN_STRONG_INLINE
+    std::enable_if_t<std::is_same<Packet, Packet2Xf>::value && (unpacket_traits<Packet2Xf>::size % 8) == 0, Packet1Xf>
+    predux_half(const Packet2Xf& a) {
+  return __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1),
+                                unpacket_traits<Packet1Xf>::size);
+}
+
+/********************************* Packet2Xl ************************************/
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pset1<Packet2Xl>(const numext::int64_t& from) {
+  return __riscv_vmv_v_x_i64m2(from, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl plset<Packet2Xl>(const numext::int64_t& a) {
+  Packet2Xl idx = __riscv_vreinterpret_v_u64m2_i64m2(__riscv_vid_v_u64m2(unpacket_traits<Packet2Xl>::size));
+  return __riscv_vadd_vx_i64m2(idx, a, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pzero<Packet2Xl>(const Packet2Xl& /*a*/) {
+  return __riscv_vmv_v_x_i64m2(0, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl padd<Packet2Xl>(const Packet2Xl& a, const Packet2Xl& b) {
+  return __riscv_vadd_vv_i64m2(a, b, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl psub<Packet2Xl>(const Packet2Xl& a, const Packet2Xl& b) {
+  return __riscv_vsub(a, b, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pnegate(const Packet2Xl& a) {
+  return __riscv_vneg(a, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pconj(const Packet2Xl& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pmul<Packet2Xl>(const Packet2Xl& a, const Packet2Xl& b) {
+  return __riscv_vmul(a, b, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pdiv<Packet2Xl>(const Packet2Xl& a, const Packet2Xl& b) {
+  return __riscv_vdiv(a, b, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pmadd(const Packet2Xl& a, const Packet2Xl& b, const Packet2Xl& c) {
+  return __riscv_vmadd(a, b, c, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pmsub(const Packet2Xl& a, const Packet2Xl& b, const Packet2Xl& c) {
+  return __riscv_vmadd(a, b, pnegate(c), unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pnmadd(const Packet2Xl& a, const Packet2Xl& b, const Packet2Xl& c) {
+  return __riscv_vnmsub_vv_i64m2(a, b, c, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pnmsub(const Packet2Xl& a, const Packet2Xl& b, const Packet2Xl& c) {
+  return __riscv_vnmsub_vv_i64m2(a, b, pnegate(c), unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pmin<Packet2Xl>(const Packet2Xl& a, const Packet2Xl& b) {
+  return __riscv_vmin(a, b, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pmax<Packet2Xl>(const Packet2Xl& a, const Packet2Xl& b) {
+  return __riscv_vmax(a, b, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pcmp_le<Packet2Xl>(const Packet2Xl& a, const Packet2Xl& b) {
+  PacketMask32 mask = __riscv_vmsle_vv_i64m2_b32(a, b, unpacket_traits<Packet2Xl>::size);
+  return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pcmp_lt<Packet2Xl>(const Packet2Xl& a, const Packet2Xl& b) {
+  PacketMask32 mask = __riscv_vmslt_vv_i64m2_b32(a, b, unpacket_traits<Packet2Xl>::size);
+  return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pcmp_eq<Packet2Xl>(const Packet2Xl& a, const Packet2Xl& b) {
+  PacketMask32 mask = __riscv_vmseq_vv_i64m2_b32(a, b, unpacket_traits<Packet2Xl>::size);
+  return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl ptrue<Packet2Xl>(const Packet2Xl& /*a*/) {
+  return __riscv_vmv_v_x_i64m2(0xffffffffffffffffu, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pand<Packet2Xl>(const Packet2Xl& a, const Packet2Xl& b) {
+  return __riscv_vand_vv_i64m2(a, b, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl por<Packet2Xl>(const Packet2Xl& a, const Packet2Xl& b) {
+  return __riscv_vor_vv_i64m2(a, b, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pxor<Packet2Xl>(const Packet2Xl& a, const Packet2Xl& b) {
+  return __riscv_vxor_vv_i64m2(a, b, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pandnot<Packet2Xl>(const Packet2Xl& a, const Packet2Xl& b) {
+  return __riscv_vand_vv_i64m2(a, __riscv_vnot_v_i64m2(b, unpacket_traits<Packet2Xl>::size),
+                               unpacket_traits<Packet2Xl>::size);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2Xl parithmetic_shift_right(Packet2Xl a) {
+  return __riscv_vsra_vx_i64m2(a, N, unpacket_traits<Packet2Xl>::size);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2Xl plogical_shift_right(Packet2Xl a) {
+  return __riscv_vreinterpret_i64m2(
+      __riscv_vsrl_vx_u64m2(__riscv_vreinterpret_u64m2(a), N, unpacket_traits<Packet2Xl>::size));
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2Xl plogical_shift_left(Packet2Xl a) {
+  return __riscv_vsll_vx_i64m2(a, N, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pload<Packet2Xl>(const numext::int64_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl ploadu<Packet2Xl>(const numext::int64_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl ploaddup<Packet2Xl>(const numext::int64_t* from) {
+  Packet2Xul idx =
+      __riscv_vsrl_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits<Packet2Xl>::size), 1, unpacket_traits<Packet2Xl>::size);
+  return __riscv_vrgather_vv_i64m2(pload<Packet2Xl>(from), idx, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl ploadquad<Packet2Xl>(const numext::int64_t* from) {
+  Packet2Xul idx =
+      __riscv_vsrl_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits<Packet2Xl>::size), 2, unpacket_traits<Packet2Xl>::size);
+  return __riscv_vrgather_vv_i64m2(pload<Packet2Xl>(from), idx, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int64_t>(numext::int64_t* to, const Packet2Xl& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int64_t>(numext::int64_t* to, const Packet2Xl& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2Xl pgather<numext::int64_t, Packet2Xl>(const numext::int64_t* from, Index stride) {
+  return __riscv_vlse64_v_i64m2(from, stride * sizeof(numext::int64_t), unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<numext::int64_t, Packet2Xl>(numext::int64_t* to, const Packet2Xl& from,
+                                                                   Index stride) {
+  __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int64_t pfirst<Packet2Xl>(const Packet2Xl& a) {
+  return __riscv_vmv_x_s_i64m2_i64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl preverse(const Packet2Xl& a) {
+  Packet2Xul idx = __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits<Packet2Xl>::size),
+                                          unpacket_traits<Packet2Xl>::size - 1, unpacket_traits<Packet2Xl>::size);
+  return __riscv_vrgather_vv_i64m2(a, idx, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pabs(const Packet2Xl& a) {
+  Packet2Xl mask = __riscv_vsra_vx_i64m2(a, 63, unpacket_traits<Packet2Xl>::size);
+  return __riscv_vsub_vv_i64m2(__riscv_vxor_vv_i64m2(a, mask, unpacket_traits<Packet2Xl>::size), mask,
+                               unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int64_t predux<Packet2Xl>(const Packet2Xl& a) {
+  return __riscv_vmv_x(__riscv_vredsum_vs_i64m2_i64m1(a, __riscv_vmv_v_x_i64m1(0, unpacket_traits<Packet2Xl>::size / 2),
+                                                      unpacket_traits<Packet2Xl>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int64_t predux_mul<Packet2Xl>(const Packet2Xl& a) {
+  return predux_mul<Packet1Xl>(__riscv_vmul_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1),
+                                                     unpacket_traits<Packet1Xl>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int64_t predux_min<Packet2Xl>(const Packet2Xl& a) {
+  return __riscv_vmv_x(__riscv_vredmin_vs_i64m2_i64m1(
+      a, __riscv_vmv_v_x_i64m1((std::numeric_limits<numext::int64_t>::max)(), unpacket_traits<Packet2Xl>::size / 2),
+      unpacket_traits<Packet2Xl>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int64_t predux_max<Packet2Xl>(const Packet2Xl& a) {
+  return __riscv_vmv_x(__riscv_vredmax_vs_i64m2_i64m1(
+      a, __riscv_vmv_v_x_i64m1((std::numeric_limits<numext::int64_t>::min)(), unpacket_traits<Packet2Xl>::size / 2),
+      unpacket_traits<Packet2Xl>::size));
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2Xl, N>& kernel) {
+  numext::int64_t buffer[unpacket_traits<Packet2Xl>::size * N] = {0};
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits<Packet2Xl>::size);
+  }
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] =
+        __riscv_vle64_v_i64m2(&buffer[i * unpacket_traits<Packet2Xl>::size], unpacket_traits<Packet2Xl>::size);
+  }
+}
+
+template <typename Packet = Packet4Xl>
+EIGEN_STRONG_INLINE
+    std::enable_if_t<std::is_same<Packet, Packet4Xl>::value && (unpacket_traits<Packet4Xl>::size % 8) == 0, Packet2Xl>
+    predux_half(const Packet4Xl& a) {
+  return __riscv_vadd_vv_i64m2(__riscv_vget_v_i64m4_i64m2(a, 0), __riscv_vget_v_i64m4_i64m2(a, 1),
+                               unpacket_traits<Packet2Xl>::size);
+}
+
+template <typename Packet = Packet2Xl>
+EIGEN_STRONG_INLINE
+    std::enable_if_t<std::is_same<Packet, Packet2Xl>::value && (unpacket_traits<Packet2Xl>::size % 8) == 0, Packet1Xl>
+    predux_half(const Packet2Xl& a) {
+  return __riscv_vadd_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1),
+                               unpacket_traits<Packet1Xl>::size);
+}
+
+/********************************* Packet2Xd ************************************/
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd ptrue<Packet2Xd>(const Packet2Xd& /*a*/) {
+  return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(0xffffffffffffffffu, unpacket_traits<Packet2Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pzero<Packet2Xd>(const Packet2Xd& /*a*/) {
+  return __riscv_vfmv_v_f_f64m2(0.0, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pabs(const Packet2Xd& a) {
+  return __riscv_vfabs_v_f64m2(a, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pabsdiff(const Packet2Xd& a, const Packet2Xd& b) {
+  return __riscv_vfabs_v_f64m2(__riscv_vfsub_vv_f64m2(a, b, unpacket_traits<Packet2Xd>::size),
+                               unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pset1<Packet2Xd>(const double& from) {
+  return __riscv_vfmv_v_f_f64m2(from, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pset1frombits<Packet2Xd>(numext::uint64_t from) {
+  return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(from, unpacket_traits<Packet2Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd plset<Packet2Xd>(const double& a) {
+  Packet2Xd idx = __riscv_vfcvt_f_x_v_f64m2(
+      __riscv_vreinterpret_v_u64m2_i64m2(__riscv_vid_v_u64m2(unpacket_traits<Packet4Xi>::size)),
+      unpacket_traits<Packet2Xd>::size);
+  return __riscv_vfadd_vf_f64m2(idx, a, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet2Xd>(const double* a, Packet2Xd& a0, Packet2Xd& a1, Packet2Xd& a2,
+                                                Packet2Xd& a3) {
+  vfloat64m2_t aa = __riscv_vle64_v_f64m2(a, 4);
+  a0 = __riscv_vrgather_vx_f64m2(aa, 0, unpacket_traits<Packet2Xd>::size);
+  a1 = __riscv_vrgather_vx_f64m2(aa, 1, unpacket_traits<Packet2Xd>::size);
+  a2 = __riscv_vrgather_vx_f64m2(aa, 2, unpacket_traits<Packet2Xd>::size);
+  a3 = __riscv_vrgather_vx_f64m2(aa, 3, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd padd<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
+  return __riscv_vfadd_vv_f64m2(a, b, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd psub<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
+  return __riscv_vfsub_vv_f64m2(a, b, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pnegate(const Packet2Xd& a) {
+  return __riscv_vfneg_v_f64m2(a, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd psignbit(const Packet2Xd& a) {
+  return __riscv_vreinterpret_v_i64m2_f64m2(
+      __riscv_vsra_vx_i64m2(__riscv_vreinterpret_v_f64m2_i64m2(a), 63, unpacket_traits<Packet2Xl>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pconj(const Packet2Xd& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pmul<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
+  return __riscv_vfmul_vv_f64m2(a, b, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pdiv<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
+  return __riscv_vfdiv_vv_f64m2(a, b, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pmadd(const Packet2Xd& a, const Packet2Xd& b, const Packet2Xd& c) {
+  return __riscv_vfmadd_vv_f64m2(a, b, c, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pmsub(const Packet2Xd& a, const Packet2Xd& b, const Packet2Xd& c) {
+  return __riscv_vfmsub_vv_f64m2(a, b, c, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pnmadd(const Packet2Xd& a, const Packet2Xd& b, const Packet2Xd& c) {
+  return __riscv_vfnmsub_vv_f64m2(a, b, c, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pnmsub(const Packet2Xd& a, const Packet2Xd& b, const Packet2Xd& c) {
+  return __riscv_vfnmadd_vv_f64m2(a, b, c, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pmin<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
+  Packet2Xd nans = __riscv_vfmv_v_f_f64m2((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet2Xd>::size);
+  PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits<Packet2Xd>::size);
+  PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits<Packet2Xd>::size);
+  mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits<Packet2Xd>::size);
+
+  return __riscv_vfmin_vv_f64m2_tumu(mask, nans, a, b, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pmin<PropagateNaN, Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
+  return pmin<Packet2Xd>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pmin<PropagateNumbers, Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
+  return __riscv_vfmin_vv_f64m2(a, b, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pmax<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
+  Packet2Xd nans = __riscv_vfmv_v_f_f64m2((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet2Xd>::size);
+  PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits<Packet2Xd>::size);
+  PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits<Packet2Xd>::size);
+  mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits<Packet2Xd>::size);
+
+  return __riscv_vfmax_vv_f64m2_tumu(mask, nans, a, b, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pmax<PropagateNaN, Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
+  return pmax<Packet2Xd>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pmax<PropagateNumbers, Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
+  return __riscv_vfmax_vv_f64m2(a, b, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pcmp_le<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
+  PacketMask32 mask = __riscv_vmfle_vv_f64m2_b32(a, b, unpacket_traits<Packet2Xd>::size);
+  return __riscv_vmerge_vvm_f64m2(pzero<Packet2Xd>(a), ptrue<Packet2Xd>(a), mask, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pcmp_lt<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
+  PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, b, unpacket_traits<Packet2Xd>::size);
+  return __riscv_vmerge_vvm_f64m2(pzero<Packet2Xd>(a), ptrue<Packet2Xd>(a), mask, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pcmp_eq<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
+  PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, b, unpacket_traits<Packet2Xd>::size);
+  return __riscv_vmerge_vvm_f64m2(pzero<Packet2Xd>(a), ptrue<Packet2Xd>(a), mask, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pcmp_lt_or_nan<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
+  PacketMask32 mask = __riscv_vmfge_vv_f64m2_b32(a, b, unpacket_traits<Packet2Xd>::size);
+  return __riscv_vfmerge_vfm_f64m2(ptrue<Packet2Xd>(a), 0.0, mask, unpacket_traits<Packet2Xd>::size);
+}
+
+EIGEN_STRONG_INLINE Packet2Xd pselect(const PacketMask32& mask, const Packet2Xd& a, const Packet2Xd& b) {
+  return __riscv_vmerge_vvm_f64m2(b, a, mask, unpacket_traits<Packet2Xd>::size);
+}
+
+EIGEN_STRONG_INLINE Packet2Xd pselect(const Packet2Xd& mask, const Packet2Xd& a, const Packet2Xd& b) {
+  PacketMask32 mask2 =
+      __riscv_vmsne_vx_i64m2_b32(__riscv_vreinterpret_v_f64m2_i64m2(mask), 0, unpacket_traits<Packet2Xd>::size);
+  return __riscv_vmerge_vvm_f64m2(b, a, mask2, unpacket_traits<Packet2Xd>::size);
+}
+
+// Logical Operations are not supported for double, so reinterpret casts
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pand<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
+  return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2(
+      __riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits<Packet2Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd por<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
+  return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vor_vv_u64m2(
+      __riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits<Packet2Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pxor<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
+  return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vxor_vv_u64m2(
+      __riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits<Packet2Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pandnot<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& b) {
+  return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2(
+      __riscv_vreinterpret_v_f64m2_u64m2(a),
+      __riscv_vnot_v_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits<Packet2Xd>::size),
+      unpacket_traits<Packet2Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pload<Packet2Xd>(const double* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd ploadu<Packet2Xd>(const double* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd ploaddup<Packet2Xd>(const double* from) {
+  Packet2Xul idx =
+      __riscv_vsrl_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits<Packet2Xd>::size), 1, unpacket_traits<Packet2Xd>::size);
+  return __riscv_vrgather_vv_f64m2(pload<Packet2Xd>(from), idx, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd ploadquad<Packet2Xd>(const double* from) {
+  Packet2Xul idx =
+      __riscv_vsrl_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits<Packet2Xd>::size), 2, unpacket_traits<Packet2Xd>::size);
+  return __riscv_vrgather_vv_f64m2(pload<Packet2Xd>(from), idx, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2Xd& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2Xd& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2Xd pgather<double, Packet2Xd>(const double* from, Index stride) {
+  return __riscv_vlse64_v_f64m2(from, stride * sizeof(double), unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2Xd>(double* to, const Packet2Xd& from, Index stride) {
+  __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2Xd>(const Packet2Xd& a) {
+  return __riscv_vfmv_f_s_f64m2_f64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd psqrt(const Packet2Xd& a) {
+  return __riscv_vfsqrt_v_f64m2(a, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd print<Packet2Xd>(const Packet2Xd& a) {
+  const Packet2Xd limit = pset1<Packet2Xd>(static_cast<double>(1ull << 52));
+  const Packet2Xd abs_a = pabs(a);
+
+  PacketMask32 mask = __riscv_vmfne_vv_f64m2_b32(a, a, unpacket_traits<Packet2Xd>::size);
+  const Packet2Xd x = __riscv_vfadd_vv_f64m2_tumu(mask, a, a, a, unpacket_traits<Packet2Xd>::size);
+  const Packet2Xd new_x = __riscv_vfcvt_f_x_v_f64m2(__riscv_vfcvt_x_f_v_i64m2(a, unpacket_traits<Packet2Xd>::size),
+                                                    unpacket_traits<Packet2Xd>::size);
+
+  mask = __riscv_vmflt_vv_f64m2_b32(abs_a, limit, unpacket_traits<Packet2Xd>::size);
+  Packet2Xd signed_x = __riscv_vfsgnj_vv_f64m2(new_x, x, unpacket_traits<Packet2Xd>::size);
+  return __riscv_vmerge_vvm_f64m2(x, signed_x, mask, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pfloor<Packet2Xd>(const Packet2Xd& a) {
+  Packet2Xd tmp = print<Packet2Xd>(a);
+  // If greater, subtract one.
+  PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, tmp, unpacket_traits<Packet2Xd>::size);
+  return __riscv_vfsub_vf_f64m2_tumu(mask, tmp, tmp, 1.0, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd preverse(const Packet2Xd& a) {
+  Packet2Xul idx = __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits<Packet2Xd>::size),
+                                          unpacket_traits<Packet2Xd>::size - 1, unpacket_traits<Packet2Xd>::size);
+  return __riscv_vrgather_vv_f64m2(a, idx, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pfrexp<Packet2Xd>(const Packet2Xd& a, Packet2Xd& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2Xd>(const Packet2Xd& a) {
+  return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m2_f64m1(
+      a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits<Packet2Xd>::size / 2), unpacket_traits<Packet2Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2Xd>(const Packet2Xd& a) {
+  return predux_mul<Packet1Xd>(__riscv_vfmul_vv_f64m1(
+      __riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), unpacket_traits<Packet1Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2Xd>(const Packet2Xd& a) {
+  return (std::min)(
+      __riscv_vfmv_f(__riscv_vfredmin_vs_f64m2_f64m1(
+          a, __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet2Xd>::size / 2),
+          unpacket_traits<Packet2Xd>::size)),
+      (std::numeric_limits<double>::max)());
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2Xd>(const Packet2Xd& a) {
+  return (std::max)(
+      __riscv_vfmv_f(__riscv_vfredmax_vs_f64m2_f64m1(
+          a, __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet2Xd>::size / 2),
+          unpacket_traits<Packet2Xd>::size)),
+      -(std::numeric_limits<double>::max)());
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2Xd, N>& kernel) {
+  double buffer[unpacket_traits<Packet2Xd>::size * N];
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits<Packet2Xd>::size);
+  }
+
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] =
+        __riscv_vle64_v_f64m2(&buffer[i * unpacket_traits<Packet2Xd>::size], unpacket_traits<Packet2Xd>::size);
+  }
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pldexp<Packet2Xd>(const Packet2Xd& a, const Packet2Xd& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template <typename Packet = Packet4Xd>
+EIGEN_STRONG_INLINE
+    std::enable_if_t<std::is_same<Packet, Packet4Xd>::value && (unpacket_traits<Packet4Xd>::size % 8) == 0, Packet2Xd>
+    predux_half(const Packet4Xd& a) {
+  return __riscv_vfadd_vv_f64m2(__riscv_vget_v_f64m4_f64m2(a, 0), __riscv_vget_v_f64m4_f64m2(a, 1),
+                                unpacket_traits<Packet2Xd>::size);
+}
+
+template <typename Packet = Packet2Xd>
+EIGEN_STRONG_INLINE
+    std::enable_if_t<std::is_same<Packet, Packet2Xd>::value && (unpacket_traits<Packet2Xd>::size % 8) == 0, Packet1Xd>
+    predux_half(const Packet2Xd& a) {
+  return __riscv_vfadd_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1),
+                                unpacket_traits<Packet1Xd>::size);
+}
+
+/********************************* Packet2Xs ************************************/
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pset1<Packet2Xs>(const numext::int16_t& from) {
+  return __riscv_vmv_v_x_i16m2(from, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs plset<Packet2Xs>(const numext::int16_t& a) {
+  Packet2Xs idx = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vid_v_u16m2(unpacket_traits<Packet2Xs>::size));
+  return __riscv_vadd_vx_i16m2(idx, a, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pzero<Packet2Xs>(const Packet2Xs& /*a*/) {
+  return __riscv_vmv_v_x_i16m2(0, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs padd<Packet2Xs>(const Packet2Xs& a, const Packet2Xs& b) {
+  return __riscv_vadd_vv_i16m2(a, b, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs psub<Packet2Xs>(const Packet2Xs& a, const Packet2Xs& b) {
+  return __riscv_vsub(a, b, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pnegate(const Packet2Xs& a) {
+  return __riscv_vneg(a, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pconj(const Packet2Xs& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pmul<Packet2Xs>(const Packet2Xs& a, const Packet2Xs& b) {
+  return __riscv_vmul(a, b, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pdiv<Packet2Xs>(const Packet2Xs& a, const Packet2Xs& b) {
+  return __riscv_vdiv(a, b, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pmadd(const Packet2Xs& a, const Packet2Xs& b, const Packet2Xs& c) {
+  return __riscv_vmadd(a, b, c, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pmsub(const Packet2Xs& a, const Packet2Xs& b, const Packet2Xs& c) {
+  return __riscv_vmadd(a, b, pnegate(c), unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pnmadd(const Packet2Xs& a, const Packet2Xs& b, const Packet2Xs& c) {
+  return __riscv_vnmsub_vv_i16m2(a, b, c, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pnmsub(const Packet2Xs& a, const Packet2Xs& b, const Packet2Xs& c) {
+  return __riscv_vnmsub_vv_i16m2(a, b, pnegate(c), unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pmin<Packet2Xs>(const Packet2Xs& a, const Packet2Xs& b) {
+  return __riscv_vmin(a, b, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pmax<Packet2Xs>(const Packet2Xs& a, const Packet2Xs& b) {
+  return __riscv_vmax(a, b, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pcmp_le<Packet2Xs>(const Packet2Xs& a, const Packet2Xs& b) {
+  PacketMask8 mask = __riscv_vmsle_vv_i16m2_b8(a, b, unpacket_traits<Packet2Xs>::size);
+  return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast<short>(0xffff), mask, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pcmp_lt<Packet2Xs>(const Packet2Xs& a, const Packet2Xs& b) {
+  PacketMask8 mask = __riscv_vmslt_vv_i16m2_b8(a, b, unpacket_traits<Packet2Xs>::size);
+  return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast<short>(0xffff), mask, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pcmp_eq<Packet2Xs>(const Packet2Xs& a, const Packet2Xs& b) {
+  PacketMask8 mask = __riscv_vmseq_vv_i16m2_b8(a, b, unpacket_traits<Packet2Xs>::size);
+  return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast<short>(0xffff), mask, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs ptrue<Packet2Xs>(const Packet2Xs& /*a*/) {
+  return __riscv_vmv_v_x_i16m2(static_cast<unsigned short>(0xffffu), unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pand<Packet2Xs>(const Packet2Xs& a, const Packet2Xs& b) {
+  return __riscv_vand_vv_i16m2(a, b, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs por<Packet2Xs>(const Packet2Xs& a, const Packet2Xs& b) {
+  return __riscv_vor_vv_i16m2(a, b, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pxor<Packet2Xs>(const Packet2Xs& a, const Packet2Xs& b) {
+  return __riscv_vxor_vv_i16m2(a, b, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pandnot<Packet2Xs>(const Packet2Xs& a, const Packet2Xs& b) {
+  return __riscv_vand_vv_i16m2(a, __riscv_vnot_v_i16m2(b, unpacket_traits<Packet2Xs>::size),
+                               unpacket_traits<Packet2Xs>::size);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2Xs parithmetic_shift_right(Packet2Xs a) {
+  return __riscv_vsra_vx_i16m2(a, N, unpacket_traits<Packet2Xs>::size);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2Xs plogical_shift_right(Packet2Xs a) {
+  return __riscv_vreinterpret_i16m2(
+      __riscv_vsrl_vx_u16m2(__riscv_vreinterpret_u16m2(a), N, unpacket_traits<Packet2Xs>::size));
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2Xs plogical_shift_left(Packet2Xs a) {
+  return __riscv_vsll_vx_i16m2(a, N, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pload<Packet2Xs>(const numext::int16_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs ploadu<Packet2Xs>(const numext::int16_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs ploaddup<Packet2Xs>(const numext::int16_t* from) {
+  Packet2Xsu data = __riscv_vreinterpret_v_i16m2_u16m2(pload<Packet2Xs>(from));
+  return __riscv_vreinterpret_v_i32m2_i16m2(__riscv_vreinterpret_v_u32m2_i32m2(__riscv_vlmul_trunc_v_u32m4_u32m2(
+      __riscv_vwmaccu_vx_u32m4(__riscv_vwaddu_vv_u32m4(data, data, unpacket_traits<Packet2Xs>::size), 0xffffu, data,
+                               unpacket_traits<Packet2Xs>::size))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs ploadquad<Packet2Xs>(const numext::int16_t* from) {
+  Packet2Xsu idx =
+      __riscv_vsrl_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits<Packet2Xs>::size), 2, unpacket_traits<Packet2Xs>::size);
+  return __riscv_vrgather_vv_i16m2(pload<Packet2Xs>(from), idx, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int16_t>(numext::int16_t* to, const Packet2Xs& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int16_t>(numext::int16_t* to, const Packet2Xs& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2Xs pgather<numext::int16_t, Packet2Xs>(const numext::int16_t* from, Index stride) {
+  return __riscv_vlse16_v_i16m2(from, stride * sizeof(numext::int16_t), unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<numext::int16_t, Packet2Xs>(numext::int16_t* to, const Packet2Xs& from,
+                                                                   Index stride) {
+  __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int16_t pfirst<Packet2Xs>(const Packet2Xs& a) {
+  return __riscv_vmv_x_s_i16m2_i16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs preverse(const Packet2Xs& a) {
+  Packet2Xsu idx = __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits<Packet2Xs>::size),
+                                          unpacket_traits<Packet2Xs>::size - 1, unpacket_traits<Packet2Xs>::size);
+  return __riscv_vrgather_vv_i16m2(a, idx, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pabs(const Packet2Xs& a) {
+  Packet2Xs mask = __riscv_vsra_vx_i16m2(a, 15, unpacket_traits<Packet2Xs>::size);
+  return __riscv_vsub_vv_i16m2(__riscv_vxor_vv_i16m2(a, mask, unpacket_traits<Packet2Xs>::size), mask,
+                               unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int16_t predux<Packet2Xs>(const Packet2Xs& a) {
+  return __riscv_vmv_x(__riscv_vredsum_vs_i16m2_i16m1(a, __riscv_vmv_v_x_i16m1(0, unpacket_traits<Packet2Xs>::size / 2),
+                                                      unpacket_traits<Packet2Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int16_t predux_mul<Packet2Xs>(const Packet2Xs& a) {
+  return predux_mul<Packet1Xs>(__riscv_vmul_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1),
+                                                     unpacket_traits<Packet1Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int16_t predux_min<Packet2Xs>(const Packet2Xs& a) {
+  return __riscv_vmv_x(__riscv_vredmin_vs_i16m2_i16m1(
+      a, __riscv_vmv_v_x_i16m1((std::numeric_limits<numext::int16_t>::max)(), unpacket_traits<Packet2Xs>::size / 2),
+      unpacket_traits<Packet2Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int16_t predux_max<Packet2Xs>(const Packet2Xs& a) {
+  return __riscv_vmv_x(__riscv_vredmax_vs_i16m2_i16m1(
+      a, __riscv_vmv_v_x_i16m1((std::numeric_limits<numext::int16_t>::min)(), unpacket_traits<Packet2Xs>::size / 2),
+      unpacket_traits<Packet2Xs>::size));
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2Xs, N>& kernel) {
+  numext::int16_t buffer[unpacket_traits<Packet2Xs>::size * N] = {0};
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits<Packet2Xs>::size);
+  }
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] =
+        __riscv_vle16_v_i16m2(&buffer[i * unpacket_traits<Packet2Xs>::size], unpacket_traits<Packet2Xs>::size);
+  }
+}
+
+template <typename Packet = Packet4Xs>
+EIGEN_STRONG_INLINE
+    std::enable_if_t<std::is_same<Packet, Packet4Xs>::value && (unpacket_traits<Packet4Xs>::size % 8) == 0, Packet2Xs>
+    predux_half(const Packet4Xs& a) {
+  return __riscv_vadd_vv_i16m2(__riscv_vget_v_i16m4_i16m2(a, 0), __riscv_vget_v_i16m4_i16m2(a, 1),
+                               unpacket_traits<Packet2Xs>::size);
+}
+
+template <typename Packet = Packet2Xs>
+EIGEN_STRONG_INLINE
+    std::enable_if_t<std::is_same<Packet, Packet2Xs>::value && (unpacket_traits<Packet2Xs>::size % 8) == 0, Packet1Xs>
+    predux_half(const Packet2Xs& a) {
+  return __riscv_vadd_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1),
+                               unpacket_traits<Packet1Xs>::size);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_PACKET2_MATH_RVV10_H
diff --git a/Eigen/src/Core/arch/RVV10/PacketMath4.h b/Eigen/src/Core/arch/RVV10/PacketMath4.h
new file mode 100644
index 00000000000..249dadfeece
--- /dev/null
+++ b/Eigen/src/Core/arch/RVV10/PacketMath4.h
@@ -0,0 +1,1462 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 Kseniya Zaytseva <kseniya.zaytseva@syntacore.com>
+// Copyright (C) 2025 Chip Kerchner <ckerchner@tenstorrent.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET4_MATH_RVV10_H
+#define EIGEN_PACKET4_MATH_RVV10_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+/********************************* Packet4Xi ************************************/
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pset1<Packet4Xi>(const numext::int32_t& from) {
+  return __riscv_vmv_v_x_i32m4(from, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi plset<Packet4Xi>(const numext::int32_t& a) {
+  Packet4Xi idx = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vid_v_u32m4(unpacket_traits<Packet4Xi>::size));
+  return __riscv_vadd_vx_i32m4(idx, a, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pzero<Packet4Xi>(const Packet4Xi& /*a*/) {
+  return __riscv_vmv_v_x_i32m4(0, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi padd<Packet4Xi>(const Packet4Xi& a, const Packet4Xi& b) {
+  return __riscv_vadd_vv_i32m4(a, b, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi psub<Packet4Xi>(const Packet4Xi& a, const Packet4Xi& b) {
+  return __riscv_vsub(a, b, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pnegate(const Packet4Xi& a) {
+  return __riscv_vneg(a, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pconj(const Packet4Xi& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pmul<Packet4Xi>(const Packet4Xi& a, const Packet4Xi& b) {
+  return __riscv_vmul(a, b, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pdiv<Packet4Xi>(const Packet4Xi& a, const Packet4Xi& b) {
+  return __riscv_vdiv(a, b, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pmadd(const Packet4Xi& a, const Packet4Xi& b, const Packet4Xi& c) {
+  return __riscv_vmadd(a, b, c, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pmsub(const Packet4Xi& a, const Packet4Xi& b, const Packet4Xi& c) {
+  return __riscv_vmadd(a, b, pnegate(c), unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pnmadd(const Packet4Xi& a, const Packet4Xi& b, const Packet4Xi& c) {
+  return __riscv_vnmsub_vv_i32m4(a, b, c, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pnmsub(const Packet4Xi& a, const Packet4Xi& b, const Packet4Xi& c) {
+  return __riscv_vnmsub_vv_i32m4(a, b, pnegate(c), unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pmin<Packet4Xi>(const Packet4Xi& a, const Packet4Xi& b) {
+  return __riscv_vmin(a, b, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pmax<Packet4Xi>(const Packet4Xi& a, const Packet4Xi& b) {
+  return __riscv_vmax(a, b, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pcmp_le<Packet4Xi>(const Packet4Xi& a, const Packet4Xi& b) {
+  PacketMask8 mask = __riscv_vmsle_vv_i32m4_b8(a, b, unpacket_traits<Packet4Xi>::size);
+  return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pcmp_lt<Packet4Xi>(const Packet4Xi& a, const Packet4Xi& b) {
+  PacketMask8 mask = __riscv_vmslt_vv_i32m4_b8(a, b, unpacket_traits<Packet4Xi>::size);
+  return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pcmp_eq<Packet4Xi>(const Packet4Xi& a, const Packet4Xi& b) {
+  PacketMask8 mask = __riscv_vmseq_vv_i32m4_b8(a, b, unpacket_traits<Packet4Xi>::size);
+  return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi ptrue<Packet4Xi>(const Packet4Xi& /*a*/) {
+  return __riscv_vmv_v_x_i32m4(0xffffffffu, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pand<Packet4Xi>(const Packet4Xi& a, const Packet4Xi& b) {
+  return __riscv_vand_vv_i32m4(a, b, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi por<Packet4Xi>(const Packet4Xi& a, const Packet4Xi& b) {
+  return __riscv_vor_vv_i32m4(a, b, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pxor<Packet4Xi>(const Packet4Xi& a, const Packet4Xi& b) {
+  return __riscv_vxor_vv_i32m4(a, b, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pandnot<Packet4Xi>(const Packet4Xi& a, const Packet4Xi& b) {
+  return __riscv_vand_vv_i32m4(a, __riscv_vnot_v_i32m4(b, unpacket_traits<Packet4Xi>::size),
+                               unpacket_traits<Packet4Xi>::size);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet4Xi parithmetic_shift_right(Packet4Xi a) {
+  return __riscv_vsra_vx_i32m4(a, N, unpacket_traits<Packet4Xi>::size);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet4Xi plogical_shift_right(Packet4Xi a) {
+  return __riscv_vreinterpret_i32m4(
+      __riscv_vsrl_vx_u32m4(__riscv_vreinterpret_u32m4(a), N, unpacket_traits<Packet4Xi>::size));
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet4Xi plogical_shift_left(Packet4Xi a) {
+  return __riscv_vsll_vx_i32m4(a, N, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pload<Packet4Xi>(const numext::int32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi ploadu<Packet4Xi>(const numext::int32_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi ploaddup<Packet4Xi>(const numext::int32_t* from) {
+  Packet4Xu data = __riscv_vreinterpret_v_i32m4_u32m4(pload<Packet4Xi>(from));
+  return __riscv_vreinterpret_v_i64m4_i32m4(__riscv_vreinterpret_v_u64m4_i64m4(__riscv_vlmul_trunc_v_u64m8_u64m4(
+      __riscv_vwmaccu_vx_u64m8(__riscv_vwaddu_vv_u64m8(data, data, unpacket_traits<Packet4Xi>::size), 0xffffffffu, data,
+                               unpacket_traits<Packet4Xi>::size))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi ploadquad<Packet4Xi>(const numext::int32_t* from) {
+  Packet4Xu idx =
+      __riscv_vsrl_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits<Packet4Xi>::size), 2, unpacket_traits<Packet4Xi>::size);
+  return __riscv_vrgather_vv_i32m4(pload<Packet4Xi>(from), idx, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int32_t>(numext::int32_t* to, const Packet4Xi& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int32_t>(numext::int32_t* to, const Packet4Xi& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4Xi pgather<numext::int32_t, Packet4Xi>(const numext::int32_t* from, Index stride) {
+  return __riscv_vlse32_v_i32m4(from, stride * sizeof(numext::int32_t), unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<numext::int32_t, Packet4Xi>(numext::int32_t* to, const Packet4Xi& from,
+                                                                   Index stride) {
+  __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t pfirst<Packet4Xi>(const Packet4Xi& a) {
+  return __riscv_vmv_x_s_i32m4_i32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi preverse(const Packet4Xi& a) {
+  Packet4Xu idx = __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits<Packet4Xi>::size),
+                                         unpacket_traits<Packet4Xi>::size - 1, unpacket_traits<Packet4Xi>::size);
+  return __riscv_vrgather_vv_i32m4(a, idx, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pabs(const Packet4Xi& a) {
+  Packet4Xi mask = __riscv_vsra_vx_i32m4(a, 31, unpacket_traits<Packet4Xi>::size);
+  return __riscv_vsub_vv_i32m4(__riscv_vxor_vv_i32m4(a, mask, unpacket_traits<Packet4Xi>::size), mask,
+                               unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux<Packet4Xi>(const Packet4Xi& a) {
+  return __riscv_vmv_x(__riscv_vredsum_vs_i32m4_i32m1(a, __riscv_vmv_v_x_i32m1(0, unpacket_traits<Packet4Xi>::size / 4),
+                                                      unpacket_traits<Packet4Xi>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_mul<Packet4Xi>(const Packet4Xi& a) {
+  Packet1Xi half1 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 0), __riscv_vget_v_i32m4_i32m1(a, 1),
+                                          unpacket_traits<Packet1Xi>::size);
+  Packet1Xi half2 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 2), __riscv_vget_v_i32m4_i32m1(a, 3),
+                                          unpacket_traits<Packet1Xi>::size);
+  return predux_mul<Packet1Xi>(__riscv_vmul_vv_i32m1(half1, half2, unpacket_traits<Packet1Xi>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_min<Packet4Xi>(const Packet4Xi& a) {
+  return __riscv_vmv_x(__riscv_vredmin_vs_i32m4_i32m1(
+      a, __riscv_vmv_v_x_i32m1((std::numeric_limits<numext::int32_t>::max)(), unpacket_traits<Packet4Xi>::size / 4),
+      unpacket_traits<Packet4Xi>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_max<Packet4Xi>(const Packet4Xi& a) {
+  return __riscv_vmv_x(__riscv_vredmax_vs_i32m4_i32m1(
+      a, __riscv_vmv_v_x_i32m1((std::numeric_limits<numext::int32_t>::min)(), unpacket_traits<Packet4Xi>::size / 4),
+      unpacket_traits<Packet4Xi>::size));
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4Xi, N>& kernel) {
+  numext::int32_t buffer[unpacket_traits<Packet4Xi>::size * N] = {0};
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits<Packet4Xi>::size);
+  }
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] =
+        __riscv_vle32_v_i32m4(&buffer[i * unpacket_traits<Packet4Xi>::size], unpacket_traits<Packet4Xi>::size);
+  }
+}
+
+/********************************* Packet4Xf ************************************/
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf ptrue<Packet4Xf>(const Packet4Xf& /*a*/) {
+  return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(0xffffffffu, unpacket_traits<Packet4Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pzero<Packet4Xf>(const Packet4Xf& /*a*/) {
+  return __riscv_vfmv_v_f_f32m4(0.0f, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pabs(const Packet4Xf& a) {
+  return __riscv_vfabs_v_f32m4(a, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pabsdiff(const Packet4Xf& a, const Packet4Xf& b) {
+  return __riscv_vfabs_v_f32m4(__riscv_vfsub_vv_f32m4(a, b, unpacket_traits<Packet4Xf>::size),
+                               unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pset1<Packet4Xf>(const float& from) {
+  return __riscv_vfmv_v_f_f32m4(from, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pset1frombits<Packet4Xf>(numext::uint32_t from) {
+  return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(from, unpacket_traits<Packet4Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf plset<Packet4Xf>(const float& a) {
+  Packet4Xf idx = __riscv_vfcvt_f_x_v_f32m4(
+      __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vid_v_u32m4(unpacket_traits<Packet4Xi>::size)),
+      unpacket_traits<Packet4Xf>::size);
+  return __riscv_vfadd_vf_f32m4(idx, a, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4Xf>(const float* a, Packet4Xf& a0, Packet4Xf& a1, Packet4Xf& a2,
+                                                Packet4Xf& a3) {
+  vfloat32m4_t aa = __riscv_vle32_v_f32m4(a, 4);
+  a0 = __riscv_vrgather_vx_f32m4(aa, 0, unpacket_traits<Packet4Xf>::size);
+  a1 = __riscv_vrgather_vx_f32m4(aa, 1, unpacket_traits<Packet4Xf>::size);
+  a2 = __riscv_vrgather_vx_f32m4(aa, 2, unpacket_traits<Packet4Xf>::size);
+  a3 = __riscv_vrgather_vx_f32m4(aa, 3, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf padd<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
+  return __riscv_vfadd_vv_f32m4(a, b, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf psub<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
+  return __riscv_vfsub_vv_f32m4(a, b, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pnegate(const Packet4Xf& a) {
+  return __riscv_vfneg_v_f32m4(a, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf psignbit(const Packet4Xf& a) {
+  return __riscv_vreinterpret_v_i32m4_f32m4(
+      __riscv_vsra_vx_i32m4(__riscv_vreinterpret_v_f32m4_i32m4(a), 31, unpacket_traits<Packet4Xi>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pconj(const Packet4Xf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pmul<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
+  return __riscv_vfmul_vv_f32m4(a, b, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pdiv<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
+  return __riscv_vfdiv_vv_f32m4(a, b, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pmadd(const Packet4Xf& a, const Packet4Xf& b, const Packet4Xf& c) {
+  return __riscv_vfmadd_vv_f32m4(a, b, c, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pmsub(const Packet4Xf& a, const Packet4Xf& b, const Packet4Xf& c) {
+  return __riscv_vfmsub_vv_f32m4(a, b, c, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pnmadd(const Packet4Xf& a, const Packet4Xf& b, const Packet4Xf& c) {
+  return __riscv_vfnmsub_vv_f32m4(a, b, c, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pnmsub(const Packet4Xf& a, const Packet4Xf& b, const Packet4Xf& c) {
+  return __riscv_vfnmadd_vv_f32m4(a, b, c, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pmin<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
+  Packet4Xf nans = __riscv_vfmv_v_f_f32m4((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet4Xf>::size);
+  PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits<Packet4Xf>::size);
+  PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits<Packet4Xf>::size);
+  mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits<Packet4Xf>::size);
+
+  return __riscv_vfmin_vv_f32m4_tumu(mask, nans, a, b, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pmin<PropagateNaN, Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
+  return pmin<Packet4Xf>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pmin<PropagateNumbers, Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
+  return __riscv_vfmin_vv_f32m4(a, b, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pmax<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
+  Packet4Xf nans = __riscv_vfmv_v_f_f32m4((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet4Xf>::size);
+  PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits<Packet4Xf>::size);
+  PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits<Packet4Xf>::size);
+  mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits<Packet4Xf>::size);
+
+  return __riscv_vfmax_vv_f32m4_tumu(mask, nans, a, b, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pmax<PropagateNaN, Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
+  return pmax<Packet4Xf>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pmax<PropagateNumbers, Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
+  return __riscv_vfmax_vv_f32m4(a, b, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pcmp_le<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
+  PacketMask8 mask = __riscv_vmfle_vv_f32m4_b8(a, b, unpacket_traits<Packet4Xf>::size);
+  return __riscv_vmerge_vvm_f32m4(pzero<Packet4Xf>(a), ptrue<Packet4Xf>(a), mask, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pcmp_lt<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
+  PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, b, unpacket_traits<Packet4Xf>::size);
+  return __riscv_vmerge_vvm_f32m4(pzero<Packet4Xf>(a), ptrue<Packet4Xf>(a), mask, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pcmp_eq<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
+  PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, b, unpacket_traits<Packet4Xf>::size);
+  return __riscv_vmerge_vvm_f32m4(pzero<Packet4Xf>(a), ptrue<Packet4Xf>(a), mask, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pcmp_lt_or_nan<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
+  PacketMask8 mask = __riscv_vmfge_vv_f32m4_b8(a, b, unpacket_traits<Packet4Xf>::size);
+  return __riscv_vfmerge_vfm_f32m4(ptrue<Packet4Xf>(a), 0.0f, mask, unpacket_traits<Packet4Xf>::size);
+}
+
+EIGEN_STRONG_INLINE Packet4Xf pselect(const PacketMask8& mask, const Packet4Xf& a, const Packet4Xf& b) {
+  return __riscv_vmerge_vvm_f32m4(b, a, mask, unpacket_traits<Packet4Xf>::size);
+}
+
+EIGEN_STRONG_INLINE Packet4Xf pselect(const Packet4Xf& mask, const Packet4Xf& a, const Packet4Xf& b) {
+  PacketMask8 mask2 =
+      __riscv_vmsne_vx_i32m4_b8(__riscv_vreinterpret_v_f32m4_i32m4(mask), 0, unpacket_traits<Packet4Xf>::size);
+  return __riscv_vmerge_vvm_f32m4(b, a, mask2, unpacket_traits<Packet4Xf>::size);
+}
+
+// Logical Operations are not supported for float, so reinterpret casts
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pand<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
+  return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4(
+      __riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits<Packet4Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf por<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
+  return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vor_vv_u32m4(
+      __riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits<Packet4Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pxor<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
+  return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vxor_vv_u32m4(
+      __riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits<Packet4Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pandnot<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& b) {
+  return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4(
+      __riscv_vreinterpret_v_f32m4_u32m4(a),
+      __riscv_vnot_v_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits<Packet4Xf>::size),
+      unpacket_traits<Packet4Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pload<Packet4Xf>(const float* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf ploadu<Packet4Xf>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf ploaddup<Packet4Xf>(const float* from) {
+  Packet4Xu data = __riscv_vreinterpret_v_f32m4_u32m4(pload<Packet4Xf>(from));
+  return __riscv_vreinterpret_v_i32m4_f32m4(
+      __riscv_vreinterpret_v_i64m4_i32m4(__riscv_vreinterpret_v_u64m4_i64m4(__riscv_vlmul_trunc_v_u64m8_u64m4(
+          __riscv_vwmaccu_vx_u64m8(__riscv_vwaddu_vv_u64m8(data, data, unpacket_traits<Packet4Xi>::size), 0xffffffffu,
+                                   data, unpacket_traits<Packet4Xi>::size)))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf ploadquad<Packet4Xf>(const float* from) {
+  Packet4Xu idx =
+      __riscv_vsrl_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits<Packet4Xf>::size), 2, unpacket_traits<Packet4Xf>::size);
+  return __riscv_vrgather_vv_f32m4(pload<Packet4Xf>(from), idx, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4Xf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4Xf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4Xf pgather<float, Packet4Xf>(const float* from, Index stride) {
+  return __riscv_vlse32_v_f32m4(from, stride * sizeof(float), unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4Xf>(float* to, const Packet4Xf& from, Index stride) {
+  __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4Xf>(const Packet4Xf& a) {
+  return __riscv_vfmv_f_s_f32m4_f32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf psqrt(const Packet4Xf& a) {
+  return __riscv_vfsqrt_v_f32m4(a, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf print<Packet4Xf>(const Packet4Xf& a) {
+  const Packet4Xf limit = pset1<Packet4Xf>(static_cast<float>(1 << 23));
+  const Packet4Xf abs_a = pabs(a);
+
+  PacketMask8 mask = __riscv_vmfne_vv_f32m4_b8(a, a, unpacket_traits<Packet4Xf>::size);
+  const Packet4Xf x = __riscv_vfadd_vv_f32m4_tumu(mask, a, a, a, unpacket_traits<Packet4Xf>::size);
+  const Packet4Xf new_x = __riscv_vfcvt_f_x_v_f32m4(__riscv_vfcvt_x_f_v_i32m4(a, unpacket_traits<Packet4Xf>::size),
+                                                    unpacket_traits<Packet4Xf>::size);
+
+  mask = __riscv_vmflt_vv_f32m4_b8(abs_a, limit, unpacket_traits<Packet4Xf>::size);
+  Packet4Xf signed_x = __riscv_vfsgnj_vv_f32m4(new_x, x, unpacket_traits<Packet4Xf>::size);
+  return __riscv_vmerge_vvm_f32m4(x, signed_x, mask, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pfloor<Packet4Xf>(const Packet4Xf& a) {
+  Packet4Xf tmp = print<Packet4Xf>(a);
+  // If greater, subtract one.
+  PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, tmp, unpacket_traits<Packet4Xf>::size);
+  return __riscv_vfsub_vf_f32m4_tumu(mask, tmp, tmp, 1.0f, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf preverse(const Packet4Xf& a) {
+  Packet4Xu idx = __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits<Packet4Xf>::size),
+                                         unpacket_traits<Packet4Xf>::size - 1, unpacket_traits<Packet4Xf>::size);
+  return __riscv_vrgather_vv_f32m4(a, idx, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pfrexp<Packet4Xf>(const Packet4Xf& a, Packet4Xf& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4Xf>(const Packet4Xf& a) {
+  return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m4_f32m1(
+      a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits<Packet4Xf>::size / 4), unpacket_traits<Packet4Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4Xf>(const Packet4Xf& a) {
+  Packet1Xf half1 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 0), __riscv_vget_v_f32m4_f32m1(a, 1),
+                                           unpacket_traits<Packet1Xf>::size);
+  Packet1Xf half2 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 2), __riscv_vget_v_f32m4_f32m1(a, 3),
+                                           unpacket_traits<Packet1Xf>::size);
+  return predux_mul<Packet1Xf>(__riscv_vfmul_vv_f32m1(half1, half2, unpacket_traits<Packet1Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4Xf>(const Packet4Xf& a) {
+  return (std::min)(
+      __riscv_vfmv_f(__riscv_vfredmin_vs_f32m4_f32m1(
+          a, __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet4Xf>::size / 4),
+          unpacket_traits<Packet4Xf>::size)),
+      (std::numeric_limits<float>::max)());
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4Xf>(const Packet4Xf& a) {
+  return (std::max)(
+      __riscv_vfmv_f(__riscv_vfredmax_vs_f32m4_f32m1(
+          a, __riscv_vfmv_v_f_f32m1((std::numeric_limits<float>::quiet_NaN)(), unpacket_traits<Packet4Xf>::size / 4),
+          unpacket_traits<Packet4Xf>::size)),
+      -(std::numeric_limits<float>::max)());
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4Xf, N>& kernel) {
+  float buffer[unpacket_traits<Packet4Xf>::size * N];
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits<Packet4Xf>::size);
+  }
+
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] =
+        __riscv_vle32_v_f32m4(&buffer[i * unpacket_traits<Packet4Xf>::size], unpacket_traits<Packet4Xf>::size);
+  }
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pldexp<Packet4Xf>(const Packet4Xf& a, const Packet4Xf& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+/********************************* Packet4Xl ************************************/
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pset1<Packet4Xl>(const numext::int64_t& from) {
+  return __riscv_vmv_v_x_i64m4(from, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl plset<Packet4Xl>(const numext::int64_t& a) {
+  Packet4Xl idx = __riscv_vreinterpret_v_u64m4_i64m4(__riscv_vid_v_u64m4(unpacket_traits<Packet4Xl>::size));
+  return __riscv_vadd_vx_i64m4(idx, a, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pzero<Packet4Xl>(const Packet4Xl& /*a*/) {
+  return __riscv_vmv_v_x_i64m4(0, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl padd<Packet4Xl>(const Packet4Xl& a, const Packet4Xl& b) {
+  return __riscv_vadd_vv_i64m4(a, b, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl psub<Packet4Xl>(const Packet4Xl& a, const Packet4Xl& b) {
+  return __riscv_vsub(a, b, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pnegate(const Packet4Xl& a) {
+  return __riscv_vneg(a, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pconj(const Packet4Xl& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pmul<Packet4Xl>(const Packet4Xl& a, const Packet4Xl& b) {
+  return __riscv_vmul(a, b, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pdiv<Packet4Xl>(const Packet4Xl& a, const Packet4Xl& b) {
+  return __riscv_vdiv(a, b, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pmadd(const Packet4Xl& a, const Packet4Xl& b, const Packet4Xl& c) {
+  return __riscv_vmadd(a, b, c, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pmsub(const Packet4Xl& a, const Packet4Xl& b, const Packet4Xl& c) {
+  return __riscv_vmadd(a, b, pnegate(c), unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pnmadd(const Packet4Xl& a, const Packet4Xl& b, const Packet4Xl& c) {
+  return __riscv_vnmsub_vv_i64m4(a, b, c, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pnmsub(const Packet4Xl& a, const Packet4Xl& b, const Packet4Xl& c) {
+  return __riscv_vnmsub_vv_i64m4(a, b, pnegate(c), unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pmin<Packet4Xl>(const Packet4Xl& a, const Packet4Xl& b) {
+  return __riscv_vmin(a, b, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pmax<Packet4Xl>(const Packet4Xl& a, const Packet4Xl& b) {
+  return __riscv_vmax(a, b, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pcmp_le<Packet4Xl>(const Packet4Xl& a, const Packet4Xl& b) {
+  PacketMask16 mask = __riscv_vmsle_vv_i64m4_b16(a, b, unpacket_traits<Packet4Xl>::size);
+  return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pcmp_lt<Packet4Xl>(const Packet4Xl& a, const Packet4Xl& b) {
+  PacketMask16 mask = __riscv_vmslt_vv_i64m4_b16(a, b, unpacket_traits<Packet4Xl>::size);
+  return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pcmp_eq<Packet4Xl>(const Packet4Xl& a, const Packet4Xl& b) {
+  PacketMask16 mask = __riscv_vmseq_vv_i64m4_b16(a, b, unpacket_traits<Packet4Xl>::size);
+  return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl ptrue<Packet4Xl>(const Packet4Xl& /*a*/) {
+  return __riscv_vmv_v_x_i64m4(0xffffffffffffffffu, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pand<Packet4Xl>(const Packet4Xl& a, const Packet4Xl& b) {
+  return __riscv_vand_vv_i64m4(a, b, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl por<Packet4Xl>(const Packet4Xl& a, const Packet4Xl& b) {
+  return __riscv_vor_vv_i64m4(a, b, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pxor<Packet4Xl>(const Packet4Xl& a, const Packet4Xl& b) {
+  return __riscv_vxor_vv_i64m4(a, b, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pandnot<Packet4Xl>(const Packet4Xl& a, const Packet4Xl& b) {
+  return __riscv_vand_vv_i64m4(a, __riscv_vnot_v_i64m4(b, unpacket_traits<Packet4Xl>::size),
+                               unpacket_traits<Packet4Xl>::size);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet4Xl parithmetic_shift_right(Packet4Xl a) {
+  return __riscv_vsra_vx_i64m4(a, N, unpacket_traits<Packet4Xl>::size);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet4Xl plogical_shift_right(Packet4Xl a) {
+  return __riscv_vreinterpret_i64m4(
+      __riscv_vsrl_vx_u64m4(__riscv_vreinterpret_u64m4(a), N, unpacket_traits<Packet4Xl>::size));
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet4Xl plogical_shift_left(Packet4Xl a) {
+  return __riscv_vsll_vx_i64m4(a, N, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pload<Packet4Xl>(const numext::int64_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl ploadu<Packet4Xl>(const numext::int64_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl ploaddup<Packet4Xl>(const numext::int64_t* from) {
+  Packet4Xul idx =
+      __riscv_vsrl_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits<Packet4Xl>::size), 1, unpacket_traits<Packet4Xl>::size);
+  return __riscv_vrgather_vv_i64m4(pload<Packet4Xl>(from), idx, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl ploadquad<Packet4Xl>(const numext::int64_t* from) {
+  Packet4Xul idx =
+      __riscv_vsrl_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits<Packet4Xl>::size), 2, unpacket_traits<Packet4Xl>::size);
+  return __riscv_vrgather_vv_i64m4(pload<Packet4Xl>(from), idx, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int64_t>(numext::int64_t* to, const Packet4Xl& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int64_t>(numext::int64_t* to, const Packet4Xl& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4Xl pgather<numext::int64_t, Packet4Xl>(const numext::int64_t* from, Index stride) {
+  return __riscv_vlse64_v_i64m4(from, stride * sizeof(numext::int64_t), unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<numext::int64_t, Packet4Xl>(numext::int64_t* to, const Packet4Xl& from,
+                                                                   Index stride) {
+  __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int64_t pfirst<Packet4Xl>(const Packet4Xl& a) {
+  return __riscv_vmv_x_s_i64m4_i64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl preverse(const Packet4Xl& a) {
+  Packet4Xul idx = __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits<Packet4Xl>::size),
+                                          unpacket_traits<Packet4Xl>::size - 1, unpacket_traits<Packet4Xl>::size);
+  return __riscv_vrgather_vv_i64m4(a, idx, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pabs(const Packet4Xl& a) {
+  Packet4Xl mask = __riscv_vsra_vx_i64m4(a, 63, unpacket_traits<Packet4Xl>::size);
+  return __riscv_vsub_vv_i64m4(__riscv_vxor_vv_i64m4(a, mask, unpacket_traits<Packet4Xl>::size), mask,
+                               unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int64_t predux<Packet4Xl>(const Packet4Xl& a) {
+  return __riscv_vmv_x(__riscv_vredsum_vs_i64m4_i64m1(a, __riscv_vmv_v_x_i64m1(0, unpacket_traits<Packet4Xl>::size / 4),
+                                                      unpacket_traits<Packet4Xl>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int64_t predux_mul<Packet4Xl>(const Packet4Xl& a) {
+  Packet1Xl half1 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 0), __riscv_vget_v_i64m4_i64m1(a, 1),
+                                          unpacket_traits<Packet1Xl>::size);
+  Packet1Xl half2 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 2), __riscv_vget_v_i64m4_i64m1(a, 3),
+                                          unpacket_traits<Packet1Xl>::size);
+  return predux_mul<Packet1Xl>(__riscv_vmul_vv_i64m1(half1, half2, unpacket_traits<Packet1Xl>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int64_t predux_min<Packet4Xl>(const Packet4Xl& a) {
+  return __riscv_vmv_x(__riscv_vredmin_vs_i64m4_i64m1(
+      a, __riscv_vmv_v_x_i64m1((std::numeric_limits<numext::int64_t>::max)(), unpacket_traits<Packet4Xl>::size / 4),
+      unpacket_traits<Packet4Xl>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int64_t predux_max<Packet4Xl>(const Packet4Xl& a) {
+  return __riscv_vmv_x(__riscv_vredmax_vs_i64m4_i64m1(
+      a, __riscv_vmv_v_x_i64m1((std::numeric_limits<numext::int64_t>::min)(), unpacket_traits<Packet4Xl>::size / 4),
+      unpacket_traits<Packet4Xl>::size));
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4Xl, N>& kernel) {
+  numext::int64_t buffer[unpacket_traits<Packet4Xl>::size * N] = {0};
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits<Packet4Xl>::size);
+  }
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] =
+        __riscv_vle64_v_i64m4(&buffer[i * unpacket_traits<Packet4Xl>::size], unpacket_traits<Packet4Xl>::size);
+  }
+}
+
+/********************************* Packet4Xd ************************************/
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd ptrue<Packet4Xd>(const Packet4Xd& /*a*/) {
+  return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(0xffffffffffffffffu, unpacket_traits<Packet4Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pzero<Packet4Xd>(const Packet4Xd& /*a*/) {
+  return __riscv_vfmv_v_f_f64m4(0.0, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pabs(const Packet4Xd& a) {
+  return __riscv_vfabs_v_f64m4(a, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pabsdiff(const Packet4Xd& a, const Packet4Xd& b) {
+  return __riscv_vfabs_v_f64m4(__riscv_vfsub_vv_f64m4(a, b, unpacket_traits<Packet4Xd>::size),
+                               unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pset1<Packet4Xd>(const double& from) {
+  return __riscv_vfmv_v_f_f64m4(from, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pset1frombits<Packet4Xd>(numext::uint64_t from) {
+  return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(from, unpacket_traits<Packet4Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd plset<Packet4Xd>(const double& a) {
+  Packet4Xd idx = __riscv_vfcvt_f_x_v_f64m4(
+      __riscv_vreinterpret_v_u64m4_i64m4(__riscv_vid_v_u64m4(unpacket_traits<Packet4Xi>::size)),
+      unpacket_traits<Packet4Xd>::size);
+  return __riscv_vfadd_vf_f64m4(idx, a, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4Xd>(const double* a, Packet4Xd& a0, Packet4Xd& a1, Packet4Xd& a2,
+                                                Packet4Xd& a3) {
+  vfloat64m4_t aa = __riscv_vle64_v_f64m4(a, 4);
+  a0 = __riscv_vrgather_vx_f64m4(aa, 0, unpacket_traits<Packet4Xd>::size);
+  a1 = __riscv_vrgather_vx_f64m4(aa, 1, unpacket_traits<Packet4Xd>::size);
+  a2 = __riscv_vrgather_vx_f64m4(aa, 2, unpacket_traits<Packet4Xd>::size);
+  a3 = __riscv_vrgather_vx_f64m4(aa, 3, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd padd<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
+  return __riscv_vfadd_vv_f64m4(a, b, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd psub<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
+  return __riscv_vfsub_vv_f64m4(a, b, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pnegate(const Packet4Xd& a) {
+  return __riscv_vfneg_v_f64m4(a, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd psignbit(const Packet4Xd& a) {
+  return __riscv_vreinterpret_v_i64m4_f64m4(
+      __riscv_vsra_vx_i64m4(__riscv_vreinterpret_v_f64m4_i64m4(a), 63, unpacket_traits<Packet4Xl>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pconj(const Packet4Xd& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pmul<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
+  return __riscv_vfmul_vv_f64m4(a, b, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pdiv<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
+  return __riscv_vfdiv_vv_f64m4(a, b, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pmadd(const Packet4Xd& a, const Packet4Xd& b, const Packet4Xd& c) {
+  return __riscv_vfmadd_vv_f64m4(a, b, c, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pmsub(const Packet4Xd& a, const Packet4Xd& b, const Packet4Xd& c) {
+  return __riscv_vfmsub_vv_f64m4(a, b, c, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pnmadd(const Packet4Xd& a, const Packet4Xd& b, const Packet4Xd& c) {
+  return __riscv_vfnmsub_vv_f64m4(a, b, c, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pnmsub(const Packet4Xd& a, const Packet4Xd& b, const Packet4Xd& c) {
+  return __riscv_vfnmadd_vv_f64m4(a, b, c, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pmin<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
+  Packet4Xd nans = __riscv_vfmv_v_f_f64m4((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet4Xd>::size);
+  PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits<Packet4Xd>::size);
+  PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits<Packet4Xd>::size);
+  mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits<Packet4Xd>::size);
+
+  return __riscv_vfmin_vv_f64m4_tumu(mask, nans, a, b, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pmin<PropagateNaN, Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
+  return pmin<Packet4Xd>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pmin<PropagateNumbers, Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
+  return __riscv_vfmin_vv_f64m4(a, b, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pmax<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
+  Packet4Xd nans = __riscv_vfmv_v_f_f64m4((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet4Xd>::size);
+  PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits<Packet4Xd>::size);
+  PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits<Packet4Xd>::size);
+  mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits<Packet4Xd>::size);
+
+  return __riscv_vfmax_vv_f64m4_tumu(mask, nans, a, b, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pmax<PropagateNaN, Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
+  return pmax<Packet4Xd>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pmax<PropagateNumbers, Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
+  return __riscv_vfmax_vv_f64m4(a, b, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pcmp_le<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
+  PacketMask16 mask = __riscv_vmfle_vv_f64m4_b16(a, b, unpacket_traits<Packet4Xd>::size);
+  return __riscv_vmerge_vvm_f64m4(pzero<Packet4Xd>(a), ptrue<Packet4Xd>(a), mask, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pcmp_lt<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
+  PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, b, unpacket_traits<Packet4Xd>::size);
+  return __riscv_vmerge_vvm_f64m4(pzero<Packet4Xd>(a), ptrue<Packet4Xd>(a), mask, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pcmp_eq<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
+  PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, b, unpacket_traits<Packet4Xd>::size);
+  return __riscv_vmerge_vvm_f64m4(pzero<Packet4Xd>(a), ptrue<Packet4Xd>(a), mask, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pcmp_lt_or_nan<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
+  PacketMask16 mask = __riscv_vmfge_vv_f64m4_b16(a, b, unpacket_traits<Packet4Xd>::size);
+  return __riscv_vfmerge_vfm_f64m4(ptrue<Packet4Xd>(a), 0.0, mask, unpacket_traits<Packet4Xd>::size);
+}
+
+EIGEN_STRONG_INLINE Packet4Xd pselect(const PacketMask16& mask, const Packet4Xd& a, const Packet4Xd& b) {
+  return __riscv_vmerge_vvm_f64m4(b, a, mask, unpacket_traits<Packet4Xd>::size);
+}
+
+EIGEN_STRONG_INLINE Packet4Xd pselect(const Packet4Xd& mask, const Packet4Xd& a, const Packet4Xd& b) {
+  PacketMask16 mask2 =
+      __riscv_vmsne_vx_i64m4_b16(__riscv_vreinterpret_v_f64m4_i64m4(mask), 0, unpacket_traits<Packet4Xd>::size);
+  return __riscv_vmerge_vvm_f64m4(b, a, mask2, unpacket_traits<Packet4Xd>::size);
+}
+
+// Logical Operations are not supported for double, so reinterpret casts
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pand<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
+  return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4(
+      __riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits<Packet4Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd por<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
+  return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vor_vv_u64m4(
+      __riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits<Packet4Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pxor<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
+  return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vxor_vv_u64m4(
+      __riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits<Packet4Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pandnot<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& b) {
+  return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4(
+      __riscv_vreinterpret_v_f64m4_u64m4(a),
+      __riscv_vnot_v_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits<Packet4Xd>::size),
+      unpacket_traits<Packet4Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pload<Packet4Xd>(const double* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd ploadu<Packet4Xd>(const double* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd ploaddup<Packet4Xd>(const double* from) {
+  Packet4Xul idx =
+      __riscv_vsrl_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits<Packet4Xd>::size), 1, unpacket_traits<Packet4Xd>::size);
+  return __riscv_vrgather_vv_f64m4(pload<Packet4Xd>(from), idx, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd ploadquad<Packet4Xd>(const double* from) {
+  Packet4Xul idx =
+      __riscv_vsrl_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits<Packet4Xd>::size), 2, unpacket_traits<Packet4Xd>::size);
+  return __riscv_vrgather_vv_f64m4(pload<Packet4Xd>(from), idx, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4Xd& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4Xd& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4Xd pgather<double, Packet4Xd>(const double* from, Index stride) {
+  return __riscv_vlse64_v_f64m4(from, stride * sizeof(double), unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4Xd>(double* to, const Packet4Xd& from, Index stride) {
+  __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet4Xd>(const Packet4Xd& a) {
+  return __riscv_vfmv_f_s_f64m4_f64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd psqrt(const Packet4Xd& a) {
+  return __riscv_vfsqrt_v_f64m4(a, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd print<Packet4Xd>(const Packet4Xd& a) {
+  const Packet4Xd limit = pset1<Packet4Xd>(static_cast<double>(1ull << 52));
+  const Packet4Xd abs_a = pabs(a);
+
+  PacketMask16 mask = __riscv_vmfne_vv_f64m4_b16(a, a, unpacket_traits<Packet4Xd>::size);
+  const Packet4Xd x = __riscv_vfadd_vv_f64m4_tumu(mask, a, a, a, unpacket_traits<Packet4Xd>::size);
+  const Packet4Xd new_x = __riscv_vfcvt_f_x_v_f64m4(__riscv_vfcvt_x_f_v_i64m4(a, unpacket_traits<Packet4Xd>::size),
+                                                    unpacket_traits<Packet4Xd>::size);
+
+  mask = __riscv_vmflt_vv_f64m4_b16(abs_a, limit, unpacket_traits<Packet4Xd>::size);
+  Packet4Xd signed_x = __riscv_vfsgnj_vv_f64m4(new_x, x, unpacket_traits<Packet4Xd>::size);
+  return __riscv_vmerge_vvm_f64m4(x, signed_x, mask, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pfloor<Packet4Xd>(const Packet4Xd& a) {
+  Packet4Xd tmp = print<Packet4Xd>(a);
+  // If greater, subtract one.
+  PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, tmp, unpacket_traits<Packet4Xd>::size);
+  return __riscv_vfsub_vf_f64m4_tumu(mask, tmp, tmp, 1.0, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd preverse(const Packet4Xd& a) {
+  Packet4Xul idx = __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits<Packet4Xd>::size),
+                                          unpacket_traits<Packet4Xd>::size - 1, unpacket_traits<Packet4Xd>::size);
+  return __riscv_vrgather_vv_f64m4(a, idx, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pfrexp<Packet4Xd>(const Packet4Xd& a, Packet4Xd& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux<Packet4Xd>(const Packet4Xd& a) {
+  return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m4_f64m1(
+      a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits<Packet4Xd>::size / 4), unpacket_traits<Packet4Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet4Xd>(const Packet4Xd& a) {
+  Packet1Xd half1 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 0), __riscv_vget_v_f64m4_f64m1(a, 1),
+                                           unpacket_traits<Packet1Xd>::size);
+  Packet1Xd half2 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 2), __riscv_vget_v_f64m4_f64m1(a, 3),
+                                           unpacket_traits<Packet1Xd>::size);
+  return predux_mul<Packet1Xd>(__riscv_vfmul_vv_f64m1(half1, half2, unpacket_traits<Packet1Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet4Xd>(const Packet4Xd& a) {
+  return (std::min)(
+      __riscv_vfmv_f(__riscv_vfredmin_vs_f64m4_f64m1(
+          a, __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet4Xd>::size / 4),
+          unpacket_traits<Packet4Xd>::size)),
+      (std::numeric_limits<double>::max)());
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet4Xd>(const Packet4Xd& a) {
+  return (std::max)(
+      __riscv_vfmv_f(__riscv_vfredmax_vs_f64m4_f64m1(
+          a, __riscv_vfmv_v_f_f64m1((std::numeric_limits<double>::quiet_NaN)(), unpacket_traits<Packet4Xd>::size / 4),
+          unpacket_traits<Packet4Xd>::size)),
+      -(std::numeric_limits<double>::max)());
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4Xd, N>& kernel) {
+  double buffer[unpacket_traits<Packet4Xd>::size * N];
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits<Packet4Xd>::size);
+  }
+
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] =
+        __riscv_vle64_v_f64m4(&buffer[i * unpacket_traits<Packet4Xd>::size], unpacket_traits<Packet4Xd>::size);
+  }
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pldexp<Packet4Xd>(const Packet4Xd& a, const Packet4Xd& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+/********************************* Packet4Xs ************************************/
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pset1<Packet4Xs>(const numext::int16_t& from) {
+  return __riscv_vmv_v_x_i16m4(from, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs plset<Packet4Xs>(const numext::int16_t& a) {
+  Packet4Xs idx = __riscv_vreinterpret_v_u16m4_i16m4(__riscv_vid_v_u16m4(unpacket_traits<Packet4Xs>::size));
+  return __riscv_vadd_vx_i16m4(idx, a, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pzero<Packet4Xs>(const Packet4Xs& /*a*/) {
+  return __riscv_vmv_v_x_i16m4(0, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs padd<Packet4Xs>(const Packet4Xs& a, const Packet4Xs& b) {
+  return __riscv_vadd_vv_i16m4(a, b, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs psub<Packet4Xs>(const Packet4Xs& a, const Packet4Xs& b) {
+  return __riscv_vsub(a, b, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pnegate(const Packet4Xs& a) {
+  return __riscv_vneg(a, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pconj(const Packet4Xs& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pmul<Packet4Xs>(const Packet4Xs& a, const Packet4Xs& b) {
+  return __riscv_vmul(a, b, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pdiv<Packet4Xs>(const Packet4Xs& a, const Packet4Xs& b) {
+  return __riscv_vdiv(a, b, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pmadd(const Packet4Xs& a, const Packet4Xs& b, const Packet4Xs& c) {
+  return __riscv_vmadd(a, b, c, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pmsub(const Packet4Xs& a, const Packet4Xs& b, const Packet4Xs& c) {
+  return __riscv_vmadd(a, b, pnegate(c), unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pnmadd(const Packet4Xs& a, const Packet4Xs& b, const Packet4Xs& c) {
+  return __riscv_vnmsub_vv_i16m4(a, b, c, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pnmsub(const Packet4Xs& a, const Packet4Xs& b, const Packet4Xs& c) {
+  return __riscv_vnmsub_vv_i16m4(a, b, pnegate(c), unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pmin<Packet4Xs>(const Packet4Xs& a, const Packet4Xs& b) {
+  return __riscv_vmin(a, b, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pmax<Packet4Xs>(const Packet4Xs& a, const Packet4Xs& b) {
+  return __riscv_vmax(a, b, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pcmp_le<Packet4Xs>(const Packet4Xs& a, const Packet4Xs& b) {
+  PacketMask4 mask = __riscv_vmsle_vv_i16m4_b4(a, b, unpacket_traits<Packet4Xs>::size);
+  return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast<short>(0xffff), mask, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pcmp_lt<Packet4Xs>(const Packet4Xs& a, const Packet4Xs& b) {
+  PacketMask4 mask = __riscv_vmslt_vv_i16m4_b4(a, b, unpacket_traits<Packet4Xs>::size);
+  return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast<short>(0xffff), mask, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pcmp_eq<Packet4Xs>(const Packet4Xs& a, const Packet4Xs& b) {
+  PacketMask4 mask = __riscv_vmseq_vv_i16m4_b4(a, b, unpacket_traits<Packet4Xs>::size);
+  return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast<short>(0xffff), mask, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs ptrue<Packet4Xs>(const Packet4Xs& /*a*/) {
+  return __riscv_vmv_v_x_i16m4(static_cast<unsigned short>(0xffffu), unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pand<Packet4Xs>(const Packet4Xs& a, const Packet4Xs& b) {
+  return __riscv_vand_vv_i16m4(a, b, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs por<Packet4Xs>(const Packet4Xs& a, const Packet4Xs& b) {
+  return __riscv_vor_vv_i16m4(a, b, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pxor<Packet4Xs>(const Packet4Xs& a, const Packet4Xs& b) {
+  return __riscv_vxor_vv_i16m4(a, b, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pandnot<Packet4Xs>(const Packet4Xs& a, const Packet4Xs& b) {
+  return __riscv_vand_vv_i16m4(a, __riscv_vnot_v_i16m4(b, unpacket_traits<Packet4Xs>::size),
+                               unpacket_traits<Packet4Xs>::size);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet4Xs parithmetic_shift_right(Packet4Xs a) {
+  return __riscv_vsra_vx_i16m4(a, N, unpacket_traits<Packet4Xs>::size);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet4Xs plogical_shift_right(Packet4Xs a) {
+  return __riscv_vreinterpret_i16m4(
+      __riscv_vsrl_vx_u16m4(__riscv_vreinterpret_u16m4(a), N, unpacket_traits<Packet4Xs>::size));
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet4Xs plogical_shift_left(Packet4Xs a) {
+  return __riscv_vsll_vx_i16m4(a, N, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pload<Packet4Xs>(const numext::int16_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs ploadu<Packet4Xs>(const numext::int16_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs ploaddup<Packet4Xs>(const numext::int16_t* from) {
+  Packet4Xsu data = __riscv_vreinterpret_v_i16m4_u16m4(pload<Packet4Xs>(from));
+  return __riscv_vreinterpret_v_i32m4_i16m4(__riscv_vreinterpret_v_u32m4_i32m4(__riscv_vlmul_trunc_v_u32m8_u32m4(
+      __riscv_vwmaccu_vx_u32m8(__riscv_vwaddu_vv_u32m8(data, data, unpacket_traits<Packet4Xs>::size), 0xffffu, data,
+                               unpacket_traits<Packet4Xs>::size))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs ploadquad<Packet4Xs>(const numext::int16_t* from) {
+  Packet4Xsu idx =
+      __riscv_vsrl_vx_u16m4(__riscv_vid_v_u16m4(unpacket_traits<Packet4Xs>::size), 2, unpacket_traits<Packet4Xs>::size);
+  return __riscv_vrgather_vv_i16m4(pload<Packet4Xs>(from), idx, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int16_t>(numext::int16_t* to, const Packet4Xs& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int16_t>(numext::int16_t* to, const Packet4Xs& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4Xs pgather<numext::int16_t, Packet4Xs>(const numext::int16_t* from, Index stride) {
+  return __riscv_vlse16_v_i16m4(from, stride * sizeof(numext::int16_t), unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<numext::int16_t, Packet4Xs>(numext::int16_t* to, const Packet4Xs& from,
+                                                                   Index stride) {
+  __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int16_t pfirst<Packet4Xs>(const Packet4Xs& a) {
+  return __riscv_vmv_x_s_i16m4_i16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs preverse(const Packet4Xs& a) {
+  Packet4Xsu idx = __riscv_vrsub_vx_u16m4(__riscv_vid_v_u16m4(unpacket_traits<Packet4Xs>::size),
+                                          unpacket_traits<Packet4Xs>::size - 1, unpacket_traits<Packet4Xs>::size);
+  return __riscv_vrgather_vv_i16m4(a, idx, unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pabs(const Packet4Xs& a) {
+  Packet4Xs mask = __riscv_vsra_vx_i16m4(a, 15, unpacket_traits<Packet4Xs>::size);
+  return __riscv_vsub_vv_i16m4(__riscv_vxor_vv_i16m4(a, mask, unpacket_traits<Packet4Xs>::size), mask,
+                               unpacket_traits<Packet4Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int16_t predux<Packet4Xs>(const Packet4Xs& a) {
+  return __riscv_vmv_x(__riscv_vredsum_vs_i16m4_i16m1(a, __riscv_vmv_v_x_i16m1(0, unpacket_traits<Packet4Xs>::size / 4),
+                                                      unpacket_traits<Packet4Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int16_t predux_mul<Packet4Xs>(const Packet4Xs& a) {
+  Packet1Xs half1 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 0), __riscv_vget_v_i16m4_i16m1(a, 1),
+                                          unpacket_traits<Packet1Xs>::size);
+  Packet1Xs half2 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 2), __riscv_vget_v_i16m4_i16m1(a, 3),
+                                          unpacket_traits<Packet1Xs>::size);
+  return predux_mul<Packet1Xs>(__riscv_vmul_vv_i16m1(half1, half2, unpacket_traits<Packet1Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int16_t predux_min<Packet4Xs>(const Packet4Xs& a) {
+  return __riscv_vmv_x(__riscv_vredmin_vs_i16m4_i16m1(
+      a, __riscv_vmv_v_x_i16m1((std::numeric_limits<numext::int16_t>::max)(), unpacket_traits<Packet4Xs>::size / 4),
+      unpacket_traits<Packet4Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int16_t predux_max<Packet4Xs>(const Packet4Xs& a) {
+  return __riscv_vmv_x(__riscv_vredmax_vs_i16m4_i16m1(
+      a, __riscv_vmv_v_x_i16m1((std::numeric_limits<numext::int16_t>::min)(), unpacket_traits<Packet4Xs>::size / 4),
+      unpacket_traits<Packet4Xs>::size));
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4Xs, N>& kernel) {
+  numext::int16_t buffer[unpacket_traits<Packet4Xs>::size * N] = {0};
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits<Packet4Xs>::size);
+  }
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] =
+        __riscv_vle16_v_i16m4(&buffer[i * unpacket_traits<Packet4Xs>::size], unpacket_traits<Packet4Xs>::size);
+  }
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_PACKET4_MATH_RVV10_H
diff --git a/Eigen/src/Core/arch/RVV10/PacketMathBF16.h b/Eigen/src/Core/arch/RVV10/PacketMathBF16.h
new file mode 100644
index 00000000000..8e15b21a47a
--- /dev/null
+++ b/Eigen/src/Core/arch/RVV10/PacketMathBF16.h
@@ -0,0 +1,838 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Chip Kerchner <ckerchner@tenstorrent.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_BF16_RVV10_H
+#define EIGEN_PACKET_MATH_BF16_RVV10_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+typedef eigen_packet_wrapper<vbfloat16m1_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))), 26> Packet1Xbf;
+typedef eigen_packet_wrapper<vbfloat16m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 27>
+    Packet2Xbf;
+
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+typedef Packet1Xbf PacketXbf;
+
+template <>
+struct packet_traits<bfloat16> : default_packet_traits {
+  typedef Packet1Xbf type;
+  typedef Packet1Xbf half;
+
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<bfloat16, EIGEN_RISCV64_RVV_VL, 1>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0,
+    HasSign = 0,
+
+    HasCmp = 1,
+    HasDiv = 1,
+    HasRound = 0,
+
+    HasSin = 0,
+    HasCos = 0,
+    HasLog = 0,
+    HasExp = 0,
+    HasSqrt = 1,
+    HasTanh = 0,
+    HasErf = 0
+  };
+};
+
+#else
+typedef Packet2Xbf PacketXbf;
+
+template <>
+struct packet_traits<bfloat16> : default_packet_traits {
+  typedef Packet2Xbf type;
+  typedef Packet1Xbf half;
+
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<bfloat16, EIGEN_RISCV64_RVV_VL, 2>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0,
+    HasSign = 0,
+
+    HasCmp = 1,
+    HasDiv = 1,
+    HasRound = 0,
+
+    HasSin = 0,
+    HasCos = 0,
+    HasLog = 0,
+    HasExp = 0,
+    HasSqrt = 1,
+    HasTanh = 0,
+    HasErf = 0
+  };
+};
+#endif
+
+template <>
+struct unpacket_traits<Packet1Xbf> : default_unpacket_traits {
+  typedef bfloat16 type;
+  typedef Packet1Xbf half;  // Half not yet implemented
+  typedef Packet1Xs integer_packet;
+  typedef numext::uint8_t mask_t;
+
+  enum {
+    size = rvv_packet_size_selector<bfloat16, EIGEN_RISCV64_RVV_VL, 1>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 1>::alignment,
+    vectorizable = true
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2Xbf> : default_unpacket_traits {
+  typedef bfloat16 type;
+  typedef Packet1Xbf half;
+  typedef Packet2Xs integer_packet;
+  typedef numext::uint8_t mask_t;
+
+  enum {
+    size = rvv_packet_size_selector<bfloat16, EIGEN_RISCV64_RVV_VL, 2>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 2>::alignment,
+    vectorizable = true
+  };
+};
+
+/********************************* Packet1Xbf ************************************/
+
+EIGEN_STRONG_INLINE Packet2Xf Bf16ToF32(const Packet1Xbf& a) {
+  return __riscv_vfwcvtbf16_f_f_v_f32m2(a, unpacket_traits<Packet1Xbf>::size);
+}
+
+EIGEN_STRONG_INLINE Packet1Xbf F32ToBf16(const Packet2Xf& a) {
+  return __riscv_vfncvtbf16_f_f_w_bf16m1(a, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf ptrue<Packet1Xbf>(const Packet1Xbf& /*a*/) {
+  return __riscv_vreinterpret_bf16m1(
+      __riscv_vmv_v_x_u16m1(static_cast<numext::uint16_t>(0xffffu), unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pzero<Packet1Xbf>(const Packet1Xbf& /*a*/) {
+  return __riscv_vreinterpret_bf16m1(
+      __riscv_vmv_v_x_i16m1(numext::bit_cast<int16_t>(static_cast<__bf16>(0.0)), unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pabs(const Packet1Xbf& a) {
+  return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vand_vx_u16m1(__riscv_vreinterpret_v_bf16m1_u16m1(a),
+                                                                   static_cast<numext::uint16_t>(0x7fffu),
+                                                                   unpacket_traits<Packet1Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pset1<Packet1Xbf>(const bfloat16& from) {
+  return __riscv_vreinterpret_bf16m1(
+      __riscv_vmv_v_x_i16m1(numext::bit_cast<int16_t>(from), unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pset1frombits<Packet1Xbf>(numext::uint16_t from) {
+  return __riscv_vreinterpret_bf16m1(__riscv_vmv_v_x_u16m1(from, unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf plset<Packet1Xbf>(const bfloat16& a) {
+  return F32ToBf16(plset<Packet2Xf>(static_cast<float>(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet1Xbf>(const bfloat16* a, Packet1Xbf& a0, Packet1Xbf& a1, Packet1Xbf& a2,
+                                                 Packet1Xbf& a3) {
+  vint16m1_t aa = __riscv_vle16_v_i16m1(reinterpret_cast<const int16_t*>(a), 4);
+  a0 = __riscv_vreinterpret_bf16m1(__riscv_vrgather_vx_i16m1(aa, 0, unpacket_traits<Packet1Xs>::size));
+  a1 = __riscv_vreinterpret_bf16m1(__riscv_vrgather_vx_i16m1(aa, 1, unpacket_traits<Packet1Xs>::size));
+  a2 = __riscv_vreinterpret_bf16m1(__riscv_vrgather_vx_i16m1(aa, 2, unpacket_traits<Packet1Xs>::size));
+  a3 = __riscv_vreinterpret_bf16m1(__riscv_vrgather_vx_i16m1(aa, 3, unpacket_traits<Packet1Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf padd<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  // b + (1 * a)
+  return F32ToBf16(__riscv_vfwmaccbf16_vf_f32m2(Bf16ToF32(b),
+                                                numext::bit_cast<__bf16>(static_cast<numext::int16_t>(0x3f80u)), a,
+                                                unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf psub<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  // a + (-1 * b)
+  return F32ToBf16(__riscv_vfwmaccbf16_vf_f32m2(Bf16ToF32(a),
+                                                numext::bit_cast<__bf16>(static_cast<numext::int16_t>(0xbf80u)), b,
+                                                unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pabsdiff(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return pabs<Packet1Xbf>(psub<Packet1Xbf>(a, b));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pnegate(const Packet1Xbf& a) {
+  return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vxor_vx_u16m1(__riscv_vreinterpret_v_bf16m1_u16m1(a),
+                                                                   static_cast<numext::uint16_t>(0x8000u),
+                                                                   unpacket_traits<Packet1Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf psignbit(const Packet1Xbf& a) {
+  return __riscv_vreinterpret_v_i16m1_bf16m1(
+      __riscv_vsra_vx_i16m1(__riscv_vreinterpret_v_bf16m1_i16m1(a), 15, unpacket_traits<Packet1Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pconj(const Packet1Xbf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pmul<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  Packet2Xf c;
+  return F32ToBf16(__riscv_vfwmaccbf16_vv_f32m2(pzero<Packet2Xf>(c), a, b, unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pdiv<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pdiv<Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pmadd(const Packet1Xbf& a, const Packet1Xbf& b, const Packet1Xbf& c) {
+  return F32ToBf16(__riscv_vfwmaccbf16_vv_f32m2(Bf16ToF32(c), a, b, unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pmsub(const Packet1Xbf& a, const Packet1Xbf& b, const Packet1Xbf& c) {
+  return F32ToBf16(
+      __riscv_vfwmaccbf16_vv_f32m2(Bf16ToF32(pnegate<Packet1Xbf>(c)), a, b, unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pnmadd(const Packet1Xbf& a, const Packet1Xbf& b, const Packet1Xbf& c) {
+  return F32ToBf16(
+      __riscv_vfwmaccbf16_vv_f32m2(Bf16ToF32(c), pnegate<Packet1Xbf>(a), b, unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pnmsub(const Packet1Xbf& a, const Packet1Xbf& b, const Packet1Xbf& c) {
+  return pnegate<Packet1Xbf>(
+      F32ToBf16(__riscv_vfwmaccbf16_vv_f32m2(Bf16ToF32(c), a, b, unpacket_traits<Packet1Xbf>::size)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pmin<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pmin<Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pmin<PropagateNaN, Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pmin<PropagateNaN, Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pmin<PropagateNumbers, Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pmin<PropagateNumbers, Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pmax<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pmax<Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pmax<PropagateNaN, Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pmax<PropagateNaN, Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pmax<PropagateNumbers, Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pmax<PropagateNumbers, Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pcmp_le<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pcmp_le<Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pcmp_lt<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pcmp_lt<Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pcmp_eq<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pcmp_eq<Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pcmp_lt_or_nan<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return F32ToBf16(pcmp_lt_or_nan<Packet2Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+EIGEN_STRONG_INLINE Packet1Xbf pselect(const PacketMask16& mask, const Packet1Xbf& a, const Packet1Xbf& b) {
+  return __riscv_vreinterpret_v_i16m1_bf16m1(__riscv_vmerge_vvm_i16m1(__riscv_vreinterpret_v_bf16m1_i16m1(b),
+                                                                      __riscv_vreinterpret_v_bf16m1_i16m1(a), mask,
+                                                                      unpacket_traits<Packet1Xbf>::size));
+}
+
+EIGEN_STRONG_INLINE Packet1Xbf pselect(const Packet1Xbf& mask, const Packet1Xbf& a, const Packet1Xbf& b) {
+  PacketMask16 mask2 =
+      __riscv_vmsne_vx_i16m1_b16(__riscv_vreinterpret_v_bf16m1_i16m1(mask), 0, unpacket_traits<Packet1Xbf>::size);
+  return __riscv_vreinterpret_v_i16m1_bf16m1(__riscv_vmerge_vvm_i16m1(__riscv_vreinterpret_v_bf16m1_i16m1(b),
+                                                                      __riscv_vreinterpret_v_bf16m1_i16m1(a), mask2,
+                                                                      unpacket_traits<Packet1Xbf>::size));
+}
+
+// Logical Operations are not supported for bfloat16, so reinterpret casts
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pand<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vand_vv_u16m1(__riscv_vreinterpret_v_bf16m1_u16m1(a),
+                                                                   __riscv_vreinterpret_v_bf16m1_u16m1(b),
+                                                                   unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf por<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vor_vv_u16m1(__riscv_vreinterpret_v_bf16m1_u16m1(a),
+                                                                  __riscv_vreinterpret_v_bf16m1_u16m1(b),
+                                                                  unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pxor<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vxor_vv_u16m1(__riscv_vreinterpret_v_bf16m1_u16m1(a),
+                                                                   __riscv_vreinterpret_v_bf16m1_u16m1(b),
+                                                                   unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pandnot<Packet1Xbf>(const Packet1Xbf& a, const Packet1Xbf& b) {
+  return __riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vand_vv_u16m1(
+      __riscv_vreinterpret_v_bf16m1_u16m1(a),
+      __riscv_vnot_v_u16m1(__riscv_vreinterpret_v_bf16m1_u16m1(b), unpacket_traits<Packet1Xbf>::size),
+      unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pload<Packet1Xbf>(const bfloat16* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_bf16m1(reinterpret_cast<const __bf16*>(from),
+                                                         unpacket_traits<Packet1Xbf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf ploadu<Packet1Xbf>(const bfloat16* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_bf16m1(reinterpret_cast<const __bf16*>(from),
+                                                           unpacket_traits<Packet1Xbf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf ploaddup<Packet1Xbf>(const bfloat16* from) {
+  Packet1Xsu data = __riscv_vreinterpret_v_bf16m1_u16m1(pload<Packet1Xbf>(from));
+  return __riscv_vreinterpret_v_i16m1_bf16m1(
+      __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_u32m1_i32m1(__riscv_vlmul_trunc_v_u32m2_u32m1(
+          __riscv_vwmaccu_vx_u32m2(__riscv_vwaddu_vv_u32m2(data, data, unpacket_traits<Packet1Xs>::size), 0xffffu, data,
+                                   unpacket_traits<Packet1Xs>::size)))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf ploadquad<Packet1Xbf>(const bfloat16* from) {
+  Packet1Xsu idx = __riscv_vsrl_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits<Packet1Xbf>::size), 2,
+                                         unpacket_traits<Packet1Xbf>::size);
+  return __riscv_vreinterpret_v_i16m1_bf16m1(__riscv_vrgather_vv_i16m1(
+      pload<Packet1Xs>(reinterpret_cast<const short*>(from)), idx, unpacket_traits<Packet1Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet1Xbf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_bf16m1(reinterpret_cast<__bf16*>(to), from,
+                                                   unpacket_traits<Packet1Xbf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet1Xbf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_bf16m1(reinterpret_cast<__bf16*>(to), from,
+                                                     unpacket_traits<Packet1Xbf>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1Xbf pgather<bfloat16, Packet1Xbf>(const bfloat16* from, Index stride) {
+  return __riscv_vlse16_v_bf16m1(reinterpret_cast<const __bf16*>(from), stride * sizeof(bfloat16),
+                                 unpacket_traits<Packet1Xbf>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<bfloat16, Packet1Xbf>(bfloat16* to, const Packet1Xbf& from, Index stride) {
+  __riscv_vsse16(reinterpret_cast<__bf16*>(to), stride * sizeof(bfloat16), from, unpacket_traits<Packet1Xbf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 pfirst<Packet1Xbf>(const Packet1Xbf& a) {
+  return numext::bit_cast<bfloat16>(__riscv_vmv_x_s_i16m1_i16(__riscv_vreinterpret_v_bf16m1_i16m1(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf psqrt(const Packet1Xbf& a) {
+  return F32ToBf16(psqrt<Packet2Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf print<Packet1Xbf>(const Packet1Xbf& a) {
+  return F32ToBf16(print<Packet2Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pfloor<Packet1Xbf>(const Packet1Xbf& a) {
+  return F32ToBf16(pfloor<Packet2Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf preverse(const Packet1Xbf& a) {
+  return __riscv_vreinterpret_v_i16m1_bf16m1(preverse<Packet1Xs>(__riscv_vreinterpret_v_bf16m1_i16m1(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux<Packet1Xbf>(const Packet1Xbf& a) {
+  return static_cast<bfloat16>(predux<Packet2Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet1Xbf>(const Packet1Xbf& a) {
+  return static_cast<bfloat16>(predux_mul<Packet2Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<Packet1Xbf>(const Packet1Xbf& a) {
+  return static_cast<bfloat16>(predux_min<Packet2Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<Packet1Xbf>(const Packet1Xbf& a) {
+  return static_cast<bfloat16>(predux_max<Packet2Xf>(Bf16ToF32(a)));
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet1Xbf, N>& kernel) {
+  bfloat16 buffer[unpacket_traits<Packet1Xbf>::size * N];
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse16(reinterpret_cast<__bf16*>(&buffer[i]), N * sizeof(bfloat16), kernel.packet[i],
+                   unpacket_traits<Packet1Xbf>::size);
+  }
+
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] = __riscv_vle16_v_bf16m1(reinterpret_cast<__bf16*>(&buffer[i * unpacket_traits<Packet1Xbf>::size]),
+                                              unpacket_traits<Packet1Xbf>::size);
+  }
+}
+
+/********************************* Packet2Xbf ************************************/
+
+EIGEN_STRONG_INLINE Packet4Xf Bf16ToF32(const Packet2Xbf& a) {
+  return __riscv_vfwcvtbf16_f_f_v_f32m4(a, unpacket_traits<Packet2Xbf>::size);
+}
+
+EIGEN_STRONG_INLINE Packet2Xbf F32ToBf16(const Packet4Xf& a) {
+  return __riscv_vfncvtbf16_f_f_w_bf16m2(a, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf ptrue<Packet2Xbf>(const Packet2Xbf& /*a*/) {
+  return __riscv_vreinterpret_bf16m2(
+      __riscv_vmv_v_x_u16m2(static_cast<numext::uint16_t>(0xffffu), unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pzero<Packet2Xbf>(const Packet2Xbf& /*a*/) {
+  return __riscv_vreinterpret_bf16m2(
+      __riscv_vmv_v_x_i16m2(numext::bit_cast<int16_t>(static_cast<__bf16>(0.0)), unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pabs(const Packet2Xbf& a) {
+  return __riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vand_vx_u16m2(__riscv_vreinterpret_v_bf16m2_u16m2(a),
+                                                                   static_cast<numext::uint16_t>(0x7fffu),
+                                                                   unpacket_traits<Packet2Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pset1<Packet2Xbf>(const bfloat16& from) {
+  return __riscv_vreinterpret_bf16m2(
+      __riscv_vmv_v_x_i16m2(numext::bit_cast<int16_t>(from), unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pset1frombits<Packet2Xbf>(numext::uint16_t from) {
+  return __riscv_vreinterpret_bf16m2(__riscv_vmv_v_x_u16m2(from, unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf plset<Packet2Xbf>(const bfloat16& a) {
+  return F32ToBf16(plset<Packet4Xf>(static_cast<float>(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet2Xbf>(const bfloat16* a, Packet2Xbf& a0, Packet2Xbf& a1, Packet2Xbf& a2,
+                                                 Packet2Xbf& a3) {
+  vint16m2_t aa = __riscv_vle16_v_i16m2(reinterpret_cast<const int16_t*>(a), 4);
+  a0 = __riscv_vreinterpret_bf16m2(__riscv_vrgather_vx_i16m2(aa, 0, unpacket_traits<Packet2Xs>::size));
+  a1 = __riscv_vreinterpret_bf16m2(__riscv_vrgather_vx_i16m2(aa, 1, unpacket_traits<Packet2Xs>::size));
+  a2 = __riscv_vreinterpret_bf16m2(__riscv_vrgather_vx_i16m2(aa, 2, unpacket_traits<Packet2Xs>::size));
+  a3 = __riscv_vreinterpret_bf16m2(__riscv_vrgather_vx_i16m2(aa, 3, unpacket_traits<Packet2Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf padd<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  // b + (1 * a)
+  return F32ToBf16(__riscv_vfwmaccbf16_vf_f32m4(Bf16ToF32(b),
+                                                numext::bit_cast<__bf16>(static_cast<numext::int16_t>(0x3f80u)), a,
+                                                unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf psub<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  // a + (-1 * b)
+  return F32ToBf16(__riscv_vfwmaccbf16_vf_f32m4(Bf16ToF32(a),
+                                                numext::bit_cast<__bf16>(static_cast<numext::int16_t>(0xbf80u)), b,
+                                                unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pabsdiff(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return pabs<Packet2Xbf>(psub<Packet2Xbf>(a, b));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pnegate(const Packet2Xbf& a) {
+  return __riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vxor_vx_u16m2(__riscv_vreinterpret_v_bf16m2_u16m2(a),
+                                                                   static_cast<numext::uint16_t>(0x8000u),
+                                                                   unpacket_traits<Packet2Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf psignbit(const Packet2Xbf& a) {
+  return __riscv_vreinterpret_v_i16m2_bf16m2(
+      __riscv_vsra_vx_i16m2(__riscv_vreinterpret_v_bf16m2_i16m2(a), 15, unpacket_traits<Packet2Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pconj(const Packet2Xbf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pmul<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  Packet4Xf c;
+  return F32ToBf16(__riscv_vfwmaccbf16_vv_f32m4(pzero<Packet4Xf>(c), a, b, unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pdiv<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pdiv<Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pmadd(const Packet2Xbf& a, const Packet2Xbf& b, const Packet2Xbf& c) {
+  return F32ToBf16(__riscv_vfwmaccbf16_vv_f32m4(Bf16ToF32(c), a, b, unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pmsub(const Packet2Xbf& a, const Packet2Xbf& b, const Packet2Xbf& c) {
+  return F32ToBf16(
+      __riscv_vfwmaccbf16_vv_f32m4(Bf16ToF32(pnegate<Packet2Xbf>(c)), a, b, unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pnmadd(const Packet2Xbf& a, const Packet2Xbf& b, const Packet2Xbf& c) {
+  return F32ToBf16(
+      __riscv_vfwmaccbf16_vv_f32m4(Bf16ToF32(c), pnegate<Packet2Xbf>(a), b, unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pnmsub(const Packet2Xbf& a, const Packet2Xbf& b, const Packet2Xbf& c) {
+  return pnegate<Packet2Xbf>(
+      F32ToBf16(__riscv_vfwmaccbf16_vv_f32m4(Bf16ToF32(c), a, b, unpacket_traits<Packet2Xbf>::size)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pmin<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pmin<Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pmin<PropagateNaN, Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pmin<PropagateNaN, Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pmin<PropagateNumbers, Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pmin<PropagateNumbers, Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pmax<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pmax<Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pmax<PropagateNaN, Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pmax<PropagateNaN, Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pmax<PropagateNumbers, Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pmax<PropagateNumbers, Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pcmp_le<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pcmp_le<Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pcmp_lt<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pcmp_lt<Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pcmp_eq<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pcmp_eq<Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pcmp_lt_or_nan<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return F32ToBf16(pcmp_lt_or_nan<Packet4Xf>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+EIGEN_STRONG_INLINE Packet2Xbf pselect(const PacketMask8& mask, const Packet2Xbf& a, const Packet2Xbf& b) {
+  return __riscv_vreinterpret_v_i16m2_bf16m2(__riscv_vmerge_vvm_i16m2(__riscv_vreinterpret_v_bf16m2_i16m2(b),
+                                                                      __riscv_vreinterpret_v_bf16m2_i16m2(a), mask,
+                                                                      unpacket_traits<Packet2Xbf>::size));
+}
+
+EIGEN_STRONG_INLINE Packet2Xbf pselect(const Packet2Xbf& mask, const Packet2Xbf& a, const Packet2Xbf& b) {
+  PacketMask8 mask2 =
+      __riscv_vmsne_vx_i16m2_b8(__riscv_vreinterpret_v_bf16m2_i16m2(mask), 0, unpacket_traits<Packet2Xbf>::size);
+  return __riscv_vreinterpret_v_i16m2_bf16m2(__riscv_vmerge_vvm_i16m2(__riscv_vreinterpret_v_bf16m2_i16m2(b),
+                                                                      __riscv_vreinterpret_v_bf16m2_i16m2(a), mask2,
+                                                                      unpacket_traits<Packet2Xbf>::size));
+}
+
+// Logical Operations are not supported for bflaot16, so reinterpret casts
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pand<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return __riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vand_vv_u16m2(__riscv_vreinterpret_v_bf16m2_u16m2(a),
+                                                                   __riscv_vreinterpret_v_bf16m2_u16m2(b),
+                                                                   unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf por<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return __riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vor_vv_u16m2(__riscv_vreinterpret_v_bf16m2_u16m2(a),
+                                                                  __riscv_vreinterpret_v_bf16m2_u16m2(b),
+                                                                  unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pxor<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return __riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vxor_vv_u16m2(__riscv_vreinterpret_v_bf16m2_u16m2(a),
+                                                                   __riscv_vreinterpret_v_bf16m2_u16m2(b),
+                                                                   unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pandnot<Packet2Xbf>(const Packet2Xbf& a, const Packet2Xbf& b) {
+  return __riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vand_vv_u16m2(
+      __riscv_vreinterpret_v_bf16m2_u16m2(a),
+      __riscv_vnot_v_u16m2(__riscv_vreinterpret_v_bf16m2_u16m2(b), unpacket_traits<Packet2Xbf>::size),
+      unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pload<Packet2Xbf>(const bfloat16* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_bf16m2(reinterpret_cast<const __bf16*>(from),
+                                                         unpacket_traits<Packet2Xbf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf ploadu<Packet2Xbf>(const bfloat16* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_bf16m2(reinterpret_cast<const __bf16*>(from),
+                                                           unpacket_traits<Packet2Xbf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf ploaddup<Packet2Xbf>(const bfloat16* from) {
+  Packet2Xsu data = __riscv_vreinterpret_v_bf16m2_u16m2(pload<Packet2Xbf>(from));
+  return __riscv_vreinterpret_v_i16m2_bf16m2(
+      __riscv_vreinterpret_v_i32m2_i16m2(__riscv_vreinterpret_v_u32m2_i32m2(__riscv_vlmul_trunc_v_u32m4_u32m2(
+          __riscv_vwmaccu_vx_u32m4(__riscv_vwaddu_vv_u32m4(data, data, unpacket_traits<Packet2Xs>::size), 0xffffu, data,
+                                   unpacket_traits<Packet2Xs>::size)))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf ploadquad<Packet2Xbf>(const bfloat16* from) {
+  Packet2Xsu idx = __riscv_vsrl_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits<Packet2Xbf>::size), 2,
+                                         unpacket_traits<Packet2Xbf>::size);
+  return __riscv_vreinterpret_v_i16m2_bf16m2(__riscv_vrgather_vv_i16m2(
+      pload<Packet2Xs>(reinterpret_cast<const short*>(from)), idx, unpacket_traits<Packet2Xbf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet2Xbf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_bf16m2(reinterpret_cast<__bf16*>(to), from,
+                                                   unpacket_traits<Packet2Xbf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet2Xbf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_bf16m2(reinterpret_cast<__bf16*>(to), from,
+                                                     unpacket_traits<Packet2Xbf>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2Xbf pgather<bfloat16, Packet2Xbf>(const bfloat16* from, Index stride) {
+  return __riscv_vlse16_v_bf16m2(reinterpret_cast<const __bf16*>(from), stride * sizeof(bfloat16),
+                                 unpacket_traits<Packet2Xbf>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<bfloat16, Packet2Xbf>(bfloat16* to, const Packet2Xbf& from, Index stride) {
+  __riscv_vsse16(reinterpret_cast<__bf16*>(to), stride * sizeof(bfloat16), from, unpacket_traits<Packet2Xbf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 pfirst<Packet2Xbf>(const Packet2Xbf& a) {
+  return numext::bit_cast<bfloat16>(__riscv_vmv_x_s_i16m2_i16(__riscv_vreinterpret_v_bf16m2_i16m2(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf psqrt(const Packet2Xbf& a) {
+  return F32ToBf16(psqrt<Packet4Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf print<Packet2Xbf>(const Packet2Xbf& a) {
+  return F32ToBf16(print<Packet4Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pfloor<Packet2Xbf>(const Packet2Xbf& a) {
+  return F32ToBf16(pfloor<Packet4Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf preverse(const Packet2Xbf& a) {
+  return __riscv_vreinterpret_v_i16m2_bf16m2(preverse<Packet2Xs>(__riscv_vreinterpret_v_bf16m2_i16m2(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux<Packet2Xbf>(const Packet2Xbf& a) {
+  return static_cast<bfloat16>(predux<Packet4Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet2Xbf>(const Packet2Xbf& a) {
+  return static_cast<bfloat16>(predux_mul<Packet4Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<Packet2Xbf>(const Packet2Xbf& a) {
+  return static_cast<bfloat16>(predux_min<Packet4Xf>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<Packet2Xbf>(const Packet2Xbf& a) {
+  return static_cast<bfloat16>(predux_max<Packet4Xf>(Bf16ToF32(a)));
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2Xbf, N>& kernel) {
+  bfloat16 buffer[unpacket_traits<Packet2Xbf>::size * N];
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse16(reinterpret_cast<__bf16*>(&buffer[i]), N * sizeof(bfloat16), kernel.packet[i],
+                   unpacket_traits<Packet2Xbf>::size);
+  }
+
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] = __riscv_vle16_v_bf16m2(reinterpret_cast<__bf16*>(&buffer[i * unpacket_traits<Packet2Xbf>::size]),
+                                              unpacket_traits<Packet2Xbf>::size);
+  }
+}
+
+template <typename Packet = Packet2Xbf>
+EIGEN_STRONG_INLINE std::enable_if_t<
+    std::is_same<Packet, Packet2Xbf>::value && (unpacket_traits<Packet2Xbf>::size % 8) == 0, Packet1Xbf>
+predux_half(const Packet2Xbf& a) {
+  return padd<Packet1Xbf>(__riscv_vget_v_bf16m2_bf16m1(a, 0), __riscv_vget_v_bf16m2_bf16m1(a, 1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xbf pcast<Packet1Xs, Packet1Xbf>(const Packet1Xs& a) {
+  return __riscv_vreinterpret_v_i16m1_bf16m1(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xbf pcast<Packet2Xs, Packet2Xbf>(const Packet2Xs& a) {
+  return __riscv_vreinterpret_v_i16m2_bf16m2(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pcast<Packet1Xbf, Packet1Xs>(const Packet1Xbf& a) {
+  return __riscv_vreinterpret_v_bf16m1_i16m1(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pcast<Packet2Xbf, Packet2Xs>(const Packet2Xbf& a) {
+  return __riscv_vreinterpret_v_bf16m2_i16m2(a);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_BF16_RVV10_H
diff --git a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h
new file mode 100644
index 00000000000..b1113c9d12b
--- /dev/null
+++ b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h
@@ -0,0 +1,998 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Kseniya Zaytseva <kseniya.zaytseva@syntacore.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_FP16_RVV10_H
+#define EIGEN_PACKET_MATH_FP16_RVV10_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+typedef eigen_packet_wrapper<vfloat16m1_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))), 24> Packet1Xh;
+typedef eigen_packet_wrapper<vfloat16m2_t __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))), 25>
+    Packet2Xh;
+
+#if EIGEN_RISCV64_DEFAULT_LMUL == 1
+typedef Packet1Xh PacketXh;
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet1Xh type;
+  typedef Packet1Xh half;
+
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<Eigen::half, EIGEN_RISCV64_RVV_VL, 1>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0,
+
+    HasCmp = 1,
+    HasDiv = 1,
+    HasRound = 1,
+
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 0,
+    HasExp = 0,
+    HasSqrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = 0
+  };
+};
+
+#else
+typedef Packet2Xh PacketXh;
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet2Xh type;
+  typedef Packet1Xh half;
+
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = rvv_packet_size_selector<Eigen::half, EIGEN_RISCV64_RVV_VL, 2>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0,
+
+    HasCmp = 1,
+    HasDiv = 1,
+    HasRound = 1,
+
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 0,
+    HasExp = 0,
+    HasSqrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = 0
+  };
+};
+#endif
+
+template <>
+struct unpacket_traits<Packet1Xh> {
+  typedef Eigen::half type;
+  typedef Packet1Xh half;  // Half not yet implemented
+  typedef Packet1Xs integer_packet;
+  typedef numext::uint8_t mask_t;
+
+  enum {
+    size = rvv_packet_size_selector<Eigen::half, EIGEN_RISCV64_RVV_VL, 1>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 1>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2Xh> {
+  typedef Eigen::half type;
+  typedef Packet1Xh half;
+  typedef Packet2Xs integer_packet;
+  typedef numext::uint8_t mask_t;
+
+  enum {
+    size = rvv_packet_size_selector<Eigen::half, EIGEN_RISCV64_RVV_VL, 2>::size,
+    alignment = rvv_packet_alignment_selector<EIGEN_RISCV64_RVV_VL, 2>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+/********************************* Packet1Xh ************************************/
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh ptrue<Packet1Xh>(const Packet1Xh& /*a*/) {
+  return __riscv_vreinterpret_f16m1(__riscv_vmv_v_x_u16m1(0xffffu, unpacket_traits<Packet1Xh>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pzero<Packet1Xh>(const Packet1Xh& /*a*/) {
+  return __riscv_vfmv_v_f_f16m1(static_cast<_Float16>(0.0), unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pabs(const Packet1Xh& a) {
+  return __riscv_vfabs_v_f16m1(a, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pabsdiff(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vfabs_v_f16m1(__riscv_vfsub_vv_f16m1(a, b, unpacket_traits<Packet1Xh>::size),
+                               unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pset1<Packet1Xh>(const Eigen::half& from) {
+  return __riscv_vfmv_v_f_f16m1(numext::bit_cast<_Float16>(from), unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pset1frombits<Packet1Xh>(numext::uint16_t from) {
+  return __riscv_vreinterpret_f16m1(__riscv_vmv_v_x_u16m1(from, unpacket_traits<Packet1Xh>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh plset<Packet1Xh>(const Eigen::half& a) {
+  Packet1Xh idx = __riscv_vfcvt_f_x_v_f16m1(
+      __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vid_v_u16m1(unpacket_traits<Packet1Xs>::size)),
+      unpacket_traits<Packet1Xh>::size);
+  return __riscv_vfadd_vf_f16m1(idx, numext::bit_cast<_Float16>(a), unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet1Xh>(const Eigen::half* a, Packet1Xh& a0, Packet1Xh& a1, Packet1Xh& a2,
+                                                Packet1Xh& a3) {
+  vfloat16m1_t aa = __riscv_vle16_v_f16m1(reinterpret_cast<const _Float16*>(a), 4);
+  a0 = __riscv_vrgather_vx_f16m1(aa, 0, unpacket_traits<Packet1Xh>::size);
+  a1 = __riscv_vrgather_vx_f16m1(aa, 1, unpacket_traits<Packet1Xh>::size);
+  a2 = __riscv_vrgather_vx_f16m1(aa, 2, unpacket_traits<Packet1Xh>::size);
+  a3 = __riscv_vrgather_vx_f16m1(aa, 3, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh padd<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vfadd_vv_f16m1(a, b, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh psub<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vfsub_vv_f16m1(a, b, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pnegate(const Packet1Xh& a) {
+  return __riscv_vfneg_v_f16m1(a, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh psignbit(const Packet1Xh& a) {
+  return __riscv_vreinterpret_v_i16m1_f16m1(
+      __riscv_vsra_vx_i16m1(__riscv_vreinterpret_v_f16m1_i16m1(a), 15, unpacket_traits<Packet1Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pconj(const Packet1Xh& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pmul<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vfmul_vv_f16m1(a, b, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pdiv<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vfdiv_vv_f16m1(a, b, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pmadd(const Packet1Xh& a, const Packet1Xh& b, const Packet1Xh& c) {
+  return __riscv_vfmadd_vv_f16m1(a, b, c, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pmsub(const Packet1Xh& a, const Packet1Xh& b, const Packet1Xh& c) {
+  return __riscv_vfmsub_vv_f16m1(a, b, c, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pnmadd(const Packet1Xh& a, const Packet1Xh& b, const Packet1Xh& c) {
+  return __riscv_vfnmsub_vv_f16m1(a, b, c, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pnmsub(const Packet1Xh& a, const Packet1Xh& b, const Packet1Xh& c) {
+  return __riscv_vfnmadd_vv_f16m1(a, b, c, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pmin<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  const Eigen::half nan = (std::numeric_limits<Eigen::half>::quiet_NaN)();
+  Packet1Xh nans = __riscv_vfmv_v_f_f16m1(numext::bit_cast<_Float16>(nan), unpacket_traits<Packet1Xh>::size);
+  PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, a, unpacket_traits<Packet1Xh>::size);
+  PacketMask16 mask2 = __riscv_vmfeq_vv_f16m1_b16(b, b, unpacket_traits<Packet1Xh>::size);
+  mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits<Packet1Xh>::size);
+
+  return __riscv_vfmin_vv_f16m1_tumu(mask, nans, a, b, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pmin<PropagateNaN, Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return pmin<Packet1Xh>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pmin<PropagateNumbers, Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vfmin_vv_f16m1(a, b, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pmax<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  const Eigen::half nan = (std::numeric_limits<Eigen::half>::quiet_NaN)();
+  Packet1Xh nans = __riscv_vfmv_v_f_f16m1(numext::bit_cast<_Float16>(nan), unpacket_traits<Packet1Xh>::size);
+  PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, a, unpacket_traits<Packet1Xh>::size);
+  PacketMask16 mask2 = __riscv_vmfeq_vv_f16m1_b16(b, b, unpacket_traits<Packet1Xh>::size);
+  mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits<Packet1Xh>::size);
+
+  return __riscv_vfmax_vv_f16m1_tumu(mask, nans, a, b, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pmax<PropagateNaN, Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return pmax<Packet1Xh>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pmax<PropagateNumbers, Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vfmax_vv_f16m1(a, b, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pcmp_le<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  PacketMask16 mask = __riscv_vmfle_vv_f16m1_b16(a, b, unpacket_traits<Packet1Xh>::size);
+  return __riscv_vmerge_vvm_f16m1(pzero<Packet1Xh>(a), ptrue<Packet1Xh>(a), mask, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pcmp_lt<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  PacketMask16 mask = __riscv_vmflt_vv_f16m1_b16(a, b, unpacket_traits<Packet1Xh>::size);
+  return __riscv_vmerge_vvm_f16m1(pzero<Packet1Xh>(a), ptrue<Packet1Xh>(a), mask, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pcmp_eq<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, b, unpacket_traits<Packet1Xh>::size);
+  return __riscv_vmerge_vvm_f16m1(pzero<Packet1Xh>(a), ptrue<Packet1Xh>(a), mask, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pcmp_lt_or_nan<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  PacketMask16 mask = __riscv_vmfge_vv_f16m1_b16(a, b, unpacket_traits<Packet1Xh>::size);
+  return __riscv_vfmerge_vfm_f16m1(ptrue<Packet1Xh>(a), static_cast<_Float16>(0.0), mask,
+                                   unpacket_traits<Packet1Xh>::size);
+}
+
+EIGEN_STRONG_INLINE Packet1Xh pselect(const PacketMask16& mask, const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vmerge_vvm_f16m1(b, a, mask, unpacket_traits<Packet1Xh>::size);
+}
+
+EIGEN_STRONG_INLINE Packet1Xh pselect(const Packet1Xh& mask, const Packet1Xh& a, const Packet1Xh& b) {
+  PacketMask16 mask2 =
+      __riscv_vmsne_vx_i16m1_b16(__riscv_vreinterpret_v_f16m1_i16m1(mask), 0, unpacket_traits<Packet1Xh>::size);
+  return __riscv_vreinterpret_v_i16m1_f16m1(__riscv_vmerge_vvm_i16m1(__riscv_vreinterpret_v_f16m1_i16m1(b),
+                                                                     __riscv_vreinterpret_v_f16m1_i16m1(a), mask2,
+                                                                     unpacket_traits<Packet1Xh>::size));
+}
+
+// Logical Operations are not supported for half, so reinterpret casts
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pand<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vand_vv_u16m1(
+      __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits<Packet1Xh>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh por<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vor_vv_u16m1(
+      __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits<Packet1Xh>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pxor<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vxor_vv_u16m1(
+      __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits<Packet1Xh>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pandnot<Packet1Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vand_vv_u16m1(
+      __riscv_vreinterpret_v_f16m1_u16m1(a),
+      __riscv_vnot_v_u16m1(__riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits<Packet1Xh>::size),
+      unpacket_traits<Packet1Xh>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pload<Packet1Xh>(const Eigen::half* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_f16m1(reinterpret_cast<const _Float16*>(from),
+                                                        unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh ploadu<Packet1Xh>(const Eigen::half* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_f16m1(reinterpret_cast<const _Float16*>(from),
+                                                          unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh ploaddup<Packet1Xh>(const Eigen::half* from) {
+  Packet1Xsu data = __riscv_vreinterpret_v_f16m1_u16m1(pload<Packet1Xh>(from));
+  return __riscv_vreinterpret_v_i16m1_f16m1(
+      __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_u32m1_i32m1(__riscv_vlmul_trunc_v_u32m2_u32m1(
+          __riscv_vwmaccu_vx_u32m2(__riscv_vwaddu_vv_u32m2(data, data, unpacket_traits<Packet1Xs>::size), 0xffffu, data,
+                                   unpacket_traits<Packet1Xs>::size)))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh ploadquad<Packet1Xh>(const Eigen::half* from) {
+  Packet1Xsu idx =
+      __riscv_vsrl_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits<Packet1Xh>::size), 2, unpacket_traits<Packet1Xh>::size);
+  return __riscv_vrgather_vv_f16m1(pload<Packet1Xh>(from), idx, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet1Xh& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_f16m1(reinterpret_cast<_Float16*>(to), from,
+                                                  unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet1Xh& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_f16m1(reinterpret_cast<_Float16*>(to), from,
+                                                    unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1Xh pgather<Eigen::half, Packet1Xh>(const Eigen::half* from, Index stride) {
+  return __riscv_vlse16_v_f16m1(reinterpret_cast<const _Float16*>(from), stride * sizeof(Eigen::half),
+                                unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<Eigen::half, Packet1Xh>(Eigen::half* to, const Packet1Xh& from, Index stride) {
+  __riscv_vsse16(reinterpret_cast<_Float16*>(to), stride * sizeof(Eigen::half), from, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet1Xh>(const Packet1Xh& a) {
+  return static_cast<Eigen::half>(__riscv_vfmv_f_s_f16m1_f16(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh psqrt(const Packet1Xh& a) {
+  return __riscv_vfsqrt_v_f16m1(a, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh print<Packet1Xh>(const Packet1Xh& a) {
+  const Packet1Xh limit = pset1<Packet1Xh>(static_cast<Eigen::half>(1 << 10));
+  const Packet1Xh abs_a = pabs(a);
+
+  PacketMask16 mask = __riscv_vmfne_vv_f16m1_b16(a, a, unpacket_traits<Packet1Xh>::size);
+  const Packet1Xh x = __riscv_vfadd_vv_f16m1_tumu(mask, a, a, a, unpacket_traits<Packet1Xh>::size);
+  const Packet1Xh new_x = __riscv_vfcvt_f_x_v_f16m1(__riscv_vfcvt_x_f_v_i16m1(a, unpacket_traits<Packet1Xh>::size),
+                                                    unpacket_traits<Packet1Xh>::size);
+
+  mask = __riscv_vmflt_vv_f16m1_b16(abs_a, limit, unpacket_traits<Packet1Xh>::size);
+  Packet1Xh signed_x = __riscv_vfsgnj_vv_f16m1(new_x, x, unpacket_traits<Packet1Xh>::size);
+  return __riscv_vmerge_vvm_f16m1(x, signed_x, mask, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pfloor<Packet1Xh>(const Packet1Xh& a) {
+  Packet1Xh tmp = print<Packet1Xh>(a);
+  // If greater, subtract one.
+  PacketMask16 mask = __riscv_vmflt_vv_f16m1_b16(a, tmp, unpacket_traits<Packet1Xh>::size);
+  return __riscv_vfsub_vf_f16m1_tumu(mask, tmp, tmp, static_cast<_Float16>(1.0), unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh preverse(const Packet1Xh& a) {
+  Packet1Xsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits<Packet1Xh>::size),
+                                          unpacket_traits<Packet1Xh>::size - 1, unpacket_traits<Packet1Xh>::size);
+  return __riscv_vrgather_vv_f16m1(a, idx, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux<Packet1Xh>(const Packet1Xh& a) {
+  return static_cast<Eigen::half>(__riscv_vfmv_f(__riscv_vfredusum_vs_f16m1_f16m1(
+      a, __riscv_vfmv_v_f_f16m1(static_cast<_Float16>(0.0), unpacket_traits<Packet1Xh>::size),
+      unpacket_traits<Packet1Xh>::size)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet1Xh>(const Packet1Xh& a) {
+  // Multiply the vector by its reverse
+  Packet1Xh prod = __riscv_vfmul_vv_f16m1(preverse(a), a, unpacket_traits<Packet1Xh>::size);
+  Packet1Xh half_prod;
+
+  if (EIGEN_RISCV64_RVV_VL >= 1024) {
+    half_prod = __riscv_vslidedown_vx_f16m1(prod, 16, unpacket_traits<Packet1Xh>::size);
+    prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits<Packet1Xh>::size);
+  }
+  if (EIGEN_RISCV64_RVV_VL >= 512) {
+    half_prod = __riscv_vslidedown_vx_f16m1(prod, 8, unpacket_traits<Packet1Xh>::size);
+    prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits<Packet1Xh>::size);
+  }
+  if (EIGEN_RISCV64_RVV_VL >= 256) {
+    half_prod = __riscv_vslidedown_vx_f16m1(prod, 4, unpacket_traits<Packet1Xh>::size);
+    prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits<Packet1Xh>::size);
+  }
+  // Last reduction
+  half_prod = __riscv_vslidedown_vx_f16m1(prod, 2, unpacket_traits<Packet1Xh>::size);
+  prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits<Packet1Xh>::size);
+
+  half_prod = __riscv_vslidedown_vx_f16m1(prod, 1, unpacket_traits<Packet1Xh>::size);
+  prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits<Packet1Xh>::size);
+
+  // The reduction is done to the first element.
+  return pfirst(prod);
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_min<Packet1Xh>(const Packet1Xh& a) {
+  const Eigen::half max = (std::numeric_limits<Eigen::half>::max)();
+  const Eigen::half nan = (std::numeric_limits<Eigen::half>::quiet_NaN)();
+  return (std::min)(static_cast<Eigen::half>(__riscv_vfmv_f(__riscv_vfredmin_vs_f16m1_f16m1(
+                        a, __riscv_vfmv_v_f_f16m1(numext::bit_cast<_Float16>(nan), unpacket_traits<Packet1Xh>::size),
+                        unpacket_traits<Packet1Xh>::size))),
+                    max);
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_max<Packet1Xh>(const Packet1Xh& a) {
+  const Eigen::half min = -(std::numeric_limits<Eigen::half>::max)();
+  const Eigen::half nan = (std::numeric_limits<Eigen::half>::quiet_NaN)();
+  return (std::max)(static_cast<Eigen::half>(__riscv_vfmv_f(__riscv_vfredmax_vs_f16m1_f16m1(
+                        a, __riscv_vfmv_v_f_f16m1(numext::bit_cast<_Float16>(nan), unpacket_traits<Packet1Xh>::size),
+                        unpacket_traits<Packet1Xh>::size))),
+                    min);
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet1Xh, N>& kernel) {
+  Eigen::half buffer[unpacket_traits<Packet1Xh>::size * N];
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse16(reinterpret_cast<_Float16*>(&buffer[i]), N * sizeof(Eigen::half), kernel.packet[i],
+                   unpacket_traits<Packet1Xh>::size);
+  }
+
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] = __riscv_vle16_v_f16m1(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits<Packet1Xh>::size]),
+                                             unpacket_traits<Packet1Xh>::size);
+  }
+}
+
+EIGEN_STRONG_INLINE Packet2Xf half2float(const Packet1Xh& a) {
+  return __riscv_vfwcvt_f_f_v_f32m2(a, unpacket_traits<Packet2Xf>::size);
+}
+
+EIGEN_STRONG_INLINE Packet1Xh float2half(const Packet2Xf& a) {
+  return __riscv_vfncvt_f_f_w_f16m1(a, unpacket_traits<Packet1Xh>::size);
+}
+
+/********************************* Packet2Xh ************************************/
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh ptrue<Packet2Xh>(const Packet2Xh& /*a*/) {
+  return __riscv_vreinterpret_f16m2(__riscv_vmv_v_x_u16m2(0xffffu, unpacket_traits<Packet2Xh>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pzero<Packet2Xh>(const Packet2Xh& /*a*/) {
+  return __riscv_vfmv_v_f_f16m2(static_cast<_Float16>(0.0), unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pabs(const Packet2Xh& a) {
+  return __riscv_vfabs_v_f16m2(a, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pabsdiff(const Packet2Xh& a, const Packet2Xh& b) {
+  return __riscv_vfabs_v_f16m2(__riscv_vfsub_vv_f16m2(a, b, unpacket_traits<Packet2Xh>::size),
+                               unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pset1<Packet2Xh>(const Eigen::half& from) {
+  return __riscv_vfmv_v_f_f16m2(numext::bit_cast<_Float16>(from), unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pset1frombits<Packet2Xh>(numext::uint16_t from) {
+  return __riscv_vreinterpret_f16m2(__riscv_vmv_v_x_u16m2(from, unpacket_traits<Packet2Xh>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh plset<Packet2Xh>(const Eigen::half& a) {
+  Packet2Xh idx = __riscv_vfcvt_f_x_v_f16m2(
+      __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vid_v_u16m2(unpacket_traits<Packet4Xs>::size)),
+      unpacket_traits<Packet2Xh>::size);
+  return __riscv_vfadd_vf_f16m2(idx, numext::bit_cast<_Float16>(a), unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet2Xh>(const Eigen::half* a, Packet2Xh& a0, Packet2Xh& a1, Packet2Xh& a2,
+                                                Packet2Xh& a3) {
+  vfloat16m2_t aa = __riscv_vle16_v_f16m2(reinterpret_cast<const _Float16*>(a), 4);
+  a0 = __riscv_vrgather_vx_f16m2(aa, 0, unpacket_traits<Packet2Xh>::size);
+  a1 = __riscv_vrgather_vx_f16m2(aa, 1, unpacket_traits<Packet2Xh>::size);
+  a2 = __riscv_vrgather_vx_f16m2(aa, 2, unpacket_traits<Packet2Xh>::size);
+  a3 = __riscv_vrgather_vx_f16m2(aa, 3, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh padd<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
+  return __riscv_vfadd_vv_f16m2(a, b, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh psub<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
+  return __riscv_vfsub_vv_f16m2(a, b, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pnegate(const Packet2Xh& a) {
+  return __riscv_vfneg_v_f16m2(a, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh psignbit(const Packet2Xh& a) {
+  return __riscv_vreinterpret_v_i16m2_f16m2(
+      __riscv_vsra_vx_i16m2(__riscv_vreinterpret_v_f16m2_i16m2(a), 15, unpacket_traits<Packet2Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pconj(const Packet2Xh& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pmul<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
+  return __riscv_vfmul_vv_f16m2(a, b, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pdiv<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
+  return __riscv_vfdiv_vv_f16m2(a, b, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pmadd(const Packet2Xh& a, const Packet2Xh& b, const Packet2Xh& c) {
+  return __riscv_vfmadd_vv_f16m2(a, b, c, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pmsub(const Packet2Xh& a, const Packet2Xh& b, const Packet2Xh& c) {
+  return __riscv_vfmsub_vv_f16m2(a, b, c, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pnmadd(const Packet2Xh& a, const Packet2Xh& b, const Packet2Xh& c) {
+  return __riscv_vfnmsub_vv_f16m2(a, b, c, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pnmsub(const Packet2Xh& a, const Packet2Xh& b, const Packet2Xh& c) {
+  return __riscv_vfnmadd_vv_f16m2(a, b, c, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pmin<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
+  const Eigen::half nan = (std::numeric_limits<Eigen::half>::quiet_NaN)();
+  Packet2Xh nans = __riscv_vfmv_v_f_f16m2(numext::bit_cast<_Float16>(nan), unpacket_traits<Packet2Xh>::size);
+  PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits<Packet2Xh>::size);
+  PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits<Packet2Xh>::size);
+  mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits<Packet2Xh>::size);
+
+  return __riscv_vfmin_vv_f16m2_tumu(mask, nans, a, b, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pmin<PropagateNaN, Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
+  return pmin<Packet2Xh>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pmin<PropagateNumbers, Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
+  return __riscv_vfmin_vv_f16m2(a, b, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pmax<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
+  const Eigen::half nan = (std::numeric_limits<Eigen::half>::quiet_NaN)();
+  Packet2Xh nans = __riscv_vfmv_v_f_f16m2(numext::bit_cast<_Float16>(nan), unpacket_traits<Packet2Xh>::size);
+  PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits<Packet2Xh>::size);
+  PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits<Packet2Xh>::size);
+  mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits<Packet2Xh>::size);
+
+  return __riscv_vfmax_vv_f16m2_tumu(mask, nans, a, b, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pmax<PropagateNaN, Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
+  return pmax<Packet2Xh>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pmax<PropagateNumbers, Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
+  return __riscv_vfmax_vv_f16m2(a, b, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pcmp_le<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
+  PacketMask8 mask = __riscv_vmfle_vv_f16m2_b8(a, b, unpacket_traits<Packet2Xh>::size);
+  return __riscv_vmerge_vvm_f16m2(pzero<Packet2Xh>(a), ptrue<Packet2Xh>(a), mask, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pcmp_lt<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
+  PacketMask8 mask = __riscv_vmflt_vv_f16m2_b8(a, b, unpacket_traits<Packet2Xh>::size);
+  return __riscv_vmerge_vvm_f16m2(pzero<Packet2Xh>(a), ptrue<Packet2Xh>(a), mask, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pcmp_eq<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
+  PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, b, unpacket_traits<Packet2Xh>::size);
+  return __riscv_vmerge_vvm_f16m2(pzero<Packet2Xh>(a), ptrue<Packet2Xh>(a), mask, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pcmp_lt_or_nan<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
+  PacketMask8 mask = __riscv_vmfge_vv_f16m2_b8(a, b, unpacket_traits<Packet2Xh>::size);
+  return __riscv_vfmerge_vfm_f16m2(ptrue<Packet2Xh>(a), static_cast<_Float16>(0.0), mask,
+                                   unpacket_traits<Packet2Xh>::size);
+}
+
+EIGEN_STRONG_INLINE Packet2Xh pselect(const PacketMask8& mask, const Packet2Xh& a, const Packet2Xh& b) {
+  return __riscv_vmerge_vvm_f16m2(b, a, mask, unpacket_traits<Packet2Xh>::size);
+}
+
+EIGEN_STRONG_INLINE Packet2Xh pselect(const Packet2Xh& mask, const Packet2Xh& a, const Packet2Xh& b) {
+  PacketMask8 mask2 =
+      __riscv_vmsne_vx_i16m2_b8(__riscv_vreinterpret_v_f16m2_i16m2(mask), 0, unpacket_traits<Packet2Xh>::size);
+  return __riscv_vreinterpret_v_i16m2_f16m2(__riscv_vmerge_vvm_i16m2(__riscv_vreinterpret_v_f16m2_i16m2(b),
+                                                                     __riscv_vreinterpret_v_f16m2_i16m2(a), mask2,
+                                                                     unpacket_traits<Packet2Xh>::size));
+}
+
+// Logical Operations are not supported for half, so reinterpret casts
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pand<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
+  return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2(
+      __riscv_vreinterpret_v_f16m2_u16m2(a), __riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits<Packet2Xh>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh por<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
+  return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vor_vv_u16m2(
+      __riscv_vreinterpret_v_f16m2_u16m2(a), __riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits<Packet2Xh>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pxor<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
+  return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vxor_vv_u16m2(
+      __riscv_vreinterpret_v_f16m2_u16m2(a), __riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits<Packet2Xh>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pandnot<Packet2Xh>(const Packet2Xh& a, const Packet2Xh& b) {
+  return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2(
+      __riscv_vreinterpret_v_f16m2_u16m2(a),
+      __riscv_vnot_v_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits<Packet2Xh>::size),
+      unpacket_traits<Packet2Xh>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pload<Packet2Xh>(const Eigen::half* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_f16m2(reinterpret_cast<const _Float16*>(from),
+                                                        unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh ploadu<Packet2Xh>(const Eigen::half* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_f16m2(reinterpret_cast<const _Float16*>(from),
+                                                          unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh ploaddup<Packet2Xh>(const Eigen::half* from) {
+  Packet2Xsu data = __riscv_vreinterpret_v_f16m2_u16m2(pload<Packet2Xh>(from));
+  return __riscv_vreinterpret_v_i16m2_f16m2(
+      __riscv_vreinterpret_v_i32m2_i16m2(__riscv_vreinterpret_v_u32m2_i32m2(__riscv_vlmul_trunc_v_u32m4_u32m2(
+          __riscv_vwmaccu_vx_u32m4(__riscv_vwaddu_vv_u32m4(data, data, unpacket_traits<Packet2Xs>::size), 0xffffu, data,
+                                   unpacket_traits<Packet2Xs>::size)))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh ploadquad<Packet2Xh>(const Eigen::half* from) {
+  Packet2Xsu idx =
+      __riscv_vsrl_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits<Packet2Xh>::size), 2, unpacket_traits<Packet2Xh>::size);
+  return __riscv_vrgather_vv_f16m2(pload<Packet2Xh>(from), idx, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet2Xh& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_f16m2(reinterpret_cast<_Float16*>(to), from,
+                                                  unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet2Xh& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_f16m2(reinterpret_cast<_Float16*>(to), from,
+                                                    unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2Xh pgather<Eigen::half, Packet2Xh>(const Eigen::half* from, Index stride) {
+  return __riscv_vlse16_v_f16m2(reinterpret_cast<const _Float16*>(from), stride * sizeof(Eigen::half),
+                                unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<Eigen::half, Packet2Xh>(Eigen::half* to, const Packet2Xh& from, Index stride) {
+  __riscv_vsse16(reinterpret_cast<_Float16*>(to), stride * sizeof(Eigen::half), from, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet2Xh>(const Packet2Xh& a) {
+  return numext::bit_cast<Eigen::half>(__riscv_vfmv_f_s_f16m2_f16(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh psqrt(const Packet2Xh& a) {
+  return __riscv_vfsqrt_v_f16m2(a, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh print<Packet2Xh>(const Packet2Xh& a) {
+  const Packet2Xh limit = pset1<Packet2Xh>(static_cast<Eigen::half>(1 << 10));
+  const Packet2Xh abs_a = pabs(a);
+
+  PacketMask8 mask = __riscv_vmfne_vv_f16m2_b8(a, a, unpacket_traits<Packet2Xh>::size);
+  const Packet2Xh x = __riscv_vfadd_vv_f16m2_tumu(mask, a, a, a, unpacket_traits<Packet2Xh>::size);
+  const Packet2Xh new_x = __riscv_vfcvt_f_x_v_f16m2(__riscv_vfcvt_x_f_v_i16m2(a, unpacket_traits<Packet2Xh>::size),
+                                                    unpacket_traits<Packet2Xh>::size);
+
+  mask = __riscv_vmflt_vv_f16m2_b8(abs_a, limit, unpacket_traits<Packet2Xh>::size);
+  Packet2Xh signed_x = __riscv_vfsgnj_vv_f16m2(new_x, x, unpacket_traits<Packet2Xh>::size);
+  return __riscv_vmerge_vvm_f16m2(x, signed_x, mask, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pfloor<Packet2Xh>(const Packet2Xh& a) {
+  Packet2Xh tmp = print<Packet2Xh>(a);
+  // If greater, subtract one.
+  PacketMask8 mask = __riscv_vmflt_vv_f16m2_b8(a, tmp, unpacket_traits<Packet2Xh>::size);
+  return __riscv_vfsub_vf_f16m2_tumu(mask, tmp, tmp, static_cast<_Float16>(1.0), unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh preverse(const Packet2Xh& a) {
+  Packet2Xsu idx = __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits<Packet2Xh>::size),
+                                          unpacket_traits<Packet2Xh>::size - 1, unpacket_traits<Packet2Xh>::size);
+  return __riscv_vrgather_vv_f16m2(a, idx, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux<Packet2Xh>(const Packet2Xh& a) {
+  return static_cast<Eigen::half>(__riscv_vfmv_f(__riscv_vfredusum_vs_f16m2_f16m1(
+      a, __riscv_vfmv_v_f_f16m1(static_cast<_Float16>(0.0), unpacket_traits<Packet2Xh>::size / 2),
+      unpacket_traits<Packet2Xh>::size)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet2Xh>(const Packet2Xh& a) {
+  return predux_mul<Packet1Xh>(__riscv_vfmul_vv_f16m1(
+      __riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1), unpacket_traits<Packet1Xh>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_min<Packet2Xh>(const Packet2Xh& a) {
+  const Eigen::half max = (std::numeric_limits<Eigen::half>::max)();
+  const Eigen::half nan = (std::numeric_limits<Eigen::half>::quiet_NaN)();
+  return (std::min)(
+      static_cast<Eigen::half>(__riscv_vfmv_f(__riscv_vfredmin_vs_f16m2_f16m1(
+          a, __riscv_vfmv_v_f_f16m1(numext::bit_cast<_Float16>(nan), unpacket_traits<Packet2Xh>::size / 2),
+          unpacket_traits<Packet2Xh>::size))),
+      max);
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_max<Packet2Xh>(const Packet2Xh& a) {
+  const Eigen::half min = -(std::numeric_limits<Eigen::half>::max)();
+  const Eigen::half nan = (std::numeric_limits<Eigen::half>::quiet_NaN)();
+  return (std::max)(
+      static_cast<Eigen::half>(__riscv_vfmv_f(__riscv_vfredmax_vs_f16m2_f16m1(
+          a, __riscv_vfmv_v_f_f16m1(numext::bit_cast<_Float16>(nan), unpacket_traits<Packet2Xh>::size / 2),
+          unpacket_traits<Packet2Xh>::size))),
+      min);
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2Xh, N>& kernel) {
+  Eigen::half buffer[unpacket_traits<Packet2Xh>::size * N];
+  int i = 0;
+
+  for (i = 0; i < N; i++) {
+    __riscv_vsse16(reinterpret_cast<_Float16*>(&buffer[i]), N * sizeof(Eigen::half), kernel.packet[i],
+                   unpacket_traits<Packet2Xh>::size);
+  }
+
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] = __riscv_vle16_v_f16m2(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits<Packet2Xh>::size]),
+                                             unpacket_traits<Packet2Xh>::size);
+  }
+}
+
+EIGEN_STRONG_INLINE Packet4Xf half2float(const Packet2Xh& a) {
+  return __riscv_vfwcvt_f_f_v_f32m4(a, unpacket_traits<Packet4Xf>::size);
+}
+
+EIGEN_STRONG_INLINE Packet2Xh float2half(const Packet4Xf& a) {
+  return __riscv_vfncvt_f_f_w_f16m2(a, unpacket_traits<Packet2Xh>::size);
+}
+
+template <typename Packet = Packet2Xh>
+EIGEN_STRONG_INLINE
+    std::enable_if_t<std::is_same<Packet, Packet2Xh>::value && (unpacket_traits<Packet2Xh>::size % 8) == 0, Packet1Xh>
+    predux_half(const Packet2Xh& a) {
+  return __riscv_vfadd_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1),
+                                unpacket_traits<Packet1Xh>::size);
+}
+
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, pcos)
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, pexp)
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, pexpm1)
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, plog)
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, plog1p)
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, plog2)
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, preciprocal)
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, prsqrt)
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, psin)
+F16_PACKET_FUNCTION(Packet2Xf, Packet1Xh, ptanh)
+
+F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, pcos)
+F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, pexp)
+F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, pexpm1)
+F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, plog)
+F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, plog1p)
+F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, plog2)
+F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, preciprocal)
+F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, prsqrt)
+F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, psin)
+F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, ptanh)
+
+/********************************* casting ************************************/
+
+template <>
+struct type_casting_traits<_Float16, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+struct type_casting_traits<numext::int16_t, _Float16> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh pcast<Packet1Xs, Packet1Xh>(const Packet1Xs& a) {
+  return __riscv_vfcvt_f_x_v_f16m1(a, unpacket_traits<Packet1Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs pcast<Packet1Xh, Packet1Xs>(const Packet1Xh& a) {
+  return __riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits<Packet1Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xh preinterpret<Packet1Xh, Packet1Xs>(const Packet1Xs& a) {
+  return __riscv_vreinterpret_v_i16m1_f16m1(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xs preinterpret<Packet1Xs, Packet1Xh>(const Packet1Xh& a) {
+  return __riscv_vreinterpret_v_f16m1_i16m1(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pcast<Packet2Xs, Packet2Xh>(const Packet2Xs& a) {
+  return __riscv_vfcvt_f_x_v_f16m2(a, unpacket_traits<Packet2Xs>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pcast<Packet2Xh, Packet2Xs>(const Packet2Xh& a) {
+  return __riscv_vfcvt_rtz_x_f_v_i16m2(a, unpacket_traits<Packet2Xh>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh preinterpret<Packet2Xh, Packet2Xs>(const Packet2Xs& a) {
+  return __riscv_vreinterpret_v_i16m2_f16m2(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs preinterpret<Packet2Xs, Packet2Xh>(const Packet2Xh& a) {
+  return __riscv_vreinterpret_v_f16m2_i16m2(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pcast<Packet1Xh, Packet4Xs>(const Packet1Xh& a, const Packet1Xh& b, const Packet1Xh& c,
+                                                          const Packet1Xh& d) {
+  return __riscv_vcreate_v_i16m1_i16m4(__riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits<Packet1Xh>::size),
+                                       __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits<Packet1Xh>::size),
+                                       __riscv_vfcvt_rtz_x_f_v_i16m1(c, unpacket_traits<Packet1Xh>::size),
+                                       __riscv_vfcvt_rtz_x_f_v_i16m1(d, unpacket_traits<Packet1Xh>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pcast<Packet1Xs, Packet2Xh>(const Packet1Xs& a, const Packet1Xs& b) {
+  return __riscv_vcreate_v_f16m1_f16m2(__riscv_vfcvt_f_x_v_f16m1(a, unpacket_traits<Packet1Xs>::size),
+                                       __riscv_vfcvt_f_x_v_f16m1(b, unpacket_traits<Packet1Xs>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xh pcast<Packet1Xh, Packet2Xh>(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vcreate_v_f16m1_f16m2(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pcast<Packet1Xh, Packet2Xs>(const Packet1Xh& a, const Packet1Xh& b) {
+  return __riscv_vcreate_v_i16m1_i16m2(__riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits<Packet1Xh>::size),
+                                       __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits<Packet1Xh>::size));
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_FP16_RVV10_H
diff --git a/Eigen/src/Core/arch/RVV10/TypeCasting.h b/Eigen/src/Core/arch/RVV10/TypeCasting.h
new file mode 100644
index 00000000000..2b0d3db47b6
--- /dev/null
+++ b/Eigen/src/Core/arch/RVV10/TypeCasting.h
@@ -0,0 +1,284 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 Kseniya Zaytseva <kseniya.zaytseva@syntacore.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_RVV10_H
+#define EIGEN_TYPE_CASTING_RVV10_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+/********************************* 32 bits ************************************/
+
+template <>
+struct type_casting_traits<float, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+struct type_casting_traits<numext::int32_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf pcast<Packet1Xi, Packet1Xf>(const Packet1Xi& a) {
+  return __riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits<Packet1Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi pcast<Packet1Xf, Packet1Xi>(const Packet1Xf& a) {
+  return __riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits<Packet1Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xf preinterpret<Packet1Xf, Packet1Xi>(const Packet1Xi& a) {
+  return __riscv_vreinterpret_v_i32m1_f32m1(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xi preinterpret<Packet1Xi, Packet1Xf>(const Packet1Xf& a) {
+  return __riscv_vreinterpret_v_f32m1_i32m1(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pcast<Packet4Xi, Packet4Xf>(const Packet4Xi& a) {
+  return __riscv_vfcvt_f_x_v_f32m4(a, unpacket_traits<Packet4Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pcast<Packet4Xf, Packet4Xi>(const Packet4Xf& a) {
+  return __riscv_vfcvt_rtz_x_f_v_i32m4(a, unpacket_traits<Packet4Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf preinterpret<Packet4Xf, Packet4Xi>(const Packet4Xi& a) {
+  return __riscv_vreinterpret_v_i32m4_f32m4(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi preinterpret<Packet4Xi, Packet4Xf>(const Packet4Xf& a) {
+  return __riscv_vreinterpret_v_f32m4_i32m4(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pcast<Packet2Xi, Packet2Xf>(const Packet2Xi& a) {
+  return __riscv_vfcvt_f_x_v_f32m2(a, unpacket_traits<Packet2Xi>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pcast<Packet2Xf, Packet2Xi>(const Packet2Xf& a) {
+  return __riscv_vfcvt_rtz_x_f_v_i32m2(a, unpacket_traits<Packet2Xf>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf preinterpret<Packet2Xf, Packet2Xi>(const Packet2Xi& a) {
+  return __riscv_vreinterpret_v_i32m2_f32m2(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi preinterpret<Packet2Xi, Packet2Xf>(const Packet2Xf& a) {
+  return __riscv_vreinterpret_v_f32m2_i32m2(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pcast<Packet1Xi, Packet4Xi>(const Packet1Xi& a, const Packet1Xi& b, const Packet1Xi& c,
+                                                               const Packet1Xi& d) {
+  return __riscv_vcreate_v_i32m1_i32m4(a, b, c, d);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pcast<Packet1Xi, Packet4Xf>(const Packet1Xi& a, const Packet1Xi& b, const Packet1Xi& c,
+                                                               const Packet1Xi& d) {
+  return __riscv_vcreate_v_f32m1_f32m4(__riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits<Packet1Xi>::size),
+                                       __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits<Packet1Xi>::size),
+                                       __riscv_vfcvt_f_x_v_f32m1(c, unpacket_traits<Packet1Xi>::size),
+                                       __riscv_vfcvt_f_x_v_f32m1(d, unpacket_traits<Packet1Xi>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xf pcast<Packet1Xf, Packet4Xf>(const Packet1Xf& a, const Packet1Xf& b, const Packet1Xf& c,
+                                                               const Packet1Xf& d) {
+  return __riscv_vcreate_v_f32m1_f32m4(a, b, c, d);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xi pcast<Packet1Xf, Packet4Xi>(const Packet1Xf& a, const Packet1Xf& b, const Packet1Xf& c,
+                                                               const Packet1Xf& d) {
+  return __riscv_vcreate_v_i32m1_i32m4(__riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits<Packet1Xf>::size),
+                                       __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits<Packet1Xf>::size),
+                                       __riscv_vfcvt_rtz_x_f_v_i32m1(c, unpacket_traits<Packet1Xf>::size),
+                                       __riscv_vfcvt_rtz_x_f_v_i32m1(d, unpacket_traits<Packet1Xf>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pcast<Packet1Xi, Packet2Xi>(const Packet1Xi& a, const Packet1Xi& b) {
+  return __riscv_vcreate_v_i32m1_i32m2(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pcast<Packet1Xi, Packet2Xf>(const Packet1Xi& a, const Packet1Xi& b) {
+  return __riscv_vcreate_v_f32m1_f32m2(__riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits<Packet1Xi>::size),
+                                       __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits<Packet1Xi>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xf pcast<Packet1Xf, Packet2Xf>(const Packet1Xf& a, const Packet1Xf& b) {
+  return __riscv_vcreate_v_f32m1_f32m2(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xi pcast<Packet1Xf, Packet2Xi>(const Packet1Xf& a, const Packet1Xf& b) {
+  return __riscv_vcreate_v_i32m1_i32m2(__riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits<Packet1Xf>::size),
+                                       __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits<Packet1Xf>::size));
+}
+
+/********************************* 64 bits ************************************/
+
+template <>
+struct type_casting_traits<double, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+struct type_casting_traits<numext::int64_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd pcast<Packet1Xl, Packet1Xd>(const Packet1Xl& a) {
+  return __riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits<Packet1Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl pcast<Packet1Xd, Packet1Xl>(const Packet1Xd& a) {
+  return __riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits<Packet1Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xd preinterpret<Packet1Xd, Packet1Xl>(const Packet1Xl& a) {
+  return __riscv_vreinterpret_v_i64m1_f64m1(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1Xl preinterpret<Packet1Xl, Packet1Xd>(const Packet1Xd& a) {
+  return __riscv_vreinterpret_v_f64m1_i64m1(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pcast<Packet4Xl, Packet4Xd>(const Packet4Xl& a) {
+  return __riscv_vfcvt_f_x_v_f64m4(a, unpacket_traits<Packet4Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pcast<Packet4Xd, Packet4Xl>(const Packet4Xd& a) {
+  return __riscv_vfcvt_rtz_x_f_v_i64m4(a, unpacket_traits<Packet4Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd preinterpret<Packet4Xd, Packet4Xl>(const Packet4Xl& a) {
+  return __riscv_vreinterpret_v_i64m4_f64m4(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl preinterpret<Packet4Xl, Packet4Xd>(const Packet4Xd& a) {
+  return __riscv_vreinterpret_v_f64m4_i64m4(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pcast<Packet2Xl, Packet2Xd>(const Packet2Xl& a) {
+  return __riscv_vfcvt_f_x_v_f64m2(a, unpacket_traits<Packet2Xl>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pcast<Packet2Xd, Packet2Xl>(const Packet2Xd& a) {
+  return __riscv_vfcvt_rtz_x_f_v_i64m2(a, unpacket_traits<Packet2Xd>::size);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd preinterpret<Packet2Xd, Packet2Xl>(const Packet2Xl& a) {
+  return __riscv_vreinterpret_v_i64m2_f64m2(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl preinterpret<Packet2Xl, Packet2Xd>(const Packet2Xd& a) {
+  return __riscv_vreinterpret_v_f64m2_i64m2(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pcast<Packet1Xl, Packet4Xl>(const Packet1Xl& a, const Packet1Xl& b, const Packet1Xl& c,
+                                                               const Packet1Xl& d) {
+  return __riscv_vcreate_v_i64m1_i64m4(a, b, c, d);
+  ;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pcast<Packet1Xl, Packet4Xd>(const Packet1Xl& a, const Packet1Xl& b, const Packet1Xl& c,
+                                                               const Packet1Xl& d) {
+  return __riscv_vcreate_v_f64m1_f64m4(__riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits<Packet1Xl>::size),
+                                       __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits<Packet1Xl>::size),
+                                       __riscv_vfcvt_f_x_v_f64m1(c, unpacket_traits<Packet1Xl>::size),
+                                       __riscv_vfcvt_f_x_v_f64m1(d, unpacket_traits<Packet1Xl>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xd pcast<Packet1Xd, Packet4Xd>(const Packet1Xd& a, const Packet1Xd& b, const Packet1Xd& c,
+                                                               const Packet1Xd& d) {
+  return __riscv_vcreate_v_f64m1_f64m4(a, b, c, d);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xl pcast<Packet1Xd, Packet4Xl>(const Packet1Xd& a, const Packet1Xd& b, const Packet1Xd& c,
+                                                               const Packet1Xd& d) {
+  return __riscv_vcreate_v_i64m1_i64m4(__riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits<Packet1Xd>::size),
+                                       __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits<Packet1Xd>::size),
+                                       __riscv_vfcvt_rtz_x_f_v_i64m1(c, unpacket_traits<Packet1Xd>::size),
+                                       __riscv_vfcvt_rtz_x_f_v_i64m1(d, unpacket_traits<Packet1Xd>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pcast<Packet1Xl, Packet2Xl>(const Packet1Xl& a, const Packet1Xl& b) {
+  return __riscv_vcreate_v_i64m1_i64m2(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pcast<Packet1Xl, Packet2Xd>(const Packet1Xl& a, const Packet1Xl& b) {
+  return __riscv_vcreate_v_f64m1_f64m2(__riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits<Packet1Xl>::size),
+                                       __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits<Packet1Xl>::size));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xd pcast<Packet1Xd, Packet2Xd>(const Packet1Xd& a, const Packet1Xd& b) {
+  return __riscv_vcreate_v_f64m1_f64m2(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xl pcast<Packet1Xd, Packet2Xl>(const Packet1Xd& a, const Packet1Xd& b) {
+  return __riscv_vcreate_v_i64m1_i64m2(__riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits<Packet1Xd>::size),
+                                       __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits<Packet1Xd>::size));
+}
+
+/********************************* 16 bits ************************************/
+
+template <>
+EIGEN_STRONG_INLINE Packet2Xs pcast<Packet1Xs, Packet2Xs>(const Packet1Xs& a, const Packet1Xs& b) {
+  return __riscv_vcreate_v_i16m1_i16m2(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4Xs pcast<Packet1Xs, Packet4Xs>(const Packet1Xs& a, const Packet1Xs& b, const Packet1Xs& c,
+                                                               const Packet1Xs& d) {
+  return __riscv_vcreate_v_i16m1_i16m4(a, b, c, d);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_TYPE_CASTING_RVV10_H
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index f79da7b8cd7..86158958e5e 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -49,7 +49,6 @@ struct packet_traits<std::complex<float> > : default_packet_traits {
     HasMin = 0,
     HasMax = 0,
     HasSetLinear = 0,
-    HasBlend = 1
   };
 };
 #endif
@@ -245,7 +244,8 @@ struct packet_traits<std::complex<double> > : default_packet_traits {
     HasAbs2 = 0,
     HasMin = 0,
     HasMax = 0,
-    HasSetLinear = 0
+    HasSetLinear = 0,
+    HasExp = 1
   };
 };
 #endif
@@ -278,7 +278,7 @@ EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
-  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000, 0x0, 0x0, 0x0));
+  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(static_cast<int32_t>(0x80000000), 0x0, 0x0, 0x0));
   return Packet1cd(_mm_xor_pd(a.v, mask));
 }
 
@@ -324,7 +324,6 @@ EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packe
   return Packet1cd(_mm_andnot_pd(b.v, a.v));
 }
 
-// FIXME force unaligned load, this is a temporary fix
 template <>
 EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(_mm_load_pd((const double*)from));
@@ -344,7 +343,6 @@ EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* fr
   return pset1<Packet1cd>(*from);
 }
 
-// FIXME force unaligned store, this is a temporary fix
 template <>
 EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd((double*)to, from.v);
@@ -413,37 +411,8 @@ EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
   return Packet1cd(pand<Packet2d>(eq, vec2d_swizzle1(eq, 1, 0)));
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
-                                     const Packet2cf& elsePacket) {
-  __m128d result = pblend<Packet2d>(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
-  return Packet2cf(_mm_castpd_ps(result));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
-  return psqrt_complex<Packet1cd>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
-  return psqrt_complex<Packet2cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd plog<Packet1cd>(const Packet1cd& a) {
-  return plog_complex<Packet1cd>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf plog<Packet2cf>(const Packet2cf& a) {
-  return plog_complex<Packet2cf>(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a) {
-  return pexp_complex<Packet2cf>(a);
-}
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet1cd)
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet2cf)
 
 #ifdef EIGEN_VECTORIZE_FMA
 // std::complex<float>
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index e8902cff6ad..695a173c19c 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -91,6 +91,8 @@ struct shuffle_mask {
   enum { mask = (s) << 6 | (r) << 4 | (q) << 2 | (p) };
 };
 
+#define SIGN_MASK_I32 static_cast<int32_t>(0x80000000)
+
 // TODO: change the implementation of all swizzle* ops from macro to template,
 #define vec4f_swizzle1(v, p, q, r, s) \
   Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), (shuffle_mask<p, q, r, s>::mask))))
@@ -183,15 +185,22 @@ struct packet_traits<float> : default_packet_traits {
     HasReciprocal = EIGEN_FAST_MATH,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
+    HasTan = EIGEN_FAST_MATH,
     HasACos = 1,
     HasASin = 1,
     HasATan = 1,
     HasATanh = 1,
+    HasSinh = 1,
+    HasCosh = 1,
+    HasASinh = 1,
+    HasACosh = 1,
     HasLog = 1,
     HasLog1p = 1,
+    HasLog10 = 1,
     HasExpm1 = 1,
     HasNdtri = 1,
     HasExp = 1,
+    HasPow = 1,
     HasBessel = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
@@ -199,7 +208,6 @@ struct packet_traits<float> : default_packet_traits {
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
     HasErfc = EIGEN_FAST_MATH,
-    HasBlend = 1,
     HasSign = 0  // The manually vectorized version is slightly slower for SSE.
   };
 };
@@ -216,17 +224,25 @@ struct packet_traits<double> : default_packet_traits {
     HasDiv = 1,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
+    HasTan = EIGEN_FAST_MATH,
+    HasSinh = 1,
+    HasCosh = 1,
+    HasASinh = 1,
+    HasACosh = 1,
     HasTanh = EIGEN_FAST_MATH,
-    HasLog = 1,
     HasErf = EIGEN_FAST_MATH,
     HasErfc = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasLog10 = 1,
     HasExp = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasPow = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
     HasCbrt = 1,
     HasATan = 1,
     HasATanh = 1,
-    HasBlend = 1
   };
 };
 template <>
@@ -241,7 +257,6 @@ struct packet_traits<int> : default_packet_traits {
     HasCmp = 1,
     HasDiv = 1,
     HasShift = 1,
-    HasBlend = 1
   };
 };
 template <>
@@ -253,11 +268,9 @@ struct packet_traits<uint32_t> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 4,
 
-    HasDiv = 0,
     HasNegate = 0,
     HasCmp = 1,
     HasShift = 1,
-    HasBlend = 1
   };
 };
 template <>
@@ -269,10 +282,8 @@ struct packet_traits<int64_t> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 2,
 
-    HasDiv = 0,
     HasCmp = 1,
     HasShift = 1,
-    HasBlend = 1
   };
 };
 #endif
@@ -285,10 +296,9 @@ struct packet_traits<bool> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 16,
 
-    HasCmp = 1,  // note -- only pcmp_eq is defined
+    HasCmp = 1,
     HasShift = 0,
     HasAbs = 0,
-    HasAbs2 = 0,
     HasMin = 0,
     HasMax = 0,
     HasConj = 0,
@@ -562,7 +572,7 @@ EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f
 #ifdef EIGEN_VECTORIZE_SSE3
   return _mm_addsub_ps(a, b);
 #else
-  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x0, 0x80000000, 0x0));
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(SIGN_MASK_I32, 0x0, SIGN_MASK_I32, 0x0));
   return padd(a, pxor(mask, b));
 #endif
 }
@@ -574,19 +584,19 @@ EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d
 #ifdef EIGEN_VECTORIZE_SSE3
   return _mm_addsub_pd(a, b);
 #else
-  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x0));
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, SIGN_MASK_I32, 0x0, 0x0));
   return padd(a, pxor(mask, b));
 #endif
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
-  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32, SIGN_MASK_I32));
   return _mm_xor_ps(a, mask);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
-  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x80000000));
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, SIGN_MASK_I32, 0x0, SIGN_MASK_I32));
   return _mm_xor_pd(a, mask);
 }
 template <>
@@ -881,7 +891,14 @@ template <>
 EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
   return _mm_andnot_si128(b, a);
 }
-
+template <>
+EIGEN_STRONG_INLINE Packet16b pandnot<Packet16b>(const Packet16b& a, const Packet16b& b) {
+  return _mm_andnot_si128(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b pcmp_lt(const Packet16b& a, const Packet16b& b) {
+  return _mm_andnot_si128(a, b);
+}
 template <>
 EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
   return _mm_cmple_ps(a, b);
@@ -925,7 +942,11 @@ EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_cmpeq_epi32(a, _mm_min_epi32(a, b));
+#else
   return por(pcmp_lt(a, b), pcmp_eq(a, b));
+#endif
 }
 template <>
 EIGEN_STRONG_INLINE Packet2l pcmp_lt(const Packet2l& a, const Packet2l& b) {
@@ -1240,7 +1261,7 @@ EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
-  const __m128i mask = _mm_setr_epi32(0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF);
+  const __m128i mask = _mm_setr_epi32(-1, 0x7FFFFFFF, -1, 0x7FFFFFFF);
   return _mm_castsi128_pd(_mm_and_si128(mask, _mm_castpd_si128(a)));
 }
 template <>
@@ -1387,7 +1408,7 @@ template <typename Packet>
 EIGEN_STRONG_INLINE Packet ploadl(const typename unpacket_traits<Packet>::type* from);
 template <>
 EIGEN_STRONG_INLINE Packet4f ploadl<Packet4f>(const float* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from)));
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_castsi128_ps(_mm_loadu_si64(reinterpret_cast<const void*>(from)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2d ploadl<Packet2d>(const double* from) {
@@ -1408,7 +1429,7 @@ EIGEN_STRONG_INLINE Packet2d ploads<Packet2d>(const double* from) {
 
 template <>
 EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
-  return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
+  return vec4f_swizzle1(_mm_castsi128_ps(_mm_loadu_si64(reinterpret_cast<const void*>(from))), 0, 0, 1, 1);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
@@ -1435,7 +1456,7 @@ EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) {
 // {b0, b0, b1, b1, b2, b2, b3, b3, b4, b4, b5, b5, b6, b6, b7, b7}
 template <>
 EIGEN_STRONG_INLINE Packet16b ploaddup<Packet16b>(const bool* from) {
-  __m128i tmp = _mm_castpd_si128(pload1<Packet2d>(reinterpret_cast<const double*>(from)));
+  __m128i tmp = _mm_loadu_si64(reinterpret_cast<const void*>(from));
   return _mm_unpacklo_epi8(tmp, tmp);
 }
 
@@ -1443,7 +1464,10 @@ EIGEN_STRONG_INLINE Packet16b ploaddup<Packet16b>(const bool* from) {
 // {b0, b0  b0, b0, b1, b1, b1, b1, b2, b2, b2, b2, b3, b3, b3, b3}
 template <>
 EIGEN_STRONG_INLINE Packet16b ploadquad<Packet16b>(const bool* from) {
-  __m128i tmp = _mm_castps_si128(pload1<Packet4f>(reinterpret_cast<const float*>(from)));
+  EIGEN_USING_STD(memcpy);
+  int val;
+  memcpy(&val, from, sizeof(int));
+  __m128i tmp = _mm_cvtsi32_si128(val);
   tmp = _mm_unpacklo_epi8(tmp, tmp);
   return _mm_unpacklo_epi16(tmp, tmp);
 }
@@ -1666,9 +1690,9 @@ EIGEN_STRONG_INLINE Packet16b pgather<bool, Packet16b>(const bool* from, Index s
 template <>
 EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
   to[stride * 0] = pfirst(from);
-  to[stride * 1] = pfirst(_mm_shuffle_ps(from, from, 1));
-  to[stride * 2] = pfirst(_mm_shuffle_ps(from, from, 2));
-  to[stride * 3] = pfirst(_mm_shuffle_ps(from, from, 3));
+  to[stride * 1] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 1)));
+  to[stride * 2] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 2)));
+  to[stride * 3] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 3)));
 }
 template <>
 EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
@@ -1985,77 +2009,39 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16b, 16>& kernel) {
   kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
 }
 
-EIGEN_STRONG_INLINE __m128i sse_blend_mask(const Selector<2>& ifPacket) {
-  return _mm_set_epi64x(0 - ifPacket.select[1], 0 - ifPacket.select[0]);
-}
-
-EIGEN_STRONG_INLINE __m128i sse_blend_mask(const Selector<4>& ifPacket) {
-  return _mm_set_epi32(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1], 0 - ifPacket.select[0]);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2l pblend(const Selector<2>& ifPacket, const Packet2l& thenPacket,
-                                    const Packet2l& elsePacket) {
-  const __m128i true_mask = sse_blend_mask(ifPacket);
-  return pselect<Packet2l>(true_mask, thenPacket, elsePacket);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
-                                    const Packet4i& elsePacket) {
-  const __m128i true_mask = sse_blend_mask(ifPacket);
-  return pselect<Packet4i>(true_mask, thenPacket, elsePacket);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4ui pblend(const Selector<4>& ifPacket, const Packet4ui& thenPacket,
-                                     const Packet4ui& elsePacket) {
-  return (Packet4ui)pblend(ifPacket, (Packet4i)thenPacket, (Packet4i)elsePacket);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
-                                    const Packet4f& elsePacket) {
-  const __m128i true_mask = sse_blend_mask(ifPacket);
-  return pselect<Packet4f>(_mm_castsi128_ps(true_mask), thenPacket, elsePacket);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
-                                    const Packet2d& elsePacket) {
-  const __m128i true_mask = sse_blend_mask(ifPacket);
-  return pselect<Packet2d>(_mm_castsi128_pd(true_mask), thenPacket, elsePacket);
-}
-
 // Scalar path for pmadd with FMA to ensure consistency with vectorized path.
-#ifdef EIGEN_VECTORIZE_FMA
+#if defined(EIGEN_VECTORIZE_FMA)
 template <>
 EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
-  return ::fmaf(a, b, c);
+  return std::fmaf(a, b, c);
 }
 template <>
 EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
-  return ::fma(a, b, c);
+  return std::fma(a, b, c);
 }
 template <>
 EIGEN_STRONG_INLINE float pmsub(const float& a, const float& b, const float& c) {
-  return ::fmaf(a, b, -c);
+  return std::fmaf(a, b, -c);
 }
 template <>
 EIGEN_STRONG_INLINE double pmsub(const double& a, const double& b, const double& c) {
-  return ::fma(a, b, -c);
+  return std::fma(a, b, -c);
 }
 template <>
 EIGEN_STRONG_INLINE float pnmadd(const float& a, const float& b, const float& c) {
-  return ::fmaf(-a, b, c);
+  return std::fmaf(-a, b, c);
 }
 template <>
 EIGEN_STRONG_INLINE double pnmadd(const double& a, const double& b, const double& c) {
-  return ::fma(-a, b, c);
+  return std::fma(-a, b, c);
 }
 template <>
 EIGEN_STRONG_INLINE float pnmsub(const float& a, const float& b, const float& c) {
-  return ::fmaf(-a, b, -c);
+  return std::fmaf(-a, b, -c);
 }
 template <>
 EIGEN_STRONG_INLINE double pnmsub(const double& a, const double& b, const double& c) {
-  return ::fma(-a, b, -c);
+  return std::fma(-a, b, -c);
 }
 #endif
 
@@ -2154,206 +2140,6 @@ EIGEN_STRONG_INLINE __m128i float2half(__m128 f) {
 }
 #endif
 
-// Packet math for Eigen::half
-// Disable the following code since it's broken on too many platforms / compilers.
-// #elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
-#if 0
-
-typedef struct {
-  __m64 x;
-} Packet4h;
-
-
-template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
-
-template <>
-struct packet_traits<Eigen::half> : default_packet_traits {
-  typedef Packet4h type;
-  // There is no half-size packet for Packet4h.
-  typedef Packet4h half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 4,
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasNegate = 0,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasConj   = 0,
-    HasSetLinear = 0,
-  };
-};
-
-
-template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; };
-
-template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
-  Packet4h result;
-  result.x = _mm_set1_pi16(from.x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
-  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  __int64_t b64 = _mm_cvtm64_si64(b.x);
-
-  Eigen::half h[4];
-
-  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
-  h[0] = ha + hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
-  h[1] = ha + hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
-  h[2] = ha + hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
-  h[3] = ha + hb;
-  Packet4h result;
-  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(const Packet4h& a, const Packet4h& b) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  __int64_t b64 = _mm_cvtm64_si64(b.x);
-
-  Eigen::half h[4];
-
-  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
-  h[0] = ha - hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
-  h[1] = ha - hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
-  h[2] = ha - hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
-  h[3] = ha - hb;
-  Packet4h result;
-  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  __int64_t b64 = _mm_cvtm64_si64(b.x);
-
-  Eigen::half h[4];
-
-  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
-  h[0] = ha * hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
-  h[1] = ha * hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
-  h[2] = ha * hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
-  h[3] = ha * hb;
-  Packet4h result;
-  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(const Packet4h& a, const Packet4h& b) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  __int64_t b64 = _mm_cvtm64_si64(b.x);
-
-  Eigen::half h[4];
-
-  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
-  h[0] = ha / hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
-  h[1] = ha / hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
-  h[2] = ha / hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
-  h[3] = ha / hb;
-  Packet4h result;
-  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
-  Packet4h result;
-  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
-  Packet4h result;
-  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
-  __int64_t r = _mm_cvtm64_si64(from.x);
-  *(reinterpret_cast<__int64_t*>(to)) = r;
-}
-
-template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
-  __int64_t r = _mm_cvtm64_si64(from.x);
-  *(reinterpret_cast<__int64_t*>(to)) = r;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h
-ploadquad<Packet4h>(const Eigen::half* from) {
-  return pset1<Packet4h>(*from);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
-{
-  Packet4h result;
-  result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
-{
-  __int64_t a = _mm_cvtm64_si64(from.x);
-  to[stride*0].x = static_cast<unsigned short>(a);
-  to[stride*1].x = static_cast<unsigned short>(a >> 16);
-  to[stride*2].x = static_cast<unsigned short>(a >> 32);
-  to[stride*3].x = static_cast<unsigned short>(a >> 48);
-}
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet4h,4>& kernel) {
-  __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
-  __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
-  __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
-  __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
-
-  kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
-  kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
-  kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
-  kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
-}
-
-#endif
-
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
index 9a7732a60d4..8eebc025fb0 100644
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -173,56 +173,6 @@ EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui&
   return Packet4i(a);
 }
 
-// Disable the following code since it's broken on too many platforms / compilers.
-// #elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
-#if 0
-
-template <>
-struct type_casting_traits<Eigen::half, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4h, Packet4f>(const Packet4h& a) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  Eigen::half h = raw_uint16_to_half(static_cast<unsigned short>(a64));
-  float f1 = static_cast<float>(h);
-  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  float f2 = static_cast<float>(h);
-  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  float f3 = static_cast<float>(h);
-  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  float f4 = static_cast<float>(h);
-  return _mm_set_ps(f4, f3, f2, f1);
-}
-
-template <>
-struct type_casting_traits<float, Eigen::half> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f& a) {
-  EIGEN_ALIGN16 float aux[4];
-  pstore(aux, a);
-  Eigen::half h0(aux[0]);
-  Eigen::half h1(aux[1]);
-  Eigen::half h2(aux[2]);
-  Eigen::half h3(aux[3]);
-
-  Packet4h result;
-  result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x);
-  return result;
-}
-
-#endif
-
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/arch/SVE/MathFunctions.h b/Eigen/src/Core/arch/SVE/MathFunctions.h
index 8c8ed84cf7b..a547d39abb7 100644
--- a/Eigen/src/Core/arch/SVE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SVE/MathFunctions.h
@@ -16,31 +16,7 @@
 namespace Eigen {
 namespace internal {
 
-template <>
-EIGEN_STRONG_INLINE PacketXf pexp<PacketXf>(const PacketXf& x) {
-  return pexp_float(x);
-}
-
-template <>
-EIGEN_STRONG_INLINE PacketXf plog<PacketXf>(const PacketXf& x) {
-  return plog_float(x);
-}
-
-template <>
-EIGEN_STRONG_INLINE PacketXf psin<PacketXf>(const PacketXf& x) {
-  return psin_float(x);
-}
-
-template <>
-EIGEN_STRONG_INLINE PacketXf pcos<PacketXf>(const PacketXf& x) {
-  return pcos_float(x);
-}
-
-// Hyperbolic Tangent function.
-template <>
-EIGEN_STRONG_INLINE PacketXf ptanh<PacketXf>(const PacketXf& x) {
-  return ptanh_float(x);
-}
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketXf)
 
 }  // end namespace internal
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h
index 952d7561b68..611ba793b31 100644
--- a/Eigen/src/Core/arch/SVE/PacketMath.h
+++ b/Eigen/src/Core/arch/SVE/PacketMath.h
@@ -49,12 +49,10 @@ struct packet_traits<numext::int32_t> : default_packet_traits {
     HasNegate = 1,
     HasAbs = 1,
     HasArg = 0,
-    HasAbs2 = 1,
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
     HasSetLinear = 0,
-    HasBlend = 0,
     HasReduxp = 0  // Not implemented in SVE
   };
 };
@@ -344,21 +342,29 @@ struct packet_traits<float> : default_packet_traits {
     HasNegate = 1,
     HasAbs = 1,
     HasArg = 0,
-    HasAbs2 = 1,
     HasMin = 1,
     HasMax = 1,
     HasConj = 1,
     HasSetLinear = 0,
-    HasBlend = 0,
     HasReduxp = 0,  // Not implemented in SVE
 
     HasDiv = 1,
 
+    HasCmp = 1,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
+    HasTan = EIGEN_FAST_MATH,
+    HasACos = 1,
+    HasASin = 1,
+    HasATan = 1,
+    HasATanh = 1,
     HasLog = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
     HasExp = 1,
+    HasPow = 1,
     HasSqrt = 1,
+    HasCbrt = 1,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
     HasErfc = EIGEN_FAST_MATH
@@ -491,6 +497,22 @@ template <>
 EIGEN_STRONG_INLINE PacketXf pfloor<PacketXf>(const PacketXf& a) {
   return svrintm_f32_x(svptrue_b32(), a);
 }
+template <>
+EIGEN_STRONG_INLINE PacketXf pceil<PacketXf>(const PacketXf& a) {
+  return svrintp_f32_x(svptrue_b32(), a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXf print<PacketXf>(const PacketXf& a) {
+  return svrintn_f32_x(svptrue_b32(), a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXf ptrunc<PacketXf>(const PacketXf& a) {
+  return svrintz_f32_x(svptrue_b32(), a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXf pround<PacketXf>(const PacketXf& a) {
+  return svrinta_f32_x(svptrue_b32(), a);
+}
 
 template <>
 EIGEN_STRONG_INLINE PacketXf ptrue<PacketXf>(const PacketXf& /*a*/) {
diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
index 578e0f3a724..0bdf8253b72 100644
--- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h
+++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
@@ -30,7 +30,7 @@ namespace Eigen {
 
 namespace internal {
 
-template <int has_blend, int lengths>
+template <int lengths>
 struct sycl_packet_traits : default_packet_traits {
   enum {
     Vectorizable = 1,
@@ -60,7 +60,6 @@ struct sycl_packet_traits : default_packet_traits {
     HasIGamma = 0,
     HasIGammac = 0,
     HasBetaInc = 0,
-    HasBlend = has_blend,
     // This flag is used to indicate whether packet comparison is supported.
     // pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true.
     HasCmp = 1,
@@ -78,19 +77,19 @@ struct sycl_packet_traits : default_packet_traits {
 };
 
 #ifdef SYCL_DEVICE_ONLY
-#define SYCL_PACKET_TRAITS(packet_type, has_blend, unpacket_type, lengths)       \
-  template <>                                                                    \
-  struct packet_traits<unpacket_type> : sycl_packet_traits<has_blend, lengths> { \
-    typedef packet_type type;                                                    \
-    typedef packet_type half;                                                    \
+#define SYCL_PACKET_TRAITS(packet_type, unpacket_type, lengths)       \
+  template <>                                                         \
+  struct packet_traits<unpacket_type> : sycl_packet_traits<lengths> { \
+    typedef packet_type type;                                         \
+    typedef packet_type half;                                         \
   };
 
-SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, Eigen::half, 8)
-SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, const Eigen::half, 8)
-SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4)
-SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4)
-SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2)
-SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2)
+SYCL_PACKET_TRAITS(cl::sycl::cl_half8, Eigen::half, 8)
+SYCL_PACKET_TRAITS(cl::sycl::cl_half8, const Eigen::half, 8)
+SYCL_PACKET_TRAITS(cl::sycl::cl_float4, float, 4)
+SYCL_PACKET_TRAITS(cl::sycl::cl_float4, const float, 4)
+SYCL_PACKET_TRAITS(cl::sycl::cl_double2, double, 2)
+SYCL_PACKET_TRAITS(cl::sycl::cl_double2, const double, 2)
 #undef SYCL_PACKET_TRAITS
 
 // Make sure this is only available when targeting a GPU: we don't want to
@@ -135,14 +134,14 @@ template <typename PacketReturnType, int PacketSize>
 struct PacketWrapper {
   typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
   template <typename Index>
-  EIGEN_DEVICE_FUNC static Scalar scalarize(Index, PacketReturnType &) {
+  EIGEN_DEVICE_FUNC static Scalar scalarize(Index, PacketReturnType&) {
     eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR  THE CHOSEN TYPE");
     abort();
   }
   EIGEN_DEVICE_FUNC static PacketReturnType convert_to_packet_type(Scalar in, Scalar) {
     return ::Eigen::internal::template plset<PacketReturnType>(in);
   }
-  EIGEN_DEVICE_FUNC static void set_packet(PacketReturnType, Scalar *) {
+  EIGEN_DEVICE_FUNC static void set_packet(PacketReturnType, Scalar*) {
     eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR  THE CHOSEN TYPE");
     abort();
   }
@@ -153,7 +152,7 @@ template <typename PacketReturnType>
 struct PacketWrapper<PacketReturnType, 4> {
   typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
   template <typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType& in) {
     switch (index) {
       case 0:
         return in.x();
@@ -174,7 +173,7 @@ struct PacketWrapper<PacketReturnType, 4> {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in, Scalar other) {
     return PacketReturnType(in, other, other, other);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType& lhs, Scalar* rhs) {
     lhs = PacketReturnType(rhs[0], rhs[1], rhs[2], rhs[3]);
   }
 };
@@ -183,20 +182,20 @@ template <typename PacketReturnType>
 struct PacketWrapper<PacketReturnType, 1> {
   typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
   template <typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index, PacketReturnType &in) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index, PacketReturnType& in) {
     return in;
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in, Scalar) {
     return PacketReturnType(in);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) { lhs = rhs[0]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType& lhs, Scalar* rhs) { lhs = rhs[0]; }
 };
 
 template <typename PacketReturnType>
 struct PacketWrapper<PacketReturnType, 2> {
   typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
   template <typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType& in) {
     switch (index) {
       case 0:
         return in.x();
@@ -213,7 +212,7 @@ struct PacketWrapper<PacketReturnType, 2> {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in, Scalar other) {
     return PacketReturnType(in, other);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType& lhs, Scalar* rhs) {
     lhs = PacketReturnType(rhs[0], rhs[1]);
   }
 };
diff --git a/Eigen/src/Core/arch/SYCL/MathFunctions.h b/Eigen/src/Core/arch/SYCL/MathFunctions.h
index b20c32b3208..90d60661246 100644
--- a/Eigen/src/Core/arch/SYCL/MathFunctions.h
+++ b/Eigen/src/Core/arch/SYCL/MathFunctions.h
@@ -31,259 +31,69 @@ namespace internal {
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
 #if defined(SYCL_DEVICE_ONLY)
-#define SYCL_PLOG(packet_type)                                                                \
-  template <>                                                                                 \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog<packet_type>(const packet_type& a) { \
-    return cl::sycl::log(a);                                                                  \
-  }
-
-SYCL_PLOG(cl::sycl::cl_half8)
-SYCL_PLOG(cl::sycl::cl_float4)
-SYCL_PLOG(cl::sycl::cl_double2)
-#undef SYCL_PLOG
-
-#define SYCL_PLOG1P(packet_type)                                                                \
-  template <>                                                                                   \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog1p<packet_type>(const packet_type& a) { \
-    return cl::sycl::log1p(a);                                                                  \
-  }
-
-SYCL_PLOG1P(cl::sycl::cl_half8)
-SYCL_PLOG1P(cl::sycl::cl_float4)
-SYCL_PLOG1P(cl::sycl::cl_double2)
-#undef SYCL_PLOG1P
-
-#define SYCL_PLOG10(packet_type)                                                                \
-  template <>                                                                                   \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog10<packet_type>(const packet_type& a) { \
-    return cl::sycl::log10(a);                                                                  \
-  }
-
-SYCL_PLOG10(cl::sycl::cl_half8)
-SYCL_PLOG10(cl::sycl::cl_float4)
-SYCL_PLOG10(cl::sycl::cl_double2)
-#undef SYCL_PLOG10
-
-#define SYCL_PEXP(packet_type)                                                                \
-  template <>                                                                                 \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexp<packet_type>(const packet_type& a) { \
-    return cl::sycl::exp(a);                                                                  \
-  }
-
-SYCL_PEXP(cl::sycl::cl_half8)
-SYCL_PEXP(cl::sycl::cl_half)
-SYCL_PEXP(cl::sycl::cl_float4)
-SYCL_PEXP(cl::sycl::cl_float)
-SYCL_PEXP(cl::sycl::cl_double2)
-#undef SYCL_PEXP
-
-#define SYCL_PEXPM1(packet_type)                                                                \
-  template <>                                                                                   \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexpm1<packet_type>(const packet_type& a) { \
-    return cl::sycl::expm1(a);                                                                  \
-  }
-
-SYCL_PEXPM1(cl::sycl::cl_half8)
-SYCL_PEXPM1(cl::sycl::cl_float4)
-SYCL_PEXPM1(cl::sycl::cl_double2)
-#undef SYCL_PEXPM1
-
-#define SYCL_PSQRT(packet_type)                                                                \
-  template <>                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psqrt<packet_type>(const packet_type& a) { \
-    return cl::sycl::sqrt(a);                                                                  \
-  }
-
-SYCL_PSQRT(cl::sycl::cl_half8)
-SYCL_PSQRT(cl::sycl::cl_float4)
-SYCL_PSQRT(cl::sycl::cl_double2)
-#undef SYCL_PSQRT
-
-#define SYCL_PRSQRT(packet_type)                                                                \
-  template <>                                                                                   \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type prsqrt<packet_type>(const packet_type& a) { \
-    return cl::sycl::rsqrt(a);                                                                  \
-  }
-
-SYCL_PRSQRT(cl::sycl::cl_half8)
-SYCL_PRSQRT(cl::sycl::cl_float4)
-SYCL_PRSQRT(cl::sycl::cl_double2)
-#undef SYCL_PRSQRT
-
-/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
-#define SYCL_PSIN(packet_type)                                                                \
-  template <>                                                                                 \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psin<packet_type>(const packet_type& a) { \
-    return cl::sycl::sin(a);                                                                  \
-  }
-
-SYCL_PSIN(cl::sycl::cl_half8)
-SYCL_PSIN(cl::sycl::cl_float4)
-SYCL_PSIN(cl::sycl::cl_double2)
-#undef SYCL_PSIN
-
-/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
-#define SYCL_PCOS(packet_type)                                                                \
-  template <>                                                                                 \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcos<packet_type>(const packet_type& a) { \
-    return cl::sycl::cos(a);                                                                  \
-  }
-
-SYCL_PCOS(cl::sycl::cl_half8)
-SYCL_PCOS(cl::sycl::cl_float4)
-SYCL_PCOS(cl::sycl::cl_double2)
-#undef SYCL_PCOS
-
-/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
-#define SYCL_PTAN(packet_type)                                                                \
-  template <>                                                                                 \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptan<packet_type>(const packet_type& a) { \
-    return cl::sycl::tan(a);                                                                  \
-  }
-
-SYCL_PTAN(cl::sycl::cl_half8)
-SYCL_PTAN(cl::sycl::cl_float4)
-SYCL_PTAN(cl::sycl::cl_double2)
-#undef SYCL_PTAN
-
-/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
-#define SYCL_PASIN(packet_type)                                                                \
-  template <>                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pasin<packet_type>(const packet_type& a) { \
-    return cl::sycl::asin(a);                                                                  \
-  }
-
-SYCL_PASIN(cl::sycl::cl_half8)
-SYCL_PASIN(cl::sycl::cl_float4)
-SYCL_PASIN(cl::sycl::cl_double2)
-#undef SYCL_PASIN
-
-/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
-#define SYCL_PACOS(packet_type)                                                                \
-  template <>                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pacos<packet_type>(const packet_type& a) { \
-    return cl::sycl::acos(a);                                                                  \
-  }
-
-SYCL_PACOS(cl::sycl::cl_half8)
-SYCL_PACOS(cl::sycl::cl_float4)
-SYCL_PACOS(cl::sycl::cl_double2)
-#undef SYCL_PACOS
-
-/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
-#define SYCL_PATAN(packet_type)                                                                \
-  template <>                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type patan<packet_type>(const packet_type& a) { \
-    return cl::sycl::atan(a);                                                                  \
-  }
-
-SYCL_PATAN(cl::sycl::cl_half8)
-SYCL_PATAN(cl::sycl::cl_float4)
-SYCL_PATAN(cl::sycl::cl_double2)
-#undef SYCL_PATAN
-
-/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
-#define SYCL_PSINH(packet_type)                                                                \
-  template <>                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psinh<packet_type>(const packet_type& a) { \
-    return cl::sycl::sinh(a);                                                                  \
-  }
-
-SYCL_PSINH(cl::sycl::cl_half8)
-SYCL_PSINH(cl::sycl::cl_float4)
-SYCL_PSINH(cl::sycl::cl_double2)
-#undef SYCL_PSINH
-
-/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
-#define SYCL_PCOSH(packet_type)                                                                \
-  template <>                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcosh<packet_type>(const packet_type& a) { \
-    return cl::sycl::cosh(a);                                                                  \
-  }
-
-SYCL_PCOSH(cl::sycl::cl_half8)
-SYCL_PCOSH(cl::sycl::cl_float4)
-SYCL_PCOSH(cl::sycl::cl_double2)
-#undef SYCL_PCOSH
-
-/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
-#define SYCL_PTANH(packet_type)                                                                \
-  template <>                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptanh<packet_type>(const packet_type& a) { \
-    return cl::sycl::tanh(a);                                                                  \
-  }
-
-SYCL_PTANH(cl::sycl::cl_half8)
-SYCL_PTANH(cl::sycl::cl_float4)
-SYCL_PTANH(cl::sycl::cl_double2)
-#undef SYCL_PTANH
-
-#define SYCL_PCEIL(packet_type)                                                                \
-  template <>                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pceil<packet_type>(const packet_type& a) { \
-    return cl::sycl::ceil(a);                                                                  \
-  }
-
-SYCL_PCEIL(cl::sycl::cl_half)
-SYCL_PCEIL(cl::sycl::cl_float4)
-SYCL_PCEIL(cl::sycl::cl_double2)
-#undef SYCL_PCEIL
-
-#define SYCL_PROUND(packet_type)                                                                \
-  template <>                                                                                   \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pround<packet_type>(const packet_type& a) { \
-    return cl::sycl::round(a);                                                                  \
-  }
-
-SYCL_PROUND(cl::sycl::cl_half8)
-SYCL_PROUND(cl::sycl::cl_float4)
-SYCL_PROUND(cl::sycl::cl_double2)
-#undef SYCL_PROUND
-
-#define SYCL_PRINT(packet_type)                                                                \
-  template <>                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type print<packet_type>(const packet_type& a) { \
-    return cl::sycl::rint(a);                                                                  \
-  }
-
-SYCL_PRINT(cl::sycl::cl_half8)
-SYCL_PRINT(cl::sycl::cl_float4)
-SYCL_PRINT(cl::sycl::cl_double2)
-#undef SYCL_PRINT
-
-#define SYCL_FLOOR(packet_type)                                                                 \
-  template <>                                                                                   \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pfloor<packet_type>(const packet_type& a) { \
-    return cl::sycl::floor(a);                                                                  \
-  }
-
-SYCL_FLOOR(cl::sycl::cl_half8)
-SYCL_FLOOR(cl::sycl::cl_float4)
-SYCL_FLOOR(cl::sycl::cl_double2)
-#undef SYCL_FLOOR
-
-#define SYCL_PMIN(packet_type, expr)                                                                                \
-  template <>                                                                                                       \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmin<packet_type>(const packet_type& a, const packet_type& b) { \
-    return expr;                                                                                                    \
-  }
-
-SYCL_PMIN(cl::sycl::cl_half8, cl::sycl::fmin(a, b))
-SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b))
-SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
-#undef SYCL_PMIN
-
-#define SYCL_PMAX(packet_type, expr)                                                                                \
-  template <>                                                                                                       \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmax<packet_type>(const packet_type& a, const packet_type& b) { \
-    return expr;                                                                                                    \
-  }
-
-SYCL_PMAX(cl::sycl::cl_half8, cl::sycl::fmax(a, b))
-SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b))
-SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
-#undef SYCL_PMAX
 
+// Generic macro for unary SYCL math functions.
+#define SYCL_PACKET_FUNCTION(EIGEN_FUNC, SYCL_FUNC, PACKET)                          \
+  template <>                                                                        \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PACKET EIGEN_FUNC<PACKET>(const PACKET& a) { \
+    return cl::sycl::SYCL_FUNC(a);                                                   \
+  }
+
+// Instantiate a unary function for the standard set of SYCL vector types.
+#define SYCL_UNARY_FUNCTION(EIGEN_FUNC, SYCL_FUNC)                 \
+  SYCL_PACKET_FUNCTION(EIGEN_FUNC, SYCL_FUNC, cl::sycl::cl_half8)  \
+  SYCL_PACKET_FUNCTION(EIGEN_FUNC, SYCL_FUNC, cl::sycl::cl_float4) \
+  SYCL_PACKET_FUNCTION(EIGEN_FUNC, SYCL_FUNC, cl::sycl::cl_double2)
+
+SYCL_UNARY_FUNCTION(plog, log)
+SYCL_UNARY_FUNCTION(plog1p, log1p)
+SYCL_UNARY_FUNCTION(plog10, log10)
+SYCL_UNARY_FUNCTION(pexpm1, expm1)
+SYCL_UNARY_FUNCTION(psqrt, sqrt)
+SYCL_UNARY_FUNCTION(prsqrt, rsqrt)
+SYCL_UNARY_FUNCTION(psin, sin)
+SYCL_UNARY_FUNCTION(pcos, cos)
+SYCL_UNARY_FUNCTION(ptan, tan)
+SYCL_UNARY_FUNCTION(pasin, asin)
+SYCL_UNARY_FUNCTION(pacos, acos)
+SYCL_UNARY_FUNCTION(patan, atan)
+SYCL_UNARY_FUNCTION(psinh, sinh)
+SYCL_UNARY_FUNCTION(pcosh, cosh)
+SYCL_UNARY_FUNCTION(ptanh, tanh)
+SYCL_UNARY_FUNCTION(pround, round)
+SYCL_UNARY_FUNCTION(print, rint)
+SYCL_UNARY_FUNCTION(pfloor, floor)
+
+// pexp has additional scalar type instantiations.
+SYCL_UNARY_FUNCTION(pexp, exp)
+SYCL_PACKET_FUNCTION(pexp, exp, cl::sycl::cl_half)
+SYCL_PACKET_FUNCTION(pexp, exp, cl::sycl::cl_float)
+
+// pceil uses cl_half (scalar) instead of cl_half8 (vector) — preserving original behavior.
+SYCL_PACKET_FUNCTION(pceil, ceil, cl::sycl::cl_half)
+SYCL_PACKET_FUNCTION(pceil, ceil, cl::sycl::cl_float4)
+SYCL_PACKET_FUNCTION(pceil, ceil, cl::sycl::cl_double2)
+
+#undef SYCL_UNARY_FUNCTION
+#undef SYCL_PACKET_FUNCTION
+
+// Binary min/max functions.
+#define SYCL_BINARY_FUNCTION(EIGEN_FUNC, SYCL_FUNC, PACKET)                                           \
+  template <>                                                                                         \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PACKET EIGEN_FUNC<PACKET>(const PACKET& a, const PACKET& b) { \
+    return cl::sycl::SYCL_FUNC(a, b);                                                                 \
+  }
+
+SYCL_BINARY_FUNCTION(pmin, fmin, cl::sycl::cl_half8)
+SYCL_BINARY_FUNCTION(pmin, fmin, cl::sycl::cl_float4)
+SYCL_BINARY_FUNCTION(pmin, fmin, cl::sycl::cl_double2)
+SYCL_BINARY_FUNCTION(pmax, fmax, cl::sycl::cl_half8)
+SYCL_BINARY_FUNCTION(pmax, fmax, cl::sycl::cl_float4)
+SYCL_BINARY_FUNCTION(pmax, fmax, cl::sycl::cl_double2)
+
+#undef SYCL_BINARY_FUNCTION
+
+// pldexp requires integer conversion of the exponent.
 #define SYCL_PLDEXP(packet_type)                                                                                  \
   template <>                                                                                                     \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pldexp(const packet_type& a, const packet_type& exponent) {   \
diff --git a/Eigen/src/Core/arch/SYCL/PacketMath.h b/Eigen/src/Core/arch/SYCL/PacketMath.h
index 6b6bfe43b8f..e5dad3c3b69 100644
--- a/Eigen/src/Core/arch/SYCL/PacketMath.h
+++ b/Eigen/src/Core/arch/SYCL/PacketMath.h
@@ -542,31 +542,6 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(PacketBlock<cl::sycl::cl_d
   kernel.packet[1].x() = tmp;
 }
 
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 pblend(
-    const Selector<unpacket_traits<cl::sycl::cl_half8>::size>& ifPacket, const cl::sycl::cl_half8& thenPacket,
-    const cl::sycl::cl_half8& elsePacket) {
-  cl::sycl::cl_short8 condition(ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1, ifPacket.select[2] ? 0 : -1,
-                                ifPacket.select[3] ? 0 : -1, ifPacket.select[4] ? 0 : -1, ifPacket.select[5] ? 0 : -1,
-                                ifPacket.select[6] ? 0 : -1, ifPacket.select[7] ? 0 : -1);
-  return cl::sycl::select(thenPacket, elsePacket, condition);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pblend(
-    const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket, const cl::sycl::cl_float4& thenPacket,
-    const cl::sycl::cl_float4& elsePacket) {
-  cl::sycl::cl_int4 condition(ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1, ifPacket.select[2] ? 0 : -1,
-                              ifPacket.select[3] ? 0 : -1);
-  return cl::sycl::select(thenPacket, elsePacket, condition);
-}
-
-template <>
-inline cl::sycl::cl_double2 pblend(const Selector<unpacket_traits<cl::sycl::cl_double2>::size>& ifPacket,
-                                   const cl::sycl::cl_double2& thenPacket, const cl::sycl::cl_double2& elsePacket) {
-  cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1);
-  return cl::sycl::select(thenPacket, elsePacket, condition);
-}
 #endif  // SYCL_DEVICE_ONLY
 
 }  // end namespace internal
diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h
index a750b26b881..048b598b009 100644
--- a/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/Eigen/src/Core/arch/ZVector/Complex.h
@@ -20,7 +20,8 @@ namespace internal {
 
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
 inline Packet4ui p4ui_CONJ_XOR() {
-  return {0x00000000, 0x80000000, 0x00000000, 0x80000000};  // vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);
+  return Packet4ui{0x00000000, 0x80000000, 0x00000000,
+                   0x80000000};  // vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);
 }
 #endif
 
@@ -72,7 +73,6 @@ struct packet_traits<std::complex<float> > : default_packet_traits {
     HasAbs2 = 0,
     HasMin = 0,
     HasMax = 0,
-    HasBlend = 1,
     HasSetLinear = 0
   };
 };
@@ -178,7 +178,7 @@ EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
-  return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2));
+  return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2()));
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
@@ -256,10 +256,8 @@ EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1c
   return pdiv_complex(a, b);
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet1cd plog<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
-  return plog_complex(a, b);
-}
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS_NO_EXP(Packet1cd)
+EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(Packet2cf)
 
 EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
   return Packet1cd(preverse(Packet2d(x.v)));
@@ -437,16 +435,6 @@ EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2c
   return pdiv_complex(a, b);
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet2cf plog<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
-  return plog_complex(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
-  return pexp_complex(a, b);
-}
-
 EIGEN_STRONG_INLINE Packet2cf pcplxflip /*<Packet2cf>*/ (const Packet2cf& x) {
   Packet2cf res;
   res.cd[0] = pcplxflip(x.cd[0]);
@@ -460,14 +448,6 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
   kernel.packet[1].cd[0] = tmp;
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
-                                     const Packet2cf& elsePacket) {
-  Packet2cf result;
-  const Selector<4> ifPacket4 = {ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1]};
-  result.v = pblend<Packet4f>(ifPacket4, thenPacket.v, elsePacket.v);
-  return result;
-}
 #else
 template <>
 EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
@@ -544,14 +524,6 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
   kernel.packet[0].v = tmp;
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
-                                     const Packet2cf& elsePacket) {
-  Packet2cf result;
-  result.v = reinterpret_cast<Packet4f>(
-      pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
-  return result;
-}
 #endif
 
 }  // end namespace internal
diff --git a/Eigen/src/Core/arch/ZVector/MathFunctions.h b/Eigen/src/Core/arch/ZVector/MathFunctions.h
index 32e042554fe..348d643ac88 100644
--- a/Eigen/src/Core/arch/ZVector/MathFunctions.h
+++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h
@@ -23,6 +23,20 @@ namespace Eigen {
 
 namespace internal {
 
+EIGEN_DOUBLE_PACKET_FUNCTION(atanh, Packet2d)
+EIGEN_DOUBLE_PACKET_FUNCTION(log, Packet2d)
+EIGEN_DOUBLE_PACKET_FUNCTION(log2, Packet2d)
+EIGEN_DOUBLE_PACKET_FUNCTION(tanh, Packet2d)
+
+EIGEN_FLOAT_PACKET_FUNCTION(atanh, Packet4f)
+EIGEN_FLOAT_PACKET_FUNCTION(log, Packet4f)
+EIGEN_FLOAT_PACKET_FUNCTION(log2, Packet4f)
+
+EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet2d)
+EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet4f)
+EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet2d)
+EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet4f)
+
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
 static EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
 static EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
@@ -170,7 +184,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pexp<Packet4f>(cons
   y = padd(y, p4f_1);
 
   // build 2^n
-  emm0 = (Packet4i){(int)fx[0], (int)fx[1], (int)fx[2], (int)fx[3]};
+  emm0 = Packet4i{(int)fx[0], (int)fx[1], (int)fx[2], (int)fx[3]};
   emm0 = emm0 + p4i_0x7f;
   emm0 = emm0 << reinterpret_cast<Packet4i>(p4i_23);
 
diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h
index b45681320cb..e1666093454 100644
--- a/Eigen/src/Core/arch/ZVector/PacketMath.h
+++ b/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -167,7 +167,6 @@ struct packet_traits<int> : default_packet_traits {
     HasSub = 1,
     HasMul = 1,
     HasDiv = 1,
-    HasBlend = 1
   };
 };
 
@@ -180,6 +179,7 @@ struct packet_traits<float> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 4,
 
+    HasCmp = 1,
     HasAdd = 1,
     HasSub = 1,
     HasMul = 1,
@@ -196,7 +196,6 @@ struct packet_traits<float> : default_packet_traits {
     HasTanh = 1,
     HasErf = 1,
     HasNegate = 1,
-    HasBlend = 1
   };
 };
 
@@ -223,7 +222,6 @@ struct packet_traits<double> : default_packet_traits {
     HasSqrt = 1,
     HasRsqrt = 1,
     HasNegate = 1,
-    HasBlend = 1
   };
 };
 
@@ -250,6 +248,7 @@ struct unpacket_traits<Packet4f> {
     masked_store_available = false
   };
   typedef Packet4f half;
+  typedef Packet4i integer_packet;
 };
 template <>
 struct unpacket_traits<Packet2d> {
@@ -262,6 +261,7 @@ struct unpacket_traits<Packet2d> {
     masked_store_available = false
   };
   typedef Packet2d half;
+  typedef Packet2l integer_packet;
 };
 
 /* Forward declaration */
@@ -313,38 +313,36 @@ inline std::ostream& operator<<(std::ostream& s, const Packet4f& v) {
 
 template <>
 EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
-  // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_LOAD
-  Packet* vfrom;
-  vfrom = (Packet*)from;
-  return vfrom->v4i;
+  return vec_xl(0, from);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
-  // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_LOAD
-  Packet* vfrom;
-  vfrom = (Packet*)from;
-  return vfrom->v2d;
+  return vec_xl(0, from);
 }
 
 template <>
 EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
-  // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_STORE
-  Packet* vto;
-  vto = (Packet*)to;
-  vto->v4i = from;
+  vec_xst(from, 0, to);
 }
 
 template <>
 EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
-  // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_STORE
-  Packet* vto;
-  vto = (Packet*)to;
-  vto->v2d = from;
+  vec_xst(from, 0, to);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
+  return pfrexp_generic(a, exponent);
 }
 
 template <>
@@ -540,7 +538,8 @@ EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d
 
 template <>
 EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
-  return vec_round(a);
+  /* Uses non-default rounding for vec_round */
+  return __builtin_s390_vfidb(a, 0, 1);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
@@ -550,6 +549,18 @@ template <>
 EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
   return vec_floor(a);
 }
+template <>
+EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
+  return __builtin_s390_vfidb(a, 4, 5);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
+  return __builtin_s390_vfidb(a, 4, 4);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pnot(pcmp_le(b, a));
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
@@ -590,6 +601,36 @@ EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
   EIGEN_ZVECTOR_PREFETCH(addr);
 }
 
+template <int N>
+EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) {
+  return Packet2l{parithmetic_shift_right<N>(a[0]), parithmetic_shift_right<N>(a[1])};
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
+  return Packet4i{parithmetic_shift_right<N>(a[0]), parithmetic_shift_right<N>(a[1]), parithmetic_shift_right<N>(a[2]),
+                  parithmetic_shift_right<N>(a[3])};
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
+  return Packet2l{plogical_shift_right<N>(a[0]), plogical_shift_right<N>(a[1])};
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
+  return Packet4i{plogical_shift_right<N>(a[0]), plogical_shift_right<N>(a[1]), plogical_shift_right<N>(a[2]),
+                  plogical_shift_right<N>(a[3])};
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
+  return Packet2l{plogical_shift_left<N>(a[0]), plogical_shift_left<N>(a[1])};
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
+  return Packet4i{plogical_shift_left<N>(a[0]), plogical_shift_left<N>(a[1]), plogical_shift_left<N>(a[2]),
+                  plogical_shift_left<N>(a[3])};
+}
+
 template <>
 EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
   EIGEN_ALIGN16 int x[4];
@@ -706,22 +747,6 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
   kernel.packet[1] = t1;
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
-                                    const Packet4i& elsePacket) {
-  Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
-  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
-  return vec_sel(elsePacket, thenPacket, mask);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
-                                    const Packet2d& elsePacket) {
-  Packet2ul select = {ifPacket.select[0], ifPacket.select[1]};
-  Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
-  return vec_sel(elsePacket, thenPacket, mask);
-}
-
 /* z13 has no vector float support so we emulate that with double
    z14 has proper vector float support.
 */
@@ -906,8 +931,8 @@ EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f
 template <>
 EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
   Packet4f res;
-  res.v4f[0] = vec_round(a.v4f[0]);
-  res.v4f[1] = vec_round(a.v4f[1]);
+  res.v4f[0] = generic_round(a.v4f[0]);
+  res.v4f[1] = generic_round(a.v4f[1]);
   return res;
 }
 
@@ -926,6 +951,20 @@ EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
   res.v4f[1] = vec_floor(a.v4f[1]);
   return res;
 }
+template <>
+EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
+  Packet4f res;
+  res.v4f[0] = print(a.v4f[0]);
+  res.v4f[1] = print(a.v4f[1]);
+  return res;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
+  Packet4f res;
+  res.v4f[0] = ptrunc(a.v4f[0]);
+  res.v4f[1] = ptrunc(a.v4f[1]);
+  return res;
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
@@ -1027,19 +1066,6 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
   kernel.packet[3].v4f[1] = t3.packet[1];
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
-                                    const Packet4f& elsePacket) {
-  Packet2ul select_hi = {ifPacket.select[0], ifPacket.select[1]};
-  Packet2ul select_lo = {ifPacket.select[2], ifPacket.select[3]};
-  Packet2ul mask_hi = vec_cmpeq(select_hi, reinterpret_cast<Packet2ul>(p2l_ONE));
-  Packet2ul mask_lo = vec_cmpeq(select_lo, reinterpret_cast<Packet2ul>(p2l_ONE));
-  Packet4f result;
-  result.v4f[0] = vec_sel(elsePacket.v4f[0], thenPacket.v4f[0], mask_hi);
-  result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo);
-  return result;
-}
-
 template <>
 Packet4f EIGEN_STRONG_INLINE pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
   Packet4f res;
@@ -1063,24 +1089,25 @@ Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f
   res.v4f[1] = pcmp_eq(a.v4f[1], b.v4f[1]);
   return res;
 }
+template <>
+Packet4f EIGEN_STRONG_INLINE pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f res;
+  res.v4f[0] = pcmp_lt_or_nan(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pcmp_lt_or_nan(a.v4f[1], b.v4f[1]);
+  return res;
+}
 
 #else
 template <>
 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
-  // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_LOAD
-  Packet* vfrom;
-  vfrom = (Packet*)from;
-  return vfrom->v4f;
+  return vec_xl(0, from);
 }
 
 template <>
 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
-  // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_STORE
-  Packet* vto;
-  vto = (Packet*)to;
-  vto->v4f = from;
+  vec_xst(from, 0, to);
 }
 
 template <>
@@ -1171,7 +1198,8 @@ EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f
 }
 template <>
 EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
-  return vec_round(a);
+  /* Uses non-default rounding for vec_round */
+  return __builtin_s390_vfisb(a, 0, 1);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
@@ -1182,6 +1210,18 @@ EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
   return vec_floor(a);
 }
 template <>
+EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
+  return __builtin_s390_vfisb(a, 4, 5);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
+  return __builtin_s390_vfisb(a, 4, 4);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pnot(pcmp_le(b, a));
+}
+template <>
 EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a) {
   return vec_abs(a);
 }
@@ -1252,15 +1292,29 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
   kernel.packet[3] = vec_mergel(t1, t3);
 }
 
+#endif
+
 template <>
-EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
-                                    const Packet4f& elsePacket) {
-  Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
-  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
-  return vec_sel(elsePacket, thenPacket, mask);
+EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+  return pldexp_generic(a, exponent);
 }
 
-#endif
+template <>
+EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
+  const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+
+  // Split 2^e into four factors and multiply:
+  const Packet2l bias = {1023, 1023};
+  Packet2l b = plogical_shift_right<2>(e);  // floor(e/4)
+  Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
+  Packet2d out = pmul(pmul(pmul(a, c), c), c);                        // a * 2^(3b)
+  b = psub(psub(psub(e, b), b), b);                                   // e - 3b
+  c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));  // 2^(e - 3b)
+  out = pmul(out, c);                                                 // a * 2^e
+  return out;
+}
 
 template <>
 EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
@@ -1279,6 +1333,64 @@ EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
   return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN);
 }
 
+#if !defined(vec_float) || !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 13)
+#pragma GCC warning "float->int and int->float conversion is simulated. compile for z15 for improved performance"
+template <>
+struct cast_impl<Packet4i, Packet4f> {
+  EIGEN_DEVICE_FUNC static inline Packet4f run(const Packet4i& a) {
+    return Packet4f{float(a[0]), float(a[1]), float(a[2]), float(a[3])};
+  }
+};
+
+template <>
+struct cast_impl<Packet4f, Packet4i> {
+  EIGEN_DEVICE_FUNC static inline Packet4i run(const Packet4f& a) {
+    return Packet4i{int(a[0]), int(a[1]), int(a[2]), int(a[3])};
+  }
+};
+
+template <>
+struct cast_impl<Packet2l, Packet2d> {
+  EIGEN_DEVICE_FUNC static inline Packet2d run(const Packet2l& a) { return Packet2d{double(a[0]), double(a[1])}; }
+};
+
+template <>
+struct cast_impl<Packet2d, Packet2l> {
+  EIGEN_DEVICE_FUNC static inline Packet2l run(const Packet2d& a) {
+    return Packet2l{(long long)(a[0]), (long long)(a[1])};
+  }
+};
+#else
+template <>
+struct cast_impl<Packet4i, Packet4f> {
+  EIGEN_DEVICE_FUNC static inline Packet4f run(const Packet4i& a) { return vec_float(a); }
+};
+
+template <>
+struct cast_impl<Packet4f, Packet4i> {
+  EIGEN_DEVICE_FUNC static inline Packet4i run(const Packet4f& a) { return vec_signed(a); }
+};
+
+template <>
+struct cast_impl<Packet2l, Packet2d> {
+  EIGEN_DEVICE_FUNC static inline Packet2d run(const Packet2l& a) { return vec_double(a); }
+};
+
+template <>
+struct cast_impl<Packet2d, Packet2l> {
+  EIGEN_DEVICE_FUNC static inline Packet2l run(const Packet2d& a) { return vec_signed(a); }
+};
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
+  return pset1<Packet4f>(Eigen::numext::bit_cast<float>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
+  return pset1<Packet2d>(Eigen::numext::bit_cast<double>(from));
+}
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/arch/clang/Complex.h b/Eigen/src/Core/arch/clang/Complex.h
new file mode 100644
index 00000000000..cfcc229bb82
--- /dev/null
+++ b/Eigen/src/Core/arch/clang/Complex.h
@@ -0,0 +1,702 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Rasmus Munk Larsen
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_CLANG_H
+#define EIGEN_COMPLEX_CLANG_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+template <typename RealScalar, int N>
+struct complex_packet_wrapper {
+  using RealPacketT = detail::VectorType<RealScalar, 2 * N>;
+  complex_packet_wrapper() = default;
+  EIGEN_STRONG_INLINE explicit complex_packet_wrapper(const RealPacketT& a) : v(a) {}
+  EIGEN_STRONG_INLINE constexpr std::complex<RealScalar> operator[](Index i) const {
+    return std::complex<RealScalar>(v[2 * i], v[2 * i + 1]);
+  }
+  RealPacketT v;
+};
+
+// --- Primary complex packet aliases ---
+constexpr int kComplexFloatSize = kFloatPacketSize / 2;    // 2, 4, or 8
+constexpr int kComplexDoubleSize = kDoublePacketSize / 2;  // 1, 2, or 4
+using PacketXcf = complex_packet_wrapper<float, kComplexFloatSize>;
+using PacketXcd = complex_packet_wrapper<double, kComplexDoubleSize>;
+
+// Sub-packet types needed for reductions at larger sizes.
+// When PacketXcf IS already a given size, we skip the alias to avoid duplicates.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+using Packet2cf = complex_packet_wrapper<float, 2>;
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+using Packet4cf = complex_packet_wrapper<float, 4>;
+using Packet2cd = complex_packet_wrapper<double, 2>;
+#endif
+
+struct generic_complex_packet_traits : default_packet_traits {
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasArg = 0,
+    HasSetLinear = 0,
+    HasConj = 1,
+    // Math functions
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+  };
+};
+
+template <>
+struct packet_traits<std::complex<float>> : generic_complex_packet_traits {
+  using type = PacketXcf;
+  using half = PacketXcf;
+  enum {
+    size = kComplexFloatSize,
+  };
+};
+
+template <>
+struct unpacket_traits<PacketXcf> : generic_unpacket_traits {
+  using type = std::complex<float>;
+  using half = PacketXcf;
+  using as_real = PacketXf;
+  enum {
+    size = kComplexFloatSize,
+  };
+};
+
+template <>
+struct packet_traits<std::complex<double>> : generic_complex_packet_traits {
+  using type = PacketXcd;
+  using half = PacketXcd;
+  enum {
+    size = kComplexDoubleSize,
+  };
+};
+
+template <>
+struct unpacket_traits<PacketXcd> : generic_unpacket_traits {
+  using type = std::complex<double>;
+  using half = PacketXcd;
+  using as_real = PacketXd;
+  enum {
+    size = kComplexDoubleSize,
+  };
+};
+
+// ------------ Load and store ops ----------
+#define EIGEN_CLANG_COMPLEX_LOAD_STORE(PACKET_TYPE)                                                       \
+  template <>                                                                                             \
+  EIGEN_STRONG_INLINE PACKET_TYPE ploadu<PACKET_TYPE>(const unpacket_traits<PACKET_TYPE>::type* from) {   \
+    return PACKET_TYPE(ploadu<typename unpacket_traits<PACKET_TYPE>::as_real>(&numext::real_ref(*from))); \
+  }                                                                                                       \
+  template <>                                                                                             \
+  EIGEN_STRONG_INLINE PACKET_TYPE pload<PACKET_TYPE>(const unpacket_traits<PACKET_TYPE>::type* from) {    \
+    return PACKET_TYPE(pload<typename unpacket_traits<PACKET_TYPE>::as_real>(&numext::real_ref(*from)));  \
+  }                                                                                                       \
+  template <>                                                                                             \
+  EIGEN_STRONG_INLINE void pstoreu<typename unpacket_traits<PACKET_TYPE>::type, PACKET_TYPE>(             \
+      typename unpacket_traits<PACKET_TYPE>::type * to, const PACKET_TYPE& from) {                        \
+    pstoreu(&numext::real_ref(*to), from.v);                                                              \
+  }                                                                                                       \
+  template <>                                                                                             \
+  EIGEN_STRONG_INLINE void pstore<typename unpacket_traits<PACKET_TYPE>::type, PACKET_TYPE>(              \
+      typename unpacket_traits<PACKET_TYPE>::type * to, const PACKET_TYPE& from) {                        \
+    pstore(&numext::real_ref(*to), from.v);                                                               \
+  }
+
+EIGEN_CLANG_COMPLEX_LOAD_STORE(PacketXcf);
+EIGEN_CLANG_COMPLEX_LOAD_STORE(PacketXcd);
+#undef EIGEN_CLANG_COMPLEX_LOAD_STORE
+
+// --- pset1 for complex ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pset1<PacketXcf>(const std::complex<float>& from) {
+  const float re = numext::real(from);
+  const float im = numext::imag(from);
+  return PacketXcf(PacketXf{re, im, re, im});
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pset1<PacketXcd>(const std::complex<double>& from) {
+  const double re = numext::real(from);
+  const double im = numext::imag(from);
+  return PacketXcd(PacketXd{re, im});
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pset1<PacketXcf>(const std::complex<float>& from) {
+  const float re = numext::real(from);
+  const float im = numext::imag(from);
+  return PacketXcf(PacketXf{re, im, re, im, re, im, re, im});
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pset1<PacketXcd>(const std::complex<double>& from) {
+  const double re = numext::real(from);
+  const double im = numext::imag(from);
+  return PacketXcd(PacketXd{re, im, re, im});
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pset1<PacketXcf>(const std::complex<float>& from) {
+  const float re = numext::real(from);
+  const float im = numext::imag(from);
+  return PacketXcf(PacketXf{re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im});
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pset1<PacketXcd>(const std::complex<double>& from) {
+  const double re = numext::real(from);
+  const double im = numext::imag(from);
+  return PacketXcd(PacketXd{re, im, re, im, re, im, re, im});
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// ----------- Unary ops ------------------
+#define DELEGATE_UNARY_TO_REAL_OP(PACKET_TYPE, OP)                        \
+  template <>                                                             \
+  EIGEN_STRONG_INLINE PACKET_TYPE OP<PACKET_TYPE>(const PACKET_TYPE& a) { \
+    return PACKET_TYPE(OP(a.v));                                          \
+  }
+
+#define EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(PACKET_TYPE)                                             \
+  DELEGATE_UNARY_TO_REAL_OP(PACKET_TYPE, pnegate)                                                    \
+  DELEGATE_UNARY_TO_REAL_OP(PACKET_TYPE, pzero)                                                      \
+  template <>                                                                                        \
+  EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type pfirst<PACKET_TYPE>(const PACKET_TYPE& a) { \
+    return a[0];                                                                                     \
+  }                                                                                                  \
+  EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(PACKET_TYPE)
+
+EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(PacketXcf);
+EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(PacketXcd);
+
+// --- pconj ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pconj<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pconj<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, -a.v, 0, 3));
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pconj<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pconj<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pconj<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, -a.v, 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pconj<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15));
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// Sub-packet pconj specializations needed for reductions.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
+}
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+template <>
+EIGEN_STRONG_INLINE Packet4cf pconj<Packet4cf>(const Packet4cf& a) {
+  return Packet4cf(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pconj<Packet2cd>(const Packet2cd& a) {
+  return Packet2cd(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
+}
+#endif
+
+#undef DELEGATE_UNARY_TO_REAL_OP
+#undef EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS
+
+// Flip real and imaginary parts, i.e.  {re(a), im(a)} -> {im(a), re(a)}.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pcplxflip<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pcplxflip<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 0));
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pcplxflip<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pcplxflip<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pcplxflip<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pcplxflip<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6));
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// Sub-packet pcplxflip specializations needed for reductions.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
+}
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+template <>
+EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& a) {
+  return Packet4cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& a) {
+  return Packet2cd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
+}
+#endif
+
+// Copy real to imaginary part, i.e. {re(a), im(a)} -> {re(a), re(a)}.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pdupreal<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pdupreal<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 0, 0));
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pdupreal<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pdupreal<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pdupreal<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pdupreal<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6));
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// Sub-packet pdupreal specializations needed for reductions.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdupreal<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
+}
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+template <>
+EIGEN_STRONG_INLINE Packet4cf pdupreal<Packet4cf>(const Packet4cf& a) {
+  return Packet4cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pdupreal<Packet2cd>(const Packet2cd& a) {
+  return Packet2cd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
+}
+#endif
+
+// Copy imaginary to real part, i.e. {re(a), im(a)} -> {im(a), im(a)}.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pdupimag<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pdupimag<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 1));
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pdupimag<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pdupimag<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pdupimag<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pdupimag<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7));
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// Sub-packet pdupimag specializations needed for reductions.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdupimag<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
+}
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+template <>
+EIGEN_STRONG_INLINE Packet4cf pdupimag<Packet4cf>(const Packet4cf& a) {
+  return Packet4cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pdupimag<Packet2cd>(const Packet2cd& a) {
+  return Packet2cd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
+}
+#endif
+
+// --- ploaddup ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf ploaddup<PacketXcf>(const std::complex<float>* from) {
+  return pset1<PacketXcf>(*from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd ploaddup<PacketXcd>(const std::complex<double>* from) {
+  return pset1<PacketXcd>(*from);
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf ploaddup<PacketXcf>(const std::complex<float>* from) {
+  return PacketXcf(PacketXf{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
+                            std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])});
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd ploaddup<PacketXcd>(const std::complex<double>* from) {
+  return pset1<PacketXcd>(*from);
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf ploaddup<PacketXcf>(const std::complex<float>* from) {
+  return PacketXcf(PacketXf{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
+                            std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]),
+                            std::real(from[2]), std::imag(from[2]), std::real(from[2]), std::imag(from[2]),
+                            std::real(from[3]), std::imag(from[3]), std::real(from[3]), std::imag(from[3])});
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd ploaddup<PacketXcd>(const std::complex<double>* from) {
+  return PacketXcd(PacketXd{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
+                            std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])});
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- ploadquad ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf ploadquad<PacketXcf>(const std::complex<float>* from) {
+  return pset1<PacketXcf>(*from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd ploadquad<PacketXcd>(const std::complex<double>* from) {
+  return pset1<PacketXcd>(*from);
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf ploadquad<PacketXcf>(const std::complex<float>* from) {
+  return pset1<PacketXcf>(*from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd ploadquad<PacketXcd>(const std::complex<double>* from) {
+  return pset1<PacketXcd>(*from);
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf ploadquad<PacketXcf>(const std::complex<float>* from) {
+  return PacketXcf(PacketXf{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
+                            std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
+                            std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]),
+                            std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])});
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd ploadquad<PacketXcd>(const std::complex<double>* from) {
+  return pset1<PacketXcd>(*from);
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- preverse ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf preverse<PacketXcf>(const PacketXcf& a) {
+  // 2 complex floats: swap pairs (0,1) and (2,3)
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 2, 3, 0, 1));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd preverse<PacketXcd>(const PacketXcd& a) {
+  // 1 complex double: identity
+  return a;
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf preverse<PacketXcf>(const PacketXcf& a) {
+  // 4 complex floats: reverse pairs
+  return PacketXcf(reinterpret_cast<PacketXf>(preverse(reinterpret_cast<PacketXd>(a.v))));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd preverse<PacketXcd>(const PacketXcd& a) {
+  // 2 complex doubles: swap pairs
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 2, 3, 0, 1));
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf preverse<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(reinterpret_cast<PacketXf>(preverse(reinterpret_cast<PacketXd>(a.v))));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd preverse<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 6, 7, 4, 5, 2, 3, 0, 1));
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// ----------- Binary ops ------------------
+#define DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, OP)                                             \
+  template <>                                                                                   \
+  EIGEN_STRONG_INLINE PACKET_TYPE OP<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
+    return PACKET_TYPE(OP(a.v, b.v));                                                           \
+  }
+
+#define EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(PACKET_TYPE)                                            \
+  DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, psub)                                                      \
+  DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, pand)                                                      \
+  DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, por)                                                       \
+  DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, pxor)                                                      \
+  DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, pandnot)                                                   \
+  template <>                                                                                        \
+  EIGEN_STRONG_INLINE PACKET_TYPE pdiv<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {    \
+    return pdiv_complex(a, b);                                                                       \
+  }                                                                                                  \
+  template <>                                                                                        \
+  EIGEN_STRONG_INLINE PACKET_TYPE pcmp_eq<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
+    const PACKET_TYPE t = PACKET_TYPE(pcmp_eq(a.v, b.v));                                            \
+    return PACKET_TYPE(pand(pdupreal(t).v, pdupimag(t).v));                                          \
+  }
+
+EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(PacketXcf);
+EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(PacketXcd);
+
+// Binary ops that are needed on sub-packets for predux and predux_mul.
+#define EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(PACKET_TYPE)                                 \
+  DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, padd)                                                   \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pmul<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
+    return pmul_complex(a, b);                                                                    \
+  }
+
+EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(PacketXcf);
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet2cf);
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet4cf);
+#endif
+EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(PacketXcd);
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet2cd);
+#endif
+
+#define EIGEN_CLANG_PACKET_SCATTER_GATHER(PACKET_TYPE)                                                               \
+  template <>                                                                                                        \
+  EIGEN_STRONG_INLINE void pscatter(unpacket_traits<PACKET_TYPE>::type* to, const PACKET_TYPE& from, Index stride) { \
+    constexpr int size = unpacket_traits<PACKET_TYPE>::size;                                                         \
+    for (int i = 0; i < size; ++i) {                                                                                 \
+      to[i * stride] = from[i];                                                                                      \
+    }                                                                                                                \
+  }                                                                                                                  \
+  template <>                                                                                                        \
+  EIGEN_STRONG_INLINE PACKET_TYPE pgather<typename unpacket_traits<PACKET_TYPE>::type, PACKET_TYPE>(                 \
+      const unpacket_traits<PACKET_TYPE>::type* from, Index stride) {                                                \
+    constexpr int size = unpacket_traits<PACKET_TYPE>::size;                                                         \
+    PACKET_TYPE result;                                                                                              \
+    for (int i = 0; i < size; ++i) {                                                                                 \
+      const unpacket_traits<PACKET_TYPE>::type from_i = from[i * stride];                                            \
+      result.v[2 * i] = numext::real(from_i);                                                                        \
+      result.v[2 * i + 1] = numext::imag(from_i);                                                                    \
+    }                                                                                                                \
+    return result;                                                                                                   \
+  }
+
+EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXcf);
+EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXcd);
+#undef EIGEN_CLANG_PACKET_SCATTER_GATHER
+
+#undef DELEGATE_BINARY_TO_REAL_OP
+#undef EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS
+#undef EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS
+
+// ------------ ternary ops -------------
+template <>
+EIGEN_STRONG_INLINE PacketXcf pselect<PacketXcf>(const PacketXcf& mask, const PacketXcf& a, const PacketXcf& b) {
+  return PacketXcf(reinterpret_cast<PacketXf>(
+      pselect(reinterpret_cast<PacketXd>(mask.v), reinterpret_cast<PacketXd>(a.v), reinterpret_cast<PacketXd>(b.v))));
+}
+
+// --- zip_in_place for complex ---
+namespace detail {
+
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcf>(PacketXcf& p1, PacketXcf& p2) {
+  PacketXf tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 4, 5);
+  p2.v = __builtin_shufflevector(p1.v, p2.v, 2, 3, 6, 7);
+  p1.v = tmp;
+}
+// PacketXcd at 16 bytes has 1 element, no zip_in_place needed.
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcf>(PacketXcf& p1, PacketXcf& p2) {
+  PacketXf tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 8, 9, 2, 3, 10, 11);
+  p2.v = __builtin_shufflevector(p1.v, p2.v, 4, 5, 12, 13, 6, 7, 14, 15);
+  p1.v = tmp;
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcd>(PacketXcd& p1, PacketXcd& p2) {
+  PacketXd tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 4, 5);
+  p2.v = __builtin_shufflevector(p1.v, p2.v, 2, 3, 6, 7);
+  p1.v = tmp;
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcf>(PacketXcf& p1, PacketXcf& p2) {
+  PacketXf tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23);
+  p2.v = __builtin_shufflevector(p1.v, p2.v, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31);
+  p1.v = tmp;
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcd>(PacketXcd& p1, PacketXcd& p2) {
+  PacketXd tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 8, 9, 2, 3, 10, 11);
+  p2.v = __builtin_shufflevector(p1.v, p2.v, 4, 5, 12, 13, 6, 7, 14, 15);
+  p1.v = tmp;
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+}  // namespace detail
+
+// --- ptranspose for complex ---
+// PacketXcf: valid block sizes depend on kComplexFloatSize.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcf, 2>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcf, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcf, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+#endif
+
+// PacketXcd: valid block sizes depend on kComplexDoubleSize.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcd, 2>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcd, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+#endif
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PacketXcf, PacketXf)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PacketXcd, PacketXd)
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_COMPLEX_CLANG_H
diff --git a/Eigen/src/Core/arch/clang/MathFunctions.h b/Eigen/src/Core/arch/clang/MathFunctions.h
new file mode 100644
index 00000000000..c2afeda8fc1
--- /dev/null
+++ b/Eigen/src/Core/arch/clang/MathFunctions.h
@@ -0,0 +1,47 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Rasmus Munk Larsen
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_CLANG_H
+#define EIGEN_MATH_FUNCTIONS_CLANG_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pfrexp<PacketXf>(const PacketXf& a, PacketXf& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXd pfrexp<PacketXd>(const PacketXd& a, PacketXd& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pldexp<PacketXf>(const PacketXf& a, const PacketXf& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXd pldexp<PacketXd>(const PacketXd& a, const PacketXd& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketXf)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketXd)
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_CLANG_H
diff --git a/Eigen/src/Core/arch/clang/PacketMath.h b/Eigen/src/Core/arch/clang/PacketMath.h
new file mode 100644
index 00000000000..491c0e2982f
--- /dev/null
+++ b/Eigen/src/Core/arch/clang/PacketMath.h
@@ -0,0 +1,1171 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Rasmus Munk Larsen
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_CLANG_H
+#define EIGEN_PACKET_MATH_CLANG_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+namespace detail {
+// namespace detail contains implementation details specific to this
+// file, while namespace internal contains internal APIs used elsewhere
+// in Eigen.
+template <typename ScalarT, int n>
+using VectorType = ScalarT __attribute__((ext_vector_type(n), aligned(n * sizeof(ScalarT))));
+}  // namespace detail
+
+// --- Naming Convention ---
+// This backend uses size-independent type aliases so the same code works
+// for EIGEN_GENERIC_VECTOR_SIZE_BYTES in {16, 32, 64}:
+//
+//   PacketXf  - float vector   (4, 8, or 16 elements)
+//   PacketXd  - double vector  (2, 4, or 8 elements)
+//   PacketXi  - int32_t vector (4, 8, or 16 elements)
+//   PacketXl  - int64_t vector (2, 4, or 8 elements)
+//   PacketXcf - complex<float> vector  (2, 4, or 8 elements)  [in Complex.h]
+//   PacketXcd - complex<double> vector (1, 2, or 4 elements)  [in Complex.h]
+//
+// The "X" suffix indicates the element count is determined by the macro
+// EIGEN_GENERIC_VECTOR_SIZE_BYTES at compile time. Operations that require
+// compile-time constant indices (e.g. __builtin_shufflevector) use
+// #if EIGEN_GENERIC_VECTOR_SIZE_BYTES == ... blocks.
+
+static_assert(EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 || EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 ||
+                  EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64,
+              "EIGEN_GENERIC_VECTOR_SIZE_BYTES must be 16, 32, or 64");
+
+constexpr int kFloatPacketSize = EIGEN_GENERIC_VECTOR_SIZE_BYTES / sizeof(float);
+constexpr int kDoublePacketSize = EIGEN_GENERIC_VECTOR_SIZE_BYTES / sizeof(double);
+using PacketXf = detail::VectorType<float, kFloatPacketSize>;
+using PacketXd = detail::VectorType<double, kDoublePacketSize>;
+using PacketXi = detail::VectorType<int32_t, kFloatPacketSize>;
+using PacketXl = detail::VectorType<int64_t, kDoublePacketSize>;
+
+// --- packet_traits specializations ---
+struct generic_float_packet_traits : default_packet_traits {
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasRound = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasCmp = 1,
+    HasSet1 = 1,
+    HasCast = 1,
+    HasBitwise = 1,
+    HasRedux = 1,
+    HasSign = 1,
+    HasArg = 0,
+    HasConj = 1,
+    // Math functions
+    HasReciprocal = 1,
+    HasSin = 1,
+    HasCos = 1,
+    HasTan = 1,
+    HasACos = 1,
+    HasASin = 1,
+    HasATan = 1,
+    HasATanh = 1,
+    HasLog = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasExp = 1,
+    HasPow = 1,
+    HasNdtri = 1,
+    HasBessel = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasCbrt = 1,
+    HasTanh = 1,
+    HasErf = 1,
+    HasErfc = 1
+  };
+};
+
+template <>
+struct packet_traits<float> : generic_float_packet_traits {
+  using type = PacketXf;
+  using half = PacketXf;
+  enum {
+    size = kFloatPacketSize,
+  };
+};
+
+template <>
+struct packet_traits<double> : generic_float_packet_traits {
+  using type = PacketXd;
+  using half = PacketXd;
+  // Generic double-precision acos/asin are not yet implemented in
+  // GenericPacketMathFunctions.h (only float versions exist).
+  enum { size = kDoublePacketSize, HasACos = 0, HasASin = 0 };
+};
+
+struct generic_integer_packet_traits : default_packet_traits {
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasCmp = 1,
+    HasSet1 = 1,
+    HasCast = 1,
+    HasBitwise = 1,
+    HasRedux = 1,
+    // Set remaining to 0
+    HasRound = 1,
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasReciprocal = 0,
+    HasArg = 0,
+    HasConj = 1,
+    HasExp = 0,
+    HasLog = 0,
+    HasSin = 0,
+    HasCos = 0,
+  };
+};
+
+template <>
+struct packet_traits<int32_t> : generic_integer_packet_traits {
+  using type = PacketXi;
+  using half = PacketXi;
+  enum {
+    size = kFloatPacketSize,
+  };
+};
+
+template <>
+struct packet_traits<int64_t> : generic_integer_packet_traits {
+  using type = PacketXl;
+  using half = PacketXl;
+  enum {
+    size = kDoublePacketSize,
+  };
+};
+
+// --- unpacket_traits specializations ---
+struct generic_unpacket_traits : default_unpacket_traits {
+  enum {
+    alignment = EIGEN_GENERIC_VECTOR_SIZE_BYTES,
+    vectorizable = true,
+  };
+};
+
+template <>
+struct unpacket_traits<PacketXf> : generic_unpacket_traits {
+  using type = float;
+  using half = PacketXf;
+  using integer_packet = PacketXi;
+  enum {
+    size = kFloatPacketSize,
+  };
+};
+template <>
+struct unpacket_traits<PacketXd> : generic_unpacket_traits {
+  using type = double;
+  using half = PacketXd;
+  using integer_packet = PacketXl;
+  enum {
+    size = kDoublePacketSize,
+  };
+};
+template <>
+struct unpacket_traits<PacketXi> : generic_unpacket_traits {
+  using type = int32_t;
+  using half = PacketXi;
+  enum {
+    size = kFloatPacketSize,
+  };
+};
+template <>
+struct unpacket_traits<PacketXl> : generic_unpacket_traits {
+  using type = int64_t;
+  using half = PacketXl;
+  enum {
+    size = kDoublePacketSize,
+  };
+};
+
+namespace detail {
+// --- vector type helpers ---
+template <typename VectorT>
+struct ScalarTypeOfVector {
+  using type = std::remove_all_extents_t<std::remove_reference_t<decltype(VectorT()[0])>>;
+};
+
+template <typename VectorT>
+using scalar_type_of_vector_t = typename ScalarTypeOfVector<VectorT>::type;
+
+template <typename VectorType>
+struct UnsignedVectorHelper {
+  static VectorType v;
+  static constexpr int n = __builtin_vectorelements(v);
+  using UnsignedScalar = std::make_unsigned_t<scalar_type_of_vector_t<VectorType>>;
+  using type = UnsignedScalar __attribute__((ext_vector_type(n), aligned(n * sizeof(UnsignedScalar))));
+};
+
+template <typename VectorT>
+using unsigned_vector_t = typename UnsignedVectorHelper<VectorT>::type;
+
+template <typename VectorT>
+using HalfPacket = VectorType<typename unpacket_traits<VectorT>::type, unpacket_traits<VectorT>::size / 2>;
+
+template <typename VectorT>
+using QuarterPacket = VectorType<typename unpacket_traits<VectorT>::type, unpacket_traits<VectorT>::size / 4>;
+
+// load and store helpers.
+template <typename VectorT>
+EIGEN_STRONG_INLINE VectorT load_vector_unaligned(const scalar_type_of_vector_t<VectorT>* from) {
+  VectorT to;
+  __builtin_memcpy(&to, from, sizeof(VectorT));
+  return to;
+}
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE VectorT load_vector_aligned(const scalar_type_of_vector_t<VectorT>* from) {
+  eigen_assert((std::uintptr_t(from) % alignof(VectorT) == 0) && "load_vector_aligned");
+  return *reinterpret_cast<const VectorT*>(assume_aligned<alignof(VectorT)>(from));
+}
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE void store_vector_unaligned(scalar_type_of_vector_t<VectorT>* to, const VectorT& from) {
+  __builtin_memcpy(to, &from, sizeof(VectorT));
+}
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE void store_vector_aligned(scalar_type_of_vector_t<VectorT>* to, const VectorT& from) {
+  eigen_assert((std::uintptr_t(to) % alignof(VectorT) == 0) && "store_vector_aligned");
+  *reinterpret_cast<VectorT*>(assume_aligned<alignof(VectorT)>(to)) = from;
+}
+
+}  // namespace detail
+
+// --- Intrinsic-like specializations ---
+
+// --- Load/Store operations ---
+#define EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PACKET_TYPE)                                                         \
+  template <>                                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE ploadu<PACKET_TYPE>(const detail::scalar_type_of_vector_t<PACKET_TYPE>* from) { \
+    return detail::load_vector_unaligned<PACKET_TYPE>(from);                                                      \
+  }                                                                                                               \
+  template <>                                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pload<PACKET_TYPE>(const detail::scalar_type_of_vector_t<PACKET_TYPE>* from) {  \
+    return detail::load_vector_aligned<PACKET_TYPE>(from);                                                        \
+  }                                                                                                               \
+  template <>                                                                                                     \
+  EIGEN_STRONG_INLINE void pstoreu<detail::scalar_type_of_vector_t<PACKET_TYPE>, PACKET_TYPE>(                    \
+      detail::scalar_type_of_vector_t<PACKET_TYPE> * to, const PACKET_TYPE& from) {                               \
+    detail::store_vector_unaligned<PACKET_TYPE>(to, from);                                                        \
+  }                                                                                                               \
+  template <>                                                                                                     \
+  EIGEN_STRONG_INLINE void pstore<detail::scalar_type_of_vector_t<PACKET_TYPE>, PACKET_TYPE>(                     \
+      detail::scalar_type_of_vector_t<PACKET_TYPE> * to, const PACKET_TYPE& from) {                               \
+    detail::store_vector_aligned<PACKET_TYPE>(to, from);                                                          \
+  }
+
+EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXf)
+EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXd)
+EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXi)
+EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXl)
+#undef EIGEN_CLANG_PACKET_LOAD_STORE_PACKET
+
+// --- Broadcast operation ---
+template <>
+EIGEN_STRONG_INLINE PacketXf pset1frombits<PacketXf>(uint32_t from) {
+  return PacketXf(numext::bit_cast<float>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXd pset1frombits<PacketXd>(uint64_t from) {
+  return PacketXd(numext::bit_cast<double>(from));
+}
+
+#define EIGEN_CLANG_PACKET_SET1(PACKET_TYPE)                                                            \
+  template <>                                                                                           \
+  EIGEN_STRONG_INLINE PACKET_TYPE pset1<PACKET_TYPE>(const unpacket_traits<PACKET_TYPE>::type& from) {  \
+    return PACKET_TYPE(from);                                                                           \
+  }                                                                                                     \
+  template <>                                                                                           \
+  EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type pfirst<PACKET_TYPE>(const PACKET_TYPE& from) { \
+    return from[0];                                                                                     \
+  }
+
+EIGEN_CLANG_PACKET_SET1(PacketXf)
+EIGEN_CLANG_PACKET_SET1(PacketXd)
+EIGEN_CLANG_PACKET_SET1(PacketXi)
+EIGEN_CLANG_PACKET_SET1(PacketXl)
+#undef EIGEN_CLANG_PACKET_SET1
+
+// --- Arithmetic operations ---
+#define EIGEN_CLANG_PACKET_ARITHMETIC(PACKET_TYPE)                             \
+  template <>                                                                  \
+  EIGEN_STRONG_INLINE PACKET_TYPE pisnan<PACKET_TYPE>(const PACKET_TYPE& a) {  \
+    return reinterpret_cast<PACKET_TYPE>(a != a);                              \
+  }                                                                            \
+  template <>                                                                  \
+  EIGEN_STRONG_INLINE PACKET_TYPE pnegate<PACKET_TYPE>(const PACKET_TYPE& a) { \
+    return -a;                                                                 \
+  }
+
+EIGEN_CLANG_PACKET_ARITHMETIC(PacketXf)
+EIGEN_CLANG_PACKET_ARITHMETIC(PacketXd)
+EIGEN_CLANG_PACKET_ARITHMETIC(PacketXi)
+EIGEN_CLANG_PACKET_ARITHMETIC(PacketXl)
+#undef EIGEN_CLANG_PACKET_ARITHMETIC
+
+// --- Bitwise operations (via casting) ---
+
+namespace detail {
+
+// Reinterpret-cast helpers, equivalent to preinterpret<> but defined here
+// because PacketMath.h is included before TypeCasting.h.
+EIGEN_STRONG_INLINE PacketXi preinterpret_float_to_int(const PacketXf& a) { return reinterpret_cast<PacketXi>(a); }
+EIGEN_STRONG_INLINE PacketXf preinterpret_int_to_float(const PacketXi& a) { return reinterpret_cast<PacketXf>(a); }
+EIGEN_STRONG_INLINE PacketXl preinterpret_double_to_long(const PacketXd& a) { return reinterpret_cast<PacketXl>(a); }
+EIGEN_STRONG_INLINE PacketXd preinterpret_long_to_double(const PacketXl& a) { return reinterpret_cast<PacketXd>(a); }
+
+}  // namespace detail
+
+// Bitwise ops for integer packets
+#define EIGEN_CLANG_PACKET_BITWISE_INT(PACKET_TYPE)                                                  \
+  template <>                                                                                        \
+  constexpr EIGEN_STRONG_INLINE PACKET_TYPE pzero<PACKET_TYPE>(const PACKET_TYPE& /*unused*/) {      \
+    return PACKET_TYPE(0);                                                                           \
+  }                                                                                                  \
+  template <>                                                                                        \
+  constexpr EIGEN_STRONG_INLINE PACKET_TYPE ptrue<PACKET_TYPE>(const PACKET_TYPE& /*unused*/) {      \
+    return numext::bit_cast<PACKET_TYPE>(PACKET_TYPE(0) == PACKET_TYPE(0));                          \
+  }                                                                                                  \
+  template <>                                                                                        \
+  EIGEN_STRONG_INLINE PACKET_TYPE pand<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {    \
+    return a & b;                                                                                    \
+  }                                                                                                  \
+  template <>                                                                                        \
+  EIGEN_STRONG_INLINE PACKET_TYPE por<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {     \
+    return a | b;                                                                                    \
+  }                                                                                                  \
+  template <>                                                                                        \
+  EIGEN_STRONG_INLINE PACKET_TYPE pxor<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {    \
+    return a ^ b;                                                                                    \
+  }                                                                                                  \
+  template <>                                                                                        \
+  EIGEN_STRONG_INLINE PACKET_TYPE pandnot<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
+    return a & ~b;                                                                                   \
+  }                                                                                                  \
+  template <int N>                                                                                   \
+  EIGEN_STRONG_INLINE PACKET_TYPE parithmetic_shift_right(const PACKET_TYPE& a) {                    \
+    return a >> N;                                                                                   \
+  }                                                                                                  \
+  template <int N>                                                                                   \
+  EIGEN_STRONG_INLINE PACKET_TYPE plogical_shift_right(const PACKET_TYPE& a) {                       \
+    using UnsignedT = detail::unsigned_vector_t<PACKET_TYPE>;                                        \
+    return reinterpret_cast<PACKET_TYPE>(reinterpret_cast<UnsignedT>(a) >> N);                       \
+  }                                                                                                  \
+  template <int N>                                                                                   \
+  EIGEN_STRONG_INLINE PACKET_TYPE plogical_shift_left(const PACKET_TYPE& a) {                        \
+    return a << N;                                                                                   \
+  }
+
+EIGEN_CLANG_PACKET_BITWISE_INT(PacketXi)
+EIGEN_CLANG_PACKET_BITWISE_INT(PacketXl)
+#undef EIGEN_CLANG_PACKET_BITWISE_INT
+
+// Bitwise ops for floating point packets
+#define EIGEN_CLANG_PACKET_BITWISE_FLOAT(PACKET_TYPE, CAST_TO_INT, CAST_FROM_INT)                    \
+  template <>                                                                                        \
+  constexpr EIGEN_STRONG_INLINE PACKET_TYPE pzero<PACKET_TYPE>(const PACKET_TYPE& /*unused*/) {      \
+    using Scalar = detail::scalar_type_of_vector_t<PACKET_TYPE>;                                     \
+    return PACKET_TYPE(Scalar(0));                                                                   \
+  }                                                                                                  \
+  template <>                                                                                        \
+  constexpr EIGEN_STRONG_INLINE PACKET_TYPE ptrue<PACKET_TYPE>(const PACKET_TYPE& /* unused */) {    \
+    using Scalar = detail::scalar_type_of_vector_t<PACKET_TYPE>;                                     \
+    return numext::bit_cast<PACKET_TYPE>(PACKET_TYPE(Scalar(0)) == PACKET_TYPE(Scalar(0)));          \
+  }                                                                                                  \
+  template <>                                                                                        \
+  EIGEN_STRONG_INLINE PACKET_TYPE pand<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {    \
+    return CAST_FROM_INT(CAST_TO_INT(a) & CAST_TO_INT(b));                                           \
+  }                                                                                                  \
+  template <>                                                                                        \
+  EIGEN_STRONG_INLINE PACKET_TYPE por<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {     \
+    return CAST_FROM_INT(CAST_TO_INT(a) | CAST_TO_INT(b));                                           \
+  }                                                                                                  \
+  template <>                                                                                        \
+  EIGEN_STRONG_INLINE PACKET_TYPE pxor<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {    \
+    return CAST_FROM_INT(CAST_TO_INT(a) ^ CAST_TO_INT(b));                                           \
+  }                                                                                                  \
+  template <>                                                                                        \
+  EIGEN_STRONG_INLINE PACKET_TYPE pandnot<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
+    return CAST_FROM_INT(CAST_TO_INT(a) & ~CAST_TO_INT(b));                                          \
+  }
+
+EIGEN_CLANG_PACKET_BITWISE_FLOAT(PacketXf, detail::preinterpret_float_to_int, detail::preinterpret_int_to_float)
+EIGEN_CLANG_PACKET_BITWISE_FLOAT(PacketXd, detail::preinterpret_double_to_long, detail::preinterpret_long_to_double)
+#undef EIGEN_CLANG_PACKET_BITWISE_FLOAT
+
+// --- Comparison operations ---
+// Clang vector extensions perform comparisons in the original type (float/double),
+// returning an int vector with all-ones (-1) for true and all-zeros for false.
+// The bit_cast reinterprets those int bitmasks as float packets, which is the
+// format expected by pselect and other Eigen packet operations.
+#define EIGEN_CLANG_PACKET_CMP(PACKET_TYPE, INT_PACKET_TYPE)                                                \
+  template <>                                                                                               \
+  EIGEN_STRONG_INLINE PACKET_TYPE pcmp_eq<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {        \
+    return numext::bit_cast<PACKET_TYPE>(INT_PACKET_TYPE(a == b));                                          \
+  }                                                                                                         \
+  template <>                                                                                               \
+  EIGEN_STRONG_INLINE PACKET_TYPE pcmp_lt<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {        \
+    return numext::bit_cast<PACKET_TYPE>(INT_PACKET_TYPE(a < b));                                           \
+  }                                                                                                         \
+  template <>                                                                                               \
+  EIGEN_STRONG_INLINE PACKET_TYPE pcmp_le<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {        \
+    return numext::bit_cast<PACKET_TYPE>(INT_PACKET_TYPE(a <= b));                                          \
+  }                                                                                                         \
+  template <>                                                                                               \
+  EIGEN_STRONG_INLINE PACKET_TYPE pcmp_lt_or_nan<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
+    return numext::bit_cast<PACKET_TYPE>(INT_PACKET_TYPE(!(a >= b)));                                       \
+  }
+
+EIGEN_CLANG_PACKET_CMP(PacketXf, PacketXi)
+EIGEN_CLANG_PACKET_CMP(PacketXd, PacketXl)
+#undef EIGEN_CLANG_PACKET_CMP
+
+// --- Min/Max operations ---
+#if EIGEN_HAS_BUILTIN(__builtin_elementwise_min) && EIGEN_HAS_BUILTIN(__builtin_elementwise_max) && \
+    EIGEN_HAS_BUILTIN(__builtin_elementwise_abs)
+#define EIGEN_CLANG_PACKET_ELEMENTWISE(PACKET_TYPE)                                                                 \
+  template <>                                                                                                       \
+  EIGEN_STRONG_INLINE PACKET_TYPE pmin<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {                   \
+    /* Match NaN propagation of std::min. */                                                                        \
+    return a == a ? __builtin_elementwise_min(a, b) : a;                                                            \
+  }                                                                                                                 \
+  template <>                                                                                                       \
+  EIGEN_STRONG_INLINE PACKET_TYPE pmax<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {                   \
+    /* Match NaN propagation of std::max. */                                                                        \
+    return a == a ? __builtin_elementwise_max(a, b) : a;                                                            \
+  }                                                                                                                 \
+  template <>                                                                                                       \
+  EIGEN_STRONG_INLINE PACKET_TYPE pmin<PropagateNumbers, PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
+    return __builtin_elementwise_min(a, b);                                                                         \
+  }                                                                                                                 \
+  template <>                                                                                                       \
+  EIGEN_STRONG_INLINE PACKET_TYPE pmax<PropagateNumbers, PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
+    return __builtin_elementwise_max(a, b);                                                                         \
+  }                                                                                                                 \
+  template <>                                                                                                       \
+  EIGEN_STRONG_INLINE PACKET_TYPE pmin<PropagateNaN, PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {     \
+    return a != a ? a : (b != b ? b : __builtin_elementwise_min(a, b));                                             \
+  }                                                                                                                 \
+  template <>                                                                                                       \
+  EIGEN_STRONG_INLINE PACKET_TYPE pmax<PropagateNaN, PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {     \
+    return a != a ? a : (b != b ? b : __builtin_elementwise_max(a, b));                                             \
+  }                                                                                                                 \
+  template <>                                                                                                       \
+  EIGEN_STRONG_INLINE PACKET_TYPE pabs<PACKET_TYPE>(const PACKET_TYPE& a) {                                         \
+    return __builtin_elementwise_abs(a);                                                                            \
+  }                                                                                                                 \
+  template <>                                                                                                       \
+  EIGEN_STRONG_INLINE PACKET_TYPE pselect<PACKET_TYPE>(const PACKET_TYPE& mask, const PACKET_TYPE& a,               \
+                                                       const PACKET_TYPE& b) {                                      \
+    return mask != 0 ? a : b;                                                                                       \
+  }
+
+EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXf)
+EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXd)
+EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXi)
+EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXl)
+#undef EIGEN_CLANG_PACKET_ELEMENTWISE
+#endif
+
+// --- Math functions (float/double only) ---
+
+#if EIGEN_HAS_BUILTIN(__builtin_elementwise_floor) && EIGEN_HAS_BUILTIN(__builtin_elementwise_ceil) &&      \
+    EIGEN_HAS_BUILTIN(__builtin_elementwise_round) && EIGEN_HAS_BUILTIN(__builtin_elementwise_roundeven) && \
+    EIGEN_HAS_BUILTIN(__builtin_elementwise_trunc) && EIGEN_HAS_BUILTIN(__builtin_elementwise_sqrt)
+#define EIGEN_CLANG_PACKET_MATH_FLOAT(PACKET_TYPE)                            \
+  template <>                                                                 \
+  EIGEN_STRONG_INLINE PACKET_TYPE pfloor<PACKET_TYPE>(const PACKET_TYPE& a) { \
+    return __builtin_elementwise_floor(a);                                    \
+  }                                                                           \
+  template <>                                                                 \
+  EIGEN_STRONG_INLINE PACKET_TYPE pceil<PACKET_TYPE>(const PACKET_TYPE& a) {  \
+    return __builtin_elementwise_ceil(a);                                     \
+  }                                                                           \
+  template <>                                                                 \
+  EIGEN_STRONG_INLINE PACKET_TYPE pround<PACKET_TYPE>(const PACKET_TYPE& a) { \
+    return __builtin_elementwise_round(a);                                    \
+  }                                                                           \
+  template <>                                                                 \
+  EIGEN_STRONG_INLINE PACKET_TYPE print<PACKET_TYPE>(const PACKET_TYPE& a) {  \
+    return __builtin_elementwise_roundeven(a);                                \
+  }                                                                           \
+  template <>                                                                 \
+  EIGEN_STRONG_INLINE PACKET_TYPE ptrunc<PACKET_TYPE>(const PACKET_TYPE& a) { \
+    return __builtin_elementwise_trunc(a);                                    \
+  }                                                                           \
+  template <>                                                                 \
+  EIGEN_STRONG_INLINE PACKET_TYPE psqrt<PACKET_TYPE>(const PACKET_TYPE& a) {  \
+    return __builtin_elementwise_sqrt(a);                                     \
+  }
+
+EIGEN_CLANG_PACKET_MATH_FLOAT(PacketXf)
+EIGEN_CLANG_PACKET_MATH_FLOAT(PacketXd)
+#undef EIGEN_CLANG_PACKET_MATH_FLOAT
+#endif
+
+// --- Fused Multiply-Add (MADD) ---
+#if defined(__FMA__) && EIGEN_HAS_BUILTIN(__builtin_elementwise_fma)
+#define EIGEN_CLANG_PACKET_MADD(PACKET_TYPE)                                                      \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pmadd<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b,  \
+                                                     const PACKET_TYPE& c) {                      \
+    return __builtin_elementwise_fma(a, b, c);                                                    \
+  }                                                                                               \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pmsub<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b,  \
+                                                     const PACKET_TYPE& c) {                      \
+    return __builtin_elementwise_fma(a, b, -c);                                                   \
+  }                                                                                               \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pnmadd<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b, \
+                                                      const PACKET_TYPE& c) {                     \
+    return __builtin_elementwise_fma(-a, b, c);                                                   \
+  }                                                                                               \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pnmsub<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b, \
+                                                      const PACKET_TYPE& c) {                     \
+    return -(__builtin_elementwise_fma(a, b, c));                                                 \
+  }
+#else
+// Fallback if FMA builtin is not available
+#define EIGEN_CLANG_PACKET_MADD(PACKET_TYPE)                                                      \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pmadd<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b,  \
+                                                     const PACKET_TYPE& c) {                      \
+    return (a * b) + c;                                                                           \
+  }                                                                                               \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pmsub<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b,  \
+                                                     const PACKET_TYPE& c) {                      \
+    return (a * b) - c;                                                                           \
+  }                                                                                               \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pnmadd<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b, \
+                                                      const PACKET_TYPE& c) {                     \
+    return c - (a * b);                                                                           \
+  }                                                                                               \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pnmsub<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b, \
+                                                      const PACKET_TYPE& c) {                     \
+    return -((a * b) + c);                                                                        \
+  }
+#endif
+
+EIGEN_CLANG_PACKET_MADD(PacketXf)
+EIGEN_CLANG_PACKET_MADD(PacketXd)
+#undef EIGEN_CLANG_PACKET_MADD
+
+#define EIGEN_CLANG_PACKET_SCATTER_GATHER(PACKET_TYPE)                                                               \
+  template <>                                                                                                        \
+  EIGEN_STRONG_INLINE void pscatter(unpacket_traits<PACKET_TYPE>::type* to, const PACKET_TYPE& from, Index stride) { \
+    constexpr int size = unpacket_traits<PACKET_TYPE>::size;                                                         \
+    for (int i = 0; i < size; ++i) {                                                                                 \
+      to[i * stride] = from[i];                                                                                      \
+    }                                                                                                                \
+  }                                                                                                                  \
+  template <>                                                                                                        \
+  EIGEN_STRONG_INLINE PACKET_TYPE pgather<typename unpacket_traits<PACKET_TYPE>::type, PACKET_TYPE>(                 \
+      const unpacket_traits<PACKET_TYPE>::type* from, Index stride) {                                                \
+    constexpr int size = unpacket_traits<PACKET_TYPE>::size;                                                         \
+    PACKET_TYPE result;                                                                                              \
+    for (int i = 0; i < size; ++i) {                                                                                 \
+      result[i] = from[i * stride];                                                                                  \
+    }                                                                                                                \
+    return result;                                                                                                   \
+  }
+
+EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXf)
+EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXd)
+EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXi)
+EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXl)
+
+#undef EIGEN_CLANG_PACKET_SCATTER_GATHER
+
+// ---- Various operations that depend on __builtin_shufflevector.
+#if EIGEN_HAS_BUILTIN(__builtin_shufflevector)
+namespace detail {
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet preverse_impl_2(const Packet& a) {
+  return __builtin_shufflevector(a, a, 1, 0);
+}
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet preverse_impl_4(const Packet& a) {
+  return __builtin_shufflevector(a, a, 3, 2, 1, 0);
+}
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet preverse_impl_8(const Packet& a) {
+  return __builtin_shufflevector(a, a, 7, 6, 5, 4, 3, 2, 1, 0);
+}
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet preverse_impl_16(const Packet& a) {
+  return __builtin_shufflevector(a, a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+}
+}  // namespace detail
+
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE PacketXf preverse<PacketXf>(const PacketXf& a) {
+  return detail::preverse_impl_4(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd preverse<PacketXd>(const PacketXd& a) {
+  return detail::preverse_impl_2(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi preverse<PacketXi>(const PacketXi& a) {
+  return detail::preverse_impl_4(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl preverse<PacketXl>(const PacketXl& a) {
+  return detail::preverse_impl_2(a);
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXf preverse<PacketXf>(const PacketXf& a) {
+  return detail::preverse_impl_8(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd preverse<PacketXd>(const PacketXd& a) {
+  return detail::preverse_impl_4(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi preverse<PacketXi>(const PacketXi& a) {
+  return detail::preverse_impl_8(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl preverse<PacketXl>(const PacketXl& a) {
+  return detail::preverse_impl_4(a);
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXf preverse<PacketXf>(const PacketXf& a) {
+  return detail::preverse_impl_16(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd preverse<PacketXd>(const PacketXd& a) {
+  return detail::preverse_impl_8(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi preverse<PacketXi>(const PacketXi& a) {
+  return detail::preverse_impl_16(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl preverse<PacketXl>(const PacketXl& a) {
+  return detail::preverse_impl_8(a);
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+namespace detail {
+
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploaddup2(const typename unpacket_traits<Packet>::type* from) {
+  static_assert((unpacket_traits<Packet>::size) % 2 == 0, "Packet size must be a multiple of 2");
+  using HalfPacket = HalfPacket<Packet>;
+  HalfPacket a = load_vector_unaligned<HalfPacket>(from);
+  return __builtin_shufflevector(a, a, 0, 0);
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploaddup4(const typename unpacket_traits<Packet>::type* from) {
+  static_assert((unpacket_traits<Packet>::size) % 2 == 0, "Packet size must be a multiple of 2");
+  using HalfPacket = HalfPacket<Packet>;
+  HalfPacket a = load_vector_unaligned<HalfPacket>(from);
+  return __builtin_shufflevector(a, a, 0, 0, 1, 1);
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploaddup8(const typename unpacket_traits<Packet>::type* from) {
+  static_assert((unpacket_traits<Packet>::size) % 2 == 0, "Packet size must be a multiple of 2");
+  using HalfPacket = HalfPacket<Packet>;
+  HalfPacket a = load_vector_unaligned<HalfPacket>(from);
+  return __builtin_shufflevector(a, a, 0, 0, 1, 1, 2, 2, 3, 3);
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploaddup16(const typename unpacket_traits<Packet>::type* from) {
+  static_assert((unpacket_traits<Packet>::size) % 2 == 0, "Packet size must be a multiple of 2");
+  using HalfPacket = HalfPacket<Packet>;
+  HalfPacket a = load_vector_unaligned<HalfPacket>(from);
+  return __builtin_shufflevector(a, a, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploadquad4(const typename unpacket_traits<Packet>::type* from) {
+  static_assert((unpacket_traits<Packet>::size) % 4 == 0, "Packet size must be a multiple of 4");
+  using QuarterPacket = QuarterPacket<Packet>;
+  QuarterPacket a = load_vector_unaligned<QuarterPacket>(from);
+  return __builtin_shufflevector(a, a, 0, 0, 0, 0);
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploadquad8(const typename unpacket_traits<Packet>::type* from) {
+  static_assert((unpacket_traits<Packet>::size) % 4 == 0, "Packet size must be a multiple of 4");
+  using QuarterPacket = QuarterPacket<Packet>;
+  QuarterPacket a = load_vector_unaligned<QuarterPacket>(from);
+  return __builtin_shufflevector(a, a, 0, 0, 0, 0, 1, 1, 1, 1);
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploadquad16(const typename unpacket_traits<Packet>::type* from) {
+  static_assert((unpacket_traits<Packet>::size) % 4 == 0, "Packet size must be a multiple of 4");
+  using QuarterPacket = QuarterPacket<Packet>;
+  QuarterPacket a = load_vector_unaligned<QuarterPacket>(from);
+  return __builtin_shufflevector(a, a, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3);
+}
+
+}  // namespace detail
+
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from) {
+  return detail::ploaddup4<PacketXf>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd ploaddup<PacketXd>(const double* from) {
+  return detail::ploaddup2<PacketXd>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const int32_t* from) {
+  return detail::ploaddup4<PacketXi>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl ploaddup<PacketXl>(const int64_t* from) {
+  return detail::ploaddup2<PacketXl>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from) {
+  return detail::ploadquad4<PacketXf>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const int32_t* from) {
+  return detail::ploadquad4<PacketXi>(from);
+}
+// No ploadquad for 2-element packets (PacketXd, PacketXl) at 16 bytes.
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from) {
+  return detail::ploaddup8<PacketXf>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd ploaddup<PacketXd>(const double* from) {
+  return detail::ploaddup4<PacketXd>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const int32_t* from) {
+  return detail::ploaddup8<PacketXi>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl ploaddup<PacketXl>(const int64_t* from) {
+  return detail::ploaddup4<PacketXl>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from) {
+  return detail::ploadquad8<PacketXf>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd ploadquad<PacketXd>(const double* from) {
+  return detail::ploadquad4<PacketXd>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const int32_t* from) {
+  return detail::ploadquad8<PacketXi>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl ploadquad<PacketXl>(const int64_t* from) {
+  return detail::ploadquad4<PacketXl>(from);
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from) {
+  return detail::ploaddup16<PacketXf>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd ploaddup<PacketXd>(const double* from) {
+  return detail::ploaddup8<PacketXd>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const int32_t* from) {
+  return detail::ploaddup16<PacketXi>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl ploaddup<PacketXl>(const int64_t* from) {
+  return detail::ploaddup8<PacketXl>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from) {
+  return detail::ploadquad16<PacketXf>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd ploadquad<PacketXd>(const double* from) {
+  return detail::ploadquad8<PacketXd>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const int32_t* from) {
+  return detail::ploadquad16<PacketXi>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl ploadquad<PacketXl>(const int64_t* from) {
+  return detail::ploadquad8<PacketXl>(from);
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- plset ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a) {
+  return PacketXf{a + 0.0f, a + 1.0f, a + 2.0f, a + 3.0f};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd plset<PacketXd>(const double& a) {
+  return PacketXd{a + 0.0, a + 1.0};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const int32_t& a) {
+  return PacketXi{a + 0, a + 1, a + 2, a + 3};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl plset<PacketXl>(const int64_t& a) {
+  return PacketXl{a + 0, a + 1};
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a) {
+  return PacketXf{a + 0.0f, a + 1.0f, a + 2.0f, a + 3.0f, a + 4.0f, a + 5.0f, a + 6.0f, a + 7.0f};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd plset<PacketXd>(const double& a) {
+  return PacketXd{a + 0.0, a + 1.0, a + 2.0, a + 3.0};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const int32_t& a) {
+  return PacketXi{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl plset<PacketXl>(const int64_t& a) {
+  return PacketXl{a + 0, a + 1, a + 2, a + 3};
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a) {
+  return PacketXf{a + 0.0f, a + 1.0f, a + 2.0f,  a + 3.0f,  a + 4.0f,  a + 5.0f,  a + 6.0f,  a + 7.0f,
+                  a + 8.0f, a + 9.0f, a + 10.0f, a + 11.0f, a + 12.0f, a + 13.0f, a + 14.0f, a + 15.0f};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd plset<PacketXd>(const double& a) {
+  return PacketXd{a + 0.0, a + 1.0, a + 2.0, a + 3.0, a + 4.0, a + 5.0, a + 6.0, a + 7.0};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const int32_t& a) {
+  return PacketXi{a + 0, a + 1, a + 2,  a + 3,  a + 4,  a + 5,  a + 6,  a + 7,
+                  a + 8, a + 9, a + 10, a + 11, a + 12, a + 13, a + 14, a + 15};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl plset<PacketXl>(const int64_t& a) {
+  return PacketXl{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7};
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- peven_mask ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE PacketXf peven_mask(const PacketXf& /* unused */) {
+  float kTrue = numext::bit_cast<float>(int32_t(-1));
+  float kFalse = 0.0f;
+  return PacketXf{kTrue, kFalse, kTrue, kFalse};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd peven_mask(const PacketXd& /* unused */) {
+  double kTrue = numext::bit_cast<double>(int64_t(-1l));
+  double kFalse = 0.0;
+  return PacketXd{kTrue, kFalse};
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXf peven_mask(const PacketXf& /* unused */) {
+  float kTrue = numext::bit_cast<float>(int32_t(-1));
+  float kFalse = 0.0f;
+  return PacketXf{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd peven_mask(const PacketXd& /* unused */) {
+  double kTrue = numext::bit_cast<double>(int64_t(-1l));
+  double kFalse = 0.0;
+  return PacketXd{kTrue, kFalse, kTrue, kFalse};
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXf peven_mask(const PacketXf& /* unused */) {
+  float kTrue = numext::bit_cast<float>(int32_t(-1));
+  float kFalse = 0.0f;
+  return PacketXf{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse,
+                  kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd peven_mask(const PacketXd& /* unused */) {
+  double kTrue = numext::bit_cast<double>(int64_t(-1l));
+  double kFalse = 0.0;
+  return PacketXd{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// Helpers for ptranspose.
+namespace detail {
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void zip_in_place2(Packet& p1, Packet& p2) {
+  Packet tmp = __builtin_shufflevector(p1, p2, 0, 2);
+  p2 = __builtin_shufflevector(p1, p2, 1, 3);
+  p1 = tmp;
+}
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void zip_in_place4(Packet& p1, Packet& p2) {
+  Packet tmp = __builtin_shufflevector(p1, p2, 0, 4, 1, 5);
+  p2 = __builtin_shufflevector(p1, p2, 2, 6, 3, 7);
+  p1 = tmp;
+}
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void zip_in_place8(Packet& p1, Packet& p2) {
+  Packet tmp = __builtin_shufflevector(p1, p2, 0, 8, 1, 9, 2, 10, 3, 11);
+  p2 = __builtin_shufflevector(p1, p2, 4, 12, 5, 13, 6, 14, 7, 15);
+  p1 = tmp;
+}
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void zip_in_place16(Packet& p1, Packet& p2) {
+  Packet tmp = __builtin_shufflevector(p1, p2, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+  p2 = __builtin_shufflevector(p1, p2, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+  p1 = tmp;
+}
+
+template <typename Packet>
+void zip_in_place(Packet& p1, Packet& p2);
+
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXf>(PacketXf& p1, PacketXf& p2) {
+  zip_in_place4(p1, p2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXd>(PacketXd& p1, PacketXd& p2) {
+  zip_in_place2(p1, p2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXi>(PacketXi& p1, PacketXi& p2) {
+  zip_in_place4(p1, p2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXl>(PacketXl& p1, PacketXl& p2) {
+  zip_in_place2(p1, p2);
+}
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXf>(PacketXf& p1, PacketXf& p2) {
+  zip_in_place8(p1, p2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXd>(PacketXd& p1, PacketXd& p2) {
+  zip_in_place4(p1, p2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXi>(PacketXi& p1, PacketXi& p2) {
+  zip_in_place8(p1, p2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXl>(PacketXl& p1, PacketXl& p2) {
+  zip_in_place4(p1, p2);
+}
+#else   // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXf>(PacketXf& p1, PacketXf& p2) {
+  zip_in_place16(p1, p2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXd>(PacketXd& p1, PacketXd& p2) {
+  zip_in_place8(p1, p2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXi>(PacketXi& p1, PacketXi& p2) {
+  zip_in_place16(p1, p2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXl>(PacketXl& p1, PacketXl& p2) {
+  zip_in_place8(p1, p2);
+}
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
+  zip_in_place(kernel.packet[0], kernel.packet[1]);
+}
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 4>& kernel) {
+  zip_in_place(kernel.packet[0], kernel.packet[2]);
+  zip_in_place(kernel.packet[1], kernel.packet[3]);
+  zip_in_place(kernel.packet[0], kernel.packet[1]);
+  zip_in_place(kernel.packet[2], kernel.packet[3]);
+}
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 8>& kernel) {
+  zip_in_place(kernel.packet[0], kernel.packet[4]);
+  zip_in_place(kernel.packet[1], kernel.packet[5]);
+  zip_in_place(kernel.packet[2], kernel.packet[6]);
+  zip_in_place(kernel.packet[3], kernel.packet[7]);
+
+  zip_in_place(kernel.packet[0], kernel.packet[2]);
+  zip_in_place(kernel.packet[1], kernel.packet[3]);
+  zip_in_place(kernel.packet[4], kernel.packet[6]);
+  zip_in_place(kernel.packet[5], kernel.packet[7]);
+
+  zip_in_place(kernel.packet[0], kernel.packet[1]);
+  zip_in_place(kernel.packet[2], kernel.packet[3]);
+  zip_in_place(kernel.packet[4], kernel.packet[5]);
+  zip_in_place(kernel.packet[6], kernel.packet[7]);
+}
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 16>& kernel) {
+  EIGEN_UNROLL_LOOP
+  for (int i = 0; i < 4; ++i) {
+    const int m = (1 << i);
+    EIGEN_UNROLL_LOOP
+    for (int j = 0; j < m; ++j) {
+      const int n = (1 << (3 - i));
+      EIGEN_UNROLL_LOOP
+      for (int k = 0; k < n; ++k) {
+        const int idx = 2 * j * n + k;
+        zip_in_place(kernel.packet[idx], kernel.packet[idx + n]);
+      }
+    }
+  }
+}
+
+}  // namespace detail
+
+// ptranspose overloads: only emit valid block sizes per vector size.
+// At 16 bytes: float has 4 elems, double has 2 elems.
+// At 32 bytes: float has 8 elems, double has 4 elems.
+// At 64 bytes: float has 16 elems, double has 8 elems.
+
+// All sizes support PacketBlock<PacketXf, 2> and PacketBlock<PacketXf, 4>.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXf, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXf, 2>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+// All sizes support PacketBlock<PacketXd, 2>.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXd, 2>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+// All sizes support PacketBlock<PacketXi, 2> and PacketBlock<PacketXi, 4>.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXi, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXi, 2>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+// All sizes support PacketBlock<PacketXl, 2>.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXl, 2>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+// 32+ bytes: float has 8+ elems, double has 4+ elems.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXf, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXd, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXi, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXl, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+#endif
+
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+// 64 bytes: float has 16 elems, double has 8 elems.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXf, 16>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXd, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXi, 16>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXl, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+#endif
+#endif
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_CLANG_H
diff --git a/Eigen/src/Core/arch/clang/Reductions.h b/Eigen/src/Core/arch/clang/Reductions.h
new file mode 100644
index 00000000000..37fc1617fe1
--- /dev/null
+++ b/Eigen/src/Core/arch/clang/Reductions.h
@@ -0,0 +1,355 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Rasmus Munk Larsen
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REDUCTIONS_CLANG_H
+#define EIGEN_REDUCTIONS_CLANG_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+// --- Reductions ---
+#if EIGEN_HAS_BUILTIN(__builtin_reduce_min) && EIGEN_HAS_BUILTIN(__builtin_reduce_max) && \
+    EIGEN_HAS_BUILTIN(__builtin_reduce_or)
+#define EIGEN_CLANG_PACKET_REDUX_MINMAX(PACKET_TYPE)                                        \
+  template <>                                                                               \
+  EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type predux_min(const PACKET_TYPE& a) { \
+    return __builtin_reduce_min(a);                                                         \
+  }                                                                                         \
+  template <>                                                                               \
+  EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type predux_max(const PACKET_TYPE& a) { \
+    return __builtin_reduce_max(a);                                                         \
+  }                                                                                         \
+  template <>                                                                               \
+  EIGEN_STRONG_INLINE bool predux_any(const PACKET_TYPE& a) {                               \
+    return __builtin_reduce_or(a != 0) != 0;                                                \
+  }
+
+EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXf)
+EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXd)
+EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXi)
+EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXl)
+#undef EIGEN_CLANG_PACKET_REDUX_MINMAX
+#endif
+
+#if EIGEN_HAS_BUILTIN(__builtin_reduce_add) && EIGEN_HAS_BUILTIN(__builtin_reduce_mul)
+#define EIGEN_CLANG_PACKET_REDUX_INT(PACKET_TYPE)                                                        \
+  template <>                                                                                            \
+  EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type predux<PACKET_TYPE>(const PACKET_TYPE& a) {     \
+    return __builtin_reduce_add(a);                                                                      \
+  }                                                                                                      \
+  template <>                                                                                            \
+  EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type predux_mul<PACKET_TYPE>(const PACKET_TYPE& a) { \
+    return __builtin_reduce_mul(a);                                                                      \
+  }
+
+// __builtin_reduce_{mul,add} are only defined for integer types.
+EIGEN_CLANG_PACKET_REDUX_INT(PacketXi)
+EIGEN_CLANG_PACKET_REDUX_INT(PacketXl)
+#undef EIGEN_CLANG_PACKET_REDUX_INT
+#endif
+
+#if EIGEN_HAS_BUILTIN(__builtin_shufflevector)
+namespace detail {
+
+// Reduction helpers for different vector sizes.
+// Each returns a pair of (even-sum, odd-sum) or (even-product, odd-product).
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd2(
+    const VectorT& a) {
+  return {a[0], a[1]};
+}
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd4(
+    const VectorT& a) {
+  const auto t1 = __builtin_shufflevector(a, a, 0, 1) + __builtin_shufflevector(a, a, 2, 3);
+  return {t1[0], t1[1]};
+}
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd8(
+    const VectorT& a) {
+  const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) + __builtin_shufflevector(a, a, 4, 5, 6, 7);
+  const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) + __builtin_shufflevector(t1, t1, 2, 3);
+  return {t2[0], t2[1]};
+}
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd16(
+    const VectorT& a) {
+  const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7) +
+                  __builtin_shufflevector(a, a, 8, 9, 10, 11, 12, 13, 14, 15);
+  const auto t2 = __builtin_shufflevector(t1, t1, 0, 1, 2, 3) + __builtin_shufflevector(t1, t1, 4, 5, 6, 7);
+  const auto t3 = __builtin_shufflevector(t2, t2, 0, 1) + __builtin_shufflevector(t2, t2, 2, 3);
+  return {t3[0], t3[1]};
+}
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul2(
+    const VectorT& a) {
+  return {a[0], a[1]};
+}
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul4(
+    const VectorT& a) {
+  const auto t1 = __builtin_shufflevector(a, a, 0, 1) * __builtin_shufflevector(a, a, 2, 3);
+  return {t1[0], t1[1]};
+}
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul8(
+    const VectorT& a) {
+  const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) * __builtin_shufflevector(a, a, 4, 5, 6, 7);
+  const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) * __builtin_shufflevector(t1, t1, 2, 3);
+  return {t2[0], t2[1]};
+}
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul16(
+    const VectorT& a) {
+  const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7) *
+                  __builtin_shufflevector(a, a, 8, 9, 10, 11, 12, 13, 14, 15);
+  const auto t2 = __builtin_shufflevector(t1, t1, 0, 1, 2, 3) * __builtin_shufflevector(t1, t1, 4, 5, 6, 7);
+  const auto t3 = __builtin_shufflevector(t2, t2, 0, 1) * __builtin_shufflevector(t2, t2, 2, 3);
+  return {t3[0], t3[1]};
+}
+}  // namespace detail
+
+// --- predux and predux_mul for float ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
+  float even, odd;
+  std::tie(even, odd) = detail::ReduceAdd4(a);
+  return even + odd;
+}
+template <>
+EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
+  float even, odd;
+  std::tie(even, odd) = detail::ReduceMul4(a);
+  return even * odd;
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
+  float even, odd;
+  std::tie(even, odd) = detail::ReduceAdd8(a);
+  return even + odd;
+}
+template <>
+EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
+  float even, odd;
+  std::tie(even, odd) = detail::ReduceMul8(a);
+  return even * odd;
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
+  float even, odd;
+  std::tie(even, odd) = detail::ReduceAdd16(a);
+  return even + odd;
+}
+template <>
+EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
+  float even, odd;
+  std::tie(even, odd) = detail::ReduceMul16(a);
+  return even * odd;
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- predux and predux_mul for double ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE double predux<PacketXd>(const PacketXd& a) {
+  double even, odd;
+  std::tie(even, odd) = detail::ReduceAdd2(a);
+  return even + odd;
+}
+template <>
+EIGEN_STRONG_INLINE double predux_mul<PacketXd>(const PacketXd& a) {
+  double even, odd;
+  std::tie(even, odd) = detail::ReduceMul2(a);
+  return even * odd;
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE double predux<PacketXd>(const PacketXd& a) {
+  double even, odd;
+  std::tie(even, odd) = detail::ReduceAdd4(a);
+  return even + odd;
+}
+template <>
+EIGEN_STRONG_INLINE double predux_mul<PacketXd>(const PacketXd& a) {
+  double even, odd;
+  std::tie(even, odd) = detail::ReduceMul4(a);
+  return even * odd;
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE double predux<PacketXd>(const PacketXd& a) {
+  double even, odd;
+  std::tie(even, odd) = detail::ReduceAdd8(a);
+  return even + odd;
+}
+template <>
+EIGEN_STRONG_INLINE double predux_mul<PacketXd>(const PacketXd& a) {
+  double even, odd;
+  std::tie(even, odd) = detail::ReduceMul8(a);
+  return even * odd;
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- predux for complex<float> ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<PacketXcf>(const PacketXcf& a) {
+  float re, im;
+  std::tie(re, im) = detail::ReduceAdd4(a.v);
+  return std::complex<float>(re, im);
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<PacketXcf>(const PacketXcf& a) {
+  float re, im;
+  std::tie(re, im) = detail::ReduceAdd8(a.v);
+  return std::complex<float>(re, im);
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<PacketXcf>(const PacketXcf& a) {
+  float re, im;
+  std::tie(re, im) = detail::ReduceAdd16(a.v);
+  return std::complex<float>(re, im);
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- predux for complex<double> ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<PacketXcd>(const PacketXcd& a) {
+  // 1 complex double: just return it
+  return a[0];
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<PacketXcd>(const PacketXcd& a) {
+  double re, im;
+  std::tie(re, im) = detail::ReduceAdd4(a.v);
+  return std::complex<double>(re, im);
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<PacketXcd>(const PacketXcd& a) {
+  double re, im;
+  std::tie(re, im) = detail::ReduceAdd8(a.v);
+  return std::complex<double>(re, im);
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- predux_mul for complex<float> ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<PacketXcf>(const PacketXcf& a) {
+  // 2 complex floats: just multiply them
+  return a[0] * a[1];
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<PacketXcf>(const PacketXcf& a) {
+  // 4 complex floats: split into 2+2, multiply, then scalar multiply
+  const Packet2cf lower2 = Packet2cf(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3));
+  const Packet2cf upper2 = Packet2cf(__builtin_shufflevector(a.v, a.v, 4, 5, 6, 7));
+  const Packet2cf prod2 = pmul<Packet2cf>(lower2, upper2);
+  return prod2[0] * prod2[1];
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<PacketXcf>(const PacketXcf& a) {
+  // 8 complex floats: 8->4->2->scalar
+  const Packet4cf lower4 = Packet4cf(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3, 4, 5, 6, 7));
+  const Packet4cf upper4 = Packet4cf(__builtin_shufflevector(a.v, a.v, 8, 9, 10, 11, 12, 13, 14, 15));
+  const Packet4cf prod4 = pmul<Packet4cf>(lower4, upper4);
+  const Packet2cf lower2 = Packet2cf(__builtin_shufflevector(prod4.v, prod4.v, 0, 1, 2, 3));
+  const Packet2cf upper2 = Packet2cf(__builtin_shufflevector(prod4.v, prod4.v, 4, 5, 6, 7));
+  const Packet2cf prod2 = pmul<Packet2cf>(lower2, upper2);
+  return prod2[0] * prod2[1];
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- predux_mul for complex<double> ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<PacketXcd>(const PacketXcd& a) {
+  // 1 complex double: just return it
+  return a[0];
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<PacketXcd>(const PacketXcd& a) {
+  // 2 complex doubles: just multiply them
+  return a[0] * a[1];
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<PacketXcd>(const PacketXcd& a) {
+  // 4 complex doubles: split into 2+2, multiply, then scalar multiply
+  const Packet2cd lower2 = Packet2cd(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3));
+  const Packet2cd upper2 = Packet2cd(__builtin_shufflevector(a.v, a.v, 4, 5, 6, 7));
+  const Packet2cd prod2 = pmul<Packet2cd>(lower2, upper2);
+  return prod2[0] * prod2[1];
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+#endif
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_REDUCTIONS_CLANG_H
diff --git a/Eigen/src/Core/arch/clang/TypeCasting.h b/Eigen/src/Core/arch/clang/TypeCasting.h
new file mode 100644
index 00000000000..75281b2a20d
--- /dev/null
+++ b/Eigen/src/Core/arch/clang/TypeCasting.h
@@ -0,0 +1,186 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Rasmus Munk Larsen
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_CLANG_H
+#define EIGEN_TYPE_CASTING_CLANG_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+//==============================================================================
+// preinterpret
+//==============================================================================
+template <>
+EIGEN_STRONG_INLINE PacketXf preinterpret<PacketXf, PacketXi>(const PacketXi& a) {
+  return reinterpret_cast<PacketXf>(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi preinterpret<PacketXi, PacketXf>(const PacketXf& a) {
+  return reinterpret_cast<PacketXi>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXd preinterpret<PacketXd, PacketXl>(const PacketXl& a) {
+  return reinterpret_cast<PacketXd>(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl preinterpret<PacketXl, PacketXd>(const PacketXd& a) {
+  return reinterpret_cast<PacketXl>(a);
+}
+
+//==============================================================================
+// pcast
+//==============================================================================
+#if EIGEN_HAS_BUILTIN(__builtin_convertvector)
+// Float-to-int conversions: __builtin_convertvector has UB for NaN/inf/
+// out-of-range inputs. Replace NaN with 0 before converting so that
+// pldexp_fast (which may pass NaN exponents) doesn't trigger UB.
+template <>
+EIGEN_STRONG_INLINE PacketXi pcast<PacketXf, PacketXi>(const PacketXf& a) {
+  const PacketXf safe = a == a ? a : PacketXf(0);
+  return __builtin_convertvector(safe, PacketXi);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXf pcast<PacketXi, PacketXf>(const PacketXi& a) {
+  return __builtin_convertvector(a, PacketXf);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXl pcast<PacketXd, PacketXl>(const PacketXd& a) {
+  const PacketXd safe = a == a ? a : PacketXd(0);
+  return __builtin_convertvector(safe, PacketXl);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd pcast<PacketXl, PacketXd>(const PacketXl& a) {
+  return __builtin_convertvector(a, PacketXd);
+}
+
+// float -> double: converts lower half of floats to doubles
+// double -> float: converts two PacketXd to one PacketXf
+// int32 -> int64: converts lower half of int32s to int64s
+// int64 -> int32: converts two PacketXl to one PacketXi
+
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+// float -> double: converts lower 2 floats to 2 doubles
+template <>
+EIGEN_STRONG_INLINE PacketXd pcast<PacketXf, PacketXd>(const PacketXf& a) {
+  using HalfFloat = detail::VectorType<float, 2>;
+  HalfFloat lo = __builtin_shufflevector(a, a, 0, 1);
+  return __builtin_convertvector(lo, PacketXd);
+}
+
+// double -> float: converts two PacketXd (2 doubles each) to one PacketXf (4 floats)
+template <>
+EIGEN_STRONG_INLINE PacketXf pcast<PacketXd, PacketXf>(const PacketXd& a, const PacketXd& b) {
+  using HalfFloat = detail::VectorType<float, 2>;
+  HalfFloat lo = __builtin_convertvector(a, HalfFloat);
+  HalfFloat hi = __builtin_convertvector(b, HalfFloat);
+  return __builtin_shufflevector(lo, hi, 0, 1, 2, 3);
+}
+
+// int32 -> int64: converts lower 2 int32s to 2 int64s
+template <>
+EIGEN_STRONG_INLINE PacketXl pcast<PacketXi, PacketXl>(const PacketXi& a) {
+  using HalfInt = detail::VectorType<int32_t, 2>;
+  HalfInt lo = __builtin_shufflevector(a, a, 0, 1);
+  return __builtin_convertvector(lo, PacketXl);
+}
+
+// int64 -> int32: converts two PacketXl (2 int64s each) to one PacketXi (4 int32s)
+template <>
+EIGEN_STRONG_INLINE PacketXi pcast<PacketXl, PacketXi>(const PacketXl& a, const PacketXl& b) {
+  using HalfInt = detail::VectorType<int32_t, 2>;
+  HalfInt lo = __builtin_convertvector(a, HalfInt);
+  HalfInt hi = __builtin_convertvector(b, HalfInt);
+  return __builtin_shufflevector(lo, hi, 0, 1, 2, 3);
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+// float -> double: converts lower 4 floats to 4 doubles
+template <>
+EIGEN_STRONG_INLINE PacketXd pcast<PacketXf, PacketXd>(const PacketXf& a) {
+  using HalfFloat = detail::VectorType<float, 4>;
+  HalfFloat lo = __builtin_shufflevector(a, a, 0, 1, 2, 3);
+  return __builtin_convertvector(lo, PacketXd);
+}
+
+// double -> float: converts two PacketXd (4 doubles each) to one PacketXf (8 floats)
+template <>
+EIGEN_STRONG_INLINE PacketXf pcast<PacketXd, PacketXf>(const PacketXd& a, const PacketXd& b) {
+  using HalfFloat = detail::VectorType<float, 4>;
+  HalfFloat lo = __builtin_convertvector(a, HalfFloat);
+  HalfFloat hi = __builtin_convertvector(b, HalfFloat);
+  return __builtin_shufflevector(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+// int32 -> int64: converts lower 4 int32s to 4 int64s
+template <>
+EIGEN_STRONG_INLINE PacketXl pcast<PacketXi, PacketXl>(const PacketXi& a) {
+  using HalfInt = detail::VectorType<int32_t, 4>;
+  HalfInt lo = __builtin_shufflevector(a, a, 0, 1, 2, 3);
+  return __builtin_convertvector(lo, PacketXl);
+}
+
+// int64 -> int32: converts two PacketXl (4 int64s each) to one PacketXi (8 int32s)
+template <>
+EIGEN_STRONG_INLINE PacketXi pcast<PacketXl, PacketXi>(const PacketXl& a, const PacketXl& b) {
+  using HalfInt = detail::VectorType<int32_t, 4>;
+  HalfInt lo = __builtin_convertvector(a, HalfInt);
+  HalfInt hi = __builtin_convertvector(b, HalfInt);
+  return __builtin_shufflevector(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+// float -> double: converts lower 8 floats to 8 doubles
+template <>
+EIGEN_STRONG_INLINE PacketXd pcast<PacketXf, PacketXd>(const PacketXf& a) {
+  using HalfFloat = detail::VectorType<float, 8>;
+  HalfFloat lo = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __builtin_convertvector(lo, PacketXd);
+}
+
+// double -> float: converts two PacketXd to one PacketXf
+template <>
+EIGEN_STRONG_INLINE PacketXf pcast<PacketXd, PacketXf>(const PacketXd& a, const PacketXd& b) {
+  using HalfFloat = detail::VectorType<float, 8>;
+  HalfFloat lo = __builtin_convertvector(a, HalfFloat);
+  HalfFloat hi = __builtin_convertvector(b, HalfFloat);
+  return __builtin_shufflevector(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+// int32 -> int64: converts lower 8 int32s to 8 int64s
+template <>
+EIGEN_STRONG_INLINE PacketXl pcast<PacketXi, PacketXl>(const PacketXi& a) {
+  using HalfInt = detail::VectorType<int32_t, 8>;
+  HalfInt lo = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __builtin_convertvector(lo, PacketXl);
+}
+
+// int64 -> int32: converts two PacketXl to one PacketXi
+template <>
+EIGEN_STRONG_INLINE PacketXi pcast<PacketXl, PacketXi>(const PacketXl& a, const PacketXl& b) {
+  using HalfInt = detail::VectorType<int32_t, 8>;
+  HalfInt lo = __builtin_convertvector(a, HalfInt);
+  HalfInt hi = __builtin_convertvector(b, HalfInt);
+  return __builtin_shufflevector(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+#endif
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_TYPE_CASTING_CLANG_H
diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h
index 0239262ae3c..8f7ad13e928 100644
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -23,7 +23,7 @@ namespace internal {
  */
 template <typename DstScalar, typename SrcScalar>
 struct assign_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(DstScalar& a, const SrcScalar& b) const { a = b; }
+  EIGEN_DEVICE_FUNC constexpr void assignCoeff(DstScalar& a, const SrcScalar& b) const { a = b; }
 
   template <int Alignment, typename Packet>
   EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const {
@@ -56,7 +56,7 @@ struct functor_traits<assign_op<DstScalar, SrcScalar>> {
 template <typename DstScalar, typename SrcScalar, typename Func>
 struct compound_assign_op {
   using traits = functor_traits<compound_assign_op<DstScalar, SrcScalar, Func>>;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(DstScalar& a, const SrcScalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr void assignCoeff(DstScalar& a, const SrcScalar& b) const {
     assign_op<DstScalar, DstScalar>().assignCoeff(a, Func().operator()(a, b));
   }
 
@@ -138,9 +138,9 @@ struct functor_traits<div_assign_op<DstScalar, SrcScalar>> : div_assign_op<DstSc
  */
 template <typename Scalar>
 struct swap_assign_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const {
 #ifdef EIGEN_GPUCC
-    // FIXME is there some kind of cuda::swap?
+    // FIXME: check whether cuda::swap exists.
     Scalar t = b;
     const_cast<Scalar&>(b) = a;
     a = t;
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index a93b998b95e..96db1510285 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -36,7 +36,7 @@ struct scalar_sum_op : binary_op_base<LhsScalar, RhsScalar> {
 #ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
   scalar_sum_op(){EIGEN_SCALAR_BINARY_OP_PLUGIN}
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type
   operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a + b;
   }
@@ -55,12 +55,13 @@ struct functor_traits<scalar_sum_op<LhsScalar, RhsScalar>> {
     Cost = (int(NumTraits<LhsScalar>::AddCost) + int(NumTraits<RhsScalar>::AddCost)) / 2,  // rough estimate!
     PacketAccess =
         is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasAdd && packet_traits<RhsScalar>::HasAdd
-    // TODO vectorize mixed sum
+    // TODO: vectorize mixed sum
   };
 };
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_sum_op<bool, bool>::operator()(const bool& a, const bool& b) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE bool scalar_sum_op<bool, bool>::operator()(const bool& a,
+                                                                                           const bool& b) const {
   return a || b;
 }
 
@@ -75,7 +76,7 @@ struct scalar_product_op : binary_op_base<LhsScalar, RhsScalar> {
 #ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
   scalar_product_op(){EIGEN_SCALAR_BINARY_OP_PLUGIN}
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type
   operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a * b;
   }
@@ -94,13 +95,13 @@ struct functor_traits<scalar_product_op<LhsScalar, RhsScalar>> {
     Cost = (int(NumTraits<LhsScalar>::MulCost) + int(NumTraits<RhsScalar>::MulCost)) / 2,  // rough estimate!
     PacketAccess =
         is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
-    // TODO vectorize mixed product
+    // TODO: vectorize mixed product
   };
 };
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_product_op<bool, bool>::operator()(const bool& a,
-                                                                                     const bool& b) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE bool scalar_product_op<bool, bool>::operator()(const bool& a,
+                                                                                               const bool& b) const {
   return a && b;
 }
 
@@ -116,7 +117,7 @@ struct scalar_conj_product_op : binary_op_base<LhsScalar, RhsScalar> {
 
   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar, scalar_conj_product_op>::ReturnType result_type;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return conj_helper<LhsScalar, RhsScalar, Conj, false>().pmul(a, b);
   }
 
@@ -141,7 +142,7 @@ struct functor_traits<scalar_conj_product_op<LhsScalar, RhsScalar>> {
 template <typename LhsScalar, typename RhsScalar, int NaNPropagation>
 struct scalar_min_op : binary_op_base<LhsScalar, RhsScalar> {
   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar, scalar_min_op>::ReturnType result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return internal::pmin<NaNPropagation>(a, b);
   }
   template <typename Packet>
@@ -170,7 +171,7 @@ struct functor_traits<scalar_min_op<LhsScalar, RhsScalar, NaNPropagation>> {
 template <typename LhsScalar, typename RhsScalar, int NaNPropagation>
 struct scalar_max_op : binary_op_base<LhsScalar, RhsScalar> {
   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar, scalar_max_op>::ReturnType result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return internal::pmax<NaNPropagation>(a, b);
   }
   template <typename Packet>
@@ -207,21 +208,10 @@ struct functor_traits<scalar_cmp_op<LhsScalar, RhsScalar, cmp, UseTypedComparato
   };
 };
 
-template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
-struct typed_cmp_helper {
-  static constexpr bool SameType = is_same<LhsScalar, RhsScalar>::value;
-  static constexpr bool IsNumeric = is_arithmetic<typename NumTraits<LhsScalar>::Real>::value;
-  static constexpr bool UseTyped = UseTypedComparators && SameType && IsNumeric;
-  using type = typename conditional<UseTyped, LhsScalar, bool>::type;
-};
-
-template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
-using cmp_return_t = typename typed_cmp_helper<LhsScalar, RhsScalar, UseTypedComparators>::type;
-
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_EQ, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
-  using result_type = cmp_return_t<LhsScalar, RhsScalar, UseTypedComparators>;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a == b ? result_type(1) : result_type(0);
   }
   template <typename Packet>
@@ -233,8 +223,8 @@ struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_EQ, UseTypedComparators> : binary
 
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_LT, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
-  using result_type = cmp_return_t<LhsScalar, RhsScalar, UseTypedComparators>;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a < b ? result_type(1) : result_type(0);
   }
   template <typename Packet>
@@ -246,8 +236,8 @@ struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_LT, UseTypedComparators> : binary
 
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_LE, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
-  using result_type = cmp_return_t<LhsScalar, RhsScalar, UseTypedComparators>;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a <= b ? result_type(1) : result_type(0);
   }
   template <typename Packet>
@@ -259,8 +249,8 @@ struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_LE, UseTypedComparators> : binary
 
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_GT, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
-  using result_type = cmp_return_t<LhsScalar, RhsScalar, UseTypedComparators>;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a > b ? result_type(1) : result_type(0);
   }
   template <typename Packet>
@@ -272,8 +262,8 @@ struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_GT, UseTypedComparators> : binary
 
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_GE, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
-  using result_type = cmp_return_t<LhsScalar, RhsScalar, UseTypedComparators>;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a >= b ? result_type(1) : result_type(0);
   }
   template <typename Packet>
@@ -285,8 +275,8 @@ struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_GE, UseTypedComparators> : binary
 
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_UNORD, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
-  using result_type = cmp_return_t<LhsScalar, RhsScalar, UseTypedComparators>;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return !(a <= b || b <= a) ? result_type(1) : result_type(0);
   }
   template <typename Packet>
@@ -298,8 +288,8 @@ struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_UNORD, UseTypedComparators> : bin
 
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_NEQ, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
-  using result_type = cmp_return_t<LhsScalar, RhsScalar, UseTypedComparators>;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a != b ? result_type(1) : result_type(0);
   }
   template <typename Packet>
@@ -316,7 +306,7 @@ struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_NEQ, UseTypedComparators> : binar
  */
 template <typename Scalar>
 struct scalar_hypot_op<Scalar, Scalar> : binary_op_base<Scalar, Scalar> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x, const Scalar& y) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& x, const Scalar& y) const {
     // This functor is used by hypotNorm only for which it is faster to first apply abs
     // on all coefficients prior to reduction through hypot.
     // This way we avoid calling abs on positive and real entries, and this also permits
@@ -348,12 +338,12 @@ struct scalar_pow_op : binary_op_base<Scalar, Exponent> {
   }
 #endif
 
-  EIGEN_DEVICE_FUNC inline result_type operator()(const Scalar& a, const Exponent& b) const {
+  EIGEN_DEVICE_FUNC constexpr inline result_type operator()(const Scalar& a, const Exponent& b) const {
     return numext::pow(a, b);
   }
 
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
     return generic_pow(a, b);
   }
 };
@@ -362,11 +352,7 @@ template <typename Scalar, typename Exponent>
 struct functor_traits<scalar_pow_op<Scalar, Exponent>> {
   enum {
     Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = (!NumTraits<Scalar>::IsComplex && !NumTraits<Scalar>::IsInteger && packet_traits<Scalar>::HasExp &&
-                    packet_traits<Scalar>::HasLog && packet_traits<Scalar>::HasRound && packet_traits<Scalar>::HasCmp &&
-                    // Temporarily disable packet access for half/bfloat16 until
-                    // accuracy is improved.
-                    !is_same<Scalar, half>::value && !is_same<Scalar, bfloat16>::value)
+    PacketAccess = (!NumTraits<Scalar>::IsComplex && !NumTraits<Scalar>::IsInteger && packet_traits<Scalar>::HasPow)
   };
 };
 
@@ -383,12 +369,12 @@ struct scalar_difference_op : binary_op_base<LhsScalar, RhsScalar> {
 #ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
   scalar_difference_op(){EIGEN_SCALAR_BINARY_OP_PLUGIN}
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type
   operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a - b;
   }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
     return internal::psub(a, b);
   }
 };
@@ -403,13 +389,13 @@ struct functor_traits<scalar_difference_op<LhsScalar, RhsScalar>> {
 
 template <typename Packet, bool IsInteger = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>
 struct maybe_raise_div_by_zero {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Packet x) { EIGEN_UNUSED_VARIABLE(x); }
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void run(Packet x) { EIGEN_UNUSED_VARIABLE(x); }
 };
 
 #ifndef EIGEN_GPU_COMPILE_PHASE
 template <typename Packet>
 struct maybe_raise_div_by_zero<Packet, true> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Packet x) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE void run(Packet x) {
     if (EIGEN_PREDICT_FALSE(predux_any(pcmp_eq(x, pzero(x))))) {
       // Use volatile variables to force a division by zero, which will
       // result in the default platform behaviour (usually SIGFPE).
@@ -432,12 +418,12 @@ struct scalar_quotient_op : binary_op_base<LhsScalar, RhsScalar> {
 #ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
   scalar_quotient_op(){EIGEN_SCALAR_BINARY_OP_PLUGIN}
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type
   operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a / b;
   }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
     return internal::pdiv(a, b);
   }
 };
@@ -461,7 +447,7 @@ struct scalar_boolean_and_op {
   using result_type = Scalar;
   // `false` any value `a` that satisfies `a == Scalar(0)`
   // `true` is the complement of `false`
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
     return (a != Scalar(0)) && (b != Scalar(0)) ? Scalar(1) : Scalar(0);
   }
   template <typename Packet>
@@ -489,7 +475,7 @@ struct scalar_boolean_or_op {
   using result_type = Scalar;
   // `false` any value `a` that satisfies `a == Scalar(0)`
   // `true` is the complement of `false`
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
     return (a != Scalar(0)) || (b != Scalar(0)) ? Scalar(1) : Scalar(0);
   }
   template <typename Packet>
@@ -516,7 +502,7 @@ struct scalar_boolean_xor_op {
   using result_type = Scalar;
   // `false` any value `a` that satisfies `a == Scalar(0)`
   // `true` is the complement of `false`
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
     return (a != Scalar(0)) != (b != Scalar(0)) ? Scalar(1) : Scalar(0);
   }
   template <typename Packet>
@@ -538,19 +524,19 @@ template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct bitwise_binary_impl {
   static constexpr size_t Size = sizeof(Scalar);
   using uint_t = typename numext::get_integer_by_size<Size>::unsigned_type;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_and(const Scalar& a, const Scalar& b) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar run_and(const Scalar& a, const Scalar& b) {
     uint_t a_as_uint = numext::bit_cast<uint_t, Scalar>(a);
     uint_t b_as_uint = numext::bit_cast<uint_t, Scalar>(b);
     uint_t result = a_as_uint & b_as_uint;
     return numext::bit_cast<Scalar, uint_t>(result);
   }
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_or(const Scalar& a, const Scalar& b) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar run_or(const Scalar& a, const Scalar& b) {
     uint_t a_as_uint = numext::bit_cast<uint_t, Scalar>(a);
     uint_t b_as_uint = numext::bit_cast<uint_t, Scalar>(b);
     uint_t result = a_as_uint | b_as_uint;
     return numext::bit_cast<Scalar, uint_t>(result);
   }
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_xor(const Scalar& a, const Scalar& b) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar run_xor(const Scalar& a, const Scalar& b) {
     uint_t a_as_uint = numext::bit_cast<uint_t, Scalar>(a);
     uint_t b_as_uint = numext::bit_cast<uint_t, Scalar>(b);
     uint_t result = a_as_uint ^ b_as_uint;
@@ -561,17 +547,17 @@ struct bitwise_binary_impl {
 template <typename Scalar>
 struct bitwise_binary_impl<Scalar, true> {
   using Real = typename NumTraits<Scalar>::Real;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_and(const Scalar& a, const Scalar& b) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar run_and(const Scalar& a, const Scalar& b) {
     Real real_result = bitwise_binary_impl<Real>::run_and(numext::real(a), numext::real(b));
     Real imag_result = bitwise_binary_impl<Real>::run_and(numext::imag(a), numext::imag(b));
     return Scalar(real_result, imag_result);
   }
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_or(const Scalar& a, const Scalar& b) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar run_or(const Scalar& a, const Scalar& b) {
     Real real_result = bitwise_binary_impl<Real>::run_or(numext::real(a), numext::real(b));
     Real imag_result = bitwise_binary_impl<Real>::run_or(numext::imag(a), numext::imag(b));
     return Scalar(real_result, imag_result);
   }
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_xor(const Scalar& a, const Scalar& b) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar run_xor(const Scalar& a, const Scalar& b) {
     Real real_result = bitwise_binary_impl<Real>::run_xor(numext::real(a), numext::real(b));
     Real imag_result = bitwise_binary_impl<Real>::run_xor(numext::imag(a), numext::imag(b));
     return Scalar(real_result, imag_result);
@@ -589,7 +575,7 @@ struct scalar_bitwise_and_op {
                       BITWISE OPERATIONS MAY ONLY BE PERFORMED ON PLAIN DATA TYPES)
   EIGEN_STATIC_ASSERT((!internal::is_same<Scalar, bool>::value), DONT USE BITWISE OPS ON BOOLEAN TYPES)
   using result_type = Scalar;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
     return bitwise_binary_impl<Scalar>::run_and(a, b);
   }
   template <typename Packet>
@@ -613,7 +599,7 @@ struct scalar_bitwise_or_op {
                       BITWISE OPERATIONS MAY ONLY BE PERFORMED ON PLAIN DATA TYPES)
   EIGEN_STATIC_ASSERT((!internal::is_same<Scalar, bool>::value), DONT USE BITWISE OPS ON BOOLEAN TYPES)
   using result_type = Scalar;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
     return bitwise_binary_impl<Scalar>::run_or(a, b);
   }
   template <typename Packet>
@@ -637,7 +623,7 @@ struct scalar_bitwise_xor_op {
                       BITWISE OPERATIONS MAY ONLY BE PERFORMED ON PLAIN DATA TYPES)
   EIGEN_STATIC_ASSERT((!internal::is_same<Scalar, bool>::value), DONT USE BITWISE OPS ON BOOLEAN TYPES)
   using result_type = Scalar;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
     return bitwise_binary_impl<Scalar>::run_xor(a, b);
   }
   template <typename Packet>
@@ -661,12 +647,12 @@ struct scalar_absolute_difference_op : binary_op_base<LhsScalar, RhsScalar> {
 #ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
   scalar_absolute_difference_op(){EIGEN_SCALAR_BINARY_OP_PLUGIN}
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type
   operator()(const LhsScalar& a, const RhsScalar& b) const {
     return numext::absdiff(a, b);
   }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
     return internal::pabsdiff(a, b);
   }
 };
@@ -686,7 +672,7 @@ struct scalar_atan2_op {
       is_same<LhsScalar, RhsScalar>::value && !NumTraits<Scalar>::IsInteger && !NumTraits<Scalar>::IsComplex;
   EIGEN_STATIC_ASSERT(Enable, "LhsScalar and RhsScalar must be the same non-integer, non-complex type")
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& y, const Scalar& x) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& y, const Scalar& x) const {
     return numext::atan2(y, x);
   }
   template <typename Packet>
@@ -708,23 +694,21 @@ struct functor_traits<scalar_atan2_op<LhsScalar, RhsScalar>> {
 //---------- binary functors bound to a constant, thus appearing as a unary functor ----------
 
 // The following two classes permits to turn any binary functor into a unary one with one argument bound to a constant
-// value. They are analogues to std::binder1st/binder2nd but with the following differences:
-//  - they are compatible with packetOp
-//  - they are portable across C++ versions (the std::binder* are deprecated in C++11)
+// value. They are analogues to the removed std::binder1st/binder2nd and are also compatible with packetOp.
 template <typename BinaryOp>
 struct bind1st_op : BinaryOp {
   typedef typename BinaryOp::first_argument_type first_argument_type;
   typedef typename BinaryOp::second_argument_type second_argument_type;
   typedef typename BinaryOp::result_type result_type;
 
-  EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type& val) : m_value(val) {}
+  EIGEN_DEVICE_FUNC constexpr explicit bind1st_op(const first_argument_type& val) : m_value(val) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const second_argument_type& b) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const second_argument_type& b) const {
     return BinaryOp::operator()(m_value, b);
   }
 
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& b) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& b) const {
     return BinaryOp::packetOp(internal::pset1<Packet>(m_value), b);
   }
 
@@ -739,14 +723,14 @@ struct bind2nd_op : BinaryOp {
   typedef typename BinaryOp::second_argument_type second_argument_type;
   typedef typename BinaryOp::result_type result_type;
 
-  EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type& val) : m_value(val) {}
+  EIGEN_DEVICE_FUNC constexpr explicit bind2nd_op(const second_argument_type& val) : m_value(val) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const first_argument_type& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const first_argument_type& a) const {
     return BinaryOp::operator()(a, m_value);
   }
 
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
     return BinaryOp::packetOp(a, internal::pset1<Packet>(m_value));
   }
 
diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h
index 35dc73869a6..d8fc78a6ce1 100644
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -19,10 +19,10 @@ namespace internal {
 
 template <typename Scalar>
 struct scalar_constant_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()() const { return m_other; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()() const { return m_other; }
   template <typename PacketType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetOp() const {
     return internal::pset1<PacketType>(m_other);
   }
   const Scalar m_other;
@@ -38,10 +38,10 @@ struct functor_traits<scalar_constant_op<Scalar>> {
 
 template <typename Scalar>
 struct scalar_zero_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_zero_op() = default;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()() const { return Scalar(0); }
+  EIGEN_DEVICE_FUNC scalar_zero_op() = default;
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()() const { return Scalar(0); }
   template <typename PacketType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetOp() const {
     return internal::pzero<PacketType>(PacketType());
   }
 };
@@ -51,7 +51,7 @@ struct functor_traits<scalar_zero_op<Scalar>> : functor_traits<scalar_constant_o
 template <typename Scalar>
 struct scalar_identity_op {
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(IndexType row, IndexType col) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(IndexType row, IndexType col) const {
     return row == col ? Scalar(1) : Scalar(0);
   }
 };
@@ -67,7 +67,7 @@ template <typename Scalar>
 struct linspaced_op_impl<Scalar, /*IsInteger*/ false> {
   typedef typename NumTraits<Scalar>::Real RealScalar;
 
-  EIGEN_DEVICE_FUNC linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps)
+  EIGEN_DEVICE_FUNC constexpr linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps)
       : m_low(low),
         m_high(high),
         m_size1(num_steps == 1 ? 1 : num_steps - 1),
@@ -75,7 +75,7 @@ struct linspaced_op_impl<Scalar, /*IsInteger*/ false> {
         m_flip(numext::abs(high) < numext::abs(low)) {}
 
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(IndexType i) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(IndexType i) const {
     if (m_flip)
       return (i == 0) ? m_low : Scalar(m_high - RealScalar(m_size1 - i) * m_step);
     else
@@ -83,7 +83,7 @@ struct linspaced_op_impl<Scalar, /*IsInteger*/ false> {
   }
 
   template <typename Packet, typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(IndexType i) const {
     // Principle:
     // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
     Packet low = pset1<Packet>(m_low);
@@ -111,7 +111,7 @@ struct linspaced_op_impl<Scalar, /*IsInteger*/ false> {
 
 template <typename Scalar>
 struct linspaced_op_impl<Scalar, /*IsInteger*/ true> {
-  EIGEN_DEVICE_FUNC linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps)
+  EIGEN_DEVICE_FUNC constexpr linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps)
       : m_low(low),
         m_multiplier((high - low) / convert_index<Scalar>(num_steps <= 1 ? 1 : num_steps - 1)),
         m_divisor(convert_index<Scalar>((high >= low ? num_steps : -num_steps) + (high - low)) /
@@ -119,7 +119,7 @@ struct linspaced_op_impl<Scalar, /*IsInteger*/ true> {
         m_use_divisor(num_steps > 1 && (numext::abs(high - low) + 1) < num_steps) {}
 
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(IndexType i) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(IndexType i) const {
     if (m_use_divisor)
       return m_low + convert_index<Scalar>(i) / m_divisor;
     else
@@ -151,16 +151,16 @@ struct functor_traits<linspaced_op<Scalar>> {
 };
 template <typename Scalar>
 struct linspaced_op {
-  EIGEN_DEVICE_FUNC linspaced_op(const Scalar& low, const Scalar& high, Index num_steps)
+  EIGEN_DEVICE_FUNC constexpr linspaced_op(const Scalar& low, const Scalar& high, Index num_steps)
       : impl((num_steps == 1 ? high : low), high, num_steps) {}
 
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(IndexType i) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(IndexType i) const {
     return impl(i);
   }
 
   template <typename Packet, typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(IndexType i) const {
     return impl.template packetOp<Packet>(i);
   }
 
@@ -173,9 +173,9 @@ template <typename Scalar>
 struct equalspaced_op {
   typedef typename NumTraits<Scalar>::Real RealScalar;
 
-  EIGEN_DEVICE_FUNC equalspaced_op(const Scalar& start, const Scalar& step) : m_start(start), m_step(step) {}
+  EIGEN_DEVICE_FUNC constexpr equalspaced_op(const Scalar& start, const Scalar& step) : m_start(start), m_step(step) {}
   template <typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(IndexType i) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(IndexType i) const {
     return m_start + m_step * static_cast<Scalar>(i);
   }
   template <typename Packet, typename IndexType>
diff --git a/Eigen/src/Core/functors/TernaryFunctors.h b/Eigen/src/Core/functors/TernaryFunctors.h
index 745779a137d..9a4ea48a6c3 100644
--- a/Eigen/src/Core/functors/TernaryFunctors.h
+++ b/Eigen/src/Core/functors/TernaryFunctors.h
@@ -21,12 +21,13 @@ namespace internal {
 
 template <typename ThenScalar, typename ElseScalar, typename ConditionScalar>
 struct scalar_boolean_select_op {
-  static constexpr bool ThenElseAreSame = is_same<ThenScalar, ElseScalar>::value;
+  static constexpr bool ThenElseAreSame =
+      is_same<std::remove_const_t<ThenScalar>, std::remove_const_t<ElseScalar>>::value;
   EIGEN_STATIC_ASSERT(ThenElseAreSame, THEN AND ELSE MUST BE SAME TYPE)
   using Scalar = ThenScalar;
   using result_type = Scalar;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const ThenScalar& a, const ElseScalar& b,
-                                                          const ConditionScalar& cond) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const ThenScalar& a, const ElseScalar& b,
+                                                                    const ConditionScalar& cond) const {
     return cond == ConditionScalar(0) ? b : a;
   }
   template <typename Packet>
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index ba7d97a038d..b92607d29a6 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -24,9 +24,9 @@ namespace internal {
  */
 template <typename Scalar>
 struct scalar_opposite_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::negate(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return numext::negate(a); }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
     return internal::pnegate(a);
   }
 };
@@ -43,9 +43,11 @@ struct functor_traits<scalar_opposite_op<Scalar>> {
 template <typename Scalar>
 struct scalar_abs_op {
   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const Scalar& a) const { return numext::abs(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const {
+    return numext::abs(a);
+  }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
     return internal::pabs(a);
   }
 };
@@ -71,7 +73,7 @@ template <typename Scalar, typename = void>
 struct abs_knowing_score {
   typedef typename NumTraits<Scalar>::Real result_type;
   template <typename Score>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const Scalar& a, const Score&) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const Scalar& a, const Score&) const {
     return numext::abs(a);
   }
 };
@@ -79,7 +81,7 @@ template <typename Scalar>
 struct abs_knowing_score<Scalar, typename scalar_score_coeff_op<Scalar>::Score_is_abs> {
   typedef typename NumTraits<Scalar>::Real result_type;
   template <typename Scal>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const Scal&, const result_type& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const Scal&, const result_type& a) const {
     return a;
   }
 };
@@ -92,35 +94,20 @@ struct abs_knowing_score<Scalar, typename scalar_score_coeff_op<Scalar>::Score_i
 template <typename Scalar>
 struct scalar_abs2_op {
   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const Scalar& a) const { return numext::abs2(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const {
+    return numext::abs2(a);
+  }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
     return internal::pmul(a, a);
   }
 };
 template <typename Scalar>
 struct functor_traits<scalar_abs2_op<Scalar>> {
-  enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasAbs2 };
-};
-
-template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct squared_norm_functor {
-  typedef Scalar result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
-    return Scalar(numext::real(a) * numext::real(a), numext::imag(a) * numext::imag(a));
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
-    return Packet(pmul(a.v, a.v));
-  }
-};
-template <typename Scalar>
-struct squared_norm_functor<Scalar, false> : scalar_abs2_op<Scalar> {};
-
-template <typename Scalar>
-struct functor_traits<squared_norm_functor<Scalar>> {
-  using Real = typename NumTraits<Scalar>::Real;
-  enum { Cost = NumTraits<Real>::MulCost, PacketAccess = packet_traits<Real>::HasMul };
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasMul && !NumTraits<Scalar>::IsComplex
+  };
 };
 
 /** \internal
@@ -130,9 +117,9 @@ struct functor_traits<squared_norm_functor<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_conjugate_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::conj(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return numext::conj(a); }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
     return internal::pconj(a);
   }
 };
@@ -160,9 +147,11 @@ struct functor_traits<scalar_conjugate_op<Scalar>> {
 template <typename Scalar>
 struct scalar_arg_op {
   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const Scalar& a) const { return numext::arg(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const {
+    return numext::arg(a);
+  }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
     return internal::parg(a);
   }
 };
@@ -182,11 +171,11 @@ struct functor_traits<scalar_arg_op<Scalar>> {
 template <typename Scalar>
 struct scalar_carg_op {
   using result_type = Scalar;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
     return Scalar(numext::arg(a));
   }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
     return pcarg(a);
   }
 };
@@ -204,7 +193,7 @@ struct functor_traits<scalar_carg_op<Scalar>> {
 template <typename Scalar, typename NewType>
 struct scalar_cast_op {
   typedef NewType result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const NewType operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE NewType operator()(const Scalar& a) const {
     return cast<Scalar, NewType>(a);
   }
 };
@@ -239,11 +228,11 @@ struct functor_traits<core_cast_op<SrcType, DstType>> {
  */
 template <typename Scalar, int N>
 struct scalar_shift_right_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
     return numext::arithmetic_shift_right(a);
   }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
     return internal::parithmetic_shift_right<N>(a);
   }
 };
@@ -259,11 +248,11 @@ struct functor_traits<scalar_shift_right_op<Scalar, N>> {
  */
 template <typename Scalar, int N>
 struct scalar_shift_left_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
     return numext::logical_shift_left(a);
   }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
     return internal::plogical_shift_left<N>(a);
   }
 };
@@ -280,7 +269,9 @@ struct functor_traits<scalar_shift_left_op<Scalar, N>> {
 template <typename Scalar>
 struct scalar_real_op {
   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const { return numext::real(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const {
+    return numext::real(a);
+  }
 };
 template <typename Scalar>
 struct functor_traits<scalar_real_op<Scalar>> {
@@ -295,7 +286,9 @@ struct functor_traits<scalar_real_op<Scalar>> {
 template <typename Scalar>
 struct scalar_imag_op {
   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const { return numext::imag(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const {
+    return numext::imag(a);
+  }
 };
 template <typename Scalar>
 struct functor_traits<scalar_imag_op<Scalar>> {
@@ -310,10 +303,12 @@ struct functor_traits<scalar_imag_op<Scalar>> {
 template <typename Scalar>
 struct scalar_real_ref_op {
   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type& operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const result_type& operator()(const Scalar& a) const {
+    return numext::real_ref(a);
+  }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type& operator()(Scalar& a) const {
     return numext::real_ref(a);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type& operator()(Scalar& a) const { return numext::real_ref(a); }
 };
 template <typename Scalar>
 struct functor_traits<scalar_real_ref_op<Scalar>> {
@@ -328,8 +323,10 @@ struct functor_traits<scalar_real_ref_op<Scalar>> {
 template <typename Scalar>
 struct scalar_imag_ref_op {
   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type& operator()(Scalar& a) const { return numext::imag_ref(a); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type& operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type& operator()(Scalar& a) const {
+    return numext::imag_ref(a);
+  }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const result_type& operator()(const Scalar& a) const {
     return numext::imag_ref(a);
   }
 };
@@ -346,7 +343,7 @@ struct functor_traits<scalar_imag_ref_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_exp_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return internal::pexp(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return internal::pexp(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pexp(a);
@@ -378,7 +375,7 @@ struct functor_traits<scalar_exp_op<Scalar>> {
 
 template <typename Scalar>
 struct scalar_exp2_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return internal::pexp2(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return internal::pexp2(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pexp2(a);
@@ -388,7 +385,7 @@ template <typename Scalar>
 struct functor_traits<scalar_exp2_op<Scalar>> {
   enum {
     PacketAccess = packet_traits<Scalar>::HasExp,
-    Cost = functor_traits<scalar_exp_op<Scalar>>::Cost  // TODO measure cost of exp2
+    Cost = functor_traits<scalar_exp_op<Scalar>>::Cost  // TODO: measure cost of exp2
   };
 };
 
@@ -400,7 +397,7 @@ struct functor_traits<scalar_exp2_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_expm1_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::expm1(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::expm1(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pexpm1(a);
@@ -410,7 +407,7 @@ template <typename Scalar>
 struct functor_traits<scalar_expm1_op<Scalar>> {
   enum {
     PacketAccess = packet_traits<Scalar>::HasExpm1,
-    Cost = functor_traits<scalar_exp_op<Scalar>>::Cost  // TODO measure cost of expm1
+    Cost = functor_traits<scalar_exp_op<Scalar>>::Cost  // TODO: measure cost of expm1
   };
 };
 
@@ -422,7 +419,7 @@ struct functor_traits<scalar_expm1_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_log_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::log(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::log(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::plog(a);
@@ -454,7 +451,7 @@ struct functor_traits<scalar_log_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_log1p_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::log1p(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::log1p(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::plog1p(a);
@@ -464,7 +461,7 @@ template <typename Scalar>
 struct functor_traits<scalar_log1p_op<Scalar>> {
   enum {
     PacketAccess = packet_traits<Scalar>::HasLog1p,
-    Cost = functor_traits<scalar_log_op<Scalar>>::Cost  // TODO measure cost of log1p
+    Cost = functor_traits<scalar_log_op<Scalar>>::Cost  // TODO: measure cost of log1p
   };
 };
 
@@ -476,7 +473,9 @@ struct functor_traits<scalar_log1p_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_log10_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { EIGEN_USING_STD(log10) return log10(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const {
+    EIGEN_USING_STD(log10) return log10(a);
+  }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::plog10(a);
@@ -496,7 +495,7 @@ struct functor_traits<scalar_log10_op<Scalar>> {
 template <typename Scalar>
 struct scalar_log2_op {
   using RealScalar = typename NumTraits<Scalar>::Real;
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const {
     return Scalar(RealScalar(EIGEN_LOG2E)) * numext::log(a);
   }
   template <typename Packet>
@@ -515,7 +514,7 @@ struct functor_traits<scalar_log2_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_sqrt_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::sqrt(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::sqrt(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::psqrt(a);
@@ -557,7 +556,7 @@ struct functor_traits<scalar_sqrt_op<bool>> {
  */
 template <typename Scalar>
 struct scalar_cbrt_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::cbrt(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::cbrt(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pcbrt(a);
@@ -575,7 +574,7 @@ struct functor_traits<scalar_cbrt_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_rsqrt_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::rsqrt(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::rsqrt(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::prsqrt(a);
@@ -593,7 +592,7 @@ struct functor_traits<scalar_rsqrt_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_cos_op {
-  EIGEN_DEVICE_FUNC inline Scalar operator()(const Scalar& a) const { return numext::cos(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::cos(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pcos(a);
@@ -610,7 +609,7 @@ struct functor_traits<scalar_cos_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_sin_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::sin(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::sin(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::psin(a);
@@ -627,7 +626,7 @@ struct functor_traits<scalar_sin_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_tan_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::tan(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::tan(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::ptan(a);
@@ -644,7 +643,7 @@ struct functor_traits<scalar_tan_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_acos_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acos(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::acos(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pacos(a);
@@ -661,7 +660,7 @@ struct functor_traits<scalar_acos_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_asin_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asin(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::asin(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pasin(a);
@@ -678,7 +677,7 @@ struct functor_traits<scalar_asin_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_atan_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atan(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::atan(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::patan(a);
@@ -695,7 +694,7 @@ struct functor_traits<scalar_atan_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_tanh_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::tanh(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::tanh(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const {
     return ptanh(x);
@@ -730,7 +729,7 @@ struct functor_traits<scalar_tanh_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_atanh_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::atanh(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const {
     return patanh(x);
@@ -748,7 +747,7 @@ struct functor_traits<scalar_atanh_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_sinh_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::sinh(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::sinh(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::psinh(a);
@@ -765,12 +764,16 @@ struct functor_traits<scalar_sinh_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_asinh_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asinh(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::asinh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::pasinh(a);
+  }
 };
 
 template <typename Scalar>
 struct functor_traits<scalar_asinh_op<Scalar>> {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasASinh };
 };
 
 /** \internal
@@ -779,7 +782,7 @@ struct functor_traits<scalar_asinh_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_cosh_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::cosh(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::cosh(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pcosh(a);
@@ -796,12 +799,16 @@ struct functor_traits<scalar_cosh_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_acosh_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acosh(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::acosh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::pacosh(a);
+  }
 };
 
 template <typename Scalar>
 struct functor_traits<scalar_acosh_op<Scalar>> {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasACosh };
 };
 
 /** \internal
@@ -810,9 +817,9 @@ struct functor_traits<scalar_acosh_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_inverse_op {
-  EIGEN_DEVICE_FUNC inline Scalar operator()(const Scalar& a) const { return Scalar(1) / a; }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return Scalar(1) / a; }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::preciprocal(a);
   }
 };
@@ -834,9 +841,9 @@ struct functor_traits<scalar_inverse_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_square_op {
-  EIGEN_DEVICE_FUNC inline Scalar operator()(const Scalar& a) const { return a * a; }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return a * a; }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pmul(a, a);
   }
 };
@@ -850,7 +857,7 @@ template <>
 struct scalar_square_op<bool> {
   EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator()(const bool& a) const { return a; }
   template <typename Packet>
-  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const {
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return a;
   }
 };
@@ -865,9 +872,9 @@ struct functor_traits<scalar_square_op<bool>> {
  */
 template <typename Scalar>
 struct scalar_cube_op {
-  EIGEN_DEVICE_FUNC inline Scalar operator()(const Scalar& a) const { return a * a * a; }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return a * a * a; }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const {
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pmul(a, pmul(a, a));
   }
 };
@@ -881,7 +888,7 @@ template <>
 struct scalar_cube_op<bool> {
   EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator()(const bool& a) const { return a; }
   template <typename Packet>
-  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const {
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return a;
   }
 };
@@ -896,7 +903,7 @@ struct functor_traits<scalar_cube_op<bool>> {
  */
 template <typename Scalar>
 struct scalar_round_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::round(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return numext::round(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pround(a);
@@ -916,7 +923,7 @@ struct functor_traits<scalar_round_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_floor_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::floor(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return numext::floor(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pfloor(a);
@@ -936,7 +943,7 @@ struct functor_traits<scalar_floor_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_rint_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::rint(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return numext::rint(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::print(a);
@@ -956,7 +963,7 @@ struct functor_traits<scalar_rint_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_ceil_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::ceil(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return numext::ceil(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::pceil(a);
@@ -976,7 +983,7 @@ struct functor_traits<scalar_ceil_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_trunc_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::trunc(a); }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return numext::trunc(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
     return internal::ptrunc(a);
@@ -996,7 +1003,7 @@ struct functor_traits<scalar_trunc_op<Scalar>> {
  */
 template <typename Scalar, bool UseTypedPredicate = false>
 struct scalar_isnan_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE bool operator()(const Scalar& a) const {
 #if defined(SYCL_DEVICE_ONLY)
     return numext::isnan(a);
 #else
@@ -1007,7 +1014,7 @@ struct scalar_isnan_op {
 
 template <typename Scalar>
 struct scalar_isnan_op<Scalar, true> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
 #if defined(SYCL_DEVICE_ONLY)
     return (numext::isnan(a) ? ptrue(a) : pzero(a));
 #else
@@ -1031,7 +1038,7 @@ struct functor_traits<scalar_isnan_op<Scalar, UseTypedPredicate>> {
  */
 template <typename Scalar, bool UseTypedPredicate = false>
 struct scalar_isinf_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE bool operator()(const Scalar& a) const {
 #if defined(SYCL_DEVICE_ONLY)
     return numext::isinf(a);
 #else
@@ -1042,7 +1049,7 @@ struct scalar_isinf_op {
 
 template <typename Scalar>
 struct scalar_isinf_op<Scalar, true> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
 #if defined(SYCL_DEVICE_ONLY)
     return (numext::isinf(a) ? ptrue(a) : pzero(a));
 #else
@@ -1065,7 +1072,7 @@ struct functor_traits<scalar_isinf_op<Scalar, UseTypedPredicate>> {
  */
 template <typename Scalar, bool UseTypedPredicate = false>
 struct scalar_isfinite_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE bool operator()(const Scalar& a) const {
 #if defined(SYCL_DEVICE_ONLY)
     return numext::isfinite(a);
 #else
@@ -1076,7 +1083,7 @@ struct scalar_isfinite_op {
 
 template <typename Scalar>
 struct scalar_isfinite_op<Scalar, true> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
 #if defined(SYCL_DEVICE_ONLY)
     return (numext::isfinite(a) ? ptrue(a) : pzero(a));
 #else
@@ -1104,7 +1111,7 @@ struct scalar_boolean_not_op {
   using result_type = Scalar;
   // `false` any value `a` that satisfies `a == Scalar(0)`
   // `true` is the complement of `false`
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
     return a == Scalar(0) ? Scalar(1) : Scalar(0);
   }
   template <typename Packet>
@@ -1123,7 +1130,7 @@ template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct bitwise_unary_impl {
   static constexpr size_t Size = sizeof(Scalar);
   using uint_t = typename numext::get_integer_by_size<Size>::unsigned_type;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_not(const Scalar& a) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar run_not(const Scalar& a) {
     uint_t a_as_uint = numext::bit_cast<uint_t, Scalar>(a);
     uint_t result = ~a_as_uint;
     return numext::bit_cast<Scalar, uint_t>(result);
@@ -1133,7 +1140,7 @@ struct bitwise_unary_impl {
 template <typename Scalar>
 struct bitwise_unary_impl<Scalar, true> {
   using Real = typename NumTraits<Scalar>::Real;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_not(const Scalar& a) {
+  static EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar run_not(const Scalar& a) {
     Real real_result = bitwise_unary_impl<Real>::run_not(numext::real(a));
     Real imag_result = bitwise_unary_impl<Real>::run_not(numext::imag(a));
     return Scalar(real_result, imag_result);
@@ -1151,7 +1158,7 @@ struct scalar_bitwise_not_op {
                       BITWISE OPERATIONS MAY ONLY BE PERFORMED ON PLAIN DATA TYPES)
   EIGEN_STATIC_ASSERT((!internal::is_same<Scalar, bool>::value), DONT USE BITWISE OPS ON BOOLEAN TYPES)
   using result_type = Scalar;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
     return bitwise_unary_impl<Scalar>::run_not(a);
   }
   template <typename Packet>
@@ -1170,7 +1177,7 @@ struct functor_traits<scalar_bitwise_not_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_sign_op {
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::sign(a); }
+  EIGEN_DEVICE_FUNC constexpr inline Scalar operator()(const Scalar& a) const { return numext::sign(a); }
 
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
@@ -1205,7 +1212,7 @@ struct scalar_logistic_op_impl {
 // Complex-valud implementation.
 template <typename T>
 struct scalar_logistic_op_impl<T, std::enable_if_t<NumTraits<T>::IsComplex>> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE T operator()(const T& x) const {
     const T e = numext::exp(x);
     return (numext::isinf)(numext::real(e)) ? T(1) : e / (e + T(1));
   }
@@ -1332,8 +1339,9 @@ struct scalar_unary_pow_op {
       internal::has_ReturnType<ScalarBinaryOpTraits<Scalar, ExponentScalar, scalar_unary_pow_op>>::value>::type
       PromotedExponent;
   typedef typename ScalarBinaryOpTraits<Scalar, PromotedExponent, scalar_unary_pow_op>::ReturnType result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_unary_pow_op(const ExponentScalar& exponent) : m_exponent(exponent) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE scalar_unary_pow_op(const ExponentScalar& exponent)
+      : m_exponent(exponent) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const {
     EIGEN_USING_STD(pow);
     return static_cast<result_type>(pow(a, m_exponent));
   }
@@ -1371,7 +1379,7 @@ struct scalar_unary_pow_op<Scalar, ExponentScalar, false, false, false, false> {
     check_is_representable();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
     EIGEN_USING_STD(pow);
     return static_cast<Scalar>(pow(a, m_exponent));
   }
@@ -1387,9 +1395,10 @@ struct scalar_unary_pow_op<Scalar, ExponentScalar, false, false, false, false> {
 
 template <typename Scalar, typename ExponentScalar, bool BaseIsInteger>
 struct scalar_unary_pow_op<Scalar, ExponentScalar, BaseIsInteger, true, false, false> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_unary_pow_op(const ExponentScalar& exponent) : m_exponent(exponent) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE scalar_unary_pow_op(const ExponentScalar& exponent)
+      : m_exponent(exponent) {}
   // TODO: error handling logic for complex^real_integer
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
     return unary_pow_impl<Scalar, ExponentScalar>::run(a, m_exponent);
   }
   template <typename Packet>
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index e72c6b48e41..8ed3d92f84a 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -13,6 +13,13 @@
 // IWYU pragma: private
 #include "../InternalHeaderCheck.h"
 
+// C4804: unsafe use of type 'bool' in operation. Unavoidable in generic code
+// instantiated with bool scalars (e.g. += and * on bool).
+#if EIGEN_COMP_MSVC
+#pragma warning(push)
+#pragma warning(disable : 4804)
+#endif
+
 namespace Eigen {
 
 namespace internal {
@@ -57,6 +64,10 @@ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(8 * 10
 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4 * 1024 * 1024);
 #endif
+#elif EIGEN_ARCH_ARM_OR_ARM64
+const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64 * 1024);
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(1024 * 1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4 * 1024 * 1024);
 #else
 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16 * 1024);
 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
@@ -70,7 +81,7 @@ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512 *
 /** \internal */
 struct CacheSizes {
   CacheSizes() : m_l1(-1), m_l2(-1), m_l3(-1) {
-    int l1CacheSize, l2CacheSize, l3CacheSize;
+    std::ptrdiff_t l1CacheSize, l2CacheSize, l3CacheSize;
     queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
     m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
     m_l2 = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
@@ -126,6 +137,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
   std::ptrdiff_t l1, l2, l3;
   manage_caching_sizes(GetAction, &l1, &l2, &l3);
 #ifdef EIGEN_VECTORIZE_AVX512
+  const std::ptrdiff_t phys_l1 = l1;
   // We need to find a rationale for that, but without this adjustment,
   // performance with AVX512 is pretty bad, like -20% slower.
   // One reason is that with increasing packet-size, the blocking size k
@@ -150,13 +162,13 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
     // increasing the value of k, so we'll cap it at 320 (value determined
     // experimentally).
     // To avoid that k vanishes, we make k_cache at least as big as kr
-    const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1 - ksub) / kdiv, 320));
+    const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)(static_cast<Index>((l1 - ksub) / kdiv), 320));
     if (k_cache < k) {
       k = k_cache - (k_cache % kr);
       eigen_internal_assert(k > 0);
     }
 
-    const Index n_cache = (l2 - l1) / (nr * sizeof(RhsScalar) * k);
+    const Index n_cache = static_cast<Index>((l2 - l1) / (nr * sizeof(RhsScalar) * k));
     const Index n_per_thread = numext::div_ceil(n, num_threads);
     if (n_cache <= n_per_thread) {
       // Don't exceed the capacity of the l2 cache.
@@ -169,7 +181,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
 
     if (l3 > l2) {
       // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
-      const Index m_cache = (l3 - l2) / (sizeof(LhsScalar) * k * num_threads);
+      const Index m_cache = static_cast<Index>((l3 - l2) / (sizeof(LhsScalar) * k * num_threads));
       const Index m_per_thread = numext::div_ceil(m, num_threads);
       if (m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
         m = m_cache - (m_cache % mr);
@@ -188,7 +200,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
 #endif
 
     // Early return for small problems because the computation below are time consuming for small problems.
-    // Perhaps it would make more sense to consider k*n*m??
+    // Perhaps it would make more sense to consider k*n*m?
     // Note that for very tiny problem, this function should be bypassed anyway
     // because we use the coefficient-based implementation for them.
     if ((numext::maxi)(k, (numext::maxi)(m, n)) < 48) return;
@@ -207,7 +219,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
     // We also include a register-level block of the result (mx x nr).
     // (In an ideal world only the lhs panel would stay in L1)
     // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
-    const Index max_kc = numext::maxi<Index>(((l1 - k_sub) / k_div) & (~(k_peeling - 1)), 1);
+    const Index max_kc = numext::maxi<Index>(static_cast<Index>(((l1 - k_sub) / k_div) & (~(k_peeling - 1))), 1);
     const Index old_k = k;
     if (k > max_kc) {
       // We are really blocking on the third dimension:
@@ -219,33 +231,49 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
       eigen_internal_assert(((old_k / k) == (old_k / max_kc)) && "the number of sweeps has to remain the same");
     }
 
+#ifdef EIGEN_VECTORIZE_AVX512
+    // The l1 *= 4 inflation above allows larger kc for better accumulator reuse,
+    // but can overfill the physical L1. Recompute max_kc using 85% of actual L1
+    // to leave headroom for RHS streaming, prefetch buffers, and stack.
+    {
+      const Index phys_l1_eff = phys_l1 * 85 / 100;
+      const Index max_kc_phys = numext::maxi<Index>(((phys_l1_eff - k_sub) / k_div) & (~(k_peeling - 1)), k_peeling);
+      if (max_kc_phys < k) {
+        k = (old_k % max_kc_phys) == 0 ? max_kc_phys
+                                       : max_kc_phys - k_peeling * ((max_kc_phys - 1 - (old_k % max_kc_phys)) /
+                                                                    (k_peeling * (old_k / max_kc_phys + 1)));
+      }
+    }
+#endif
+
 // ---- 2nd level of blocking on max(L2,L3), yields nc ----
 
-// TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
-//      actual_l2 = max(l2, l3/nb_core_sharing_l3)
-// The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
-// For instance, it corresponds to 6MB of L3 shared among 4 cores.
+// Estimate the effective per-core L2 capacity for 2nd-level blocking.
+// Use 1.5x the runtime-detected L2 size. The extra 50% accounts for data
+// that spills to L3 but remains accessible with low latency. This matches
+// the empirically-tuned constant (1.5MB) previously used when L2 was 1MB.
 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
-    const Index actual_l2 = l3;
+    const Index actual_l2 = static_cast<Index>(l3);
 #else
-    const Index actual_l2 = 1572864;  // == 1.5 MB
+    const Index actual_l2 = static_cast<Index>(l2 * 3 / 2);
 #endif
 
     // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
     // The second half is implicitly reserved to access the result and lhs coefficients.
-    // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
-    // to limit this growth: we bound nc to growth by a factor x1.5.
+    // When k<max_kc, then nc can grow without bound. In practice, it seems to be fruitful
+    // to limit this growth: we bound nc growth to a factor of 1.5x.
     // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
     // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
     Index max_nc;
     const Index lhs_bytes = m * k * sizeof(LhsScalar);
-    const Index remaining_l1 = l1 - k_sub - lhs_bytes;
+    const Index remaining_l1 = static_cast<Index>(l1 - k_sub - lhs_bytes);
     if (remaining_l1 >= Index(Traits::nr * sizeof(RhsScalar)) * k) {
       // L1 blocking
       max_nc = remaining_l1 / (k * sizeof(RhsScalar));
     } else {
-      // L2 blocking
-      max_nc = (3 * actual_l2) / (2 * 2 * max_kc * sizeof(RhsScalar));
+      // L2 blocking: use actual kc (k) rather than max_kc so that nc is not
+      // unnecessarily squeezed when k < max_kc (e.g. on CPUs with large L1).
+      max_nc = (3 * actual_l2) / (2 * 2 * k * sizeof(RhsScalar));
     }
     // WARNING Below, we assume that Traits::nr is a power of two.
     Index nc = numext::mini<Index>(actual_l2 / (2 * k * sizeof(RhsScalar)), max_nc) & (~(Traits::nr - 1));
@@ -256,21 +284,28 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
       //    Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
       n = (n % nc) == 0 ? nc : (nc - Traits::nr * ((nc /*-1*/ - (n % nc)) / (Traits::nr * (n / nc + 1))));
     } else if (old_k == k) {
-      // So far, no blocking at all, i.e., kc==k, and nc==n.
-      // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
-      // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic
-      // here should be obsolete.
+      // No k- or n-blocking happened yet (kc==depth, nc>=n). gebp already
+      // strip-chunks the packed lhs via its own `actual_panel_rows` budget,
+      // so cache residency is honored whatever mc we pick here. What this
+      // branch actually governs is the size of the `mc * kc` packing buffer
+      // (blockA) that the caller allocates — capping mc keeps it bounded for
+      // tall-m / small-k shapes, where leaving mc=m would allocate up to
+      // `rows * depth * sizeof(LhsScalar)`. A budget-based alternative
+      // (e.g. cap blockA at ~L3/4) is no faster in benchmarks and increases
+      // heap use, so the original L1/L2-residency tuning is kept.
       Index problem_size = k * n * sizeof(LhsScalar);
       Index actual_lm = actual_l2;
       Index max_mc = m;
       if (problem_size <= 1024) {
         // problem is small enough to keep in L1
         // Let's choose m such that lhs's block fit in 1/3 of L1
-        actual_lm = l1;
-      } else if (l3 != 0 && problem_size <= 32768) {
-        // we have both L2 and L3, and problem is small enough to be kept in L2
-        // Let's choose m such that lhs's block fit in 1/3 of L2
-        actual_lm = l2;
+        actual_lm = static_cast<Index>(l1);
+      } else if (l3 != 0 && problem_size <= l1) {
+        // We have both L2 and L3, and the rhs panel still fits in L1. Choose mc so the
+        // lhs block fits in 1/3 of L2 and avoid spilling into the L2+50% fallback band.
+        // The 32768 byte threshold previously used here was a stand-in for typical x86
+        // L1 size; using the runtime-detected l1 generalizes this to current cache sizes.
+        actual_lm = static_cast<Index>(l2);
         max_mc = (numext::mini<Index>)(576, max_mc);
       }
       Index mc = (numext::mini<Index>)(actual_lm / (3 * k * sizeof(LhsScalar)), max_mc);
@@ -293,9 +328,9 @@ inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) {
     return true;
   }
 #else
-  EIGEN_UNUSED_VARIABLE(k)
-  EIGEN_UNUSED_VARIABLE(m)
-  EIGEN_UNUSED_VARIABLE(n)
+  EIGEN_UNUSED_VARIABLE(k);
+  EIGEN_UNUSED_VARIABLE(m);
+  EIGEN_UNUSED_VARIABLE(n);
 #endif
   return false;
 }
@@ -443,7 +478,7 @@ class gebp_traits {
   typedef QuadPacket<RhsPacket> RhsPacketx4;
   typedef ResPacket AccPacket;
 
-  EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
+  EIGEN_STRONG_INLINE void initAcc(AccPacket& p) const { p = pset1<ResPacket>(ResScalar(0)); }
 
   template <typename RhsPacketType>
   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
@@ -548,7 +583,7 @@ class gebp_traits<std::complex<RealScalar>, RealScalar, ConjLhs_, false, Arch, P
 
   typedef ResPacket AccPacket;
 
-  EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
+  EIGEN_STRONG_INLINE void initAcc(AccPacket& p) const { p = pset1<ResPacket>(ResScalar(0)); }
 
   template <typename RhsPacketType>
   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
@@ -571,8 +606,7 @@ class gebp_traits<std::complex<RealScalar>, RealScalar, ConjLhs_, false, Arch, P
   }
 
   EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const {
-    // FIXME we can do better!
-    // what we want here is a ploadheight
+    // FIXME: replace with a dedicated ploadheight operation for more efficient quad loading.
     RhsScalar tmp[4] = {b[0], b[0], b[1], b[1]};
     dest = ploadquad<RhsPacket>(tmp);
   }
@@ -642,25 +676,23 @@ DoublePacket<Packet> padd(const DoublePacket<Packet>& a, const DoublePacket<Pack
   return res;
 }
 
-// note that for DoublePacket<RealPacket> the "4" in "downto4"
-// corresponds to the number of complexes, so it means "8"
-// it terms of real coefficients.
-
 template <typename Packet>
-const DoublePacket<Packet>& predux_half_dowto4(const DoublePacket<Packet>& a,
-                                               std::enable_if_t<unpacket_traits<Packet>::size <= 8>* = 0) {
+const DoublePacket<Packet>& predux_half(const DoublePacket<Packet>& a,
+                                        std::enable_if_t<unpacket_traits<Packet>::size <= 8>* = 0) {
   return a;
 }
 
 template <typename Packet>
-DoublePacket<typename unpacket_traits<Packet>::half> predux_half_dowto4(
-    const DoublePacket<Packet>& a, std::enable_if_t<unpacket_traits<Packet>::size == 16>* = 0) {
-  // yes, that's pretty hackish :(
+DoublePacket<typename unpacket_traits<Packet>::half> predux_half(
+    const DoublePacket<Packet>& a,
+    std::enable_if_t<unpacket_traits<Packet>::size >= 16 &&
+                     !NumTraits<typename unpacket_traits<Packet>::type>::IsComplex>* = 0) {
+  // Workaround: reduce real packets to half size by reinterpreting as complex.
   DoublePacket<typename unpacket_traits<Packet>::half> res;
   typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
   typedef typename packet_traits<Cplx>::type CplxPacket;
-  res.first = predux_half_dowto4(CplxPacket(a.first)).v;
-  res.second = predux_half_dowto4(CplxPacket(a.second)).v;
+  res.first = predux_half(CplxPacket(a.first)).v;
+  res.second = predux_half(CplxPacket(a.second)).v;
   return res;
 }
 
@@ -675,7 +707,7 @@ void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
 template <typename Scalar, typename RealPacket>
 void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
                             std::enable_if_t<unpacket_traits<RealPacket>::size == 16>* = 0) {
-  // yes, that's pretty hackish too :(
+  // Workaround: load quad elements by reinterpreting real packets as complex.
   typedef typename NumTraits<Scalar>::Real RealScalar;
   RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
   RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
@@ -684,7 +716,7 @@ void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
 }
 
 template <typename Packet>
-struct unpacket_traits<DoublePacket<Packet> > {
+struct unpacket_traits<DoublePacket<Packet>> {
   typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
   enum { size = 2 * unpacket_traits<Packet>::size };
 };
@@ -739,9 +771,9 @@ class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, ConjLhs_,
   // this actually holds 8 packets!
   typedef QuadPacket<RhsPacket> RhsPacketx4;
 
-  EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
+  EIGEN_STRONG_INLINE void initAcc(Scalar& p) const { p = Scalar(0); }
 
-  EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) {
+  EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) const {
     p.first = pset1<RealPacket>(RealScalar(0));
     p.second = pset1<RealPacket>(RealScalar(0));
   }
@@ -883,7 +915,7 @@ class gebp_traits<RealScalar, std::complex<RealScalar>, false, ConjRhs_, Arch, P
   typedef QuadPacket<RhsPacket> RhsPacketx4;
   typedef ResPacket AccPacket;
 
-  EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
+  EIGEN_STRONG_INLINE void initAcc(AccPacket& p) const { p = pset1<ResPacket>(ResScalar(0)); }
 
   template <typename RhsPacketType>
   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
@@ -1008,7 +1040,7 @@ struct gebp_kernel {
 
   EIGEN_DONT_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, Index rows,
                                     Index depth, Index cols, ResScalar alpha, Index strideA = -1, Index strideB = -1,
-                                    Index offsetA = 0, Index offsetB = 0);
+                                    Index offsetA = 0, Index offsetB = 0) const;
 };
 
 template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
@@ -1027,7 +1059,7 @@ struct last_row_process_16_packets {
 
   EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits& straits, const LhsScalar* blA,
                                       const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
-                                      ResScalar alpha, SAccPacket& C0) {
+                                      ResScalar alpha, SAccPacket& C0) const {
     EIGEN_UNUSED_VARIABLE(res);
     EIGEN_UNUSED_VARIABLE(straits);
     EIGEN_UNUSED_VARIABLE(blA);
@@ -1055,7 +1087,7 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr,
 
   EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits& straits, const LhsScalar* blA,
                                       const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
-                                      ResScalar alpha, SAccPacket& C0) {
+                                      ResScalar alpha, SAccPacket& C0) const {
     typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
     typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
     typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
@@ -1067,7 +1099,7 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr,
     if (depth - endk > 0) {
       // We have to handle the last row(s) of the rhs, which
       // correspond to a half-packet
-      SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
+      SAccPacketQuarter c0 = predux_half(predux_half(C0));
 
       for (Index kk = endk; kk < depth; kk++) {
         SLhsPacketQuarter a0;
@@ -1080,345 +1112,270 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr,
       }
       straits.acc(c0, alphav, R);
     } else {
-      straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
+      straits.acc(predux_half(predux_half(C0)), alphav, R);
     }
     res.scatterPacket(i, j2, R);
   }
 };
 
-template <int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar,
-          typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits,
-          typename LinearMapper, typename DataMapper>
-struct lhs_process_one_packet {
-  typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
-
-  EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits,
-                                             LhsPacket* A0, RhsPacketx4* rhs_panel, RhsPacket* T0, AccPacket* C0,
-                                             AccPacket* C1, AccPacket* C2, AccPacket* C3) {
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
-    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
-    traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], *A0);
-    traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], *rhs_panel);
-    traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
-    traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
-    traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
-    traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
-#if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
-    __asm__("" : "+x,m"(*A0));
-#endif
-    EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
+// Compile-time recursive helper: processes RHS columns J..NrCols-1 for gebp_micro_onestep.
+// For each column, loads/updates the RHS panel and does madd for all MrPackets LHS packets.
+// The bool partial specialization terminates the recursion without requiring if constexpr.
+template <int J, int MrPackets, int NrCols, bool Continue = (J < NrCols)>
+struct gebp_rhs_cols;
+
+// Base case: J >= NrCols, do nothing.
+template <int J, int MrPackets, int NrCols>
+struct gebp_rhs_cols<J, MrPackets, NrCols, false> {
+  template <typename GEBPTraits, typename LhsArray, typename RhsPanelType, typename RhsPacketType, typename AccArray,
+            typename RhsScalar>
+  static EIGEN_ALWAYS_INLINE void run(GEBPTraits&, const RhsScalar*, Index, LhsArray&, RhsPanelType&, RhsPacketType&,
+                                      AccArray&) {}
+};
+
+// Active case: J < NrCols.
+template <int J, int MrPackets, int NrCols>
+struct gebp_rhs_cols<J, MrPackets, NrCols, true> {
+  template <typename GEBPTraits, typename LhsArray, typename RhsPanelType, typename RhsPacketType, typename AccArray,
+            typename RhsScalar>
+  static EIGEN_ALWAYS_INLINE void run(GEBPTraits& traits, const RhsScalar* blB, Index rhs_offset, LhsArray& A,
+                                      RhsPanelType& rhs_panel, RhsPacketType& T0, AccArray& C) {
+    constexpr int lane = J % 4;
+    EIGEN_IF_CONSTEXPR(lane == 0)
+    traits.loadRhs(blB + (J + rhs_offset) * GEBPTraits::RhsProgress, rhs_panel);
+    else traits.updateRhs(blB + (J + rhs_offset) * GEBPTraits::RhsProgress, rhs_panel);
+
+    EIGEN_IF_CONSTEXPR(MrPackets >= 1) traits.madd(A[0], rhs_panel, C[J + 0 * NrCols], T0, fix<lane>);
+    EIGEN_IF_CONSTEXPR(MrPackets >= 2) traits.madd(A[1], rhs_panel, C[J + 1 * NrCols], T0, fix<lane>);
+    EIGEN_IF_CONSTEXPR(MrPackets >= 3) traits.madd(A[2], rhs_panel, C[J + 2 * NrCols], T0, fix<lane>);
+
+    gebp_rhs_cols<J + 1, MrPackets, NrCols>::run(traits, blB, rhs_offset, A, rhs_panel, T0, C);
   }
+};
 
-  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
-                                      ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB,
-                                      Index offsetA, Index offsetB, int prefetch_res_offset, Index peeled_kc, Index pk,
-                                      Index cols, Index depth, Index packet_cols4) {
-    GEBPTraits traits;
-    Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
-    // loops on each largest micro horizontal panel of lhs
-    // (LhsProgress x depth)
-    for (Index i = peelStart; i < peelEnd; i += LhsProgress) {
-#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
-      EIGEN_IF_CONSTEXPR(nr >= 8) {
-        for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
-          const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
-          prefetch(&blA[0]);
+// One step of the micro-kernel: loads MrPackets LHS packets at step K,
+// then processes NrCols RHS columns via gebp_rhs_cols.
+template <int K, int MrPackets, int NrCols>
+struct gebp_micro_step {
+  template <typename GEBPTraits, typename LhsScalar_, typename RhsScalar_, typename LhsArray, typename RhsPanelType,
+            typename RhsPacketType, typename AccArray>
+  static EIGEN_ALWAYS_INLINE void run(GEBPTraits& traits, const LhsScalar_* blA, const RhsScalar_* blB, LhsArray& A,
+                                      RhsPanelType& rhs_panel, RhsPacketType& T0, AccArray& C) {
+    constexpr int LhsProg = GEBPTraits::LhsProgress;
 
-          // gets res block as register
-          AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
-          traits.initAcc(C0);
-          traits.initAcc(C1);
-          traits.initAcc(C2);
-          traits.initAcc(C3);
-          traits.initAcc(C4);
-          traits.initAcc(C5);
-          traits.initAcc(C6);
-          traits.initAcc(C7);
-
-          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
-          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
-          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
-          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
-          LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
-          LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
-          LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
-          LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
-          r0.prefetch(prefetch_res_offset);
-          r1.prefetch(prefetch_res_offset);
-          r2.prefetch(prefetch_res_offset);
-          r3.prefetch(prefetch_res_offset);
-          r4.prefetch(prefetch_res_offset);
-          r5.prefetch(prefetch_res_offset);
-          r6.prefetch(prefetch_res_offset);
-          r7.prefetch(prefetch_res_offset);
-          const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
-          prefetch(&blB[0]);
-
-          LhsPacket A0;
-          for (Index k = 0; k < peeled_kc; k += pk) {
-            RhsPacketx4 rhs_panel;
-            RhsPacket T0;
-#define EIGEN_GEBGP_ONESTEP(K)                                    \
-  do {                                                            \
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX8");    \
-    traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], A0);          \
-    traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel);   \
-    traits.madd(A0, rhs_panel, C0, T0, fix<0>);                   \
-    traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
-    traits.madd(A0, rhs_panel, C1, T0, fix<1>);                   \
-    traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
-    traits.madd(A0, rhs_panel, C2, T0, fix<2>);                   \
-    traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
-    traits.madd(A0, rhs_panel, C3, T0, fix<3>);                   \
-    traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel);   \
-    traits.madd(A0, rhs_panel, C4, T0, fix<0>);                   \
-    traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
-    traits.madd(A0, rhs_panel, C5, T0, fix<1>);                   \
-    traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
-    traits.madd(A0, rhs_panel, C6, T0, fix<2>);                   \
-    traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
-    traits.madd(A0, rhs_panel, C7, T0, fix<3>);                   \
-    EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX8");      \
-  } while (false)
+    EIGEN_IF_CONSTEXPR(MrPackets >= 1) traits.loadLhs(&blA[(0 + MrPackets * K) * LhsProg], A[0]);
+    EIGEN_IF_CONSTEXPR(MrPackets >= 2) traits.loadLhs(&blA[(1 + MrPackets * K) * LhsProg], A[1]);
+    EIGEN_IF_CONSTEXPR(MrPackets >= 3) traits.loadLhs(&blA[(2 + MrPackets * K) * LhsProg], A[2]);
+
+    gebp_rhs_cols<0, MrPackets, NrCols>::run(traits, blB, Index(NrCols * K), A, rhs_panel, T0, C);
+  }
+};
+// Compiler register allocation workarounds for the GEBP micro-kernel.
+// GCC can fail to keep array-based SIMD values in vector registers, causing
+// excessive spilling. These helpers use inline asm constraints to pin values.
+// Only applied when the scalar type is actually vectorizable (not custom types).
+// See Eigen bugs 935, 1637, and 3059.
+
+// ARM64 NEON: pin 3 LHS packets in vector registers.
+// Old GCC (< 9) misallocates registers for 3-packet paths without this hint.
+template <int MrPackets, typename GEBPTraits_, typename FullLhsPacket_, typename LhsArray_>
+EIGEN_ALWAYS_INLINE void gebp_neon_3p_workaround(LhsArray_& A) {
+#if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
+  using LhsElement = std::remove_all_extents_t<std::remove_reference_t<LhsArray_>>;
+  constexpr bool apply = GEBPTraits_::Vectorizable && MrPackets == 3 && std::is_same<LhsElement, FullLhsPacket_>::value;
+  EIGEN_IF_CONSTEXPR(apply) { __asm__("" : "+w,m"(A[0]), "+w,m"(A[1]), "+w,m"(A[2])); }
+#else
+  EIGEN_UNUSED_VARIABLE(A);
+#endif
+}
 
-            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX8");
+// GCC SSE: prevent register spilling for LHS packets and accumulators.
+// C++17: pin accumulators with strict "+x" (if constexpr discards dead branches).
+// C++14: pin LHS packets with relaxed "+x,m" (memory fallback for non-SSE types).
+template <int MrPackets, int NrCols, typename GEBPTraits_, typename FullLhsPacket_, typename LhsArray_,
+          typename AccArray_>
+EIGEN_ALWAYS_INLINE void gebp_sse_spilling_workaround(LhsArray_& A, AccArray_& ACC) {
+  EIGEN_UNUSED_VARIABLE(A);
+  EIGEN_UNUSED_VARIABLE(ACC);
+#if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE)
+  using LhsElement = std::remove_all_extents_t<std::remove_reference_t<LhsArray_>>;
+  constexpr bool apply =
+      GEBPTraits_::Vectorizable && MrPackets <= 2 && NrCols >= 4 && std::is_same<LhsElement, FullLhsPacket_>::value;
+  EIGEN_IF_CONSTEXPR(apply) {
+#ifdef EIGEN_HAS_CXX17_IFCONSTEXPR
+    using AccElement = std::decay_t<decltype(ACC[0])>;
+    constexpr bool pin_acc = std::is_same<AccElement, FullLhsPacket_>::value && MrPackets == 2 && NrCols == 4;
+    if constexpr (pin_acc) {
+      __asm__(""
+              : "+x"(ACC[0]), "+x"(ACC[1]), "+x"(ACC[2]), "+x"(ACC[3]), "+x"(ACC[4]), "+x"(ACC[5]), "+x"(ACC[6]),
+                "+x"(ACC[7]));
+    }
+#else
+    EIGEN_IF_CONSTEXPR(MrPackets == 2) { __asm__("" : "+x,m"(A[0]), "+x,m"(A[1])); }
+#endif
+  }
+#endif
+}
 
-            EIGEN_GEBGP_ONESTEP(0);
-            EIGEN_GEBGP_ONESTEP(1);
-            EIGEN_GEBGP_ONESTEP(2);
-            EIGEN_GEBGP_ONESTEP(3);
-            EIGEN_GEBGP_ONESTEP(4);
-            EIGEN_GEBGP_ONESTEP(5);
-            EIGEN_GEBGP_ONESTEP(6);
-            EIGEN_GEBGP_ONESTEP(7);
+// Unrolled peeled loop body: calls gebp_micro_step for K=0..7, handling
+// double-accumulation for 1pX4, prefetches, and compiler workarounds.
+template <int MrPackets, int NrCols>
+struct gebp_peeled_loop {
+  template <typename GEBPTraits, typename LhsScalar_, typename RhsScalar_, typename LhsArray, typename RhsPanelType,
+            typename RhsPacketType, typename AccArray, typename AccArrayD, typename FullLhsPacket>
+  static EIGEN_ALWAYS_INLINE void run(GEBPTraits& traits, const LhsScalar_* blA, const RhsScalar_* blB, LhsArray& A,
+                                      RhsPanelType& rhs_panel, RhsPacketType& T0, AccArray& C, AccArrayD& D) {
+    constexpr bool use_double_accum = (MrPackets == 1 && NrCols == 4);
+
+    // Prefetch for 4-col paths
+    EIGEN_IF_CONSTEXPR(NrCols == 4) { internal::prefetch(blB + (48 + 0)); }
+
+    // Helper to do one step with workarounds
+#define EIGEN_GEBP_DO_STEP(KVAL, ACC)                                                       \
+  do {                                                                                      \
+    gebp_micro_step<KVAL, MrPackets, NrCols>::run(traits, blA, blB, A, rhs_panel, T0, ACC); \
+    gebp_neon_3p_workaround<MrPackets, GEBPTraits, FullLhsPacket>(A);                       \
+    gebp_sse_spilling_workaround<MrPackets, NrCols, GEBPTraits, FullLhsPacket>(A, ACC);     \
+    /* LHS prefetch for 2pX4 and 3pX4 */                                                    \
+    EIGEN_IF_CONSTEXPR((MrPackets == 2 || MrPackets == 3) && NrCols == 4) {                 \
+      internal::prefetch(blA + (MrPackets * KVAL + 16) * GEBPTraits::LhsProgress);          \
+      if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) {                                              \
+        internal::prefetch(blB + (NrCols * KVAL + 16) * GEBPTraits::RhsProgress);           \
+      }                                                                                     \
+    }                                                                                       \
+  } while (false)
 
-            blB += pk * 8 * RhsProgress;
-            blA += pk * (1 * LhsProgress);
+    EIGEN_IF_CONSTEXPR(use_double_accum) {
+      EIGEN_GEBP_DO_STEP(0, C);
+      EIGEN_GEBP_DO_STEP(1, D);
+      EIGEN_GEBP_DO_STEP(2, C);
+      EIGEN_GEBP_DO_STEP(3, D);
+      EIGEN_IF_CONSTEXPR(NrCols == 4) { internal::prefetch(blB + (48 + 16)); }
+      EIGEN_GEBP_DO_STEP(4, C);
+      EIGEN_GEBP_DO_STEP(5, D);
+      EIGEN_GEBP_DO_STEP(6, C);
+      EIGEN_GEBP_DO_STEP(7, D);
+    }
+    else {
+      EIGEN_GEBP_DO_STEP(0, C);
+      EIGEN_GEBP_DO_STEP(1, C);
+      EIGEN_GEBP_DO_STEP(2, C);
+      EIGEN_GEBP_DO_STEP(3, C);
+      EIGEN_IF_CONSTEXPR(NrCols == 4 && MrPackets == 2) { internal::prefetch(blB + (48 + 16)); }
+      EIGEN_GEBP_DO_STEP(4, C);
+      EIGEN_GEBP_DO_STEP(5, C);
+      EIGEN_GEBP_DO_STEP(6, C);
+      EIGEN_GEBP_DO_STEP(7, C);
+    }
 
-            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX8");
-          }
-          // process remaining peeled loop
-          for (Index k = peeled_kc; k < depth; k++) {
-            RhsPacketx4 rhs_panel;
-            RhsPacket T0;
-            EIGEN_GEBGP_ONESTEP(0);
-            blB += 8 * RhsProgress;
-            blA += 1 * LhsProgress;
-          }
+#undef EIGEN_GEBP_DO_STEP
+  }
+};
 
-#undef EIGEN_GEBGP_ONESTEP
-
-          ResPacket R0, R1;
-          ResPacket alphav = pset1<ResPacket>(alpha);
-
-          R0 = r0.template loadPacket<ResPacket>(0);
-          R1 = r1.template loadPacket<ResPacket>(0);
-          traits.acc(C0, alphav, R0);
-          traits.acc(C1, alphav, R1);
-          r0.storePacket(0, R0);
-          r1.storePacket(0, R1);
-
-          R0 = r2.template loadPacket<ResPacket>(0);
-          R1 = r3.template loadPacket<ResPacket>(0);
-          traits.acc(C2, alphav, R0);
-          traits.acc(C3, alphav, R1);
-          r2.storePacket(0, R0);
-          r3.storePacket(0, R1);
-
-          R0 = r4.template loadPacket<ResPacket>(0);
-          R1 = r5.template loadPacket<ResPacket>(0);
-          traits.acc(C4, alphav, R0);
-          traits.acc(C5, alphav, R1);
-          r4.storePacket(0, R0);
-          r5.storePacket(0, R1);
-
-          R0 = r6.template loadPacket<ResPacket>(0);
-          R1 = r7.template loadPacket<ResPacket>(0);
-          traits.acc(C6, alphav, R0);
-          traits.acc(C7, alphav, R1);
-          r6.storePacket(0, R0);
-          r7.storePacket(0, R1);
-        }
-      }
+// Unified micro-panel function: handles a MrPackets x NrCols register block.
+// GEBPTraits determines the packet types (supports full/half/quarter sizes).
+// Accumulator layout: C[j + p * NrCols] for column j, LHS packet p.
+template <int MrPackets, int NrCols, typename GEBPTraits, typename LhsScalar_, typename RhsScalar_, typename ResScalar_,
+          typename Index_, typename DataMapper_, typename LinearMapper_, typename FullLhsPacket>
+EIGEN_ALWAYS_INLINE void gebp_micro_panel_impl(GEBPTraits& traits, const DataMapper_& res, const LhsScalar_* blockA,
+                                               const RhsScalar_* blockB, ResScalar_ alpha, Index_ i, Index_ j2,
+                                               Index_ depth, Index_ strideA, Index_ strideB, Index_ offsetA,
+                                               Index_ offsetB, int prefetch_res_offset, Index_ peeled_kc, int pk) {
+  using LhsPacketLocal = typename GEBPTraits::LhsPacket;
+  using RhsPacketLocal = typename GEBPTraits::RhsPacket;
+  using ResPacketLocal = typename GEBPTraits::ResPacket;
+  using AccPacketLocal = typename GEBPTraits::AccPacket;
+  using RhsPacketx4Local = typename GEBPTraits::RhsPacketx4;
+  constexpr int LhsProg = GEBPTraits::LhsProgress;
+  constexpr int RhsProg = GEBPTraits::RhsProgress;
+  constexpr int ResPacketSz = GEBPTraits::ResPacketSize;
+
+  // Determine RhsPanel type based on register pressure
+  using RhsPanelType = std::conditional_t<
+      NrCols == 1, RhsPacketLocal,
+      typename RhsPanelHelper<RhsPacketLocal, RhsPacketx4Local, MrPackets * NrCols + MrPackets>::type>;
+
+  const LhsScalar_* blA = &blockA[i * strideA + offsetA * (MrPackets * LhsProg)];
+  prefetch(&blA[0]);
+
+  // Accumulators: C[j + p * NrCols] for column j, LHS packet p.
+  // With if constexpr (C++17) we use exact sizes; with plain if (C++14) we pad
+  // to 3*NrCols so dead-branch array accesses in gebp_rhs_cols remain valid.
+#ifdef EIGEN_HAS_CXX17_IFCONSTEXPR
+  constexpr int CSize = MrPackets * NrCols;
+#else
+  constexpr int CSize = 3 * NrCols > MrPackets * NrCols ? 3 * NrCols : MrPackets * NrCols;
 #endif
+  alignas(AccPacketLocal) AccPacketLocal C[CSize];
+  for (int n = 0; n < MrPackets * NrCols; ++n) traits.initAcc(C[n]);
 
-      // loops on each largest micro vertical panel of rhs (depth * nr)
-      for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
-        // We select a LhsProgress x nr micro block of res
-        // which is entirely stored into 1 x nr registers.
+  // Double-accumulation trick for 1pX4 path to break FMA dependency chains
+  constexpr bool use_double_accum = (MrPackets == 1 && NrCols == 4);
+#ifdef EIGEN_HAS_CXX17_IFCONSTEXPR
+  alignas(AccPacketLocal) AccPacketLocal D[use_double_accum ? NrCols : 1];
+#else
+  // Without if constexpr, we must allocate a larger array to satisfy the
+  // compiler that D[n] is always in bounds for the use_double_accum path.
+  alignas(AccPacketLocal) AccPacketLocal D[CSize];
+#endif
+  EIGEN_IF_CONSTEXPR(use_double_accum) {
+    for (int n = 0; n < NrCols; ++n) traits.initAcc(D[n]);
+  }
 
-        const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
-        prefetch(&blA[0]);
+  // Prefetch result memory
+  for (int j = 0; j < NrCols; ++j) res.getLinearMapper(i, j2 + j).prefetch(NrCols > 1 ? prefetch_res_offset : 0);
 
-        // gets res block as register
-        AccPacket C0, C1, C2, C3;
-        traits.initAcc(C0);
-        traits.initAcc(C1);
-        traits.initAcc(C2);
-        traits.initAcc(C3);
-        // To improve instruction pipelining, let's double the accumulation registers:
-        //  even k will accumulate in C*, while odd k will accumulate in D*.
-        // This trick is crucial to get good performance with FMA, otherwise it is
-        // actually faster to perform separated MUL+ADD because of a naturally
-        // better instruction-level parallelism.
-        AccPacket D0, D1, D2, D3;
-        traits.initAcc(D0);
-        traits.initAcc(D1);
-        traits.initAcc(D2);
-        traits.initAcc(D3);
-
-        LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
-        LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
-        LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
-        LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
-
-        r0.prefetch(prefetch_res_offset);
-        r1.prefetch(prefetch_res_offset);
-        r2.prefetch(prefetch_res_offset);
-        r3.prefetch(prefetch_res_offset);
-
-        // performs "inner" products
-        const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
-        prefetch(&blB[0]);
-        LhsPacket A0, A1;
-
-        for (Index k = 0; k < peeled_kc; k += pk) {
-          EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
-          RhsPacketx4 rhs_panel;
-          RhsPacket T0;
-
-          internal::prefetch(blB + (48 + 0));
-          peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
-          peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
-          peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
-          peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
-          internal::prefetch(blB + (48 + 16));
-          peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
-          peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
-          peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
-          peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
-
-          blB += pk * 4 * RhsProgress;
-          blA += pk * LhsProgress;
-
-          EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
-        }
-        C0 = padd(C0, D0);
-        C1 = padd(C1, D1);
-        C2 = padd(C2, D2);
-        C3 = padd(C3, D3);
-
-        // process remaining peeled loop
-        for (Index k = peeled_kc; k < depth; k++) {
-          RhsPacketx4 rhs_panel;
-          RhsPacket T0;
-          peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
-          blB += 4 * RhsProgress;
-          blA += LhsProgress;
-        }
+  // RHS pointer
+  const RhsScalar_* blB = &blockB[j2 * strideB + offsetB * NrCols];
+  prefetch(&blB[0]);
 
-        ResPacket R0, R1;
-        ResPacket alphav = pset1<ResPacket>(alpha);
-
-        R0 = r0.template loadPacket<ResPacket>(0);
-        R1 = r1.template loadPacket<ResPacket>(0);
-        traits.acc(C0, alphav, R0);
-        traits.acc(C1, alphav, R1);
-        r0.storePacket(0, R0);
-        r1.storePacket(0, R1);
-
-        R0 = r2.template loadPacket<ResPacket>(0);
-        R1 = r3.template loadPacket<ResPacket>(0);
-        traits.acc(C2, alphav, R0);
-        traits.acc(C3, alphav, R1);
-        r2.storePacket(0, R0);
-        r3.storePacket(0, R1);
-      }
+  // LHS packet staging area. With if constexpr (C++17) we use exact sizes.
+#ifdef EIGEN_HAS_CXX17_IFCONSTEXPR
+  alignas(LhsPacketLocal) LhsPacketLocal A[MrPackets];
+#else
+  alignas(LhsPacketLocal) LhsPacketLocal A[3];
+#endif
 
-      // Deal with remaining columns of the rhs
-      for (Index j2 = packet_cols4; j2 < cols; j2++) {
-        // One column at a time
-        const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
-        prefetch(&blA[0]);
+  // ---- Peeled k-loop (pk=8 unrolled) ----
+  for (Index_ k = 0; k < peeled_kc; k += pk) {
+    alignas(RhsPanelType) RhsPanelType rhs_panel;
+    alignas(RhsPacketLocal) RhsPacketLocal T0;
 
-        // gets res block as register
-        AccPacket C0;
-        traits.initAcc(C0);
+    gebp_peeled_loop<MrPackets, NrCols>::template run<GEBPTraits, LhsScalar_, RhsScalar_, decltype(A), RhsPanelType,
+                                                      RhsPacketLocal, decltype(C), decltype(D), FullLhsPacket>(
+        traits, blA, blB, A, rhs_panel, T0, C, D);
 
-        LinearMapper r0 = res.getLinearMapper(i, j2);
+    blB += pk * NrCols * RhsProg;
+    blA += pk * MrPackets * LhsProg;
+  }
 
-        // performs "inner" products
-        const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
-        LhsPacket A0;
-
-        for (Index k = 0; k < peeled_kc; k += pk) {
-          EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
-          RhsPacket B_0;
-
-#define EIGEN_GEBGP_ONESTEP(K)                                             \
-  do {                                                                     \
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
-    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");    \
-    /* FIXME: why unaligned???? */                                         \
-    traits.loadLhsUnaligned(&blA[(0 + 1 * K) * LhsProgress], A0);          \
-    traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0);                      \
-    traits.madd(A0, B_0, C0, B_0, fix<0>);                                 \
-    EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1");   \
-  } while (false);
-
-          EIGEN_GEBGP_ONESTEP(0);
-          EIGEN_GEBGP_ONESTEP(1);
-          EIGEN_GEBGP_ONESTEP(2);
-          EIGEN_GEBGP_ONESTEP(3);
-          EIGEN_GEBGP_ONESTEP(4);
-          EIGEN_GEBGP_ONESTEP(5);
-          EIGEN_GEBGP_ONESTEP(6);
-          EIGEN_GEBGP_ONESTEP(7);
-
-          blB += pk * RhsProgress;
-          blA += pk * LhsProgress;
-
-          EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
-        }
+  // Merge double accumulators
+  EIGEN_IF_CONSTEXPR(use_double_accum) {
+    for (int n = 0; n < NrCols; ++n) C[n] = padd(C[n], D[n]);
+  }
 
-        // process remaining peeled loop
-        for (Index k = peeled_kc; k < depth; k++) {
-          RhsPacket B_0;
-          EIGEN_GEBGP_ONESTEP(0);
-          blB += RhsProgress;
-          blA += LhsProgress;
-        }
-#undef EIGEN_GEBGP_ONESTEP
-        ResPacket R0;
-        ResPacket alphav = pset1<ResPacket>(alpha);
-        R0 = r0.template loadPacket<ResPacket>(0);
-        traits.acc(C0, alphav, R0);
-        r0.storePacket(0, R0);
-      }
-    }
+  // ---- Remainder k-loop ----
+  for (Index_ k = peeled_kc; k < depth; k++) {
+    alignas(RhsPanelType) RhsPanelType rhs_panel;
+    alignas(RhsPacketLocal) RhsPacketLocal T0;
+
+    gebp_micro_step<0, MrPackets, NrCols>::run(traits, blA, blB, A, rhs_panel, T0, C);
+
+    blB += NrCols * RhsProg;
+    blA += MrPackets * LhsProg;
   }
-};
 
-template <int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar,
-          typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits,
-          typename LinearMapper, typename DataMapper>
-struct lhs_process_fraction_of_packet
-    : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket,
-                             RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper> {
-  EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits,
-                                             LhsPacket* A0, RhsPacket* B_0, RhsPacket* B1, RhsPacket* B2, RhsPacket* B3,
-                                             AccPacket* C0, AccPacket* C1, AccPacket* C2, AccPacket* C3) {
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
-    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
-    traits.loadLhsUnaligned(&blA[(0 + 1 * K) * (LhsProgress)], *A0);
-    traits.broadcastRhs(&blB[(0 + 4 * K) * RhsProgress], *B_0, *B1, *B2, *B3);
-    traits.madd(*A0, *B_0, *C0, *B_0);
-    traits.madd(*A0, *B1, *C1, *B1);
-    traits.madd(*A0, *B2, *C2, *B2);
-    traits.madd(*A0, *B3, *C3, *B3);
-    EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
+  // ---- Store results: C[j + p * NrCols] -> res(i + p*ResPacketSz, j2 + j) ----
+  alignas(ResPacketLocal) ResPacketLocal alphav = pset1<ResPacketLocal>(alpha);
+  for (int j = 0; j < NrCols; ++j) {
+    LinearMapper_ r = res.getLinearMapper(i, j2 + j);
+    for (int p = 0; p < MrPackets; ++p) {
+      alignas(ResPacketLocal) ResPacketLocal R = r.template loadPacket<ResPacketLocal>(p * ResPacketSz);
+      traits.acc(C[j + p * NrCols], alphav, R);
+      r.storePacket(p * ResPacketSz, R);
+    }
   }
-};
+}
 
 template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
           bool ConjugateLhs, bool ConjugateRhs>
@@ -1426,7 +1383,7 @@ EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr,
                                    ConjugateRhs>::operator()(const DataMapper& res, const LhsScalar* blockA,
                                                              const RhsScalar* blockB, Index rows, Index depth,
                                                              Index cols, ResScalar alpha, Index strideA, Index strideB,
-                                                             Index offsetA, Index offsetB) {
+                                                             Index offsetA, Index offsetB) const {
   Traits traits;
   SwappedTraits straits;
 
@@ -1449,885 +1406,168 @@ EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr,
   enum { pk = 8 };  // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
   const Index peeled_kc = depth & ~(pk - 1);
   const int prefetch_res_offset = 32 / sizeof(ResScalar);
-  //     const Index depth2     = depth & ~1;
+
+  // Helper to invoke gebp_micro_panel_impl with the right types.
+  // The always_inline attribute is critical: without it GCC outlines each
+  // template instantiation of this generic lambda as a separate function,
+  // adding call overhead that causes 10-17 % regressions in LLT/TRSM
+  // for small-to-medium matrix sizes.
+  auto micro_panel = [&](auto mrp_tag, auto nrc_tag, auto& local_traits, Index i, Index j2) EIGEN_LAMBDA_ALWAYS_INLINE {
+    constexpr int MrP = decltype(mrp_tag)::value;
+    constexpr int NrC = decltype(nrc_tag)::value;
+    using LTraits = std::remove_reference_t<decltype(local_traits)>;
+    gebp_micro_panel_impl<MrP, NrC, LTraits, LhsScalar, RhsScalar, ResScalar, Index, DataMapper, LinearMapper,
+                          LhsPacket>(local_traits, res, blockA, blockB, alpha, i, j2, depth, strideA, strideB, offsetA,
+                                     offsetB, prefetch_res_offset, peeled_kc, pk);
+  };
+
+  // Budget (in bytes) for co-residency of the RHS block and a strip of the
+  // LHS panel. On most architectures this is L1: the LHS streams sequentially
+  // through L1 and we only need room for one micro-panel strip at a time.
+  // Sub-blocking trades cache misses for extra passes over the RHS columns,
+  // which hurts IPC and loop overhead. On modern x86, L1→L2 traffic is cheap
+  // (~5 cycles) and hardware prefetchers absorb the LHS misses, so we use a
+  // fraction of L2 instead — effectively disabling sub-blocking when the LHS
+  // panel already fits in L2.
+  Index lhs_budget;
+  {
+    std::ptrdiff_t l1, l2, l3;
+    manage_caching_sizes(GetAction, &l1, &l2, &l3);
+#if EIGEN_ARCH_i386_OR_x86_64
+    lhs_budget = static_cast<Index>(l2 / 2);
+#else
+    lhs_budget = static_cast<Index>(l1);
+#endif
+  }
 
   //---------- Process 3 * LhsProgress rows at once ----------
-  // This corresponds to 3*LhsProgress x nr register blocks.
-  // Usually, make sense only with FMA
-  if (mr >= 3 * Traits::LhsProgress) {
-    // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x
-    // depth) and on each largest micro vertical panel of the rhs (depth * nr). Blocking sizes, i.e., 'depth' has been
-    // computed so that the micro horizontal panel of the lhs fit in L1. However, if depth is too small, we can extend
-    // the number of rows of these horizontal panels. This actual number of rows is computed as follow:
-    const Index l1 = defaultL1CacheSize;  // in Bytes, TODO, l1 should be passed to this function.
-    // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
-    // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only
-    // guess), or because we are testing specific blocking sizes.
-    const Index actual_panel_rows =
-        (3 * LhsProgress) * std::max<Index>(1, ((l1 - sizeof(ResScalar) * mr * nr - depth * nr * sizeof(RhsScalar)) /
-                                                (depth * sizeof(LhsScalar) * 3 * LhsProgress)));
+  EIGEN_IF_CONSTEXPR(mr >= 3 * Traits::LhsProgress) {
+    const Index rhs_block = sizeof(ResScalar) * mr * nr + depth * nr * sizeof(RhsScalar);
+    const Index lhs_strip = depth * sizeof(LhsScalar) * 3 * LhsProgress;
+    const Index lhs_avail = (lhs_budget > rhs_block) ? (lhs_budget - rhs_block) : 0;
+    const Index actual_panel_rows = (lhs_avail >= peeled_mc3 * depth * static_cast<Index>(sizeof(LhsScalar)))
+                                        ? peeled_mc3
+                                        : (3 * LhsProgress) * std::max<Index>(1, lhs_avail / lhs_strip);
     for (Index i1 = 0; i1 < peeled_mc3; i1 += actual_panel_rows) {
       const Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc3);
-#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
       EIGEN_IF_CONSTEXPR(nr >= 8) {
         for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
           for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
-            const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * LhsProgress)];
-            prefetch(&blA[0]);
-            // gets res block as register
-            AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15, C16, C17, C18, C19, C20,
-                C21, C22, C23;
-            traits.initAcc(C0);
-            traits.initAcc(C1);
-            traits.initAcc(C2);
-            traits.initAcc(C3);
-            traits.initAcc(C4);
-            traits.initAcc(C5);
-            traits.initAcc(C6);
-            traits.initAcc(C7);
-            traits.initAcc(C8);
-            traits.initAcc(C9);
-            traits.initAcc(C10);
-            traits.initAcc(C11);
-            traits.initAcc(C12);
-            traits.initAcc(C13);
-            traits.initAcc(C14);
-            traits.initAcc(C15);
-            traits.initAcc(C16);
-            traits.initAcc(C17);
-            traits.initAcc(C18);
-            traits.initAcc(C19);
-            traits.initAcc(C20);
-            traits.initAcc(C21);
-            traits.initAcc(C22);
-            traits.initAcc(C23);
-
-            LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
-            LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
-            LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
-            LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
-            LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
-            LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
-            LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
-            LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
-
-            r0.prefetch(0);
-            r1.prefetch(0);
-            r2.prefetch(0);
-            r3.prefetch(0);
-            r4.prefetch(0);
-            r5.prefetch(0);
-            r6.prefetch(0);
-            r7.prefetch(0);
-
-            // performs "inner" products
-            const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
-            prefetch(&blB[0]);
-            LhsPacket A0, A1;
-            for (Index k = 0; k < peeled_kc; k += pk) {
-              EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX8");
-              // 27 registers are taken (24 for acc, 3 for lhs).
-              RhsPanel27 rhs_panel;
-              RhsPacket T0;
-              LhsPacket A2;
-#if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
-// see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
-// without this workaround A0, A1, and A2 are loaded in the same register,
-// which is not good for pipelining
-#define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
-#else
-#define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND
-#endif
-
-#define EIGEN_GEBP_ONESTEP(K)                                                                                     \
-  do {                                                                                                            \
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX8");                                                    \
-    traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0);                                                          \
-    traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1);                                                          \
-    traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2);                                                          \
-    EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND traits.loadRhs(blB + (0 + 8 * K) * Traits::RhsProgress, rhs_panel); \
-    traits.madd(A0, rhs_panel, C0, T0, fix<0>);                                                                   \
-    traits.madd(A1, rhs_panel, C8, T0, fix<0>);                                                                   \
-    traits.madd(A2, rhs_panel, C16, T0, fix<0>);                                                                  \
-    traits.updateRhs(blB + (1 + 8 * K) * Traits::RhsProgress, rhs_panel);                                         \
-    traits.madd(A0, rhs_panel, C1, T0, fix<1>);                                                                   \
-    traits.madd(A1, rhs_panel, C9, T0, fix<1>);                                                                   \
-    traits.madd(A2, rhs_panel, C17, T0, fix<1>);                                                                  \
-    traits.updateRhs(blB + (2 + 8 * K) * Traits::RhsProgress, rhs_panel);                                         \
-    traits.madd(A0, rhs_panel, C2, T0, fix<2>);                                                                   \
-    traits.madd(A1, rhs_panel, C10, T0, fix<2>);                                                                  \
-    traits.madd(A2, rhs_panel, C18, T0, fix<2>);                                                                  \
-    traits.updateRhs(blB + (3 + 8 * K) * Traits::RhsProgress, rhs_panel);                                         \
-    traits.madd(A0, rhs_panel, C3, T0, fix<3>);                                                                   \
-    traits.madd(A1, rhs_panel, C11, T0, fix<3>);                                                                  \
-    traits.madd(A2, rhs_panel, C19, T0, fix<3>);                                                                  \
-    traits.loadRhs(blB + (4 + 8 * K) * Traits::RhsProgress, rhs_panel);                                           \
-    traits.madd(A0, rhs_panel, C4, T0, fix<0>);                                                                   \
-    traits.madd(A1, rhs_panel, C12, T0, fix<0>);                                                                  \
-    traits.madd(A2, rhs_panel, C20, T0, fix<0>);                                                                  \
-    traits.updateRhs(blB + (5 + 8 * K) * Traits::RhsProgress, rhs_panel);                                         \
-    traits.madd(A0, rhs_panel, C5, T0, fix<1>);                                                                   \
-    traits.madd(A1, rhs_panel, C13, T0, fix<1>);                                                                  \
-    traits.madd(A2, rhs_panel, C21, T0, fix<1>);                                                                  \
-    traits.updateRhs(blB + (6 + 8 * K) * Traits::RhsProgress, rhs_panel);                                         \
-    traits.madd(A0, rhs_panel, C6, T0, fix<2>);                                                                   \
-    traits.madd(A1, rhs_panel, C14, T0, fix<2>);                                                                  \
-    traits.madd(A2, rhs_panel, C22, T0, fix<2>);                                                                  \
-    traits.updateRhs(blB + (7 + 8 * K) * Traits::RhsProgress, rhs_panel);                                         \
-    traits.madd(A0, rhs_panel, C7, T0, fix<3>);                                                                   \
-    traits.madd(A1, rhs_panel, C15, T0, fix<3>);                                                                  \
-    traits.madd(A2, rhs_panel, C23, T0, fix<3>);                                                                  \
-    EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX8");                                                      \
-  } while (false)
-
-              EIGEN_GEBP_ONESTEP(0);
-              EIGEN_GEBP_ONESTEP(1);
-              EIGEN_GEBP_ONESTEP(2);
-              EIGEN_GEBP_ONESTEP(3);
-              EIGEN_GEBP_ONESTEP(4);
-              EIGEN_GEBP_ONESTEP(5);
-              EIGEN_GEBP_ONESTEP(6);
-              EIGEN_GEBP_ONESTEP(7);
-
-              blB += pk * 8 * RhsProgress;
-              blA += pk * 3 * Traits::LhsProgress;
-              EIGEN_ASM_COMMENT("end gebp micro kernel 3pX8");
-            }
-
-            // process remaining peeled loop
-            for (Index k = peeled_kc; k < depth; k++) {
-              RhsPanel27 rhs_panel;
-              RhsPacket T0;
-              LhsPacket A2;
-              EIGEN_GEBP_ONESTEP(0);
-              blB += 8 * RhsProgress;
-              blA += 3 * Traits::LhsProgress;
-            }
-
-#undef EIGEN_GEBP_ONESTEP
-
-            ResPacket R0, R1, R2;
-            ResPacket alphav = pset1<ResPacket>(alpha);
-
-            R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-            traits.acc(C0, alphav, R0);
-            traits.acc(C8, alphav, R1);
-            traits.acc(C16, alphav, R2);
-            r0.storePacket(0 * Traits::ResPacketSize, R0);
-            r0.storePacket(1 * Traits::ResPacketSize, R1);
-            r0.storePacket(2 * Traits::ResPacketSize, R2);
-
-            R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-            traits.acc(C1, alphav, R0);
-            traits.acc(C9, alphav, R1);
-            traits.acc(C17, alphav, R2);
-            r1.storePacket(0 * Traits::ResPacketSize, R0);
-            r1.storePacket(1 * Traits::ResPacketSize, R1);
-            r1.storePacket(2 * Traits::ResPacketSize, R2);
-
-            R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-            traits.acc(C2, alphav, R0);
-            traits.acc(C10, alphav, R1);
-            traits.acc(C18, alphav, R2);
-            r2.storePacket(0 * Traits::ResPacketSize, R0);
-            r2.storePacket(1 * Traits::ResPacketSize, R1);
-            r2.storePacket(2 * Traits::ResPacketSize, R2);
-
-            R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-            traits.acc(C3, alphav, R0);
-            traits.acc(C11, alphav, R1);
-            traits.acc(C19, alphav, R2);
-            r3.storePacket(0 * Traits::ResPacketSize, R0);
-            r3.storePacket(1 * Traits::ResPacketSize, R1);
-            r3.storePacket(2 * Traits::ResPacketSize, R2);
-
-            R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r4.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-            traits.acc(C4, alphav, R0);
-            traits.acc(C12, alphav, R1);
-            traits.acc(C20, alphav, R2);
-            r4.storePacket(0 * Traits::ResPacketSize, R0);
-            r4.storePacket(1 * Traits::ResPacketSize, R1);
-            r4.storePacket(2 * Traits::ResPacketSize, R2);
-
-            R0 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r5.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-            traits.acc(C5, alphav, R0);
-            traits.acc(C13, alphav, R1);
-            traits.acc(C21, alphav, R2);
-            r5.storePacket(0 * Traits::ResPacketSize, R0);
-            r5.storePacket(1 * Traits::ResPacketSize, R1);
-            r5.storePacket(2 * Traits::ResPacketSize, R2);
-
-            R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r6.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-            traits.acc(C6, alphav, R0);
-            traits.acc(C14, alphav, R1);
-            traits.acc(C22, alphav, R2);
-            r6.storePacket(0 * Traits::ResPacketSize, R0);
-            r6.storePacket(1 * Traits::ResPacketSize, R1);
-            r6.storePacket(2 * Traits::ResPacketSize, R2);
-
-            R0 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r7.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-            traits.acc(C7, alphav, R0);
-            traits.acc(C15, alphav, R1);
-            traits.acc(C23, alphav, R2);
-            r7.storePacket(0 * Traits::ResPacketSize, R0);
-            r7.storePacket(1 * Traits::ResPacketSize, R1);
-            r7.storePacket(2 * Traits::ResPacketSize, R2);
+            micro_panel(fix<3>, fix<8>, traits, i, j2);
           }
         }
       }
-#endif
       for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
         for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
-          // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
-          // stored into 3 x nr registers.
-
-          const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * LhsProgress)];
-          prefetch(&blA[0]);
-
-          // gets res block as register
-          AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11;
-          traits.initAcc(C0);
-          traits.initAcc(C1);
-          traits.initAcc(C2);
-          traits.initAcc(C3);
-          traits.initAcc(C4);
-          traits.initAcc(C5);
-          traits.initAcc(C6);
-          traits.initAcc(C7);
-          traits.initAcc(C8);
-          traits.initAcc(C9);
-          traits.initAcc(C10);
-          traits.initAcc(C11);
-
-          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
-          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
-          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
-          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
-
-          r0.prefetch(0);
-          r1.prefetch(0);
-          r2.prefetch(0);
-          r3.prefetch(0);
-
-          // performs "inner" products
-          const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
-          prefetch(&blB[0]);
-          LhsPacket A0, A1;
-
-          for (Index k = 0; k < peeled_kc; k += pk) {
-            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
-            // 15 registers are taken (12 for acc, 3 for lhs).
-            RhsPanel15 rhs_panel;
-            RhsPacket T0;
-            LhsPacket A2;
-#if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
-// see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
-// without this workaround A0, A1, and A2 are loaded in the same register,
-// which is not good for pipelining
-#define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
-#else
-#define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
-#endif
-#define EIGEN_GEBP_ONESTEP(K)                                             \
-  do {                                                                    \
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4");            \
-    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");   \
-    internal::prefetch(blA + (3 * K + 16) * LhsProgress);                 \
-    if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) {                              \
-      internal::prefetch(blB + (4 * K + 16) * RhsProgress);               \
-    } /* Bug 953 */                                                       \
-    traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0);                  \
-    traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1);                  \
-    traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2);                  \
-    EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND                             \
-    traits.loadRhs(blB + (0 + 4 * K) * Traits::RhsProgress, rhs_panel);   \
-    traits.madd(A0, rhs_panel, C0, T0, fix<0>);                           \
-    traits.madd(A1, rhs_panel, C4, T0, fix<0>);                           \
-    traits.madd(A2, rhs_panel, C8, T0, fix<0>);                           \
-    traits.updateRhs(blB + (1 + 4 * K) * Traits::RhsProgress, rhs_panel); \
-    traits.madd(A0, rhs_panel, C1, T0, fix<1>);                           \
-    traits.madd(A1, rhs_panel, C5, T0, fix<1>);                           \
-    traits.madd(A2, rhs_panel, C9, T0, fix<1>);                           \
-    traits.updateRhs(blB + (2 + 4 * K) * Traits::RhsProgress, rhs_panel); \
-    traits.madd(A0, rhs_panel, C2, T0, fix<2>);                           \
-    traits.madd(A1, rhs_panel, C6, T0, fix<2>);                           \
-    traits.madd(A2, rhs_panel, C10, T0, fix<2>);                          \
-    traits.updateRhs(blB + (3 + 4 * K) * Traits::RhsProgress, rhs_panel); \
-    traits.madd(A0, rhs_panel, C3, T0, fix<3>);                           \
-    traits.madd(A1, rhs_panel, C7, T0, fix<3>);                           \
-    traits.madd(A2, rhs_panel, C11, T0, fix<3>);                          \
-    EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4");              \
-  } while (false)
-
-            internal::prefetch(blB);
-            EIGEN_GEBP_ONESTEP(0);
-            EIGEN_GEBP_ONESTEP(1);
-            EIGEN_GEBP_ONESTEP(2);
-            EIGEN_GEBP_ONESTEP(3);
-            EIGEN_GEBP_ONESTEP(4);
-            EIGEN_GEBP_ONESTEP(5);
-            EIGEN_GEBP_ONESTEP(6);
-            EIGEN_GEBP_ONESTEP(7);
-
-            blB += pk * 4 * RhsProgress;
-            blA += pk * 3 * Traits::LhsProgress;
-
-            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
-          }
-          // process remaining peeled loop
-          for (Index k = peeled_kc; k < depth; k++) {
-            RhsPanel15 rhs_panel;
-            RhsPacket T0;
-            LhsPacket A2;
-            EIGEN_GEBP_ONESTEP(0);
-            blB += 4 * RhsProgress;
-            blA += 3 * Traits::LhsProgress;
-          }
-
-#undef EIGEN_GEBP_ONESTEP
-
-          ResPacket R0, R1, R2;
-          ResPacket alphav = pset1<ResPacket>(alpha);
-
-          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-          traits.acc(C0, alphav, R0);
-          traits.acc(C4, alphav, R1);
-          traits.acc(C8, alphav, R2);
-          r0.storePacket(0 * Traits::ResPacketSize, R0);
-          r0.storePacket(1 * Traits::ResPacketSize, R1);
-          r0.storePacket(2 * Traits::ResPacketSize, R2);
-
-          R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-          traits.acc(C1, alphav, R0);
-          traits.acc(C5, alphav, R1);
-          traits.acc(C9, alphav, R2);
-          r1.storePacket(0 * Traits::ResPacketSize, R0);
-          r1.storePacket(1 * Traits::ResPacketSize, R1);
-          r1.storePacket(2 * Traits::ResPacketSize, R2);
-
-          R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-          traits.acc(C2, alphav, R0);
-          traits.acc(C6, alphav, R1);
-          traits.acc(C10, alphav, R2);
-          r2.storePacket(0 * Traits::ResPacketSize, R0);
-          r2.storePacket(1 * Traits::ResPacketSize, R1);
-          r2.storePacket(2 * Traits::ResPacketSize, R2);
-
-          R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-          traits.acc(C3, alphav, R0);
-          traits.acc(C7, alphav, R1);
-          traits.acc(C11, alphav, R2);
-          r3.storePacket(0 * Traits::ResPacketSize, R0);
-          r3.storePacket(1 * Traits::ResPacketSize, R1);
-          r3.storePacket(2 * Traits::ResPacketSize, R2);
+          micro_panel(fix<3>, fix<4>, traits, i, j2);
         }
       }
-
-      // Deal with remaining columns of the rhs
       for (Index j2 = packet_cols4; j2 < cols; j2++) {
         for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
-          // One column at a time
-          const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * Traits::LhsProgress)];
-          prefetch(&blA[0]);
-
-          // gets res block as register
-          AccPacket C0, C4, C8;
-          traits.initAcc(C0);
-          traits.initAcc(C4);
-          traits.initAcc(C8);
-
-          LinearMapper r0 = res.getLinearMapper(i, j2);
-          r0.prefetch(0);
-
-          // performs "inner" products
-          const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
-          LhsPacket A0, A1, A2;
-
-          for (Index k = 0; k < peeled_kc; k += pk) {
-            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
-            RhsPacket B_0;
-#define EIGEN_GEBGP_ONESTEP(K)                                          \
-  do {                                                                  \
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1");          \
-    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
-    traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0);                \
-    traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1);                \
-    traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2);                \
-    traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0);                   \
-    traits.madd(A0, B_0, C0, B_0, fix<0>);                              \
-    traits.madd(A1, B_0, C4, B_0, fix<0>);                              \
-    traits.madd(A2, B_0, C8, B_0, fix<0>);                              \
-    EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1");            \
-  } while (false)
-
-            EIGEN_GEBGP_ONESTEP(0);
-            EIGEN_GEBGP_ONESTEP(1);
-            EIGEN_GEBGP_ONESTEP(2);
-            EIGEN_GEBGP_ONESTEP(3);
-            EIGEN_GEBGP_ONESTEP(4);
-            EIGEN_GEBGP_ONESTEP(5);
-            EIGEN_GEBGP_ONESTEP(6);
-            EIGEN_GEBGP_ONESTEP(7);
-
-            blB += int(pk) * int(RhsProgress);
-            blA += int(pk) * 3 * int(Traits::LhsProgress);
-
-            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
-          }
-
-          // process remaining peeled loop
-          for (Index k = peeled_kc; k < depth; k++) {
-            RhsPacket B_0;
-            EIGEN_GEBGP_ONESTEP(0);
-            blB += RhsProgress;
-            blA += 3 * Traits::LhsProgress;
-          }
-#undef EIGEN_GEBGP_ONESTEP
-          ResPacket R0, R1, R2;
-          ResPacket alphav = pset1<ResPacket>(alpha);
-
-          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
-          traits.acc(C0, alphav, R0);
-          traits.acc(C4, alphav, R1);
-          traits.acc(C8, alphav, R2);
-          r0.storePacket(0 * Traits::ResPacketSize, R0);
-          r0.storePacket(1 * Traits::ResPacketSize, R1);
-          r0.storePacket(2 * Traits::ResPacketSize, R2);
+          micro_panel(fix<3>, fix<1>, traits, i, j2);
         }
       }
     }
   }
 
   //---------- Process 2 * LhsProgress rows at once ----------
-  if (mr >= 2 * Traits::LhsProgress) {
-    const Index l1 = defaultL1CacheSize;  // in Bytes, TODO, l1 should be passed to this function.
-    // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
-    // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only
-    // guess), or because we are testing specific blocking sizes.
-    Index actual_panel_rows =
-        (2 * LhsProgress) * std::max<Index>(1, ((l1 - sizeof(ResScalar) * mr * nr - depth * nr * sizeof(RhsScalar)) /
-                                                (depth * sizeof(LhsScalar) * 2 * LhsProgress)));
-
+  EIGEN_IF_CONSTEXPR(mr >= 2 * Traits::LhsProgress) {
+    const Index rhs_block2 = sizeof(ResScalar) * mr * nr + depth * nr * sizeof(RhsScalar);
+    const Index lhs_strip2 = depth * sizeof(LhsScalar) * 2 * LhsProgress;
+    const Index lhs_avail2 = (lhs_budget > rhs_block2) ? (lhs_budget - rhs_block2) : 0;
+    const Index mc2_range = peeled_mc2 - peeled_mc3;
+    Index actual_panel_rows = (lhs_avail2 >= mc2_range * depth * static_cast<Index>(sizeof(LhsScalar)))
+                                  ? mc2_range
+                                  : (2 * LhsProgress) * std::max<Index>(1, lhs_avail2 / lhs_strip2);
     for (Index i1 = peeled_mc3; i1 < peeled_mc2; i1 += actual_panel_rows) {
       Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc2);
-#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
       EIGEN_IF_CONSTEXPR(nr >= 8) {
         for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
           for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
-            const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
-            prefetch(&blA[0]);
-
-            AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15;
-            traits.initAcc(C0);
-            traits.initAcc(C1);
-            traits.initAcc(C2);
-            traits.initAcc(C3);
-            traits.initAcc(C4);
-            traits.initAcc(C5);
-            traits.initAcc(C6);
-            traits.initAcc(C7);
-            traits.initAcc(C8);
-            traits.initAcc(C9);
-            traits.initAcc(C10);
-            traits.initAcc(C11);
-            traits.initAcc(C12);
-            traits.initAcc(C13);
-            traits.initAcc(C14);
-            traits.initAcc(C15);
-
-            LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
-            LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
-            LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
-            LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
-            LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
-            LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
-            LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
-            LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
-            r0.prefetch(prefetch_res_offset);
-            r1.prefetch(prefetch_res_offset);
-            r2.prefetch(prefetch_res_offset);
-            r3.prefetch(prefetch_res_offset);
-            r4.prefetch(prefetch_res_offset);
-            r5.prefetch(prefetch_res_offset);
-            r6.prefetch(prefetch_res_offset);
-            r7.prefetch(prefetch_res_offset);
-
-            const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
-            prefetch(&blB[0]);
-            LhsPacket A0, A1;
-            for (Index k = 0; k < peeled_kc; k += pk) {
-              RhsPacketx4 rhs_panel;
-              RhsPacket T0;
-// NOTE: the begin/end asm comments below work around bug 935!
-// but they are not enough for gcc>=6 without FMA (bug 1637)
-#if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE)
-#define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
-#else
-#define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND
-#endif
-#define EIGEN_GEBGP_ONESTEP(K)                                                                   \
-  do {                                                                                           \
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX8");                                   \
-    traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0);                                         \
-    traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1);                                         \
-    traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel);                                  \
-    traits.madd(A0, rhs_panel, C0, T0, fix<0>);                                                  \
-    traits.madd(A1, rhs_panel, C8, T0, fix<0>);                                                  \
-    traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel);                                \
-    traits.madd(A0, rhs_panel, C1, T0, fix<1>);                                                  \
-    traits.madd(A1, rhs_panel, C9, T0, fix<1>);                                                  \
-    traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel);                                \
-    traits.madd(A0, rhs_panel, C2, T0, fix<2>);                                                  \
-    traits.madd(A1, rhs_panel, C10, T0, fix<2>);                                                 \
-    traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel);                                \
-    traits.madd(A0, rhs_panel, C3, T0, fix<3>);                                                  \
-    traits.madd(A1, rhs_panel, C11, T0, fix<3>);                                                 \
-    traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel);                                  \
-    traits.madd(A0, rhs_panel, C4, T0, fix<0>);                                                  \
-    traits.madd(A1, rhs_panel, C12, T0, fix<0>);                                                 \
-    traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel);                                \
-    traits.madd(A0, rhs_panel, C5, T0, fix<1>);                                                  \
-    traits.madd(A1, rhs_panel, C13, T0, fix<1>);                                                 \
-    traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel);                                \
-    traits.madd(A0, rhs_panel, C6, T0, fix<2>);                                                  \
-    traits.madd(A1, rhs_panel, C14, T0, fix<2>);                                                 \
-    traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel);                                \
-    traits.madd(A0, rhs_panel, C7, T0, fix<3>);                                                  \
-    traits.madd(A1, rhs_panel, C15, T0, fix<3>);                                                 \
-    EIGEN_GEBP_2Px8_SPILLING_WORKAROUND EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX8"); \
-  } while (false)
-
-              EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX8");
-
-              EIGEN_GEBGP_ONESTEP(0);
-              EIGEN_GEBGP_ONESTEP(1);
-              EIGEN_GEBGP_ONESTEP(2);
-              EIGEN_GEBGP_ONESTEP(3);
-              EIGEN_GEBGP_ONESTEP(4);
-              EIGEN_GEBGP_ONESTEP(5);
-              EIGEN_GEBGP_ONESTEP(6);
-              EIGEN_GEBGP_ONESTEP(7);
-
-              blB += pk * 8 * RhsProgress;
-              blA += pk * (2 * Traits::LhsProgress);
-
-              EIGEN_ASM_COMMENT("end gebp micro kernel 2pX8");
-            }
-            // process remaining peeled loop
-            for (Index k = peeled_kc; k < depth; k++) {
-              RhsPacketx4 rhs_panel;
-              RhsPacket T0;
-              EIGEN_GEBGP_ONESTEP(0);
-              blB += 8 * RhsProgress;
-              blA += 2 * Traits::LhsProgress;
-            }
-
-#undef EIGEN_GEBGP_ONESTEP
-
-            ResPacket R0, R1, R2, R3;
-            ResPacket alphav = pset1<ResPacket>(alpha);
-
-            R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            traits.acc(C0, alphav, R0);
-            traits.acc(C8, alphav, R1);
-            traits.acc(C1, alphav, R2);
-            traits.acc(C9, alphav, R3);
-            r0.storePacket(0 * Traits::ResPacketSize, R0);
-            r0.storePacket(1 * Traits::ResPacketSize, R1);
-            r1.storePacket(0 * Traits::ResPacketSize, R2);
-            r1.storePacket(1 * Traits::ResPacketSize, R3);
-
-            R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            traits.acc(C2, alphav, R0);
-            traits.acc(C10, alphav, R1);
-            traits.acc(C3, alphav, R2);
-            traits.acc(C11, alphav, R3);
-            r2.storePacket(0 * Traits::ResPacketSize, R0);
-            r2.storePacket(1 * Traits::ResPacketSize, R1);
-            r3.storePacket(0 * Traits::ResPacketSize, R2);
-            r3.storePacket(1 * Traits::ResPacketSize, R3);
-
-            R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R3 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            traits.acc(C4, alphav, R0);
-            traits.acc(C12, alphav, R1);
-            traits.acc(C5, alphav, R2);
-            traits.acc(C13, alphav, R3);
-            r4.storePacket(0 * Traits::ResPacketSize, R0);
-            r4.storePacket(1 * Traits::ResPacketSize, R1);
-            r5.storePacket(0 * Traits::ResPacketSize, R2);
-            r5.storePacket(1 * Traits::ResPacketSize, R3);
-
-            R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            R2 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-            R3 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-            traits.acc(C6, alphav, R0);
-            traits.acc(C14, alphav, R1);
-            traits.acc(C7, alphav, R2);
-            traits.acc(C15, alphav, R3);
-            r6.storePacket(0 * Traits::ResPacketSize, R0);
-            r6.storePacket(1 * Traits::ResPacketSize, R1);
-            r7.storePacket(0 * Traits::ResPacketSize, R2);
-            r7.storePacket(1 * Traits::ResPacketSize, R3);
+            micro_panel(fix<2>, fix<8>, traits, i, j2);
           }
         }
       }
-#endif
       for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
         for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
-          // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
-          // stored into 2 x nr registers.
-
-          const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
-          prefetch(&blA[0]);
-
-          // gets res block as register
-          AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
-          traits.initAcc(C0);
-          traits.initAcc(C1);
-          traits.initAcc(C2);
-          traits.initAcc(C3);
-          traits.initAcc(C4);
-          traits.initAcc(C5);
-          traits.initAcc(C6);
-          traits.initAcc(C7);
-
-          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
-          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
-          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
-          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
-
-          r0.prefetch(prefetch_res_offset);
-          r1.prefetch(prefetch_res_offset);
-          r2.prefetch(prefetch_res_offset);
-          r3.prefetch(prefetch_res_offset);
-
-          // performs "inner" products
-          const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
-          prefetch(&blB[0]);
-          LhsPacket A0, A1;
-
-          for (Index k = 0; k < peeled_kc; k += pk) {
-            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
-            RhsPacketx4 rhs_panel;
-            RhsPacket T0;
-
-// NOTE: the begin/end asm comments below work around bug 935!
-// but they are not enough for gcc>=6 without FMA (bug 1637)
-#if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
-#define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
-#else
-#define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
-#endif
-#define EIGEN_GEBGP_ONESTEP(K)                                  \
-  do {                                                          \
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");  \
-    traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0);        \
-    traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1);        \
-    traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
-    traits.madd(A0, rhs_panel, C0, T0, fix<0>);                 \
-    traits.madd(A1, rhs_panel, C4, T0, fix<0>);                 \
-    traits.madd(A0, rhs_panel, C1, T0, fix<1>);                 \
-    traits.madd(A1, rhs_panel, C5, T0, fix<1>);                 \
-    traits.madd(A0, rhs_panel, C2, T0, fix<2>);                 \
-    traits.madd(A1, rhs_panel, C6, T0, fix<2>);                 \
-    traits.madd(A0, rhs_panel, C3, T0, fix<3>);                 \
-    traits.madd(A1, rhs_panel, C7, T0, fix<3>);                 \
-    EIGEN_GEBP_2PX4_SPILLING_WORKAROUND                         \
-    EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");    \
-  } while (false)
-
-            internal::prefetch(blB + (48 + 0));
-            EIGEN_GEBGP_ONESTEP(0);
-            EIGEN_GEBGP_ONESTEP(1);
-            EIGEN_GEBGP_ONESTEP(2);
-            EIGEN_GEBGP_ONESTEP(3);
-            internal::prefetch(blB + (48 + 16));
-            EIGEN_GEBGP_ONESTEP(4);
-            EIGEN_GEBGP_ONESTEP(5);
-            EIGEN_GEBGP_ONESTEP(6);
-            EIGEN_GEBGP_ONESTEP(7);
-
-            blB += pk * 4 * RhsProgress;
-            blA += pk * (2 * Traits::LhsProgress);
-
-            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
-          }
-          // process remaining peeled loop
-          for (Index k = peeled_kc; k < depth; k++) {
-            RhsPacketx4 rhs_panel;
-            RhsPacket T0;
-            EIGEN_GEBGP_ONESTEP(0);
-            blB += 4 * RhsProgress;
-            blA += 2 * Traits::LhsProgress;
-          }
-#undef EIGEN_GEBGP_ONESTEP
-
-          ResPacket R0, R1, R2, R3;
-          ResPacket alphav = pset1<ResPacket>(alpha);
-
-          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          traits.acc(C0, alphav, R0);
-          traits.acc(C4, alphav, R1);
-          traits.acc(C1, alphav, R2);
-          traits.acc(C5, alphav, R3);
-          r0.storePacket(0 * Traits::ResPacketSize, R0);
-          r0.storePacket(1 * Traits::ResPacketSize, R1);
-          r1.storePacket(0 * Traits::ResPacketSize, R2);
-          r1.storePacket(1 * Traits::ResPacketSize, R3);
-
-          R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          traits.acc(C2, alphav, R0);
-          traits.acc(C6, alphav, R1);
-          traits.acc(C3, alphav, R2);
-          traits.acc(C7, alphav, R3);
-          r2.storePacket(0 * Traits::ResPacketSize, R0);
-          r2.storePacket(1 * Traits::ResPacketSize, R1);
-          r3.storePacket(0 * Traits::ResPacketSize, R2);
-          r3.storePacket(1 * Traits::ResPacketSize, R3);
+          micro_panel(fix<2>, fix<4>, traits, i, j2);
         }
       }
-
-      // Deal with remaining columns of the rhs
       for (Index j2 = packet_cols4; j2 < cols; j2++) {
         for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
-          // One column at a time
-          const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
-          prefetch(&blA[0]);
-
-          // gets res block as register
-          AccPacket C0, C4;
-          traits.initAcc(C0);
-          traits.initAcc(C4);
-
-          LinearMapper r0 = res.getLinearMapper(i, j2);
-          r0.prefetch(prefetch_res_offset);
-
-          // performs "inner" products
-          const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
-          LhsPacket A0, A1;
-
-          for (Index k = 0; k < peeled_kc; k += pk) {
-            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
-            RhsPacket B_0, B1;
-
-#define EIGEN_GEBGP_ONESTEP(K)                                          \
-  do {                                                                  \
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");          \
-    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
-    traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0);                \
-    traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1);                \
-    traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0);                   \
-    traits.madd(A0, B_0, C0, B1, fix<0>);                               \
-    traits.madd(A1, B_0, C4, B_0, fix<0>);                              \
-    EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");            \
-  } while (false)
-
-            EIGEN_GEBGP_ONESTEP(0);
-            EIGEN_GEBGP_ONESTEP(1);
-            EIGEN_GEBGP_ONESTEP(2);
-            EIGEN_GEBGP_ONESTEP(3);
-            EIGEN_GEBGP_ONESTEP(4);
-            EIGEN_GEBGP_ONESTEP(5);
-            EIGEN_GEBGP_ONESTEP(6);
-            EIGEN_GEBGP_ONESTEP(7);
-
-            blB += int(pk) * int(RhsProgress);
-            blA += int(pk) * 2 * int(Traits::LhsProgress);
-
-            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
-          }
-
-          // process remaining peeled loop
-          for (Index k = peeled_kc; k < depth; k++) {
-            RhsPacket B_0, B1;
-            EIGEN_GEBGP_ONESTEP(0);
-            blB += RhsProgress;
-            blA += 2 * Traits::LhsProgress;
-          }
-#undef EIGEN_GEBGP_ONESTEP
-          ResPacket R0, R1;
-          ResPacket alphav = pset1<ResPacket>(alpha);
-
-          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          traits.acc(C0, alphav, R0);
-          traits.acc(C4, alphav, R1);
-          r0.storePacket(0 * Traits::ResPacketSize, R0);
-          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          micro_panel(fix<2>, fix<1>, traits, i, j2);
         }
       }
     }
   }
+
   //---------- Process 1 * LhsProgress rows at once ----------
-  if (mr >= 1 * Traits::LhsProgress) {
-    lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket,
-                           RhsPacket, ResPacket, Traits, LinearMapper, DataMapper>
-        p;
-    p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
-      peeled_kc, pk, cols, depth, packet_cols4);
+  EIGEN_IF_CONSTEXPR(mr >= 1 * Traits::LhsProgress) {
+    for (Index i = peeled_mc2; i < peeled_mc1; i += LhsProgress) {
+      EIGEN_IF_CONSTEXPR(nr >= 8) {
+        for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
+          micro_panel(fix<1>, fix<8>, traits, i, j2);
+        }
+      }
+      for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
+        micro_panel(fix<1>, fix<4>, traits, i, j2);
+      }
+      for (Index j2 = packet_cols4; j2 < cols; j2++) {
+        micro_panel(fix<1>, fix<1>, traits, i, j2);
+      }
+    }
   }
+
   //---------- Process LhsProgressHalf rows at once ----------
-  if ((LhsProgressHalf < LhsProgress) && mr >= LhsProgressHalf) {
-    lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf,
-                                   LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper>
-        p;
-    p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
-      peeled_kc, pk, cols, depth, packet_cols4);
+  EIGEN_IF_CONSTEXPR((LhsProgressHalf < LhsProgress) && mr >= LhsProgressHalf) {
+    HalfTraits half_traits;
+    for (Index i = peeled_mc1; i < peeled_mc_half; i += LhsProgressHalf) {
+      EIGEN_IF_CONSTEXPR(nr >= 8) {
+        for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
+          gebp_micro_panel_impl<1, 8, HalfTraits, LhsScalar, RhsScalar, ResScalar, Index, DataMapper, LinearMapper,
+                                LhsPacket>(half_traits, res, blockA, blockB, alpha, i, j2, depth, strideA, strideB,
+                                           offsetA, offsetB, prefetch_res_offset, peeled_kc, pk);
+        }
+      }
+      for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
+        gebp_micro_panel_impl<1, 4, HalfTraits, LhsScalar, RhsScalar, ResScalar, Index, DataMapper, LinearMapper,
+                              LhsPacket>(half_traits, res, blockA, blockB, alpha, i, j2, depth, strideA, strideB,
+                                         offsetA, offsetB, prefetch_res_offset, peeled_kc, pk);
+      }
+      for (Index j2 = packet_cols4; j2 < cols; j2++) {
+        gebp_micro_panel_impl<1, 1, HalfTraits, LhsScalar, RhsScalar, ResScalar, Index, DataMapper, LinearMapper,
+                              LhsPacket>(half_traits, res, blockA, blockB, alpha, i, j2, depth, strideA, strideB,
+                                         offsetA, offsetB, prefetch_res_offset, peeled_kc, pk);
+      }
+    }
   }
+
   //---------- Process LhsProgressQuarter rows at once ----------
-  if ((LhsProgressQuarter < LhsProgressHalf) && mr >= LhsProgressQuarter) {
-    lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar,
-                                   AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter,
-                                   QuarterTraits, LinearMapper, DataMapper>
-        p;
-    p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB,
-      prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
+  EIGEN_IF_CONSTEXPR((LhsProgressQuarter < LhsProgressHalf) && mr >= LhsProgressQuarter) {
+    QuarterTraits quarter_traits;
+    for (Index i = peeled_mc_half; i < peeled_mc_quarter; i += LhsProgressQuarter) {
+      EIGEN_IF_CONSTEXPR(nr >= 8) {
+        for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
+          gebp_micro_panel_impl<1, 8, QuarterTraits, LhsScalar, RhsScalar, ResScalar, Index, DataMapper, LinearMapper,
+                                LhsPacket>(quarter_traits, res, blockA, blockB, alpha, i, j2, depth, strideA, strideB,
+                                           offsetA, offsetB, prefetch_res_offset, peeled_kc, pk);
+        }
+      }
+      for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
+        gebp_micro_panel_impl<1, 4, QuarterTraits, LhsScalar, RhsScalar, ResScalar, Index, DataMapper, LinearMapper,
+                              LhsPacket>(quarter_traits, res, blockA, blockB, alpha, i, j2, depth, strideA, strideB,
+                                         offsetA, offsetB, prefetch_res_offset, peeled_kc, pk);
+      }
+      for (Index j2 = packet_cols4; j2 < cols; j2++) {
+        gebp_micro_panel_impl<1, 1, QuarterTraits, LhsScalar, RhsScalar, ResScalar, Index, DataMapper, LinearMapper,
+                              LhsPacket>(quarter_traits, res, blockA, blockB, alpha, i, j2, depth, strideA, strideB,
+                                         offsetA, offsetB, prefetch_res_offset, peeled_kc, pk);
+      }
+    }
   }
+
   //---------- Process remaining rows, 1 at once ----------
   if (peeled_mc_quarter < rows) {
-#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
     EIGEN_IF_CONSTEXPR(nr >= 8) {
       // loop on each panel of the rhs
       for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
@@ -2379,7 +1619,6 @@ EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr,
         }
       }
     }
-#endif
 
     for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
       // loop on each row of the lhs (1*LhsProgress x depth)
@@ -2473,11 +1712,11 @@ EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr,
               SRhsPacketHalf b0;
               straits.loadLhsUnaligned(blB, a0);
               straits.loadRhs(blA, b0);
-              SAccPacketHalf c0 = predux_half_dowto4(C0);
+              SAccPacketHalf c0 = predux_half(C0);
               straits.madd(a0, b0, c0, b0, fix<0>);
               straits.acc(c0, alphav, R);
             } else {
-              straits.acc(predux_half_dowto4(C0), alphav, R);
+              straits.acc(predux_half(C0), alphav, R);
             }
             res.scatterPacket(i, j2, R);
           } else if (SwappedTraits::LhsProgress == 16) {
@@ -2562,14 +1801,14 @@ template <typename Scalar, typename Index, typename DataMapper, int Pack1, int P
 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
   typedef typename DataMapper::LinearMapper LinearMapper;
   EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
-                                    Index offset = 0);
+                                    Index offset = 0) const;
 };
 
 template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
           bool PanelMode>
 EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate,
                                      PanelMode>::operator()(Scalar* blockA, const DataMapper& lhs, Index depth,
-                                                            Index rows, Index stride, Index offset) {
+                                                            Index rows, Index stride, Index offset) const {
   typedef typename unpacket_traits<Packet>::half HalfPacket;
   typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
   enum {
@@ -2584,7 +1823,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
   EIGEN_UNUSED_VARIABLE(stride);
   EIGEN_UNUSED_VARIABLE(offset);
   eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
-  eigen_assert(((Pack1 % PacketSize) == 0 && Pack1 <= 4 * PacketSize) || (Pack1 <= 4));
+  eigen_assert(((Pack1 % PacketSize) == 0 && Pack1 <= 4 * PacketSize) || (Pack1 <= 4) || (Pack1 < PacketSize));
   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
   Index count = 0;
 
@@ -2688,14 +1927,23 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
   // address both real & imaginary parts on the rhs. This portion will
   // pack those half ones until they match the number expected on the
   // last peeling loop at this point (for the rhs).
+  //
+  // When there are no half/quarter packet types (HasHalf and HasQuarter
+  // are both false), last_lhs_progress can exceed Pack2, producing
+  // interleaved groups that the GEBP micro-kernel cannot consume.  In
+  // that case we use exactly Pack2 rows per group so the kernel's main
+  // loop (which reads Pack2 = LhsProgress values via ploaddup) can
+  // handle them; remaining rows fall through to the scalar loop below.
   if (Pack2 < PacketSize && Pack2 > 1) {
-    for (; i < peeled_mc0; i += last_lhs_progress) {
-      if (PanelMode) count += last_lhs_progress * offset;
+    const Index pack2_progress = (HasHalf || HasQuarter) ? last_lhs_progress : Pack2;
+    const Index peeled = (HasHalf || HasQuarter) ? peeled_mc0 : (rows / Pack2) * Pack2;
+    for (; i < peeled; i += pack2_progress) {
+      if (PanelMode) count += pack2_progress * offset;
 
       for (Index k = 0; k < depth; k++)
-        for (Index w = 0; w < last_lhs_progress; w++) blockA[count++] = cj(lhs(i + w, k));
+        for (Index w = 0; w < pack2_progress; w++) blockA[count++] = cj(lhs(i + w, k));
 
-      if (PanelMode) count += last_lhs_progress * (stride - offset - depth);
+      if (PanelMode) count += pack2_progress * (stride - offset - depth);
     }
   }
   // Pack scalars
@@ -2711,14 +1959,14 @@ template <typename Scalar, typename Index, typename DataMapper, int Pack1, int P
 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
   typedef typename DataMapper::LinearMapper LinearMapper;
   EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
-                                    Index offset = 0);
+                                    Index offset = 0) const;
 };
 
 template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
           bool PanelMode>
 EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate,
                                      PanelMode>::operator()(Scalar* blockA, const DataMapper& lhs, Index depth,
-                                                            Index rows, Index stride, Index offset) {
+                                                            Index rows, Index stride, Index offset) const {
   typedef typename unpacket_traits<Packet>::half HalfPacket;
   typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
   enum {
@@ -2810,9 +2058,16 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
       // address both real & imaginary parts on the rhs. This portion will
       // pack those half ones until they match the number expected on the
       // last peeling loop at this point (for the rhs).
+      //
+      // When there are no half/quarter packet types (HasHalf and HasQuarter
+      // are both false), last_lhs_progress can exceed Pack2, producing
+      // interleaved groups that the GEBP micro-kernel cannot consume.  In
+      // that case we use exactly Pack2 rows per group so the kernel's main
+      // loop (which reads Pack2 = LhsProgress values via ploaddup) can
+      // handle them; remaining rows fall through to the scalar loop below.
       if (Pack2 < PacketSize && !gone_last) {
         gone_last = true;
-        psize = pack = left & ~1;
+        psize = pack = (HasHalf || HasQuarter) ? (left & ~1) : Pack2;
       }
     }
   }
@@ -2837,12 +2092,12 @@ struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMo
   typedef typename DataMapper::LinearMapper LinearMapper;
   enum { PacketSize = packet_traits<Scalar>::size };
   EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0);
+                                    Index offset = 0) const;
 };
 
 template <typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
 EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
-    Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
+    Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) const {
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
   EIGEN_UNUSED_VARIABLE(stride);
   EIGEN_UNUSED_VARIABLE(offset);
@@ -2853,7 +2108,6 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
   Index count = 0;
   const Index peeled_k = (depth / PacketSize) * PacketSize;
 
-#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
   EIGEN_IF_CONSTEXPR(nr >= 8) {
     for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
       // skip what we have before
@@ -2959,7 +2213,6 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
       if (PanelMode) count += 8 * (stride - offset - depth);
     }
   }
-#endif
 
   EIGEN_IF_CONSTEXPR(nr >= 4) {
     for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
@@ -2971,19 +2224,37 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
       const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
 
       Index k = 0;
-      if ((PacketSize % 4) == 0)  // TODO enable vectorized transposition for PacketSize==2 ??
-      {
+      EIGEN_IF_CONSTEXPR((PacketSize % 4) == 0 || PacketSize == 2) {
         for (; k < peeled_k; k += PacketSize) {
-          PacketBlock<Packet, (PacketSize % 4) == 0 ? 4 : PacketSize> kernel;
+          PacketBlock<Packet, 4> kernel;
           kernel.packet[0] = dm0.template loadPacket<Packet>(k);
-          kernel.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
-          kernel.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
-          kernel.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
-          ptranspose(kernel);
+          kernel.packet[1] = dm1.template loadPacket<Packet>(k);
+          kernel.packet[2] = dm2.template loadPacket<Packet>(k);
+          kernel.packet[3] = dm3.template loadPacket<Packet>(k);
+          EIGEN_IF_CONSTEXPR(PacketSize == 2) {
+            // For PacketSize==2 we cannot ptranspose 4 packets directly; compose two
+            // 2-packet transposes and re-interleave so the 4 stores produce the
+            // packed-rhs layout (each store writing one half-row of the panel).
+            PacketBlock<Packet, 2> tmp01;
+            tmp01.packet[0] = kernel.packet[0];
+            tmp01.packet[1] = kernel.packet[1];
+            ptranspose(tmp01);
+            PacketBlock<Packet, 2> tmp23;
+            tmp23.packet[0] = kernel.packet[2];
+            tmp23.packet[1] = kernel.packet[3];
+            ptranspose(tmp23);
+            kernel.packet[0] = tmp01.packet[0];
+            kernel.packet[1] = tmp23.packet[0];
+            kernel.packet[2] = tmp01.packet[1];
+            kernel.packet[3] = tmp23.packet[1];
+          }
+          else {
+            ptranspose(kernel);
+          }
           pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel.packet[0]));
-          pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1 % PacketSize]));
-          pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2 % PacketSize]));
-          pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3 % PacketSize]));
+          pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1]));
+          pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2]));
+          pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3]));
           count += 4 * PacketSize;
         }
       }
@@ -3024,7 +2295,7 @@ struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMo
     QuarterPacketSize = unpacket_traits<QuarterPacket>::size
   };
   EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0) {
+                                    Index offset = 0) const {
     EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
     EIGEN_UNUSED_VARIABLE(stride);
     EIGEN_UNUSED_VARIABLE(offset);
@@ -3036,7 +2307,6 @@ struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMo
     Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
     Index count = 0;
 
-#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
     EIGEN_IF_CONSTEXPR(nr >= 8) {
       for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
         // skip what we have before
@@ -3069,7 +2339,6 @@ struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMo
         if (PanelMode) count += 8 * (stride - offset - depth);
       }
     }
-#endif
 
     if (nr >= 4) {
       for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
@@ -3150,4 +2419,8 @@ inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_
 
 }  // end namespace Eigen
 
+#if EIGEN_COMP_MSVC
+#pragma warning(pop)
+#endif
+
 #endif  // EIGEN_GENERAL_BLOCK_PANEL_H
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index ebfac0146be..915f1304095 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -52,6 +52,9 @@ struct general_matrix_matrix_product<Index, LhsScalar, LhsStorageOrder, Conjugat
   static void run(Index rows, Index cols, Index depth, const LhsScalar* lhs_, Index lhsStride, const RhsScalar* rhs_,
                   Index rhsStride, ResScalar* res_, Index resIncr, Index resStride, ResScalar alpha,
                   level3_blocking<LhsScalar, RhsScalar>& blocking, GemmParallelInfo<Index>* info = 0) {
+    // BLAS contract: if alpha == 0, the result is unchanged (and lhs/rhs need not be read).
+    if (numext::is_exactly_zero(alpha)) return;
+
     typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
     typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
     typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
@@ -383,8 +386,8 @@ struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, GemmProduct>
     // to determine the following heuristic.
     // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
     // unless it has been specialized by the user or for a given architecture.
-    // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs.
-    // I'm not sure it is still required.
+    // Note that the condition rhs.rows()>0 was required because lazy product did not handle empty inputs
+    // correctly. It is unclear whether this guard is still necessary.
     if ((rhs.rows() + dst.rows() + dst.cols()) < EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows() > 0)
       lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar, Scalar>());
     else {
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index bf275675726..5a8599ab7eb 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -87,7 +87,7 @@ struct general_matrix_matrix_triangular_product<Index, LhsScalar, LhsStorageOrde
 
     // !!! mc must be a multiple of nr
     if (mc > Traits::nr) {
-      using UnsignedIndex = typename make_unsigned<Index>::type;
+      using UnsignedIndex = std::make_unsigned_t<Index>;
       mc = (UnsignedIndex(mc) / Traits::nr) * Traits::nr;
     }
 
@@ -154,7 +154,7 @@ struct tribb_kernel {
 
   enum { BlockSize = meta_least_common_multiple<plain_enum_max(mr, nr), plain_enum_min(mr, nr)>::ret };
   void operator()(ResScalar* res_, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB,
-                  Index size, Index depth, const ResScalar& alpha) {
+                  Index size, Index depth, const ResScalar& alpha) const {
     typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
     typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned> BufferMapper;
     ResMapper res(res_, resStride, resIncr);
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
index 6817cc09f7c..e14aa61c6dd 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
@@ -67,7 +67,7 @@ struct general_matrix_matrix_rankupdate
 
 EIGEN_BLAS_RANKUPDATE_SPECIALIZE(double)
 EIGEN_BLAS_RANKUPDATE_SPECIALIZE(float)
-// TODO handle complex cases
+// TODO: handle complex cases
 // EIGEN_BLAS_RANKUPDATE_SPECIALIZE(dcomplex)
 // EIGEN_BLAS_RANKUPDATE_SPECIALIZE(scomplex)
 
@@ -137,10 +137,12 @@ EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_)
 EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_)
 #endif
 
-// TODO handle complex cases
+// TODO: handle complex cases
 // EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_)
 // EIGEN_BLAS_RANKUPDATE_C(scomplex, float,  float, cherk_)
 
+#undef EIGEN_BLAS_RANKUPDATE_SPECIALIZE
+#undef EIGEN_BLAS_RANKUPDATE_R
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
index 913beb69680..11c29b604cc 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
@@ -49,7 +49,7 @@ namespace internal {
 
 // gemm specialization
 
-#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC)                                                 \
+#define EIGEN_BLAS_GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC)                                      \
   template <typename Index, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder, bool ConjugateRhs>         \
   struct general_matrix_matrix_product<Index, EIGTYPE, LhsStorageOrder, ConjugateLhs, EIGTYPE, RhsStorageOrder,     \
                                        ConjugateRhs, ColMajor, 1> {                                                 \
@@ -105,15 +105,15 @@ namespace internal {
   };
 
 #ifdef EIGEN_USE_MKL
-GEMM_SPECIALIZATION(double, d, double, dgemm)
-GEMM_SPECIALIZATION(float, f, float, sgemm)
-GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm)
-GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm)
+EIGEN_BLAS_GEMM_SPECIALIZATION(double, d, double, dgemm)
+EIGEN_BLAS_GEMM_SPECIALIZATION(float, f, float, sgemm)
+EIGEN_BLAS_GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm)
+EIGEN_BLAS_GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm)
 #else
-GEMM_SPECIALIZATION(double, d, double, dgemm_)
-GEMM_SPECIALIZATION(float, f, float, sgemm_)
-GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
-GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
+EIGEN_BLAS_GEMM_SPECIALIZATION(double, d, double, dgemm_)
+EIGEN_BLAS_GEMM_SPECIALIZATION(float, f, float, sgemm_)
+EIGEN_BLAS_GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
+EIGEN_BLAS_GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
 #endif
 
 // If OpenBLAS with BUILD_BFLOAT16=1 support is available,
@@ -198,6 +198,7 @@ struct general_matrix_matrix_product<Index, Eigen::bfloat16, LhsStorageOrder, Co
 
 #endif  // EIGEN_USE_OPENBLAS_SBGEMM
 
+#undef EIGEN_BLAS_GEMM_SPECIALIZATION
 }  // namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index ba72a8a4fbe..0fc9a747ef9 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -13,6 +13,13 @@
 // IWYU pragma: private
 #include "../InternalHeaderCheck.h"
 
+// C4804: unsafe use of type 'bool' in operation. Unavoidable in generic code
+// instantiated with bool scalars (e.g. += and * on bool).
+#if EIGEN_COMP_MSVC
+#pragma warning(push)
+#pragma warning(disable : 4804)
+#endif
+
 namespace Eigen {
 
 namespace internal {
@@ -97,20 +104,95 @@ struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, Conj
   typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
   typedef typename QuarterTraits::ResPacket ResPacketQuarter;
 
-  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,
-                                                      const RhsMapper& rhs, ResScalar* res, Index resIncr,
-                                                      RhsScalar alpha);
+  EIGEN_DEVICE_FUNC inline static void run(Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs,
+                                           ResScalar* res, Index resIncr, RhsScalar alpha);
+
+  template <int N>
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE void process_rows(
+      Index i, Index j2, Index jend, const LhsMapper& lhs, const RhsMapper& rhs, ResScalar* res,
+      const ResPacket& palpha, conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>& pcj);
+};
+
+// Recursive template unroller for col-major GEMV full-packet row blocks.
+// Unrolls the packet dimension (K = 0..N-1) at compile time, guaranteeing
+// that each accumulator lives in its own register variable.
+template <int K, int N>
+struct gemv_colmajor_unroller {
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void init_zero(Packet* c) {
+    gemv_colmajor_unroller<K - 1, N>::init_zero(c);
+    c[K] = pzero(Packet{});
+  }
+
+  template <typename LhsPacket, int LhsStride, int Alignment, typename AccPacket, typename RhsPacket,
+            typename ConjHelper, typename LhsMapper, typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void madd(AccPacket* c, const LhsMapper& lhs, Index i, Index j,
+                                                         const RhsPacket& b0, ConjHelper& pcj) {
+    gemv_colmajor_unroller<K - 1, N>::template madd<LhsPacket, LhsStride, Alignment>(c, lhs, i, j, b0, pcj);
+    c[K] = pcj.pmadd(lhs.template load<LhsPacket, Alignment>(i + LhsStride * K, j), b0, c[K]);
+  }
+
+  template <typename ResPacket, int ResStride, typename ResScalar>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void store(const ResPacket* c, ResScalar* res, Index i,
+                                                          const ResPacket& palpha) {
+    gemv_colmajor_unroller<K - 1, N>::template store<ResPacket, ResStride>(c, res, i, palpha);
+    pstoreu(res + i + ResStride * K, pmadd(c[K], palpha, ploadu<ResPacket>(res + i + ResStride * K)));
+  }
+};
+
+template <int N>
+struct gemv_colmajor_unroller<0, N> {
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void init_zero(Packet* c) {
+    c[0] = pzero(Packet{});
+  }
+
+  template <typename LhsPacket, int LhsStride, int Alignment, typename AccPacket, typename RhsPacket,
+            typename ConjHelper, typename LhsMapper, typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void madd(AccPacket* c, const LhsMapper& lhs, Index i, Index j,
+                                                         const RhsPacket& b0, ConjHelper& pcj) {
+    c[0] = pcj.pmadd(lhs.template load<LhsPacket, Alignment>(i, j), b0, c[0]);
+  }
+
+  template <typename ResPacket, int ResStride, typename ResScalar>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void store(const ResPacket* c, ResScalar* res, Index i,
+                                                          const ResPacket& palpha) {
+    pstoreu(res + i, pmadd(c[0], palpha, ploadu<ResPacket>(res + i)));
+  }
 };
 
 template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
           typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void general_matrix_vector_product<
+    Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
+    Version>::process_rows(Index i, Index j2, Index jend, const LhsMapper& lhs, const RhsMapper& rhs, ResScalar* res,
+                           const ResPacket& palpha,
+                           conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>& pcj) {
+  enum { LhsAlignment = Unaligned, LhsPacketSize = Traits::LhsPacketSize, ResPacketSize = Traits::ResPacketSize };
+  using Unroller = gemv_colmajor_unroller<N - 1, N>;
+
+  ResPacket c[N];
+  Unroller::init_zero(c);
+  for (Index j = j2; j < jend; ++j) {
+    RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
+    Unroller::template madd<LhsPacket, LhsPacketSize, LhsAlignment>(c, lhs, i, j, b0, pcj);
+  }
+  Unroller::template store<ResPacket, ResPacketSize>(c, res, i, palpha);
+}
+
+template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DEVICE_FUNC inline void
 general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
                               Version>::run(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
                                             ResScalar* res, Index resIncr, RhsScalar alpha) {
   EIGEN_UNUSED_VARIABLE(resIncr);
   eigen_internal_assert(resIncr == 1);
 
+  // BLAS contract: if alpha == 0, the result is unchanged (and lhs/rhs need not be read).
+  if (numext::is_exactly_zero(alpha)) return;
+
   // The following copy tells the compiler that lhs's attributes are not modified outside this function
   // This helps GCC to generate proper code.
   LhsMapper lhs(alhs);
@@ -121,7 +203,9 @@ general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLh
   conj_helper<LhsPacketQuarter, RhsPacketQuarter, ConjugateLhs, ConjugateRhs> pcj_quarter;
 
   const Index lhsStride = lhs.stride();
-  // TODO: for padded aligned inputs, we could enable aligned reads
+  // LhsAlignment stays Unaligned; enabling aligned reads would require
+  // propagating the Mapper's Alignment through the run() template, and on
+  // modern x86 aligned/unaligned packet loads are equivalent anyway.
   enum {
     LhsAlignment = Unaligned,
     ResPacketSize = Traits::ResPacketSize,
@@ -140,8 +224,12 @@ general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLh
   const Index n_half = rows - 1 * ResPacketSizeHalf + 1;
   const Index n_quarter = rows - 1 * ResPacketSizeQuarter + 1;
 
-  // TODO: improve the following heuristic:
-  const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 32000 ? 16 : 4);
+  // Choose block_cols so that one column slice of the LHS roughly fits in L1.
+  // When it does not, fall back to a smaller batch to keep cache pressure down.
+  std::ptrdiff_t l1, l2, l3;
+  manage_caching_sizes(GetAction, &l1, &l2, &l3);
+  const Index block_cols =
+      cols < 128 ? cols : (lhsStride * Index(sizeof(LhsScalar)) < Index(l1) ? Index(16) : Index(4));
   ResPacket palpha = pset1<ResPacket>(alpha);
   ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
   ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
@@ -149,89 +237,25 @@ general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLh
   for (Index j2 = 0; j2 < cols; j2 += block_cols) {
     Index jend = numext::mini(j2 + block_cols, cols);
     Index i = 0;
-    for (; i < n8; i += ResPacketSize * 8) {
-      ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
-                c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0)),
-                c4 = pset1<ResPacket>(ResScalar(0)), c5 = pset1<ResPacket>(ResScalar(0)),
-                c6 = pset1<ResPacket>(ResScalar(0)), c7 = pset1<ResPacket>(ResScalar(0));
-
-      for (Index j = j2; j < jend; j += 1) {
-        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
-        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
-        c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
-        c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
-        c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 3, j), b0, c3);
-        c4 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 4, j), b0, c4);
-        c5 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 5, j), b0, c5);
-        c6 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 6, j), b0, c6);
-        c7 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 7, j), b0, c7);
-      }
-      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
-      pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
-      pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
-      pstoreu(res + i + ResPacketSize * 3, pmadd(c3, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 3)));
-      pstoreu(res + i + ResPacketSize * 4, pmadd(c4, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 4)));
-      pstoreu(res + i + ResPacketSize * 5, pmadd(c5, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 5)));
-      pstoreu(res + i + ResPacketSize * 6, pmadd(c6, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 6)));
-      pstoreu(res + i + ResPacketSize * 7, pmadd(c7, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 7)));
-    }
+    for (; i < n8; i += ResPacketSize * 8) process_rows<8>(i, j2, jend, lhs, rhs, res, palpha, pcj);
     if (i < n4) {
-      ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
-                c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0));
-
-      for (Index j = j2; j < jend; j += 1) {
-        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
-        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
-        c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
-        c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
-        c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 3, j), b0, c3);
-      }
-      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
-      pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
-      pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
-      pstoreu(res + i + ResPacketSize * 3, pmadd(c3, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 3)));
-
+      process_rows<4>(i, j2, jend, lhs, rhs, res, palpha, pcj);
       i += ResPacketSize * 4;
     }
     if (i < n3) {
-      ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
-                c2 = pset1<ResPacket>(ResScalar(0));
-
-      for (Index j = j2; j < jend; j += 1) {
-        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
-        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
-        c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
-        c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
-      }
-      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
-      pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
-      pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
-
+      process_rows<3>(i, j2, jend, lhs, rhs, res, palpha, pcj);
       i += ResPacketSize * 3;
     }
     if (i < n2) {
-      ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0));
-
-      for (Index j = j2; j < jend; j += 1) {
-        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
-        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
-        c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
-      }
-      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
-      pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
+      process_rows<2>(i, j2, jend, lhs, rhs, res, palpha, pcj);
       i += ResPacketSize * 2;
     }
     if (i < n1) {
-      ResPacket c0 = pset1<ResPacket>(ResScalar(0));
-      for (Index j = j2; j < jend; j += 1) {
-        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
-        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
-      }
-      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
+      process_rows<1>(i, j2, jend, lhs, rhs, res, palpha, pcj);
       i += ResPacketSize;
     }
     if (HasHalf && i < n_half) {
-      ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
+      ResPacketHalf c0 = pzero(ResPacketHalf{});
       for (Index j = j2; j < jend; j += 1) {
         RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j, 0));
         c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf, LhsAlignment>(i + 0, j), b0, c0);
@@ -241,7 +265,7 @@ general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLh
       i += ResPacketSizeHalf;
     }
     if (HasQuarter && i < n_quarter) {
-      ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
+      ResPacketQuarter c0 = pzero(ResPacketQuarter{});
       for (Index j = j2; j < jend; j += 1) {
         RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j, 0));
         c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter, LhsAlignment>(i + 0, j), b0, c0);
@@ -290,17 +314,53 @@ struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, Conj
   typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
   typedef typename QuarterTraits::ResPacket ResPacketQuarter;
 
-  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,
-                                                      const RhsMapper& rhs, ResScalar* res, Index resIncr,
-                                                      ResScalar alpha);
+  EIGEN_DEVICE_FUNC static inline void run(Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs,
+                                           ResScalar* res, Index resIncr, ResScalar alpha);
+
+  // Specialized path for when cols < full packet size. Kept noinline to avoid
+  // bloating the main run() function and causing icache pressure.
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run_small_cols(Index rows, Index cols, const LhsMapper& lhs,
+                                                                 const RhsMapper& rhs, ResScalar* res, Index resIncr,
+                                                                 ResScalar alpha);
+
+  // Templated helper that processes N rows in run_small_cols. N is a compile-time
+  // constant; row-dimension unrolling is done via recursive templates to guarantee
+  // full unrolling regardless of compiler heuristics.
+  template <int N>
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE void process_rows_small_cols(Index i, Index cols, const LhsMapper& lhs,
+                                                                            const RhsMapper& rhs, ResScalar* res,
+                                                                            Index resIncr, ResScalar alpha,
+                                                                            Index halfColBlockEnd,
+                                                                            Index quarterColBlockEnd);
 };
 
 template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
           typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void
+EIGEN_DEVICE_FUNC inline void
 general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
                               Version>::run(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
                                             ResScalar* res, Index resIncr, ResScalar alpha) {
+  // BLAS contract: if alpha == 0, the result is unchanged (and lhs/rhs need not be read).
+  if (numext::is_exactly_zero(alpha)) return;
+
+  // When cols < full packet size, the main vectorized loops are empty.
+  // Dispatch to a separate noinline function to avoid polluting the icache.
+  // Only dispatch when cols is large enough that half or quarter packets can be used;
+  // otherwise the helper would just do scalar work with extra function call overhead.
+  enum {
+    LhsPacketSize_ = Traits::LhsPacketSize,
+    MinUsefulCols_ =
+        ((int)QuarterTraits::LhsPacketSize < (int)HalfTraits::LhsPacketSize)
+            ? (int)QuarterTraits::LhsPacketSize
+            : (((int)HalfTraits::LhsPacketSize < (int)Traits::LhsPacketSize) ? (int)HalfTraits::LhsPacketSize
+                                                                             : (int)Traits::LhsPacketSize),
+    HasSubPackets_ = (int)MinUsefulCols_ < (int)LhsPacketSize_
+  };
+  if (HasSubPackets_ && cols >= MinUsefulCols_ && cols < LhsPacketSize_) {
+    run_small_cols(rows, cols, alhs, rhs, res, resIncr, alpha);
+    return;
+  }
+
   // The following copy tells the compiler that lhs's attributes are not modified outside this function
   // This helps GCC to generate proper code.
   LhsMapper lhs(alhs);
@@ -311,13 +371,17 @@ general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLh
   conj_helper<LhsPacketHalf, RhsPacketHalf, ConjugateLhs, ConjugateRhs> pcj_half;
   conj_helper<LhsPacketQuarter, RhsPacketQuarter, ConjugateLhs, ConjugateRhs> pcj_quarter;
 
-  // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
-  //       processing 8 rows at once might be counter productive wrt cache.
-  const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? 0 : rows - 7;
+  // Disable the 8-row inner unroll once a single column slice no longer fits in L1; with very
+  // large LHS strides each unrolled iteration evicts the previously-loaded rows from cache.
+  std::ptrdiff_t l1, l2, l3;
+  manage_caching_sizes(GetAction, &l1, &l2, &l3);
+  const Index n8 = lhs.stride() * Index(sizeof(LhsScalar)) > Index(l1) ? 0 : rows - 7;
   const Index n4 = rows - 3;
   const Index n2 = rows - 1;
 
-  // TODO: for padded aligned inputs, we could enable aligned reads
+  // LhsAlignment stays Unaligned; enabling aligned reads would require
+  // propagating the Mapper's Alignment through the run() template, and on
+  // modern x86 aligned/unaligned packet loads are equivalent anyway.
   enum {
     LhsAlignment = Unaligned,
     ResPacketSize = Traits::ResPacketSize,
@@ -330,17 +394,15 @@ general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLh
     HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
   };
 
-  using UnsignedIndex = typename make_unsigned<Index>::type;
+  using UnsignedIndex = std::make_unsigned_t<Index>;
   const Index fullColBlockEnd = LhsPacketSize * (UnsignedIndex(cols) / LhsPacketSize);
   const Index halfColBlockEnd = LhsPacketSizeHalf * (UnsignedIndex(cols) / LhsPacketSizeHalf);
   const Index quarterColBlockEnd = LhsPacketSizeQuarter * (UnsignedIndex(cols) / LhsPacketSizeQuarter);
 
   Index i = 0;
   for (; i < n8; i += 8) {
-    ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
-              c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0)),
-              c4 = pset1<ResPacket>(ResScalar(0)), c5 = pset1<ResPacket>(ResScalar(0)),
-              c6 = pset1<ResPacket>(ResScalar(0)), c7 = pset1<ResPacket>(ResScalar(0));
+    ResPacket c0 = pzero(ResPacket{}), c1 = pzero(ResPacket{}), c2 = pzero(ResPacket{}), c3 = pzero(ResPacket{}),
+              c4 = pzero(ResPacket{}), c5 = pzero(ResPacket{}), c6 = pzero(ResPacket{}), c7 = pzero(ResPacket{});
 
     for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
       RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
@@ -385,8 +447,7 @@ general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLh
     res[(i + 7) * resIncr] += alpha * cc7;
   }
   for (; i < n4; i += 4) {
-    ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
-              c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0));
+    ResPacket c0 = pzero(ResPacket{}), c1 = pzero(ResPacket{}), c2 = pzero(ResPacket{}), c3 = pzero(ResPacket{});
 
     for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
       RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
@@ -415,7 +476,7 @@ general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLh
     res[(i + 3) * resIncr] += alpha * cc3;
   }
   for (; i < n2; i += 2) {
-    ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0));
+    ResPacket c0 = pzero(ResPacket{}), c1 = pzero(ResPacket{});
 
     for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
       RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
@@ -436,9 +497,9 @@ general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLh
     res[(i + 1) * resIncr] += alpha * cc1;
   }
   for (; i < rows; ++i) {
-    ResPacket c0 = pset1<ResPacket>(ResScalar(0));
-    ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
-    ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
+    ResPacket c0 = pzero(ResPacket{});
+    ResPacketHalf c0_h = pzero(ResPacketHalf{});
+    ResPacketQuarter c0_q = pzero(ResPacketQuarter{});
 
     for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
       RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
@@ -466,8 +527,180 @@ general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLh
   }
 }
 
+// Recursive template unroller for process_rows_small_cols.
+// Unrolls the row dimension (K = 0..N-1) at compile time, guaranteeing
+// that each accumulator lives in its own register variable regardless
+// of compiler unrolling heuristics.
+template <int K, int N>
+struct gemv_small_cols_unroller {
+  template <typename LhsPacket, typename AccPacket, int Alignment, typename RhsType, typename ConjHelper,
+            typename LhsMapper, typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void madd(AccPacket* acc, const LhsMapper& lhs, Index i, Index j,
+                                                         const RhsType& b0, ConjHelper& pcj) {
+    gemv_small_cols_unroller<K - 1, N>::template madd<LhsPacket, AccPacket, Alignment>(acc, lhs, i, j, b0, pcj);
+    acc[K] = pcj.pmadd(lhs.template load<LhsPacket, Alignment>(i + K, j), b0, acc[K]);
+  }
+
+  template <typename ResScalar, typename RhsScalar, typename ConjHelper, typename LhsMapper, typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void scalar_madd(ResScalar* cc, const LhsMapper& lhs, Index i, Index j,
+                                                                const RhsScalar& b0, ConjHelper& cj) {
+    gemv_small_cols_unroller<K - 1, N>::scalar_madd(cc, lhs, i, j, b0, cj);
+    cc[K] += cj.pmul(lhs(i + K, j), b0);
+  }
+
+  template <typename Scalar, typename Packet>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void predux_accum(Scalar* cc, const Packet* acc) {
+    gemv_small_cols_unroller<K - 1, N>::predux_accum(cc, acc);
+    cc[K] += predux(acc[K]);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void init_zero(Packet* acc) {
+    gemv_small_cols_unroller<K - 1, N>::init_zero(acc);
+    acc[K] = pzero(Packet{});
+  }
+
+  template <typename Scalar, typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void write_result(Scalar* res, Index resIncr, Index i, Scalar alpha,
+                                                                 const Scalar* cc) {
+    gemv_small_cols_unroller<K - 1, N>::write_result(res, resIncr, i, alpha, cc);
+    res[(i + K) * resIncr] += alpha * cc[K];
+  }
+};
+
+template <int N>
+struct gemv_small_cols_unroller<0, N> {
+  template <typename LhsPacket, typename AccPacket, int Alignment, typename RhsType, typename ConjHelper,
+            typename LhsMapper, typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void madd(AccPacket* acc, const LhsMapper& lhs, Index i, Index j,
+                                                         const RhsType& b0, ConjHelper& pcj) {
+    acc[0] = pcj.pmadd(lhs.template load<LhsPacket, Alignment>(i, j), b0, acc[0]);
+  }
+
+  template <typename ResScalar, typename RhsScalar, typename ConjHelper, typename LhsMapper, typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void scalar_madd(ResScalar* cc, const LhsMapper& lhs, Index i, Index j,
+                                                                const RhsScalar& b0, ConjHelper& cj) {
+    cc[0] += cj.pmul(lhs(i, j), b0);
+  }
+
+  template <typename Scalar, typename Packet>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void predux_accum(Scalar* cc, const Packet* acc) {
+    cc[0] += predux(acc[0]);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void init_zero(Packet* acc) {
+    acc[0] = pzero(Packet{});
+  }
+
+  template <typename Scalar, typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void write_result(Scalar* res, Index resIncr, Index i, Scalar alpha,
+                                                                 const Scalar* cc) {
+    res[i * resIncr] += alpha * cc[0];
+  }
+};
+
+template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
+general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
+                              Version>::process_rows_small_cols(Index i, Index cols, const LhsMapper& lhs,
+                                                                const RhsMapper& rhs, ResScalar* res, Index resIncr,
+                                                                ResScalar alpha, Index halfColBlockEnd,
+                                                                Index quarterColBlockEnd) {
+  conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
+  conj_helper<LhsPacketHalf, RhsPacketHalf, ConjugateLhs, ConjugateRhs> pcj_half;
+  conj_helper<LhsPacketQuarter, RhsPacketQuarter, ConjugateLhs, ConjugateRhs> pcj_quarter;
+
+  enum {
+    LhsAlignment = Unaligned,
+    ResPacketSizeHalf = HalfTraits::ResPacketSize,
+    ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
+    LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
+    LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
+    HasHalf = (int)ResPacketSizeHalf < (int)Traits::ResPacketSize,
+    HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
+  };
+
+  using Unroll = gemv_small_cols_unroller<N - 1, N>;
+
+  ResScalar cc[N] = {};
+  if (HasHalf) {
+    ResPacketHalf h[N];
+    Unroll::init_zero(h);
+    for (Index j = 0; j < halfColBlockEnd; j += LhsPacketSizeHalf) {
+      RhsPacketHalf b0 = rhs.template load<RhsPacketHalf, Unaligned>(j, 0);
+      Unroll::template madd<LhsPacketHalf, ResPacketHalf, LhsAlignment>(h, lhs, i, j, b0, pcj_half);
+    }
+    Unroll::predux_accum(cc, h);
+  }
+  if (HasQuarter) {
+    ResPacketQuarter q[N];
+    Unroll::init_zero(q);
+    for (Index j = halfColBlockEnd; j < quarterColBlockEnd; j += LhsPacketSizeQuarter) {
+      RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter, Unaligned>(j, 0);
+      Unroll::template madd<LhsPacketQuarter, ResPacketQuarter, LhsAlignment>(q, lhs, i, j, b0, pcj_quarter);
+    }
+    Unroll::predux_accum(cc, q);
+  }
+  for (Index j = quarterColBlockEnd; j < cols; ++j) {
+    RhsScalar b0 = rhs(j, 0);
+    Unroll::scalar_madd(cc, lhs, i, j, b0, cj);
+  }
+  Unroll::write_result(res, resIncr, i, alpha, cc);
+}
+
+template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void
+general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
+                              Version>::run_small_cols(Index rows, Index cols, const LhsMapper& alhs,
+                                                       const RhsMapper& rhs, ResScalar* res, Index resIncr,
+                                                       ResScalar alpha) {
+  LhsMapper lhs(alhs);
+  eigen_internal_assert(rhs.stride() == 1);
+
+  enum {
+    LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
+    LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
+  };
+
+  using UnsignedIndex = std::make_unsigned_t<Index>;
+  const Index halfColBlockEnd = LhsPacketSizeHalf * (UnsignedIndex(cols) / LhsPacketSizeHalf);
+  const Index quarterColBlockEnd = LhsPacketSizeQuarter * (UnsignedIndex(cols) / LhsPacketSizeQuarter);
+
+  // Disable the 8-row inner unroll once a single column slice no longer fits in L1; with very
+  // large LHS strides each unrolled iteration evicts the previously-loaded rows from cache.
+  std::ptrdiff_t l1, l2, l3;
+  manage_caching_sizes(GetAction, &l1, &l2, &l3);
+  const Index n8 = lhs.stride() * Index(sizeof(LhsScalar)) > Index(l1) ? 0 : rows - 7;
+  const Index n4 = rows - 3;
+  const Index n2 = rows - 1;
+
+  Index i = 0;
+  for (; i < n8; i += 8) {
+    process_rows_small_cols<8>(i, cols, lhs, rhs, res, resIncr, alpha, halfColBlockEnd, quarterColBlockEnd);
+  }
+  // Process remaining groups of 4 rows in case n8 was 0.
+  for (; i < n4; i += 4) {
+    process_rows_small_cols<4>(i, cols, lhs, rhs, res, resIncr, alpha, halfColBlockEnd, quarterColBlockEnd);
+  }
+  if (i < n2) {
+    process_rows_small_cols<2>(i, cols, lhs, rhs, res, resIncr, alpha, halfColBlockEnd, quarterColBlockEnd);
+    i += 2;
+  }
+  if (i < rows) {
+    process_rows_small_cols<1>(i, cols, lhs, rhs, res, resIncr, alpha, halfColBlockEnd, quarterColBlockEnd);
+  }
+}
+
 }  // end namespace internal
 
 }  // end namespace Eigen
 
+#if EIGEN_COMP_MSVC
+#pragma warning(pop)
+#endif
+
 #endif  // EIGEN_GENERAL_MATRIX_VECTOR_H
diff --git a/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
index 4010a0a6733..18adfd1520f 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
@@ -132,6 +132,8 @@ EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_)
 EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, cgemv_)
 #endif
 
+#undef EIGEN_BLAS_GEMV_SPECIALIZE
+#undef EIGEN_BLAS_GEMV_SPECIALIZATION
 }  // namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 4f3668944fc..96cc237c31f 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -47,7 +47,7 @@ inline void manage_multi_threading(Action action, int* v);
 // Public APIs.
 
 /** Must be call first when calling Eigen from multiple threads */
-EIGEN_DEPRECATED inline void initParallel() {}
+EIGEN_DEPRECATED_WITH_REASON("Initialization is no longer needed.") inline void initParallel() {}
 
 /** \returns the max number of threads reserved for Eigen
  * \sa setNbThreads */
@@ -141,7 +141,7 @@ inline void manage_multi_threading(Action action, int* v) {
     // for OpenMP.
     eigen_internal_assert(*v >= 0);
     int omp_threads = omp_get_max_threads();
-    m_maxThreads = (*v == 0 ? omp_threads : std::min(*v, omp_threads));
+    m_maxThreads = (*v == 0 ? omp_threads : std::min<int>(*v, omp_threads));
 #elif defined(EIGEN_GEMM_THREADPOOL)
     // Calling action == SetAction and *v = 0 means
     // restoring m_maxThreads to the number of threads in the ThreadPool,
@@ -182,7 +182,7 @@ EIGEN_STRONG_INLINE void parallelize_gemm(const Functor& func, Index rows, Index
 
   // compute the maximal number of threads from the total amount of work:
   double work = static_cast<double>(rows) * static_cast<double>(cols) * static_cast<double>(depth);
-  double kMinTaskSize = 50000;  // FIXME improve this heuristic.
+  double kMinTaskSize = 50000;  // FIXME: tune this minimum task-size heuristic based on architecture and scalar type.
   pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, static_cast<Index>(work / kMinTaskSize)));
 
   // compute the number of threads we are going to use
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 899283dcc23..fd66b609348 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -22,7 +22,7 @@ template <typename Scalar, typename Index, int Pack1, int Pack2_dummy, int Stora
 struct symm_pack_lhs {
   template <int BlockRows>
   inline void pack(Scalar* blockA, const const_blas_data_mapper<Scalar, Index, StorageOrder>& lhs, Index cols, Index i,
-                   Index& count) {
+                   Index& count) const {
     // normal copy
     for (Index k = 0; k < i; k++)
       for (Index w = 0; w < BlockRows; w++) blockA[count++] = lhs(i + w, k);  // normal
@@ -40,7 +40,7 @@ struct symm_pack_lhs {
     for (Index k = i + BlockRows; k < cols; k++)
       for (Index w = 0; w < BlockRows; w++) blockA[count++] = numext::conj(lhs(k, i + w));  // transposed
   }
-  void operator()(Scalar* blockA, const Scalar* lhs_, Index lhsStride, Index cols, Index rows) {
+  void operator()(Scalar* blockA, const Scalar* lhs_, Index lhsStride, Index cols, Index rows) const {
     typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket;
     typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half
         QuarterPacket;
@@ -99,7 +99,7 @@ struct symm_pack_lhs {
 template <typename Scalar, typename Index, int nr, int StorageOrder>
 struct symm_pack_rhs {
   enum { PacketSize = packet_traits<Scalar>::size };
-  void operator()(Scalar* blockB, const Scalar* rhs_, Index rhsStride, Index rows, Index cols, Index k2) {
+  void operator()(Scalar* blockB, const Scalar* rhs_, Index rhsStride, Index rows, Index cols, Index k2) const {
     Index end_k = k2 + rows;
     Index count = 0;
     const_blas_data_mapper<Scalar, Index, StorageOrder> rhs(rhs_, rhsStride);
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
index c0dbfd18795..684a90f8d49 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
@@ -270,6 +270,10 @@ EIGEN_BLAS_SYMM_R(float, float, f, ssymm_)
 EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_)
 EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_)
 #endif
+
+#undef EIGEN_BLAS_SYMM_L
+#undef EIGEN_BLAS_SYMM_R
+#undef EIGEN_BLAS_HEMM_R
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h
index f7387601ffb..f87509bb3bf 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -18,9 +18,11 @@ namespace Eigen {
 namespace internal {
 
 /* Optimized selfadjoint matrix * vector product:
- * This algorithm processes 2 columns at once that allows to both reduce
- * the number of load/stores of the result by a factor 2 and to reduce
- * the instruction dependency.
+ * This algorithm processes 4 columns at once to reduce the number of
+ * load/stores of the result vector by a factor of 4 compared to the
+ * naive approach, and to increase instruction-level parallelism.
+ * A 2-column cleanup handles the remaining even columns, and a
+ * 1-column loop handles any final odd column.
  */
 
 template <typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs,
@@ -61,84 +63,246 @@ selfadjoint_matrix_vector_product<Scalar, Index, StorageOrder, UpLo, ConjugateLh
 
   Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
 
-  Index bound = numext::maxi(Index(0), size - 8) & 0xfffffffe;
-  if (FirstTriangular) bound = size - bound;
-
-  for (Index j = FirstTriangular ? bound : 0; j < (FirstTriangular ? size : bound); j += 2) {
-    const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
-    const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride;
-
-    Scalar t0 = cjAlpha * rhs[j];
-    Packet ptmp0 = pset1<Packet>(t0);
-    Scalar t1 = cjAlpha * rhs[j + 1];
-    Packet ptmp1 = pset1<Packet>(t1);
-
-    Scalar t2(0);
-    Packet ptmp2 = pset1<Packet>(t2);
-    Scalar t3(0);
-    Packet ptmp3 = pset1<Packet>(t3);
-
-    Index starti = FirstTriangular ? 0 : j + 2;
-    Index endi = FirstTriangular ? j : size;
-    Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi - starti);
-    Index alignedEnd = alignedStart + ((endi - alignedStart) / (PacketSize)) * (PacketSize);
-
-    res[j] += cjd.pmul(numext::real(A0[j]), t0);
-    res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1);
-    if (FirstTriangular) {
-      res[j] += cj0.pmul(A1[j], t1);
-      t3 += cj1.pmul(A1[j], rhs[j]);
-    } else {
-      res[j + 1] += cj0.pmul(A0[j + 1], t0);
-      t2 += cj1.pmul(A0[j + 1], rhs[j + 1]);
+  // Compute column counts for 4-col, 2-col, and 1-col processing phases.
+  // We leave up to ~8 columns near the diagonal for cleanup (short off-diagonal ranges).
+  Index n4 = (numext::maxi(Index(0), size - 8) / 4) * 4;
+  Index n2 = ((size - n4) / 2) * 2;
+  // Remaining (size - n4 - n2) is 0 or 1 columns.
+
+  // For !FirstTriangular: 4-col [0, n4), 2-col [n4, n4+n2), 1-col [n4+n2, size)
+  // For FirstTriangular:  1-col [0, size-n4-n2), 2-col [size-n4-n2, size-n4), 4-col [size-n4, size)
+
+  // === Phase 1: 4 columns at a time ===
+  {
+    Index jStart = FirstTriangular ? (size - n4) : 0;
+    Index jEnd = FirstTriangular ? size : n4;
+
+    for (Index j = jStart; j < jEnd; j += 4) {
+      const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
+      const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride;
+      const Scalar* EIGEN_RESTRICT A2 = lhs + (j + 2) * lhsStride;
+      const Scalar* EIGEN_RESTRICT A3 = lhs + (j + 3) * lhsStride;
+
+      Scalar t0 = cjAlpha * rhs[j];
+      Scalar t1 = cjAlpha * rhs[j + 1];
+      Scalar t2 = cjAlpha * rhs[j + 2];
+      Scalar t3 = cjAlpha * rhs[j + 3];
+      Packet ptmp0 = pset1<Packet>(t0);
+      Packet ptmp1 = pset1<Packet>(t1);
+      Packet ptmp2 = pset1<Packet>(t2);
+      Packet ptmp3 = pset1<Packet>(t3);
+
+      Scalar t4(0), t5(0), t6(0), t7(0);
+      Packet ptmp4 = pzero(Packet{});
+      Packet ptmp5 = pzero(Packet{});
+      Packet ptmp6 = pzero(Packet{});
+      Packet ptmp7 = pzero(Packet{});
+
+      Index starti = FirstTriangular ? 0 : j + 4;
+      Index endi = FirstTriangular ? j : size;
+      Index alignedStart = starti + internal::first_default_aligned(&res[starti], endi - starti);
+      Index alignedEnd = alignedStart + ((endi - alignedStart) / PacketSize) * PacketSize;
+
+      // Handle the 4x4 diagonal block: diagonal elements
+      res[j] += cjd.pmul(numext::real(A0[j]), t0);
+      res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1);
+      res[j + 2] += cjd.pmul(numext::real(A2[j + 2]), t2);
+      res[j + 3] += cjd.pmul(numext::real(A3[j + 3]), t3);
+
+      // Handle the 4x4 diagonal block: off-diagonal cross terms
+      if (FirstTriangular) {
+        // Upper triangle stored (A_k[l] for l <= k)
+        res[j] += cj0.pmul(A1[j], t1) + cj0.pmul(A2[j], t2) + cj0.pmul(A3[j], t3);
+        res[j + 1] += cj0.pmul(A2[j + 1], t2) + cj0.pmul(A3[j + 1], t3);
+        res[j + 2] += cj0.pmul(A3[j + 2], t3);
+
+        t5 += cj1.pmul(A1[j], rhs[j]);
+        t6 += cj1.pmul(A2[j], rhs[j]) + cj1.pmul(A2[j + 1], rhs[j + 1]);
+        t7 += cj1.pmul(A3[j], rhs[j]) + cj1.pmul(A3[j + 1], rhs[j + 1]) + cj1.pmul(A3[j + 2], rhs[j + 2]);
+      } else {
+        // Lower triangle stored (A_k[l] for l >= k)
+        res[j + 1] += cj0.pmul(A0[j + 1], t0);
+        res[j + 2] += cj0.pmul(A0[j + 2], t0) + cj0.pmul(A1[j + 2], t1);
+        res[j + 3] += cj0.pmul(A0[j + 3], t0) + cj0.pmul(A1[j + 3], t1) + cj0.pmul(A2[j + 3], t2);
+
+        t4 += cj1.pmul(A0[j + 1], rhs[j + 1]) + cj1.pmul(A0[j + 2], rhs[j + 2]) + cj1.pmul(A0[j + 3], rhs[j + 3]);
+        t5 += cj1.pmul(A1[j + 2], rhs[j + 2]) + cj1.pmul(A1[j + 3], rhs[j + 3]);
+        t6 += cj1.pmul(A2[j + 3], rhs[j + 3]);
+      }
+
+      // Pre-alignment scalar loop
+      for (Index i = starti; i < alignedStart; ++i) {
+        res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1) + cj0.pmul(A2[i], t2) + cj0.pmul(A3[i], t3);
+        t4 += cj1.pmul(A0[i], rhs[i]);
+        t5 += cj1.pmul(A1[i], rhs[i]);
+        t6 += cj1.pmul(A2[i], rhs[i]);
+        t7 += cj1.pmul(A3[i], rhs[i]);
+      }
+
+      // Main vectorized loop: 4 matrix column loads, 1 rhs load, 1 result load/store
+      const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
+      const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
+      const Scalar* EIGEN_RESTRICT a2It = A2 + alignedStart;
+      const Scalar* EIGEN_RESTRICT a3It = A3 + alignedStart;
+      const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
+      Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
+      for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
+        Packet A0i = ploadu<Packet>(a0It);
+        a0It += PacketSize;
+        Packet A1i = ploadu<Packet>(a1It);
+        a1It += PacketSize;
+        Packet A2i = ploadu<Packet>(a2It);
+        a2It += PacketSize;
+        Packet A3i = ploadu<Packet>(a3It);
+        a3It += PacketSize;
+        Packet Bi = ploadu<Packet>(rhsIt);
+        rhsIt += PacketSize;
+        Packet Xi = pload<Packet>(resIt);
+
+        Xi = pcj0.pmadd(A0i, ptmp0, Xi);
+        Xi = pcj0.pmadd(A1i, ptmp1, Xi);
+        Xi = pcj0.pmadd(A2i, ptmp2, Xi);
+        Xi = pcj0.pmadd(A3i, ptmp3, Xi);
+        pstore(resIt, Xi);
+        resIt += PacketSize;
+
+        ptmp4 = pcj1.pmadd(A0i, Bi, ptmp4);
+        ptmp5 = pcj1.pmadd(A1i, Bi, ptmp5);
+        ptmp6 = pcj1.pmadd(A2i, Bi, ptmp6);
+        ptmp7 = pcj1.pmadd(A3i, Bi, ptmp7);
+      }
+
+      // Post-alignment scalar loop
+      for (Index i = alignedEnd; i < endi; ++i) {
+        res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1) + cj0.pmul(A2[i], t2) + cj0.pmul(A3[i], t3);
+        t4 += cj1.pmul(A0[i], rhs[i]);
+        t5 += cj1.pmul(A1[i], rhs[i]);
+        t6 += cj1.pmul(A2[i], rhs[i]);
+        t7 += cj1.pmul(A3[i], rhs[i]);
+      }
+
+      res[j] += alpha * (t4 + predux(ptmp4));
+      res[j + 1] += alpha * (t5 + predux(ptmp5));
+      res[j + 2] += alpha * (t6 + predux(ptmp6));
+      res[j + 3] += alpha * (t7 + predux(ptmp7));
     }
+  }
 
-    for (Index i = starti; i < alignedStart; ++i) {
-      res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
-      t2 += cj1.pmul(A0[i], rhs[i]);
-      t3 += cj1.pmul(A1[i], rhs[i]);
-    }
-    // Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up)
-    // gcc 4.2 does this optimization automatically.
-    const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
-    const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
-    const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
-    Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
-    for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
-      Packet A0i = ploadu<Packet>(a0It);
-      a0It += PacketSize;
-      Packet A1i = ploadu<Packet>(a1It);
-      a1It += PacketSize;
-      Packet Bi = ploadu<Packet>(rhsIt);
-      rhsIt += PacketSize;  // FIXME should be aligned in most cases
-      Packet Xi = pload<Packet>(resIt);
-
-      Xi = pcj0.pmadd(A0i, ptmp0, pcj0.pmadd(A1i, ptmp1, Xi));
-      ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
-      ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
-      pstore(resIt, Xi);
-      resIt += PacketSize;
+  // === Phase 2: 2 columns at a time ===
+  {
+    Index jStart = FirstTriangular ? (size - n4 - n2) : n4;
+    Index jEnd = FirstTriangular ? (size - n4) : (n4 + n2);
+
+    for (Index j = jStart; j < jEnd; j += 2) {
+      const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
+      const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride;
+
+      Scalar t0 = cjAlpha * rhs[j];
+      Packet ptmp0 = pset1<Packet>(t0);
+      Scalar t1 = cjAlpha * rhs[j + 1];
+      Packet ptmp1 = pset1<Packet>(t1);
+
+      Scalar t2(0);
+      Packet ptmp2 = pzero(Packet{});
+      Scalar t3(0);
+      Packet ptmp3 = pzero(Packet{});
+
+      Index starti = FirstTriangular ? 0 : j + 2;
+      Index endi = FirstTriangular ? j : size;
+      Index alignedStart = starti + internal::first_default_aligned(&res[starti], endi - starti);
+      Index alignedEnd = alignedStart + ((endi - alignedStart) / PacketSize) * PacketSize;
+
+      res[j] += cjd.pmul(numext::real(A0[j]), t0);
+      res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1);
+      if (FirstTriangular) {
+        res[j] += cj0.pmul(A1[j], t1);
+        t3 += cj1.pmul(A1[j], rhs[j]);
+      } else {
+        res[j + 1] += cj0.pmul(A0[j + 1], t0);
+        t2 += cj1.pmul(A0[j + 1], rhs[j + 1]);
+      }
+
+      for (Index i = starti; i < alignedStart; ++i) {
+        res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
+        t2 += cj1.pmul(A0[i], rhs[i]);
+        t3 += cj1.pmul(A1[i], rhs[i]);
+      }
+      const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
+      const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
+      const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
+      Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
+      for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
+        Packet A0i = ploadu<Packet>(a0It);
+        a0It += PacketSize;
+        Packet A1i = ploadu<Packet>(a1It);
+        a1It += PacketSize;
+        Packet Bi = ploadu<Packet>(rhsIt);
+        rhsIt += PacketSize;
+        Packet Xi = pload<Packet>(resIt);
+
+        Xi = pcj0.pmadd(A0i, ptmp0, pcj0.pmadd(A1i, ptmp1, Xi));
+        ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
+        ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
+        pstore(resIt, Xi);
+        resIt += PacketSize;
+      }
+      for (Index i = alignedEnd; i < endi; i++) {
+        res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
+        t2 += cj1.pmul(A0[i], rhs[i]);
+        t3 += cj1.pmul(A1[i], rhs[i]);
+      }
+
+      res[j] += alpha * (t2 + predux(ptmp2));
+      res[j + 1] += alpha * (t3 + predux(ptmp3));
     }
-    for (Index i = alignedEnd; i < endi; i++) {
-      res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
-      t2 += cj1.pmul(A0[i], rhs[i]);
-      t3 += cj1.pmul(A1[i], rhs[i]);
-    }
-
-    res[j] += alpha * (t2 + predux(ptmp2));
-    res[j + 1] += alpha * (t3 + predux(ptmp3));
   }
-  for (Index j = FirstTriangular ? 0 : bound; j < (FirstTriangular ? bound : size); j++) {
-    const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
-
-    Scalar t1 = cjAlpha * rhs[j];
-    Scalar t2(0);
-    res[j] += cjd.pmul(numext::real(A0[j]), t1);
-    for (Index i = FirstTriangular ? 0 : j + 1; i < (FirstTriangular ? j : size); i++) {
-      res[i] += cj0.pmul(A0[i], t1);
-      t2 += cj1.pmul(A0[i], rhs[i]);
+
+  // === Phase 3: 1 column at a time ===
+  {
+    Index jStart = FirstTriangular ? 0 : (n4 + n2);
+    Index jEnd = FirstTriangular ? (size - n4 - n2) : size;
+
+    for (Index j = jStart; j < jEnd; j++) {
+      const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
+
+      Scalar t1 = cjAlpha * rhs[j];
+      Scalar t2(0);
+      Packet ptmp1 = pset1<Packet>(t1);
+      Packet ptmp2 = pzero(Packet{});
+
+      res[j] += cjd.pmul(numext::real(A0[j]), t1);
+
+      Index starti = FirstTriangular ? 0 : j + 1;
+      Index endi = FirstTriangular ? j : size;
+      Index alignedStart = starti + internal::first_default_aligned(&res[starti], endi - starti);
+      Index alignedEnd = alignedStart + ((endi - alignedStart) / PacketSize) * PacketSize;
+
+      for (Index i = starti; i < alignedStart; ++i) {
+        res[i] += cj0.pmul(A0[i], t1);
+        t2 += cj1.pmul(A0[i], rhs[i]);
+      }
+      const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
+      const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
+      Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
+      for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
+        Packet A0i = ploadu<Packet>(a0It);
+        a0It += PacketSize;
+        Packet Bi = ploadu<Packet>(rhsIt);
+        rhsIt += PacketSize;
+        Packet Xi = pload<Packet>(resIt);
+
+        Xi = pcj0.pmadd(A0i, ptmp1, Xi);
+        pstore(resIt, Xi);
+        resIt += PacketSize;
+
+        ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
+      }
+      for (Index i = alignedEnd; i < endi; i++) {
+        res[i] += cj0.pmul(A0[i], t1);
+        t2 += cj1.pmul(A0[i], rhs[i]);
+      }
+      res[j] += alpha * (t2 + predux(ptmp2));
     }
-    res[j] += alpha * t2;
   }
 }
 
@@ -164,6 +328,11 @@ struct selfadjoint_product_impl<Lhs, LhsMode, false, Rhs, 0, true> {
 
   enum { LhsUpLo = LhsMode & (Upper | Lower) };
 
+  // Verify that the Rhs is a vector in the correct orientation.
+  // Otherwise, we break the assumption that we are multiplying
+  // MxN * Nx1.
+  static_assert(Rhs::ColsAtCompileTime == 1, "The RHS must be a column vector.");
+
   template <typename Dest>
   static EIGEN_DEVICE_FUNC void run(Dest& dest, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) {
     typedef typename Dest::Scalar ResScalar;
@@ -173,11 +342,6 @@ struct selfadjoint_product_impl<Lhs, LhsMode, false, Rhs, 0, true> {
 
     eigen_assert(dest.rows() == a_lhs.rows() && dest.cols() == a_rhs.cols());
 
-    if (a_lhs.rows() == 1) {
-      dest = (alpha * a_lhs.coeff(0, 0)) * a_rhs;
-      return;
-    }
-
     add_const_on_value_type_t<ActualLhsType> lhs = LhsBlasTraits::extract(a_lhs);
     add_const_on_value_type_t<ActualRhsType> rhs = RhsBlasTraits::extract(a_rhs);
 
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
index 187c9115a47..413d9490e94 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
@@ -108,6 +108,8 @@ EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_)
 EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_)
 #endif
 
+#undef EIGEN_BLAS_SYMV_SPECIALIZATION
+#undef EIGEN_BLAS_SYMV_SPECIALIZE
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h
index f1034655eb7..0e0f349061f 100644
--- a/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/Eigen/src/Core/products/SelfadjointProduct.h
@@ -24,14 +24,104 @@ namespace Eigen {
 template <typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
 struct selfadjoint_rank1_update<Scalar, Index, ColMajor, UpLo, ConjLhs, ConjRhs> {
   static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha) {
-    internal::conj_if<ConjRhs> cj;
-    typedef Map<const Matrix<Scalar, Dynamic, 1> > OtherMap;
-    typedef std::conditional_t<ConjLhs, typename OtherMap::ConjugateReturnType, const OtherMap&> ConjLhsType;
-    for (Index i = 0; i < size; ++i) {
-      Map<Matrix<Scalar, Dynamic, 1> >(mat + stride * i + (UpLo == Lower ? i : 0),
-                                       (UpLo == Lower ? size - i : (i + 1))) +=
-          (alpha * cj(vecY[i])) *
-          ConjLhsType(OtherMap(vecX + (UpLo == Lower ? i : 0), UpLo == Lower ? size - i : (i + 1)));
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+    const Index PacketSize = internal::unpacket_traits<Packet>::size;
+
+    internal::conj_if<ConjRhs> cjy;
+    internal::conj_if<ConjLhs> cjx;
+    internal::conj_helper<Packet, Packet, ConjLhs, false> pcj;
+
+    // Process 2 columns at a time to share vecX loads and reduce loop overhead.
+    Index j = 0;
+    for (; j + 1 < size; j += 2) {
+      Scalar s0 = alpha * cjy(vecY[j]);
+      Scalar s1 = alpha * cjy(vecY[j + 1]);
+      Packet ps0 = internal::pset1<Packet>(s0);
+      Packet ps1 = internal::pset1<Packet>(s1);
+
+      if (UpLo == Lower) {
+        Scalar* EIGEN_RESTRICT col0 = mat + stride * j + j;
+        Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1) + (j + 1);
+
+        // Diagonal and cross-diagonal scalar elements
+        col0[0] += s0 * cjx(vecX[j]);
+        col0[1] += s0 * cjx(vecX[j + 1]);
+        col1[0] += s1 * cjx(vecX[j + 1]);
+
+        // Shared vectorized loop for rows j+2..size-1
+        Index len = size - j - 2;
+        const Scalar* EIGEN_RESTRICT xp = vecX + j + 2;
+        Scalar* EIGEN_RESTRICT d0 = col0 + 2;
+        Scalar* EIGEN_RESTRICT d1 = col1 + 1;
+
+        Index k = 0;
+        Index vectorizedEnd = (len / PacketSize) * PacketSize;
+        for (; k < vectorizedEnd; k += PacketSize) {
+          Packet xi = internal::ploadu<Packet>(xp + k);
+          Packet m0 = internal::ploadu<Packet>(d0 + k);
+          m0 = pcj.pmadd(xi, ps0, m0);
+          internal::pstoreu(d0 + k, m0);
+          Packet m1 = internal::ploadu<Packet>(d1 + k);
+          m1 = pcj.pmadd(xi, ps1, m1);
+          internal::pstoreu(d1 + k, m1);
+        }
+        for (; k < len; ++k) {
+          Scalar cx = cjx(xp[k]);
+          d0[k] += s0 * cx;
+          d1[k] += s1 * cx;
+        }
+      } else {
+        // UpLo == Upper
+        Scalar* EIGEN_RESTRICT col0 = mat + stride * j;
+        Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1);
+
+        // Shared vectorized loop for rows 0..j-1
+        const Scalar* EIGEN_RESTRICT xp = vecX;
+        Index len = j;
+        Index k = 0;
+        Index vectorizedEnd = (len / PacketSize) * PacketSize;
+        for (; k < vectorizedEnd; k += PacketSize) {
+          Packet xi = internal::ploadu<Packet>(xp + k);
+          Packet m0 = internal::ploadu<Packet>(col0 + k);
+          Packet m1 = internal::ploadu<Packet>(col1 + k);
+          m0 = pcj.pmadd(xi, ps0, m0);
+          m1 = pcj.pmadd(xi, ps1, m1);
+          internal::pstoreu(col0 + k, m0);
+          internal::pstoreu(col1 + k, m1);
+        }
+        for (; k < len; ++k) {
+          Scalar cx = cjx(xp[k]);
+          col0[k] += s0 * cx;
+          col1[k] += s1 * cx;
+        }
+
+        // Diagonal and cross-diagonal scalar elements
+        col0[j] += s0 * cjx(vecX[j]);
+        col1[j] += s1 * cjx(vecX[j]);
+        col1[j + 1] += s1 * cjx(vecX[j + 1]);
+      }
+    }
+
+    // Handle last column if size is odd
+    if (j < size) {
+      Scalar s = alpha * cjy(vecY[j]);
+      Packet ps = internal::pset1<Packet>(s);
+      Index start = UpLo == Lower ? j : 0;
+      Index len = UpLo == Lower ? size - j : j + 1;
+      Scalar* EIGEN_RESTRICT dst = mat + stride * j + start;
+      const Scalar* EIGEN_RESTRICT xp = vecX + start;
+
+      Index k = 0;
+      Index vectorizedEnd = (len / PacketSize) * PacketSize;
+      for (; k < vectorizedEnd; k += PacketSize) {
+        Packet xi = internal::ploadu<Packet>(xp + k);
+        Packet di = internal::ploadu<Packet>(dst + k);
+        di = pcj.pmadd(xi, ps, di);
+        internal::pstoreu(dst + k, di);
+      }
+      for (; k < len; ++k) {
+        dst[k] += s * cjx(xp[k]);
+      }
     }
   }
 };
@@ -123,7 +213,7 @@ template <typename MatrixType, unsigned int UpLo>
 template <typename DerivedU>
 EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType, UpLo>& SelfAdjointView<MatrixType, UpLo>::rankUpdate(
     const MatrixBase<DerivedU>& u, const Scalar& alpha) {
-  selfadjoint_product_selector<MatrixType, DerivedU, UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha);
+  selfadjoint_product_selector<MatrixType, DerivedU, UpLo>::run(nestedExpression(), u.derived(), alpha);
 
   return *this;
 }
diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h
index 9c234ec2a96..165875ff421 100644
--- a/Eigen/src/Core/products/SelfadjointRank2Update.h
+++ b/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -21,35 +21,174 @@ namespace internal {
  * It corresponds to the Level2 syr2 BLAS routine
  */
 
-template <typename Scalar, typename Index, typename UType, typename VType, int UpLo>
+template <typename Scalar, typename Index, int UpLo>
 struct selfadjoint_rank2_update_selector;
 
-template <typename Scalar, typename Index, typename UType, typename VType>
-struct selfadjoint_rank2_update_selector<Scalar, Index, UType, VType, Lower> {
-  static EIGEN_DEVICE_FUNC void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) {
-    const Index size = u.size();
-    for (Index i = 0; i < size; ++i) {
-      Map<Matrix<Scalar, Dynamic, 1>>(mat + stride * i + i, size - i) +=
-          (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.tail(size - i) +
-          (alpha * numext::conj(v.coeff(i))) * u.tail(size - i);
+template <typename Scalar, typename Index>
+struct selfadjoint_rank2_update_selector<Scalar, Index, Lower> {
+  static void run(Index size, Scalar* mat, Index stride, const Scalar* u, const Scalar* v, const Scalar& alpha) {
+    typedef typename packet_traits<Scalar>::type Packet;
+    const Index PacketSize = unpacket_traits<Packet>::size;
+    const Scalar cAlpha = numext::conj(alpha);
+
+    // Process 2 columns at a time to share u/v loads and reduce loop overhead.
+    Index j = 0;
+    for (; j + 1 < size; j += 2) {
+      // Scale factors: col[j:] += s0u * v[j:] + s0v * u[j:]
+      Scalar s0u = cAlpha * numext::conj(u[j]);
+      Scalar s0v = alpha * numext::conj(v[j]);
+      Scalar s1u = cAlpha * numext::conj(u[j + 1]);
+      Scalar s1v = alpha * numext::conj(v[j + 1]);
+
+      Packet ps0u = pset1<Packet>(s0u);
+      Packet ps0v = pset1<Packet>(s0v);
+      Packet ps1u = pset1<Packet>(s1u);
+      Packet ps1v = pset1<Packet>(s1v);
+
+      Scalar* EIGEN_RESTRICT col0 = mat + stride * j + j;
+      Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1) + (j + 1);
+
+      // Diagonal and cross-diagonal scalar elements
+      col0[0] += s0u * v[j] + s0v * u[j];
+      col0[1] += s0u * v[j + 1] + s0v * u[j + 1];
+      col1[0] += s1u * v[j + 1] + s1v * u[j + 1];
+
+      // Shared vectorized loop for rows j+2..size-1
+      Index len = size - j - 2;
+      const Scalar* EIGEN_RESTRICT up = u + j + 2;
+      const Scalar* EIGEN_RESTRICT vp = v + j + 2;
+      Scalar* EIGEN_RESTRICT d0 = col0 + 2;
+      Scalar* EIGEN_RESTRICT d1 = col1 + 1;
+
+      Index k = 0;
+      Index vectorizedEnd = (len / PacketSize) * PacketSize;
+      for (; k < vectorizedEnd; k += PacketSize) {
+        Packet ui = ploadu<Packet>(up + k);
+        Packet vi = ploadu<Packet>(vp + k);
+        Packet m0 = ploadu<Packet>(d0 + k);
+        m0 = pmadd(vi, ps0u, m0);
+        m0 = pmadd(ui, ps0v, m0);
+        pstoreu(d0 + k, m0);
+        Packet m1 = ploadu<Packet>(d1 + k);
+        m1 = pmadd(vi, ps1u, m1);
+        m1 = pmadd(ui, ps1v, m1);
+        pstoreu(d1 + k, m1);
+      }
+      for (; k < len; ++k) {
+        d0[k] += s0u * vp[k] + s0v * up[k];
+        d1[k] += s1u * vp[k] + s1v * up[k];
+      }
+    }
+
+    // Handle last column if size is odd
+    if (j < size) {
+      Scalar su = cAlpha * numext::conj(u[j]);
+      Scalar sv = alpha * numext::conj(v[j]);
+      Packet psu = pset1<Packet>(su);
+      Packet psv = pset1<Packet>(sv);
+
+      Scalar* EIGEN_RESTRICT dst = mat + stride * j + j;
+      const Scalar* EIGEN_RESTRICT up = u + j;
+      const Scalar* EIGEN_RESTRICT vp = v + j;
+      Index len = size - j;
+
+      Index k = 0;
+      Index vectorizedEnd = (len / PacketSize) * PacketSize;
+      for (; k < vectorizedEnd; k += PacketSize) {
+        Packet ui = ploadu<Packet>(up + k);
+        Packet vi = ploadu<Packet>(vp + k);
+        Packet di = ploadu<Packet>(dst + k);
+        di = pmadd(vi, psu, di);
+        di = pmadd(ui, psv, di);
+        pstoreu(dst + k, di);
+      }
+      for (; k < len; ++k) {
+        dst[k] += su * vp[k] + sv * up[k];
+      }
     }
   }
 };
 
-template <typename Scalar, typename Index, typename UType, typename VType>
-struct selfadjoint_rank2_update_selector<Scalar, Index, UType, VType, Upper> {
-  static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) {
-    const Index size = u.size();
-    for (Index i = 0; i < size; ++i)
-      Map<Matrix<Scalar, Dynamic, 1>>(mat + stride * i, i + 1) +=
-          (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.head(i + 1) +
-          (alpha * numext::conj(v.coeff(i))) * u.head(i + 1);
+template <typename Scalar, typename Index>
+struct selfadjoint_rank2_update_selector<Scalar, Index, Upper> {
+  static void run(Index size, Scalar* mat, Index stride, const Scalar* u, const Scalar* v, const Scalar& alpha) {
+    typedef typename packet_traits<Scalar>::type Packet;
+    const Index PacketSize = unpacket_traits<Packet>::size;
+    const Scalar cAlpha = numext::conj(alpha);
+
+    // Process 2 columns at a time to share u/v loads and reduce loop overhead.
+    Index j = 0;
+    for (; j + 1 < size; j += 2) {
+      Scalar s0u = cAlpha * numext::conj(u[j]);
+      Scalar s0v = alpha * numext::conj(v[j]);
+      Scalar s1u = cAlpha * numext::conj(u[j + 1]);
+      Scalar s1v = alpha * numext::conj(v[j + 1]);
+
+      Packet ps0u = pset1<Packet>(s0u);
+      Packet ps0v = pset1<Packet>(s0v);
+      Packet ps1u = pset1<Packet>(s1u);
+      Packet ps1v = pset1<Packet>(s1v);
+
+      Scalar* EIGEN_RESTRICT col0 = mat + stride * j;
+      Scalar* EIGEN_RESTRICT col1 = mat + stride * (j + 1);
+
+      // Shared vectorized loop for rows 0..j-1
+      Index len = j;
+      Index k = 0;
+      Index vectorizedEnd = (len / PacketSize) * PacketSize;
+      for (; k < vectorizedEnd; k += PacketSize) {
+        Packet ui = ploadu<Packet>(u + k);
+        Packet vi = ploadu<Packet>(v + k);
+        Packet m0 = ploadu<Packet>(col0 + k);
+        m0 = pmadd(vi, ps0u, m0);
+        m0 = pmadd(ui, ps0v, m0);
+        pstoreu(col0 + k, m0);
+        Packet m1 = ploadu<Packet>(col1 + k);
+        m1 = pmadd(vi, ps1u, m1);
+        m1 = pmadd(ui, ps1v, m1);
+        pstoreu(col1 + k, m1);
+      }
+      for (; k < len; ++k) {
+        col0[k] += s0u * v[k] + s0v * u[k];
+        col1[k] += s1u * v[k] + s1v * u[k];
+      }
+
+      // Diagonal and cross-diagonal scalar elements
+      col0[j] += s0u * v[j] + s0v * u[j];
+      col1[j] += s1u * v[j] + s1v * u[j];
+      col1[j + 1] += s1u * v[j + 1] + s1v * u[j + 1];
+    }
+
+    // Handle last column if size is odd
+    if (j < size) {
+      Scalar su = cAlpha * numext::conj(u[j]);
+      Scalar sv = alpha * numext::conj(v[j]);
+      Packet psu = pset1<Packet>(su);
+      Packet psv = pset1<Packet>(sv);
+
+      Scalar* EIGEN_RESTRICT dst = mat + stride * j;
+      Index len = j + 1;
+
+      Index k = 0;
+      Index vectorizedEnd = (len / PacketSize) * PacketSize;
+      for (; k < vectorizedEnd; k += PacketSize) {
+        Packet ui = ploadu<Packet>(u + k);
+        Packet vi = ploadu<Packet>(v + k);
+        Packet di = ploadu<Packet>(dst + k);
+        di = pmadd(vi, psu, di);
+        di = pmadd(ui, psv, di);
+        pstoreu(dst + k, di);
+      }
+      for (; k < len; ++k) {
+        dst[k] += su * v[k] + sv * u[k];
+      }
+    }
   }
 };
 
 template <bool Cond, typename T>
 using conj_expr_if =
-    std::conditional<!Cond, const T&, CwiseUnaryOp<scalar_conjugate_op<typename traits<T>::Scalar>, T>>;
+    std::conditional_t<!Cond, const T&, CwiseUnaryOp<scalar_conjugate_op<typename traits<T>::Scalar>, T>>;
 
 }  // end namespace internal
 
@@ -69,23 +208,49 @@ EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType, UpLo>& SelfAdjointView<MatrixType,
 
   // If MatrixType is row major, then we use the routine for lower triangular in the upper triangular case and
   // vice versa, and take the complex conjugate of all coefficients and vector entries.
+  enum {
+    IsRowMajor = (internal::traits<MatrixType>::Flags & RowMajorBit) ? 1 : 0,
+    // Only need to conjugate if complex and the condition triggers
+    NeedConjU = (int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate)) && NumTraits<Scalar>::IsComplex,
+    NeedConjV = (int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate)) && NumTraits<Scalar>::IsComplex,
+    UseUDirectly = ActualUType_::InnerStrideAtCompileTime == 1 && !NeedConjU,
+    UseVDirectly = ActualVType_::InnerStrideAtCompileTime == 1 && !NeedConjV
+  };
 
-  enum { IsRowMajor = (internal::traits<MatrixType>::Flags & RowMajorBit) ? 1 : 0 };
   Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived()) *
                        numext::conj(VBlasTraits::extractScalarFactor(v.derived()));
   if (IsRowMajor) actualAlpha = numext::conj(actualAlpha);
 
-  typedef internal::remove_all_t<
-      typename internal::conj_expr_if<int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate), ActualUType_>::type>
-      UType;
-  typedef internal::remove_all_t<
-      typename internal::conj_expr_if<int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate), ActualVType_>::type>
-      VType;
-  internal::selfadjoint_rank2_update_selector<Scalar, Index, UType, VType,
-                                              (IsRowMajor ? int(UpLo == Upper ? Lower : Upper)
-                                                          : UpLo)>::run(_expression().const_cast_derived().data(),
-                                                                        _expression().outerStride(), UType(actualU),
-                                                                        VType(actualV), actualAlpha);
+  const Index size = u.size();
+
+  // Copy u to contiguous buffer, applying conjugation if needed
+  internal::gemv_static_vector_if<Scalar, DerivedU::SizeAtCompileTime, DerivedU::MaxSizeAtCompileTime, !UseUDirectly>
+      static_u;
+  ei_declare_aligned_stack_constructed_variable(Scalar, uPtr, size,
+                                                (UseUDirectly ? const_cast<Scalar*>(actualU.data()) : static_u.data()));
+  if (!UseUDirectly) {
+    if (NeedConjU)
+      Map<typename ActualUType_::PlainObject>(uPtr, size) = actualU.conjugate();
+    else
+      Map<typename ActualUType_::PlainObject>(uPtr, size) = actualU;
+  }
+
+  // Copy v to contiguous buffer, applying conjugation if needed
+  internal::gemv_static_vector_if<Scalar, DerivedV::SizeAtCompileTime, DerivedV::MaxSizeAtCompileTime, !UseVDirectly>
+      static_v;
+  ei_declare_aligned_stack_constructed_variable(Scalar, vPtr, size,
+                                                (UseVDirectly ? const_cast<Scalar*>(actualV.data()) : static_v.data()));
+  if (!UseVDirectly) {
+    if (NeedConjV)
+      Map<typename ActualVType_::PlainObject>(vPtr, size) = actualV.conjugate();
+    else
+      Map<typename ActualVType_::PlainObject>(vPtr, size) = actualV;
+  }
+
+  internal::selfadjoint_rank2_update_selector<
+      Scalar, Index, (IsRowMajor ? int(UpLo == Upper ? Lower : Upper) : UpLo)>::run(size, nestedExpression().data(),
+                                                                                    nestedExpression().outerStride(),
+                                                                                    uPtr, vPtr, actualAlpha);
 
   return *this;
 }
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index a0d05ef8fdb..e4e446e4d81 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -17,30 +17,6 @@ namespace Eigen {
 
 namespace internal {
 
-// template<typename Scalar, int mr, int StorageOrder, bool Conjugate, int Mode>
-// struct gemm_pack_lhs_triangular
-// {
-//   Matrix<Scalar,mr,mr,
-//   void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* lhs_, int lhsStride, int depth, int rows)
-//   {
-//     conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-//     const_blas_data_mapper<Scalar, StorageOrder> lhs(lhs_,lhsStride);
-//     int count = 0;
-//     const int peeled_mc = (rows/mr)*mr;
-//     for(int i=0; i<peeled_mc; i+=mr)
-//     {
-//       for(int k=0; k<depth; k++)
-//         for(int w=0; w<mr; w++)
-//           blockA[count++] = cj(lhs(i+w, k));
-//     }
-//     for(int i=peeled_mc; i<rows; i++)
-//     {
-//       for(int k=0; k<depth; k++)
-//         blockA[count++] = cj(lhs(i, k));
-//     }
-//   }
-// };
-
 /* Optimized triangular matrix * matrix (_TRMM++) product built on top of
  * the general matrix matrix product.
  */
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
index 3d612b04d47..043011e23eb 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
@@ -318,6 +318,10 @@ EIGEN_BLAS_TRMM_R(dcomplex, double, cd, ztrmm_)
 EIGEN_BLAS_TRMM_R(float, float, f, strmm_)
 EIGEN_BLAS_TRMM_R(scomplex, float, cf, ctrmm_)
 #endif
+
+#undef EIGEN_BLAS_TRMM_SPECIALIZE
+#undef EIGEN_BLAS_TRMM_L
+#undef EIGEN_BLAS_TRMM_R
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index bef4cbaf88a..41395f951e1 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -43,43 +43,102 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index, Mode, LhsScalar,
   Index rows = IsLower ? _rows : (std::min)(_rows, _cols);
   Index cols = IsLower ? (std::min)(_rows, _cols) : _cols;
 
-  typedef Map<const Matrix<LhsScalar, Dynamic, Dynamic, ColMajor>, 0, OuterStride<> > LhsMap;
-  const LhsMap lhs(lhs_, rows, cols, OuterStride<>(lhsStride));
-  typename conj_expr_if<ConjLhs, LhsMap>::type cjLhs(lhs);
-
-  typedef Map<const Matrix<RhsScalar, Dynamic, 1>, 0, InnerStride<> > RhsMap;
-  const RhsMap rhs(rhs_, cols, InnerStride<>(rhsIncr));
-  typename conj_expr_if<ConjRhs, RhsMap>::type cjRhs(rhs);
-
-  typedef Map<Matrix<ResScalar, Dynamic, 1> > ResMap;
-  ResMap res(res_, rows);
-
   typedef const_blas_data_mapper<LhsScalar, Index, ColMajor> LhsMapper;
   typedef const_blas_data_mapper<RhsScalar, Index, RowMajor> RhsMapper;
 
+  conj_if<ConjLhs> cjl;
+  conj_if<ConjRhs> cjr;
+
   for (Index pi = 0; pi < size; pi += PanelWidth) {
     Index actualPanelWidth = (std::min)(PanelWidth, size - pi);
-    for (Index k = 0; k < actualPanelWidth; ++k) {
-      Index i = pi + k;
-      Index s = IsLower ? ((HasUnitDiag || HasZeroDiag) ? i + 1 : i) : pi;
-      Index r = IsLower ? actualPanelWidth - k : k + 1;
-      if ((!(HasUnitDiag || HasZeroDiag)) || (--r) > 0)
-        res.segment(s, r) += (alpha * cjRhs.coeff(i)) * cjLhs.col(i).segment(s, r);
-      if (HasUnitDiag) res.coeffRef(i) += alpha * cjRhs.coeff(i);
+
+    // Process the triangular panel using raw pointer operations with 2-column batching
+    // to eliminate expression template overhead and share result loads/stores.
+    if (IsLower) {
+      Index k = 0;
+      for (; k + 1 < actualPanelWidth; k += 2) {
+        Index i0 = pi + k;
+        Index i1 = i0 + 1;
+        ResScalar s0 = alpha * cjr(rhs_[i0 * rhsIncr]);
+        ResScalar s1 = alpha * cjr(rhs_[i1 * rhsIncr]);
+        const LhsScalar* EIGEN_RESTRICT c0 = lhs_ + i0 * lhsStride;
+        const LhsScalar* EIGEN_RESTRICT c1 = lhs_ + i1 * lhsStride;
+
+        // Diagonal of column 0
+        if (!(HasUnitDiag || HasZeroDiag)) res_[i0] += s0 * cjl(c0[i0]);
+        // Row i1: contribution from column 0 + diagonal of column 1
+        {
+          ResScalar r1 = s0 * cjl(c0[i1]);
+          if (!(HasUnitDiag || HasZeroDiag)) r1 += s1 * cjl(c1[i1]);
+          res_[i1] += r1;
+        }
+        // Shared rows where both columns contribute
+        Index panelEnd = pi + actualPanelWidth;
+        for (Index j = i1 + 1; j < panelEnd; ++j) res_[j] += s0 * cjl(c0[j]) + s1 * cjl(c1[j]);
+
+        if (HasUnitDiag) {
+          res_[i0] += s0;
+          res_[i1] += s1;
+        }
+      }
+      if (k < actualPanelWidth) {
+        Index i = pi + k;
+        ResScalar s = alpha * cjr(rhs_[i * rhsIncr]);
+        const LhsScalar* EIGEN_RESTRICT c = lhs_ + i * lhsStride;
+        if (!(HasUnitDiag || HasZeroDiag)) res_[i] += s * cjl(c[i]);
+        if (HasUnitDiag) res_[i] += s;
+      }
+    } else {
+      // Upper triangular: process 2 columns at a time
+      Index k = 0;
+      for (; k + 1 < actualPanelWidth; k += 2) {
+        Index i0 = pi + k;
+        Index i1 = i0 + 1;
+        ResScalar s0 = alpha * cjr(rhs_[i0 * rhsIncr]);
+        ResScalar s1 = alpha * cjr(rhs_[i1 * rhsIncr]);
+        const LhsScalar* EIGEN_RESTRICT c0 = lhs_ + i0 * lhsStride;
+        const LhsScalar* EIGEN_RESTRICT c1 = lhs_ + i1 * lhsStride;
+
+        // Shared rows before the diagonal block
+        for (Index j = pi; j < i0; ++j) res_[j] += s0 * cjl(c0[j]) + s1 * cjl(c1[j]);
+
+        // Row i0: diagonal of col0 + contribution from col1
+        {
+          ResScalar r0 = s1 * cjl(c1[i0]);
+          if (!(HasUnitDiag || HasZeroDiag)) r0 += s0 * cjl(c0[i0]);
+          res_[i0] += r0;
+        }
+        // Diagonal of column 1
+        if (!(HasUnitDiag || HasZeroDiag)) res_[i1] += s1 * cjl(c1[i1]);
+
+        if (HasUnitDiag) {
+          res_[i0] += s0;
+          res_[i1] += s1;
+        }
+      }
+      if (k < actualPanelWidth) {
+        Index i = pi + k;
+        ResScalar s = alpha * cjr(rhs_[i * rhsIncr]);
+        const LhsScalar* EIGEN_RESTRICT c = lhs_ + i * lhsStride;
+        for (Index j = pi; j < i; ++j) res_[j] += s * cjl(c[j]);
+        if (!(HasUnitDiag || HasZeroDiag)) res_[i] += s * cjl(c[i]);
+        if (HasUnitDiag) res_[i] += s;
+      }
     }
+
+    // Rectangular part: delegate to optimized GEMV
     Index r = IsLower ? rows - pi - actualPanelWidth : pi;
     if (r > 0) {
       Index s = IsLower ? pi + actualPanelWidth : 0;
       general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs,
-                                    BuiltIn>::run(r, actualPanelWidth, LhsMapper(&lhs.coeffRef(s, pi), lhsStride),
-                                                  RhsMapper(&rhs.coeffRef(pi), rhsIncr), &res.coeffRef(s), resIncr,
-                                                  alpha);
+                                    BuiltIn>::run(r, actualPanelWidth, LhsMapper(&lhs_[pi * lhsStride + s], lhsStride),
+                                                  RhsMapper(&rhs_[pi * rhsIncr], rhsIncr), &res_[s], resIncr, alpha);
     }
   }
   if ((!IsLower) && cols > size) {
     general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs>::run(
-        rows, cols - size, LhsMapper(&lhs.coeffRef(0, size), lhsStride), RhsMapper(&rhs.coeffRef(size), rhsIncr), res_,
-        resIncr, alpha);
+        rows, cols - size, LhsMapper(&lhs_[size * lhsStride], lhsStride), RhsMapper(&rhs_[size * rhsIncr], rhsIncr),
+        res_, resIncr, alpha);
   }
 }
 
@@ -105,43 +164,48 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index, Mode, LhsScalar,
   Index rows = IsLower ? _rows : diagSize;
   Index cols = IsLower ? diagSize : _cols;
 
-  typedef Map<const Matrix<LhsScalar, Dynamic, Dynamic, RowMajor>, 0, OuterStride<> > LhsMap;
-  const LhsMap lhs(lhs_, rows, cols, OuterStride<>(lhsStride));
-  typename conj_expr_if<ConjLhs, LhsMap>::type cjLhs(lhs);
-
-  typedef Map<const Matrix<RhsScalar, Dynamic, 1> > RhsMap;
-  const RhsMap rhs(rhs_, cols);
-  typename conj_expr_if<ConjRhs, RhsMap>::type cjRhs(rhs);
-
-  typedef Map<Matrix<ResScalar, Dynamic, 1>, 0, InnerStride<> > ResMap;
-  ResMap res(res_, rows, InnerStride<>(resIncr));
-
   typedef const_blas_data_mapper<LhsScalar, Index, RowMajor> LhsMapper;
   typedef const_blas_data_mapper<RhsScalar, Index, RowMajor> RhsMapper;
 
+  conj_if<ConjLhs> cjl;
+  conj_if<ConjRhs> cjr;
+
   for (Index pi = 0; pi < diagSize; pi += PanelWidth) {
     Index actualPanelWidth = (std::min)(PanelWidth, diagSize - pi);
+
+    // Process the triangular panel using raw dot products to eliminate
+    // the cwiseProduct().sum() expression template overhead.
     for (Index k = 0; k < actualPanelWidth; ++k) {
       Index i = pi + k;
-      Index s = IsLower ? pi : ((HasUnitDiag || HasZeroDiag) ? i + 1 : i);
-      Index r = IsLower ? k + 1 : actualPanelWidth - k;
-      if ((!(HasUnitDiag || HasZeroDiag)) || (--r) > 0)
-        res.coeffRef(i) += alpha * (cjLhs.row(i).segment(s, r).cwiseProduct(cjRhs.segment(s, r).transpose())).sum();
-      if (HasUnitDiag) res.coeffRef(i) += alpha * cjRhs.coeff(i);
+      const LhsScalar* EIGEN_RESTRICT row_i = lhs_ + i * lhsStride;
+      ResScalar dot = ResScalar(0);
+
+      if (IsLower) {
+        Index s = pi;
+        Index len = (HasUnitDiag || HasZeroDiag) ? k : k + 1;
+        for (Index j = 0; j < len; ++j) dot += cjl(row_i[s + j]) * cjr(rhs_[s + j]);
+      } else {
+        Index s = (HasUnitDiag || HasZeroDiag) ? i + 1 : i;
+        Index len = pi + actualPanelWidth - s;
+        for (Index j = 0; j < len; ++j) dot += cjl(row_i[s + j]) * cjr(rhs_[s + j]);
+      }
+      res_[i * resIncr] += alpha * dot;
+      if (HasUnitDiag) res_[i * resIncr] += alpha * cjr(rhs_[i]);
     }
+
+    // Rectangular part: delegate to optimized GEMV
     Index r = IsLower ? pi : cols - pi - actualPanelWidth;
     if (r > 0) {
       Index s = IsLower ? 0 : pi + actualPanelWidth;
       general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs,
-                                    BuiltIn>::run(actualPanelWidth, r, LhsMapper(&lhs.coeffRef(pi, s), lhsStride),
-                                                  RhsMapper(&rhs.coeffRef(s), rhsIncr), &res.coeffRef(pi), resIncr,
-                                                  alpha);
+                                    BuiltIn>::run(actualPanelWidth, r, LhsMapper(&lhs_[pi * lhsStride + s], lhsStride),
+                                                  RhsMapper(&rhs_[s], rhsIncr), &res_[pi * resIncr], resIncr, alpha);
     }
   }
   if (IsLower && rows > diagSize) {
     general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs>::run(
-        rows - diagSize, cols, LhsMapper(&lhs.coeffRef(diagSize, 0), lhsStride), RhsMapper(&rhs.coeffRef(0), rhsIncr),
-        &res.coeffRef(diagSize), resIncr, alpha);
+        rows - diagSize, cols, LhsMapper(&lhs_[diagSize * lhsStride], lhsStride), RhsMapper(rhs_, rhsIncr),
+        &res_[diagSize * resIncr], resIncr, alpha);
   }
 }
 
@@ -212,7 +276,7 @@ struct trmv_selector<Mode, ColMajor> {
     ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
 
     // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
-    // on, the other hand it is good for the cache to pack the vector anyways...
+    // On the other hand, it is good for the cache to pack the vector anyways...
     constexpr bool EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime == 1;
     constexpr bool ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex);
     constexpr bool MightCannotUseDest = (Dest::InnerStrideAtCompileTime != 1) || ComplexByReal;
diff --git a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
index 1de68803b0a..19ed81388aa 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
@@ -268,6 +268,9 @@ EIGEN_BLAS_TRMV_RM(float, float, f, s, _)
 EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c, _)
 #endif
 
+#undef EIGEN_BLAS_TRMV_RM
+#undef EIGEN_BLAS_TRMV_SPECIALIZE
+#undef EIGEN_BLAS_TRMV_CM
 }  // namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index 8244758bd30..bee971d5051 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -52,7 +52,7 @@ EIGEN_STRONG_INLINE void trsmKernelL<Scalar, Index, Mode, Conjugate, TriStorageO
 
   // tr solve
   for (Index k = 0; k < size; ++k) {
-    // TODO write a small kernel handling this (can be shared with trsv)
+    // TODO: write a small kernel handling this (can be shared with trsv)
     Index i = IsLower ? k : -k - 1;
     Index rs = size - k - 1;  // remaining size
     Index s = TriStorageOrder == RowMajor ? (IsLower ? 0 : i + 1) : IsLower ? i + 1 : i - rs;
@@ -97,14 +97,72 @@ EIGEN_STRONG_INLINE void trsmKernelR<Scalar, Index, Mode, Conjugate, TriStorageO
     Index j = IsLower ? size - k - 1 : k;
 
     typename LhsMapper::LinearMapper r = lhs.getLinearMapper(0, j);
-    for (Index k3 = 0; k3 < k; ++k3) {
-      Scalar b = conj(rhs(IsLower ? j + 1 + k3 : k3, j));
-      typename LhsMapper::LinearMapper a = lhs.getLinearMapper(0, IsLower ? j + 1 + k3 : k3);
-      for (Index i = 0; i < otherSize; ++i) r(i) -= a(i) * b;
+    EIGEN_IF_CONSTEXPR(OtherInnerStride == 1 && packet_traits<Scalar>::Vectorizable) {
+      using Packet = typename packet_traits<Scalar>::type;
+      constexpr Index PS = unpacket_traits<Packet>::size;
+      // Unrolled k3 loop by 4 to reduce r load/store traffic.
+      Index k3 = 0;
+      for (; k3 + 3 < k; k3 += 4) {
+        Index col0 = IsLower ? j + 1 + k3 : k3;
+        Scalar b0 = conj(rhs(col0, j));
+        Scalar b1 = conj(rhs(col0 + 1, j));
+        Scalar b2 = conj(rhs(col0 + 2, j));
+        Scalar b3 = conj(rhs(col0 + 3, j));
+        Packet neg_pb0 = pset1<Packet>(-b0);
+        Packet neg_pb1 = pset1<Packet>(-b1);
+        Packet neg_pb2 = pset1<Packet>(-b2);
+        Packet neg_pb3 = pset1<Packet>(-b3);
+        typename LhsMapper::LinearMapper a0 = lhs.getLinearMapper(0, col0);
+        typename LhsMapper::LinearMapper a1 = lhs.getLinearMapper(0, col0 + 1);
+        typename LhsMapper::LinearMapper a2 = lhs.getLinearMapper(0, col0 + 2);
+        typename LhsMapper::LinearMapper a3 = lhs.getLinearMapper(0, col0 + 3);
+        Index i = 0;
+        for (; i + PS <= otherSize; i += PS) {
+          Packet pr = r.template loadPacket<Packet>(i);
+          pr = pmadd(a0.template loadPacket<Packet>(i), neg_pb0, pr);
+          pr = pmadd(a1.template loadPacket<Packet>(i), neg_pb1, pr);
+          pr = pmadd(a2.template loadPacket<Packet>(i), neg_pb2, pr);
+          pr = pmadd(a3.template loadPacket<Packet>(i), neg_pb3, pr);
+          r.template storePacket<Packet>(i, pr);
+        }
+        for (; i < otherSize; ++i) {
+          r(i) -= a0(i) * b0 + a1(i) * b1 + a2(i) * b2 + a3(i) * b3;
+        }
+      }
+      // Handle remaining k3 iterations with vectorized inner loop.
+      for (; k3 < k; ++k3) {
+        Scalar b = conj(rhs(IsLower ? j + 1 + k3 : k3, j));
+        typename LhsMapper::LinearMapper a = lhs.getLinearMapper(0, IsLower ? j + 1 + k3 : k3);
+        Packet neg_pb = pset1<Packet>(-b);
+        Index i = 0;
+        for (; i + PS <= otherSize; i += PS) {
+          Packet pr = r.template loadPacket<Packet>(i);
+          pr = pmadd(a.template loadPacket<Packet>(i), neg_pb, pr);
+          r.template storePacket<Packet>(i, pr);
+        }
+        for (; i < otherSize; ++i) r(i) -= a(i) * b;
+      }
+      // Vectorized diagonal scaling.
+      EIGEN_IF_CONSTEXPR((Mode & UnitDiag) == 0) {
+        Scalar inv_rjj = RealScalar(1) / conj(rhs(j, j));
+        Packet pinv = pset1<Packet>(inv_rjj);
+        Index i = 0;
+        for (; i + PS <= otherSize; i += PS) {
+          r.template storePacket<Packet>(i, pmul(r.template loadPacket<Packet>(i), pinv));
+        }
+        for (; i < otherSize; ++i) r(i) *= inv_rjj;
+      }
     }
-    if ((Mode & UnitDiag) == 0) {
-      Scalar inv_rjj = RealScalar(1) / conj(rhs(j, j));
-      for (Index i = 0; i < otherSize; ++i) r(i) *= inv_rjj;
+    else {
+      for (Index k3 = 0; k3 < k; ++k3) {
+        Scalar b = conj(rhs(IsLower ? j + 1 + k3 : k3, j));
+        typename LhsMapper::LinearMapper a = lhs.getLinearMapper(0, IsLower ? j + 1 + k3 : k3);
+        for (Index i = 0; i < otherSize; ++i) r(i) -= a(i) * b;
+      }
+      EIGEN_IF_CONSTEXPR((Mode & UnitDiag) == 0) {
+        Scalar inv_rjj = RealScalar(1) / conj(rhs(j, j));
+        for (Index i = 0; i < otherSize; ++i) r(i) *= inv_rjj;
+      }
     }
   }
 }
@@ -141,7 +199,8 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar, Index, OnTheLeft, Mode, C
   std::ptrdiff_t l1, l2, l3;
   manage_caching_sizes(GetAction, &l1, &l2, &l3);
 
-#if defined(EIGEN_VECTORIZE_AVX512) && EIGEN_USE_AVX512_TRSM_L_KERNELS && EIGEN_ENABLE_AVX512_NOCOPY_TRSM_L_CUTOFFS
+#if defined(EIGEN_VECTORIZE_AVX512) && defined(EIGEN_USE_AVX512_TRSM_L_KERNELS) && EIGEN_USE_AVX512_TRSM_L_KERNELS && \
+    EIGEN_ENABLE_AVX512_NOCOPY_TRSM_L_CUTOFFS
   EIGEN_IF_CONSTEXPR(
       (OtherInnerStride == 1 && (std::is_same<Scalar, float>::value || std::is_same<Scalar, double>::value))) {
     // Very rough cutoffs to determine when to call trsm w/o packing
@@ -209,7 +268,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar, Index, OnTheLeft, Mode, C
         // tr solve
         {
           Index i = IsLower ? k2 + k1 : k2 - k1;
-#if defined(EIGEN_VECTORIZE_AVX512) && EIGEN_USE_AVX512_TRSM_L_KERNELS
+#if defined(EIGEN_VECTORIZE_AVX512) && defined(EIGEN_USE_AVX512_TRSM_L_KERNELS) && EIGEN_USE_AVX512_TRSM_L_KERNELS
           EIGEN_IF_CONSTEXPR(
               (OtherInnerStride == 1 && (std::is_same<Scalar, float>::value || std::is_same<Scalar, double>::value))) {
             i = IsLower ? k2 + k1 : k2 - k1 - actualPanelWidth;
@@ -273,7 +332,8 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar, Index, OnTheRight, Mode,
                                                                       level3_blocking<Scalar, Scalar>& blocking) {
   Index rows = otherSize;
 
-#if defined(EIGEN_VECTORIZE_AVX512) && EIGEN_USE_AVX512_TRSM_R_KERNELS && EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS
+#if defined(EIGEN_VECTORIZE_AVX512) && defined(EIGEN_USE_AVX512_TRSM_R_KERNELS) && EIGEN_USE_AVX512_TRSM_R_KERNELS && \
+    EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS
   EIGEN_IF_CONSTEXPR(
       (OtherInnerStride == 1 && (std::is_same<Scalar, float>::value || std::is_same<Scalar, double>::value))) {
     // TODO: Investigate better heuristics for cutoffs.
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
index 9cc15fbd3b5..20c8f207ddf 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
@@ -159,6 +159,8 @@ EIGEN_BLAS_TRSM_R(float, float, strsm_)
 EIGEN_BLAS_TRSM_R(scomplex, float, ctrsm_)
 #endif
 
+#undef EIGEN_BLAS_TRSM_R
+#undef EIGEN_BLAS_TRSM_L
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/util/AOCL_Support.h b/Eigen/src/Core/util/AOCL_Support.h
new file mode 100644
index 00000000000..c628268d7a0
--- /dev/null
+++ b/Eigen/src/Core/util/AOCL_Support.h
@@ -0,0 +1,174 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ *
+ * AOCL_Support.h - AMD Optimizing CPU Libraries Integration Header for Eigen
+ *
+ * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Description:
+ * ------------
+ * This header file serves as the central configuration and integration point
+ * for AMD Optimizing CPU Libraries (AOCL) with the Eigen C++ template library.
+ * It orchestrates the integration of multiple AOCL components to provide
+ * optimal mathematical computing performance on AMD Zen family processors.
+ *
+ * AOCL Component Integration:
+ * ---------------------------
+ * 1. AOCL Vector Math Library (VML):
+ *    - Provides VRDA (Vector Rapid Double-precision Arithmetic) functions
+ *    - Optimized transcendental functions: exp, sin, cos, sqrt, log, pow, etc.
+ *    - SIMD vectorization for AMD architectures (AVX2, AVX-512)
+ *    - Headers: amdlibm.h, amdlibm_vec.h
+ *
+ * 2. AOCL BLAS (BLIS - BLAS-like Library Instantiation Software):
+ *    - High-performance Basic Linear Algebra Subprograms
+ *    - Supports single-threaded (libblis) and multithreaded (libblis-mt)
+ * variants
+ *    - Optimized matrix operations: GEMM, GEMV, TRSM, etc.
+ *    - Headers: cblas.h, blis.h
+ *
+ * 3. AOCL LAPACK (libFLAME - Formal Linear Algebra Methods Environment):
+ *    - Dense linear algebra operations: factorizations, eigenvalue solvers
+ *    - Matrix decompositions: LU, Cholesky, QR, SVD
+ *    - Eigenvalue/eigenvector computations optimized for AMD hardware
+ *    - Headers: LAPACKE interface
+ *
+ * ------------------------------
+ * EIGEN_AOCL_VML_THRESHOLD (default: 128):
+ *   - Minimum vector size for AOCL VML dispatch
+ *   - Smaller vectors use standard Eigen to avoid function call overhead
+ *   - Optimal values: 64-512 depending on operation and data characteristics
+ *
+ *
+ *
+ * Architecture Support:
+ * ---------------------
+ * Optimized for AMD processor families:
+ * - Zen Architecture (Naples, Rome): AVX2 optimization
+ * - Zen 2 Architecture (Rome, Matisse): Enhanced AVX2
+ * - Zen 3 Architecture (Milan, Vermeer): Improved IPC and cache
+ * - Zen 4 Architecture (Genoa, Raphael): AVX-512 support
+ * - Zen 5 Architecture (Turin, Granite Ridge): Enhanced AVX-512
+ *
+ *
+ * Dependencies:
+ * -------------
+ * Required AOCL components:
+ * - libamdlibm: Core math library with VRDA functions
+ * - libblis or libblis-mt: BLAS implementation
+ * - libflame: LAPACK implementation
+ *
+ * System requirements:
+ * - AMD x86_64 processor (optimal performance)
+ * - Linux, Windows, or compatible POSIX system
+ * - C++14 or later standard
+ * - CMake 3.5+ for build system integration
+ *
+ * Developer:
+ * ----------
+ * Name: Sharad Saurabh Bhaskar
+ * Email: shbhaska@amd.com
+ * Organization: Advanced Micro Devices, Inc.
+ */
+
+#ifndef EIGEN_AOCL_SUPPORT_H
+#define EIGEN_AOCL_SUPPORT_H
+
+#if defined(EIGEN_USE_AOCL_ALL) || defined(EIGEN_USE_AOCL_MT)
+
+#include <complex>
+
+// Define AOCL component flags based on main flags
+#ifdef EIGEN_USE_AOCL_ALL
+#define EIGEN_USE_AOCL_VML   // Enable AOCL Vector Math Library
+#define EIGEN_USE_AOCL_BLAS  // Enable AOCL BLAS (BLIS)
+
+// Enable Eigen BLAS backend only if BLIS provides compatible interface
+#if defined(EIGEN_AOCL_BLIS_COMPATIBLE)
+#define EIGEN_USE_BLAS  // Enable Eigen BLAS backend
+#endif
+
+#define EIGEN_USE_LAPACKE  // Enable LAPACK backend (FLAME)
+#endif
+
+#ifdef EIGEN_USE_AOCL_MT
+#define EIGEN_USE_AOCL_VML   // Enable AOCL Vector Math Library
+#define EIGEN_USE_AOCL_BLAS  // Enable AOCL BLAS (BLIS)
+
+// For multithreaded: disable EIGEN_USE_BLAS to avoid signature conflicts
+// Use direct BLIS calls instead through EIGEN_USE_AOCL_BLAS
+// #define EIGEN_USE_BLAS       // Commented out - causes conflicts with BLIS
+// interface
+
+// Note: LAPACKE disabled in MT mode to avoid header conflicts
+#define EIGEN_USE_LAPACKE         // Commented out - causes conflicts with BLIS LAPACKE
+#define EIGEN_AOCL_USE_BLIS_MT 1  // Enable multithreaded BLIS
+#endif
+
+// Handle standalone EIGEN_USE_AOCL_VML flag
+#ifndef EIGEN_USE_AOCL_VML
+#ifdef EIGEN_USE_AOCL_ALL
+#define EIGEN_USE_AOCL_VML
+#endif
+#ifdef EIGEN_USE_AOCL_MT
+#define EIGEN_USE_AOCL_VML
+#endif
+#endif
+
+// Configuration constants - define these for any AOCL usage
+#ifndef EIGEN_AOCL_VML_THRESHOLD
+#define EIGEN_AOCL_VML_THRESHOLD 128  // Threshold for VML dispatch
+#endif
+
+#ifndef AOCL_SIMD_WIDTH
+#define AOCL_SIMD_WIDTH 8  // AVX-512: 512 bits / 64 bits per double
+#endif
+
+// Include AOCL Math Library headers for VML
+#if defined(EIGEN_USE_AOCL_VML) || defined(EIGEN_USE_AOCL_ALL) || defined(EIGEN_USE_AOCL_MT)
+#if defined(__has_include)
+#if __has_include("amdlibm.h")
+#include "amdlibm.h"
+#ifndef AMD_LIBM_VEC_EXPERIMENTAL
+#define AMD_LIBM_VEC_EXPERIMENTAL
+#endif
+#if __has_include("amdlibm_vec.h")
+#include "amdlibm_vec.h"
+#endif
+#endif
+#else
+// Fallback for compilers without __has_include
+#include "amdlibm.h"
+#ifndef AMD_LIBM_VEC_EXPERIMENTAL
+#define AMD_LIBM_VEC_EXPERIMENTAL
+#endif
+#include "amdlibm_vec.h"
+#endif
+#endif
+
+// Include CBLAS headers when BLAS is enabled
+#ifdef EIGEN_USE_AOCL_BLAS
+#if defined(__has_include)
+#if __has_include("cblas.h")
+#include "cblas.h"
+#elif __has_include("blis.h")
+#include "blis.h"
+#endif
+#else
+// Fallback
+#include "cblas.h"
+#endif
+#endif
+
+namespace Eigen {
+// AOCL-specific type definitions
+typedef std::complex<double> dcomplex;
+typedef std::complex<float> scomplex;
+typedef int BlasIndex;  // Standard BLAS index type
+}  // namespace Eigen
+
+#endif  // EIGEN_USE_AOCL_ALL || EIGEN_USE_AOCL_MT
+
+#endif  // EIGEN_AOCL_SUPPORT_H
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 19d9917d788..2e1ca91b2d7 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -43,12 +43,12 @@ struct general_matrix_vector_product;
 
 template <typename From, typename To>
 struct get_factor {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return To(x); }
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE To run(const From& x) { return To(x); }
 };
 
 template <typename Scalar>
 struct get_factor<Scalar, typename NumTraits<Scalar>::Real> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) {
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) {
     return numext::real(x);
   }
 };
@@ -56,9 +56,9 @@ struct get_factor<Scalar, typename NumTraits<Scalar>::Real> {
 template <typename Scalar, typename Index>
 class BlasVectorMapper {
  public:
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar* data) : m_data(data) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar* data) : m_data(data) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { return m_data[i]; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { return m_data[i]; }
   template <typename Packet, int AlignmentType>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet load(Index i) const {
     return ploadt<Packet, AlignmentType>(m_data + i);
@@ -79,14 +79,14 @@ class BlasLinearMapper;
 template <typename Scalar, typename Index, int AlignmentType>
 class BlasLinearMapper<Scalar, Index, AlignmentType> {
  public:
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar* data, Index incr = 1) : m_data(data) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar* data, Index incr = 1) : m_data(data) {
     EIGEN_ONLY_USED_FOR_DEBUG(incr);
     eigen_assert(incr == 1);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(Index i) const { internal::prefetch(&operator()(i)); }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { return m_data[i]; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { return m_data[i]; }
 
   template <typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const {
@@ -178,27 +178,27 @@ class blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, 1> {
   typedef blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType> SubMapper;
   typedef BlasVectorMapper<Scalar, Index> VectorMapper;
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr = 1)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr = 1)
       : m_data(data), m_stride(stride) {
     EIGEN_ONLY_USED_FOR_DEBUG(incr);
     eigen_assert(incr == 1);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubMapper getSubMapper(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE SubMapper getSubMapper(Index i, Index j) const {
     return SubMapper(&operator()(i, j), m_stride);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
     return LinearMapper(&operator()(i, j));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
     return VectorMapper(&operator()(i, j));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(Index i, Index j) const { internal::prefetch(&operator()(i, j)); }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
     return m_data[StorageOrder == RowMajor ? j + i * m_stride : i + j * m_stride];
   }
 
@@ -239,8 +239,8 @@ class blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, 1> {
     return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
   }
 
-  EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; }
-  EIGEN_DEVICE_FUNC const Index incr() const { return 1; }
+  EIGEN_DEVICE_FUNC constexpr const Index stride() const { return m_stride; }
+  EIGEN_DEVICE_FUNC constexpr const Index incr() const { return 1; }
   EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_data; }
 
   EIGEN_DEVICE_FUNC Index firstAligned(Index size) const {
@@ -268,11 +268,14 @@ class blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, 1> {
 template <typename Scalar, typename Index, int AlignmentType, int Incr>
 class BlasLinearMapper {
  public:
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar* data, Index incr) : m_data(data), m_incr(incr) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar* data, Index incr)
+      : m_data(data), m_incr(incr) {}
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { internal::prefetch(&operator()(i)); }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { return m_data[i * m_incr.value()]; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
+    return m_data[i * m_incr.value()];
+  }
 
   template <typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const {
@@ -306,20 +309,20 @@ class blas_data_mapper {
   typedef BlasLinearMapper<Scalar, Index, AlignmentType, Incr> LinearMapper;
   typedef blas_data_mapper SubMapper;
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr)
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr)
       : m_data(data), m_stride(stride), m_incr(incr) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubMapper getSubMapper(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE SubMapper getSubMapper(Index i, Index j) const {
     return SubMapper(&operator()(i, j), m_stride, m_incr.value());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
     return LinearMapper(&operator()(i, j), m_incr.value());
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(Index i, Index j) const { internal::prefetch(&operator()(i, j)); }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
     return m_data[StorageOrder == RowMajor ? j * m_incr.value() + i * m_stride : i * m_incr.value() + j * m_stride];
   }
 
@@ -428,8 +431,8 @@ class blas_data_mapper {
     spb.store(this, i, j, block);
   }
 
-  EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; }
-  EIGEN_DEVICE_FUNC const Index incr() const { return m_incr.value(); }
+  EIGEN_DEVICE_FUNC constexpr const Index stride() const { return m_stride; }
+  EIGEN_DEVICE_FUNC constexpr const Index incr() const { return m_incr.value(); }
   EIGEN_DEVICE_FUNC constexpr Scalar* data() const { return m_data; }
 
  protected:
@@ -567,18 +570,18 @@ struct blas_traits<const T> : blas_traits<T> {};
 
 template <typename T, bool HasUsableDirectAccess = blas_traits<T>::HasUsableDirectAccess>
 struct extract_data_selector {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename T::Scalar* run(const T& m) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE static const typename T::Scalar* run(const T& m) {
     return blas_traits<T>::extract(m).data();
   }
 };
 
 template <typename T>
 struct extract_data_selector<T, false> {
-  EIGEN_DEVICE_FUNC static typename T::Scalar* run(const T&) { return 0; }
+  EIGEN_DEVICE_FUNC constexpr static typename T::Scalar* run(const T&) { return 0; }
 };
 
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename T::Scalar* extract_data(const T& m) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE const typename T::Scalar* extract_data(const T& m) {
   return extract_data_selector<T>::run(m);
 }
 
@@ -588,30 +591,31 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename T::Scalar* extract_data(con
  */
 template <typename ResScalar, typename Lhs, typename Rhs>
 struct combine_scalar_factors_impl {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const Lhs& lhs, const Rhs& rhs) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE static ResScalar run(const Lhs& lhs, const Rhs& rhs) {
     return blas_traits<Lhs>::extractScalarFactor(lhs) * blas_traits<Rhs>::extractScalarFactor(rhs);
   }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const ResScalar& alpha, const Lhs& lhs, const Rhs& rhs) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE static ResScalar run(const ResScalar& alpha, const Lhs& lhs,
+                                                                       const Rhs& rhs) {
     return alpha * blas_traits<Lhs>::extractScalarFactor(lhs) * blas_traits<Rhs>::extractScalarFactor(rhs);
   }
 };
 template <typename Lhs, typename Rhs>
 struct combine_scalar_factors_impl<bool, Lhs, Rhs> {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const Lhs& lhs, const Rhs& rhs) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE static bool run(const Lhs& lhs, const Rhs& rhs) {
     return blas_traits<Lhs>::extractScalarFactor(lhs) && blas_traits<Rhs>::extractScalarFactor(rhs);
   }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const bool& alpha, const Lhs& lhs, const Rhs& rhs) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE static bool run(const bool& alpha, const Lhs& lhs, const Rhs& rhs) {
     return alpha && blas_traits<Lhs>::extractScalarFactor(lhs) && blas_traits<Rhs>::extractScalarFactor(rhs);
   }
 };
 
 template <typename ResScalar, typename Lhs, typename Rhs>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const ResScalar& alpha, const Lhs& lhs,
-                                                                       const Rhs& rhs) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const ResScalar& alpha, const Lhs& lhs,
+                                                                                 const Rhs& rhs) {
   return combine_scalar_factors_impl<ResScalar, Lhs, Rhs>::run(alpha, lhs, rhs);
 }
 template <typename ResScalar, typename Lhs, typename Rhs>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const Lhs& lhs, const Rhs& rhs) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const Lhs& lhs, const Rhs& rhs) {
   return combine_scalar_factors_impl<ResScalar, Lhs, Rhs>::run(lhs, rhs);
 }
 
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 49f307c734e..4f31ae544f0 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -11,6 +11,18 @@
 #ifndef EIGEN_CONFIGURE_VECTORIZATION_H
 #define EIGEN_CONFIGURE_VECTORIZATION_H
 
+// Prepare for using the generic clang backend if requested.
+#if defined(EIGEN_VECTORIZE_GENERIC) && !defined(EIGEN_DONT_VECTORIZE) && !defined(EIGEN_DONT_ALIGN)
+#if !EIGEN_ARCH_VECTOR_EXTENSIONS
+#error "The compiler does not support clang vector extensions."
+#endif
+#define EIGEN_VECTORIZE
+#ifndef EIGEN_GENERIC_VECTOR_SIZE_BYTES
+#define EIGEN_GENERIC_VECTOR_SIZE_BYTES 64
+#endif
+#define EIGEN_MAX_ALIGN_BYTES EIGEN_GENERIC_VECTOR_SIZE_BYTES
+#endif
+
 //------------------------------------------------------------------------------------------
 // Static and dynamic alignment control
 //
@@ -60,6 +72,9 @@
 #else
 #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
 #endif
+#elif defined(EIGEN_VECTORIZE_GENERIC)
+// Generic clang backend overrides native SIMD; align to the generic vector size.
+#define EIGEN_IDEAL_MAX_ALIGN_BYTES EIGEN_GENERIC_VECTOR_SIZE_BYTES
 #elif defined(__AVX512F__)
 // 64 bytes static alignment is preferred only if really required
 #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
@@ -68,6 +83,8 @@
 #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32
 #elif defined __HVX__ && (__HVX_LENGTH__ == 128)
 #define EIGEN_IDEAL_MAX_ALIGN_BYTES 128
+#elif defined(EIGEN_RISCV64_USE_RVV10)
+#define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
 #else
 #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
 #endif
@@ -104,7 +121,7 @@
 // Only static alignment is really problematic (relies on nonstandard compiler extensions),
 // try to keep heap alignment even when we have to disable static alignment.
 #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || \
-                         EIGEN_ARCH_MIPS || EIGEN_ARCH_LOONGARCH64)
+                         EIGEN_ARCH_MIPS || EIGEN_ARCH_LOONGARCH64 || EIGEN_ARCH_RISCV)
 #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
 #else
 #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
@@ -200,7 +217,7 @@
 #endif
 #endif
 
-#if !(defined(EIGEN_DONT_VECTORIZE) || defined(EIGEN_GPUCC))
+#if !(defined(EIGEN_DONT_VECTORIZE) || defined(EIGEN_GPUCC) || defined(EIGEN_VECTORIZE_GENERIC))
 
 #if defined(EIGEN_SSE2_ON_NON_MSVC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
 
@@ -228,7 +245,7 @@
 #define EIGEN_VECTORIZE_SSE4_2
 #endif
 #ifdef __AVX__
-#ifndef EIGEN_USE_SYCL
+#if !defined(EIGEN_USE_SYCL) && !EIGEN_COMP_EMSCRIPTEN
 #define EIGEN_VECTORIZE_AVX
 #endif
 #define EIGEN_VECTORIZE_SSE3
@@ -343,7 +360,7 @@
 // notice that since these are C headers, the extern "C" is theoretically needed anyways.
 extern "C" {
 // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
-// Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
+// Doing so triggers some issues with ICC. However old gcc versions may not have this file, thus:
 #if EIGEN_COMP_ICC >= 1110 || EIGEN_COMP_EMSCRIPTEN
 #include <immintrin.h>
 #else
@@ -374,7 +391,7 @@ extern "C" {
 #define EIGEN_VECTORIZE_VSX 1
 #define EIGEN_VECTORIZE_FMA
 #include <altivec.h>
-// We need to #undef all these ugly tokens defined in <altivec.h>
+// We need to #undef macros defined by <altivec.h> that conflict with standard C++ names.
 // => use __vector instead of vector
 #undef bool
 #undef vector
@@ -386,7 +403,7 @@ extern "C" {
 #define EIGEN_VECTORIZE_ALTIVEC
 #define EIGEN_VECTORIZE_FMA
 #include <altivec.h>
-// We need to #undef all these ugly tokens defined in <altivec.h>
+// We need to #undef macros defined by <altivec.h> that conflict with standard C++ names.
 // => use __vector instead of vector
 #undef bool
 #undef vector
@@ -406,14 +423,59 @@ extern "C" {
 #define EIGEN_VECTORIZE_SVE
 #include <arm_sve.h>
 
-// Since we depend on knowing SVE vector lengths at compile-time, we need
-// to ensure a fixed lengths is set
+// Since we depend on knowing SVE vector length at compile-time, we need
+// to ensure a fixed length is set
 #if defined __ARM_FEATURE_SVE_BITS
 #define EIGEN_ARM64_SVE_VL __ARM_FEATURE_SVE_BITS
 #else
 #error "Eigen requires a fixed SVE lector length but EIGEN_ARM64_SVE_VL is not set."
 #endif
 
+#elif EIGEN_ARCH_RISCV
+
+#if defined(__riscv_zfh)
+#define EIGEN_HAS_BUILTIN_FLOAT16
+#endif
+
+// We currently require RVV to be enabled explicitly via EIGEN_RISCV64_USE_RVV and
+// will not select the backend automatically
+#if (defined EIGEN_RISCV64_USE_RVV10)
+
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_RVV10
+#include <riscv_vector.h>
+
+// Since we depend on knowing RVV vector length at compile-time, we need
+// to ensure a fixed length is set
+#if defined(__riscv_v_fixed_vlen)
+#define EIGEN_RISCV64_RVV_VL __riscv_v_fixed_vlen
+#if __riscv_v_fixed_vlen >= 256
+#undef EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT
+#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+#endif
+#else
+#error "Eigen requires a fixed RVV vector length but -mrvv-vector-bits=zvl is not set."
+#endif
+
+#undef EIGEN_STACK_ALLOCATION_LIMIT
+#define EIGEN_STACK_ALLOCATION_LIMIT 196608
+
+#if defined(__riscv_zvfh) && defined(__riscv_zfh)
+#define EIGEN_VECTORIZE_RVV10FP16
+#elif defined(__riscv_zvfh)
+#if defined(__GNUC__) || defined(__clang__)
+#warning "The Eigen::Half vectorization requires Zfh and Zvfh extensions."
+#elif defined(_MSC_VER)
+#pragma message("The Eigen::Half vectorization requires Zfh and Zvfh extensions.")
+#endif
+#endif
+
+#if defined(__riscv_zvfbfwma)
+#define EIGEN_VECTORIZE_RVV10BF16
+#endif
+
+#endif  // defined(EIGEN_ARCH_RISCV)
+
 #elif (defined __s390x__ && defined __VEC__)
 
 #define EIGEN_VECTORIZE
@@ -479,12 +541,6 @@ extern "C" {
 #if defined EIGEN_CUDACC
 #define EIGEN_VECTORIZE_GPU
 #include <vector_types.h>
-#if EIGEN_CUDA_SDK_VER >= 70500
-#define EIGEN_HAS_CUDA_FP16
-#endif
-#endif
-
-#if defined(EIGEN_HAS_CUDA_FP16)
 #include <cuda_runtime_api.h>
 #include <cuda_fp16.h>
 #endif
@@ -492,19 +548,25 @@ extern "C" {
 #if defined(EIGEN_HIPCC)
 #define EIGEN_VECTORIZE_GPU
 #include <hip/hip_vector_types.h>
-#define EIGEN_HAS_HIP_FP16
 #include <hip/hip_fp16.h>
 #define EIGEN_HAS_HIP_BF16
 #include <hip/hip_bfloat16.h>
 #endif
 
+#if defined(__riscv)
+// Defines the default LMUL for RISC-V
+#ifndef EIGEN_RISCV64_DEFAULT_LMUL
+#define EIGEN_RISCV64_DEFAULT_LMUL 1
+#endif
+#endif
+
 /** \brief Namespace containing all symbols from the %Eigen library. */
 // IWYU pragma: private
 #include "../InternalHeaderCheck.h"
 
 namespace Eigen {
 
-inline static const char *SimdInstructionSetsInUse(void) {
+inline static const char* SimdInstructionSetsInUse(void) {
 #if defined(EIGEN_VECTORIZE_AVX512)
   return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
 #elif defined(EIGEN_VECTORIZE_AVX)
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index fcc2db82266..3a8c40f45ed 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -22,21 +22,21 @@ namespace Eigen {
  *
  * Changing the value of Dynamic breaks the ABI, as Dynamic is often used as a template parameter for Matrix.
  */
-const int Dynamic = -1;
+constexpr int Dynamic = -1;
 
 /** This value means that a signed quantity (e.g., a signed index) is not known at compile-time, and that instead its
  * value has to be specified at runtime.
  */
-const int DynamicIndex = 0xffffff;
+constexpr int DynamicIndex = 0xffffff;
 
 /** This value means that the requested value is not defined.
  */
-const int Undefined = 0xfffffe;
+constexpr int Undefined = 0xfffffe;
 
 /** This value means +Infinity; it is currently used only as the p parameter to MatrixBase::lpNorm<int>().
  * The value Infinity there means the L-infinity norm.
  */
-const int Infinity = -1;
+constexpr int Infinity = -1;
 
 /** This value means that the cost to evaluate an expression coefficient is either very expensive or
  * cannot be known at compile time.
@@ -45,7 +45,7 @@ const int Infinity = -1;
  * and very very expensive expressions. It thus must also be large enough to make sure unrolling won't happen and that
  * sub expressions will be evaluated, but not too large to avoid overflow.
  */
-const int HugeCost = 10000;
+constexpr int HugeCost = 10000;
 
 /** \defgroup flags Flags
  * \ingroup Core_Module
@@ -67,16 +67,16 @@ const int HugeCost = 10000;
  * For an expression, this determines the storage order of
  * the matrix created by evaluation of that expression.
  * \sa \blank  \ref TopicStorageOrders */
-const unsigned int RowMajorBit = 0x1;
+constexpr unsigned int RowMajorBit = 0x1;
 
 /** \ingroup flags
  * means the expression should be evaluated by the calling expression */
-const unsigned int EvalBeforeNestingBit = 0x2;
+constexpr unsigned int EvalBeforeNestingBit = 0x2;
 
 /** \ingroup flags
  * \deprecated
  * means the expression should be evaluated before any assignment */
-EIGEN_DEPRECATED const unsigned int EvalBeforeAssigningBit = 0x4;  // FIXME deprecated
+EIGEN_DEPRECATED constexpr unsigned int EvalBeforeAssigningBit = 0x4;
 
 /** \ingroup flags
  *
@@ -94,7 +94,7 @@ EIGEN_DEPRECATED const unsigned int EvalBeforeAssigningBit = 0x4;  // FIXME depr
  * \note This bit can be set regardless of whether vectorization is actually enabled.
  *       To check for actual vectorizability, see \a ActualPacketAccessBit.
  */
-const unsigned int PacketAccessBit = 0x8;
+constexpr unsigned int PacketAccessBit = 0x8;
 
 #ifdef EIGEN_VECTORIZE
 /** \ingroup flags
@@ -105,9 +105,9 @@ const unsigned int PacketAccessBit = 0x8;
  * If vectorization is not enabled (EIGEN_VECTORIZE is not defined) this constant
  * is set to the value 0.
  */
-const unsigned int ActualPacketAccessBit = PacketAccessBit;
+constexpr unsigned int ActualPacketAccessBit = PacketAccessBit;
 #else
-const unsigned int ActualPacketAccessBit = 0x0;
+constexpr unsigned int ActualPacketAccessBit = 0x0;
 #endif
 
 /** \ingroup flags
@@ -130,7 +130,7 @@ const unsigned int ActualPacketAccessBit = 0x0;
  * Product is a vector expression. Thus, vector Product expressions allow index-based coefficient access but
  * not index-based packet access, so they don't have the LinearAccessBit.
  */
-const unsigned int LinearAccessBit = 0x10;
+constexpr unsigned int LinearAccessBit = 0x10;
 
 /** \ingroup flags
  *
@@ -145,7 +145,7 @@ const unsigned int LinearAccessBit = 0x10;
  * Expressions having LvalueBit also have their coeff() method returning a const reference instead of returning a new
  * value.
  */
-const unsigned int LvalueBit = 0x20;
+constexpr unsigned int LvalueBit = 0x20;
 
 /** \ingroup flags
  *
@@ -156,7 +156,7 @@ const unsigned int LvalueBit = 0x20;
  *
  * See the comment on LvalueBit for an explanation of how LvalueBit and DirectAccessBit are mutually orthogonal.
  */
-const unsigned int DirectAccessBit = 0x40;
+constexpr unsigned int DirectAccessBit = 0x40;
 
 /** \deprecated \ingroup flags
  *
@@ -168,9 +168,9 @@ const unsigned int DirectAccessBit = 0x40;
  * expression.packet<Aligned>(0);
  * \endcode
  */
-EIGEN_DEPRECATED const unsigned int AlignedBit = 0x80;
+EIGEN_DEPRECATED constexpr unsigned int AlignedBit = 0x80;
 
-const unsigned int NestByRefBit = 0x100;
+constexpr unsigned int NestByRefBit = 0x100;
 
 /** \ingroup flags
  *
@@ -179,7 +179,7 @@ const unsigned int NestByRefBit = 0x100;
  * The precise choice will be decided at evaluation time or when
  * combined with other expressions.
  * \sa \blank  \ref RowMajorBit, \ref TopicStorageOrders */
-const unsigned int NoPreferredStorageOrderBit = 0x200;
+constexpr unsigned int NoPreferredStorageOrderBit = 0x200;
 
 /** \ingroup flags
   *
@@ -192,10 +192,10 @@ const unsigned int NoPreferredStorageOrderBit = 0x200;
     inline const Index* innerNonZeroPtr() const;
     \endcode
   */
-const unsigned int CompressedAccessBit = 0x400;
+constexpr unsigned int CompressedAccessBit = 0x400;
 
 // list of flags that are inherited by default
-const unsigned int HereditaryBits = RowMajorBit | EvalBeforeNestingBit;
+constexpr unsigned int HereditaryBits = RowMajorBit | EvalBeforeNestingBit;
 
 /** \defgroup enums Enumerations
  * \ingroup Core_Module
@@ -429,6 +429,15 @@ enum QRPreconditioners {
   DisableQRDecomposition = NoQRPreconditioner
 };
 
+// JacobiSVD and BDCSVD combine QR preconditioner flags with decomposition flags in a single template bitmask.
+constexpr int operator|(QRPreconditioners qr_preconditioner, DecompositionOptions decomposition_option) {
+  return static_cast<int>(qr_preconditioner) | static_cast<int>(decomposition_option);
+}
+
+constexpr int operator|(DecompositionOptions decomposition_option, QRPreconditioners qr_preconditioner) {
+  return static_cast<int>(decomposition_option) | static_cast<int>(qr_preconditioner);
+}
+
 #ifdef Success
 #error The preprocessor symbol 'Success' is defined, possibly by the X11 header file X.h
 #endif
@@ -475,6 +484,7 @@ enum Type {
   SVE = 0x6,
   HVX = 0x7,
   LSX = 0x8,
+  RVV10 = 0x9,
 #if defined EIGEN_VECTORIZE_SSE
   Target = SSE
 #elif defined EIGEN_VECTORIZE_ALTIVEC
@@ -491,6 +501,8 @@ enum Type {
   Target = HVX
 #elif defined EIGEN_VECTORIZE_LSX
   Target = LSX
+#elif defined EIGEN_VECTORIZE_RVV10
+  Target = RVV10
 #else
   Target = Generic
 #endif
diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index ab0c542d0e2..b2bc18a3de7 100644
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -84,8 +84,7 @@
 #endif
 
 #if defined __NVCC__ && defined __CUDACC__
-// MSVC 14.16 (required by CUDA 9.*) does not support the _Pragma keyword, so
-// we instead use Microsoft's __pragma extension.
+// MSVC does not support the _Pragma keyword, so we use Microsoft's __pragma extension.
 #if defined _MSC_VER
 #define EIGEN_MAKE_PRAGMA(X) __pragma(#X)
 #else
diff --git a/Eigen/src/Core/util/EmulateArray.h b/Eigen/src/Core/util/EmulateArray.h
index 6c4c22d4131..dd9de183c1c 100644
--- a/Eigen/src/Core/util/EmulateArray.h
+++ b/Eigen/src/Core/util/EmulateArray.h
@@ -222,7 +222,7 @@ struct array_size<const array<T, N>&> {
 
 #else
 
-// The compiler supports c++11, and we're not targeting cuda: use std::array as Eigen::array
+// Not targeting cuda: use std::array as Eigen::array.
 #include <array>
 
 namespace Eigen {
@@ -231,37 +231,20 @@ template <typename T, std::size_t N>
 using array = std::array<T, N>;
 
 namespace internal {
-/* std::get is only constexpr in C++14, not yet in C++11
- *     - libstdc++ from version 4.7 onwards has it nevertheless,
- *                                          so use that
- *     - libstdc++ older versions: use _M_instance directly
- *     - libc++ all versions so far: use __elems_ directly
- *     - all other libs: use std::get to be portable, but
- *                       this may not be constexpr
- */
-#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322
-#define STD_GET_ARR_HACK a._M_instance[I_]
-#elif defined(_LIBCPP_VERSION)
-#define STD_GET_ARR_HACK a.__elems_[I_]
-#else
-#define STD_GET_ARR_HACK std::template get<I_, T, N>(a)
-#endif
 
 template <std::size_t I_, class T, std::size_t N>
 constexpr T& array_get(std::array<T, N>& a) {
-  return (T&)STD_GET_ARR_HACK;
+  return std::get<I_>(a);
 }
 template <std::size_t I_, class T, std::size_t N>
 constexpr T&& array_get(std::array<T, N>&& a) {
-  return (T&&)STD_GET_ARR_HACK;
+  return std::get<I_>(std::move(a));
 }
 template <std::size_t I_, class T, std::size_t N>
 constexpr T const& array_get(std::array<T, N> const& a) {
-  return (T const&)STD_GET_ARR_HACK;
+  return std::get<I_>(a);
 }
 
-#undef STD_GET_ARR_HACK
-
 }  // end namespace internal
 }  // end namespace Eigen
 
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 3c0bc461e24..70591398626 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -160,7 +160,7 @@ template <typename Derived>
 class RefBase;
 template <typename PlainObjectType, int Options = 0,
           typename StrideType =
-              typename std::conditional_t<PlainObjectType::IsVectorAtCompileTime, InnerStride<1>, OuterStride<>>>
+              std::conditional_t<PlainObjectType::IsVectorAtCompileTime, InnerStride<1>, OuterStride<>>>
 class Ref;
 template <typename ViewOp, typename MatrixType, typename StrideType = Stride<0, 0>>
 class CwiseUnaryView;
@@ -171,6 +171,8 @@ template <typename MatrixType, unsigned int Mode>
 class TriangularView;
 template <typename MatrixType, unsigned int Mode>
 class SelfAdjointView;
+template <typename Derived>
+class RealView;
 template <typename MatrixType>
 class SparseView;
 template <typename ExpressionType>
@@ -397,14 +399,14 @@ template <typename Scalar_, int Rows_, int Cols_,
                                                                    : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION),
           int MaxRows_ = Rows_, int MaxCols_ = Cols_>
 class Array;
-template <typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
-class Select;
 template <typename MatrixType, typename BinaryOp, int Direction>
 class PartialReduxExpr;
 template <typename ExpressionType, int Direction>
 class VectorwiseOp;
 template <typename MatrixType, int RowFactor, int ColFactor>
 class Replicate;
+template <int Direction, typename LhsType, typename RhsType>
+class Concat;
 template <typename MatrixType, int Direction = BothDirections>
 class Reverse;
 
@@ -517,6 +519,9 @@ struct eigen_zero_impl;
 
 template <typename Packet>
 struct has_packet_segment : std::false_type {};
+
+template <typename T>
+struct complex_array_access;
 }  // namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/util/GpuHipCudaDefines.inc b/Eigen/src/Core/util/GpuHipCudaDefines.inc
index 4e1050053ad..f3d4023c15d 100644
--- a/Eigen/src/Core/util/GpuHipCudaDefines.inc
+++ b/Eigen/src/Core/util/GpuHipCudaDefines.inc
@@ -15,11 +15,11 @@
 // There is code in the Tensorflow codebase that will define EIGEN_USE_GPU,  but
 // for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
 // When compiling such files, gcc will end up trying to pick up the CUDA headers by
-// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
+// default (see the code within "unsupported/Eigen/Tensor" that is guarded by EIGEN_USE_GPU)
 // This will obviously not work when trying to compile tensorflow on a system with no CUDA
 // To work around this issue for HIP systems (and leave the default behaviour intact), the
 // HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and
-// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
+// "unsupported/Eigen/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
 // defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well
 
 #if defined(EIGEN_USE_HIP)
diff --git a/Eigen/src/Core/util/IndexedViewHelper.h b/Eigen/src/Core/util/IndexedViewHelper.h
index abf4b19502c..1c9e35d676b 100644
--- a/Eigen/src/Core/util/IndexedViewHelper.h
+++ b/Eigen/src/Core/util/IndexedViewHelper.h
@@ -125,6 +125,15 @@ struct SymbolicExpressionEvaluator<FixedInt<N>, SizeAtCompileTime, void> {
 // Handling of generic indices (e.g. array)
 //--------------------------------------------------------------------------------
 
+// Detect Eigen expression types that are not plain objects (Matrix/Array).
+// These types may hold internal references to temporaries and must be evaluated before storing.
+template <typename T, typename = void>
+struct is_eigen_index_expression : std::false_type {};
+
+template <typename T>
+struct is_eigen_index_expression<T, std::enable_if_t<!std::is_same<T, typename T::PlainObject>::value>>
+    : std::true_type {};
+
 // Potentially wrap indices in a type that is better-suited for IndexedView evaluation.
 template <typename Indices, int NestedSizeAtCompileTime, typename EnableIf = void>
 struct IndexedViewHelperIndicesWrapper {
@@ -132,6 +141,15 @@ struct IndexedViewHelperIndicesWrapper {
   static const type& CreateIndexSequence(const Indices& indices, Index /*nested_size*/) { return indices; }
 };
 
+// Specialization for Eigen expression types (Reshaped, Block, CwiseOp, etc.) used as indices.
+// These may hold dangling references to temporaries if not evaluated.
+template <typename Indices, int NestedSizeAtCompileTime>
+struct IndexedViewHelperIndicesWrapper<Indices, NestedSizeAtCompileTime,
+                                       std::enable_if_t<is_eigen_index_expression<Indices>::value>> {
+  using type = typename Indices::PlainObject;
+  static type CreateIndexSequence(const Indices& indices, Index /*nested_size*/) { return indices.eval(); }
+};
+
 // Extract compile-time and runtime first, size, increments.
 template <typename Indices, typename EnableIf = void>
 struct IndexedViewHelper {
@@ -155,16 +173,16 @@ class ArithmeticSequenceRange {
   static constexpr Index SizeAtCompileTime = SizeAtCompileTime_;
   static constexpr Index IncrAtCompileTime = IncrAtCompileTime_;
 
-  constexpr ArithmeticSequenceRange(Index first, Index size, Index incr) : first_{first}, size_{size}, incr_{incr} {}
+  constexpr ArithmeticSequenceRange(Index first, Index size, Index incr) : m_first{first}, m_size{size}, m_incr{incr} {}
   constexpr Index operator[](Index i) const { return first() + i * incr(); }
-  constexpr Index first() const noexcept { return first_.value(); }
-  constexpr Index size() const noexcept { return size_.value(); }
-  constexpr Index incr() const noexcept { return incr_.value(); }
+  constexpr Index first() const noexcept { return m_first.value(); }
+  constexpr Index size() const noexcept { return m_size.value(); }
+  constexpr Index incr() const noexcept { return m_incr.value(); }
 
  private:
-  variable_if_dynamicindex<Index, int(FirstAtCompileTime)> first_;
-  variable_if_dynamic<Index, int(SizeAtCompileTime)> size_;
-  variable_if_dynamicindex<Index, int(IncrAtCompileTime)> incr_;
+  variable_if_dynamicindex<Index, int(FirstAtCompileTime)> m_first;
+  variable_if_dynamic<Index, int(SizeAtCompileTime)> m_size;
+  variable_if_dynamicindex<Index, int(IncrAtCompileTime)> m_incr;
 };
 
 template <typename FirstType, typename SizeType, typename IncrType, int NestedSizeAtCompileTime>
@@ -221,14 +239,14 @@ class SingleRange {
   static constexpr Index SizeAtCompileTime = Index(1);
   static constexpr Index IncrAtCompileTime = Index(1);  // Needs to be 1 to be treated as block-like.
 
-  constexpr SingleRange(Index v) noexcept : value_(v) {}
+  constexpr SingleRange(Index v) noexcept : m_value(v) {}
   constexpr Index operator[](Index) const noexcept { return first(); }
-  constexpr Index first() const noexcept { return value_.value(); }
+  constexpr Index first() const noexcept { return m_value.value(); }
   constexpr Index size() const noexcept { return SizeAtCompileTime; }
   constexpr Index incr() const noexcept { return IncrAtCompileTime; }
 
  private:
-  variable_if_dynamicindex<Index, int(ValueAtCompileTime)> value_;
+  variable_if_dynamicindex<Index, int(ValueAtCompileTime)> m_value;
 };
 
 template <typename T>
@@ -280,14 +298,14 @@ class AllRange {
   static constexpr Index FirstAtCompileTime = Index(0);
   static constexpr Index SizeAtCompileTime = SizeAtCompileTime_;
   static constexpr Index IncrAtCompileTime = Index(1);
-  constexpr AllRange(Index size) : size_(size) {}
+  constexpr AllRange(Index size) : m_size(size) {}
   constexpr Index operator[](Index i) const noexcept { return i; }
   constexpr Index first() const noexcept { return FirstAtCompileTime; }
-  constexpr Index size() const noexcept { return size_.value(); }
+  constexpr Index size() const noexcept { return m_size.value(); }
   constexpr Index incr() const noexcept { return IncrAtCompileTime; }
 
  private:
-  variable_if_dynamic<Index, int(SizeAtCompileTime)> size_;
+  variable_if_dynamic<Index, int(SizeAtCompileTime)> m_size;
 };
 
 template <int NestedSizeAtCompileTime>
@@ -416,9 +434,8 @@ struct VectorIndexedViewSelector<
   using ColMajorReturnType = IndexedView<Derived, IvcType<Indices, Derived::SizeAtCompileTime>, ZeroIndex>;
   using ConstColMajorReturnType = IndexedView<const Derived, IvcType<Indices, Derived::SizeAtCompileTime>, ZeroIndex>;
 
-  using ReturnType = typename internal::conditional<IsRowMajor, RowMajorReturnType, ColMajorReturnType>::type;
-  using ConstReturnType =
-      typename internal::conditional<IsRowMajor, ConstRowMajorReturnType, ConstColMajorReturnType>::type;
+  using ReturnType = std::conditional_t<IsRowMajor, RowMajorReturnType, ColMajorReturnType>;
+  using ConstReturnType = std::conditional_t<IsRowMajor, ConstRowMajorReturnType, ConstColMajorReturnType>;
 
   template <bool UseRowMajor = IsRowMajor, std::enable_if_t<UseRowMajor, bool> = true>
   static inline RowMajorReturnType run(Derived& derived, const Indices& indices) {
diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h
index 53fabd59511..a298f814f11 100644
--- a/Eigen/src/Core/util/IntegralConstant.h
+++ b/Eigen/src/Core/util/IntegralConstant.h
@@ -27,11 +27,11 @@ class VariableAndFixedInt;
  *
  * This class embeds a compile-time integer \c N.
  *
- * It is similar to c++11 std::integral_constant<int,N> but with some additional features
+ * It is similar to std::integral_constant<int,N> but with some additional features
  * such as:
  *  - implicit conversion to int
  *  - arithmetic and some bitwise operators: -, +, *, /, %, &, |
- *  - c++98/14 compatibility with fix<N> and fix<N>() syntax to define integral constants.
+ *  - fix<N> and fix<N>() syntax to define integral constants.
  *
  * It is strongly discouraged to directly deal with this class FixedInt. Instances are expected to
  * be created by the user using Eigen::fix<N> or Eigen::fix<N>().
@@ -144,8 +144,8 @@ template <int N>
 class VariableAndFixedInt {
  public:
   static const int value = N;
-  operator int() const { return m_value; }
-  VariableAndFixedInt(int val) { m_value = val; }
+  constexpr operator int() const { return m_value; }
+  constexpr VariableAndFixedInt(int val) : m_value(val) {}
 
  protected:
   int m_value;
@@ -172,7 +172,7 @@ struct get_fixed_value<variable_if_dynamic<T, N>, Default> {
 };
 
 template <typename T>
-EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) {
+EIGEN_DEVICE_FUNC constexpr Index get_runtime_value(const T &x) {
   return x;
 }
 
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 00d55577d9f..c08489a7dbc 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -17,13 +17,9 @@
 // Eigen version and basic defaults
 //------------------------------------------------------------------------------------------
 
-#define EIGEN_WORLD_VERSION 3
-#define EIGEN_MAJOR_VERSION 4
-#define EIGEN_MINOR_VERSION 90
-
 #define EIGEN_VERSION_AT_LEAST(x, y, z) \
-  (EIGEN_WORLD_VERSION > x ||           \
-   (EIGEN_WORLD_VERSION >= x && (EIGEN_MAJOR_VERSION > y || (EIGEN_MAJOR_VERSION >= y && EIGEN_MINOR_VERSION >= z))))
+  (EIGEN_MAJOR_VERSION > x ||           \
+   (EIGEN_MAJOR_VERSION >= x && (EIGEN_MINOR_VERSION > y || (EIGEN_MINOR_VERSION >= y && EIGEN_PATCH_VERSION >= z))))
 
 #ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
 #define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor
@@ -56,6 +52,26 @@
 #define EIGEN_STACK_ALLOCATION_LIMIT 131072
 #endif
 
+/* Specify whether to use std::fma for scalar multiply-add instructions.
+ *
+ * On machines that have FMA as a single instruction, this will generally
+ * improve precision without significant performance implications.
+ *
+ * Without a single instruction, performance has been found to be reduced 2-3x
+ * on Intel CPUs, and up to 30x for WASM.
+ *
+ * If unspecified, defaults to using FMA if hardware support is available.
+ * The default should be used in most cases to ensure consistency between
+ * vectorized and non-vectorized paths.
+ */
+#ifndef EIGEN_SCALAR_MADD_USE_FMA
+#ifdef EIGEN_VECTORIZE_FMA
+#define EIGEN_SCALAR_MADD_USE_FMA 1
+#else
+#define EIGEN_SCALAR_MADD_USE_FMA 0
+#endif
+#endif
+
 //------------------------------------------------------------------------------------------
 // Compiler identification, EIGEN_COMP_*
 //------------------------------------------------------------------------------------------
@@ -132,13 +148,8 @@
 #endif
 
 #if defined(__NVCC__)
-#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+// CUDA 11.4+ always defines __CUDACC_VER_MAJOR__.
 #define EIGEN_COMP_NVCC ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
-#elif defined(__CUDACC_VER__)
-#define EIGEN_COMP_NVCC __CUDACC_VER__
-#else
-#error "NVCC did not define compiler version."
-#endif
 #else
 #define EIGEN_COMP_NVCC 0
 #endif
@@ -262,10 +273,10 @@
 
 /// \internal EIGEN_COMP_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC,
 /// clang, mingw, etc.)
-#if EIGEN_COMP_GNUC &&                                                                                      \
-    !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_CLANGICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI ||    \
-      EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN || EIGEN_COMP_FCC || EIGEN_COMP_CLANGFCC || \
-      EIGEN_COMP_CPE || EIGEN_COMP_CLANGCPE || EIGEN_COMP_LCC)
+#if EIGEN_COMP_GNUC &&                                                                                   \
+    !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_CLANGICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || \
+      EIGEN_COMP_NVHPC || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN || EIGEN_COMP_FCC || \
+      EIGEN_COMP_CLANGFCC || EIGEN_COMP_CPE || EIGEN_COMP_CLANGCPE || EIGEN_COMP_LCC)
 #define EIGEN_COMP_GNUC_STRICT 1
 #else
 #define EIGEN_COMP_GNUC_STRICT 0
@@ -404,6 +415,13 @@
 #define EIGEN_ARCH_PPC 0
 #endif
 
+/// \internal EIGEN_ARCH_RISCV set to 1 if the architecture is RISC-V.
+#if defined(__riscv)
+#define EIGEN_ARCH_RISCV 1
+#else
+#define EIGEN_ARCH_RISCV 0
+#endif
+
 //------------------------------------------------------------------------------------------
 // Operating system identification, EIGEN_OS_*
 //------------------------------------------------------------------------------------------
@@ -552,6 +570,10 @@
 #define EIGEN_CUDA_SDK_VER 0
 #endif
 
+#if defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER > 0 && EIGEN_CUDA_SDK_VER < 110400
+#error "Eigen requires CUDA 11.4 or later."
+#endif
+
 #if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) && !defined(__SYCL_DEVICE_ONLY__)
 // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
 #define EIGEN_HIPCC __HIPCC__
@@ -561,29 +583,34 @@
 // ++ host_defines.h which contains the defines for the __host__ and __device__ macros
 #include <hip/hip_runtime.h>
 
+// Eigen requires ROCm/HIP >= 5.6 (GFX906 minimum architecture).
+// This floor exists to allow simplifying shared CUDA/HIP preprocessor guards —
+// all __HIP_ARCH_HAS_WARP_SHUFFLE__, __HIP_ARCH_HAS_FP16__, etc. are always true on GFX906+.
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 5 || (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 6))
+#error "Eigen requires ROCm/HIP >= 5.6."
+#endif
+
 #if defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
 // analogous to EIGEN_CUDA_ARCH, but for HIP
 #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
 #endif
 
-// For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute
-// value to 1024. The compiler assigns a default value of 256 when the attribute is not
-// specified. This results in failures on the HIP platform, for cases when a GPU kernel
-// without an explicit launch_bounds attribute is called with a threads_per_block value
-// greater than 256.
-//
-// This is a regression in functioanlity and is expected to be fixed within the next
-// couple of ROCm releases (compiler will go back to using 1024 value as the default)
-//
-// In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds
-// attribute.
+// HIP compilers default to launch_bounds(256), which causes failures when kernels
+// are called with more than 256 threads per block. On CUDA, without explicit
+// launch_bounds the compiler may over-allocate registers per thread, causing
+// cudaErrorLaunchOutOfResources for kernels launched with 1024 threads (e.g. 3D
+// convolution). Set to 1024 for all GPU compilers.
 
 #define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024)
 
 #endif
 
 #if !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024)
+#if defined(EIGEN_CUDACC)
+#define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024)
+#else
 #define EIGEN_HIP_LAUNCH_BOUNDS_1024
+#endif
 #endif  // !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024)
 
 // Unify CUDA/HIPCC
@@ -684,6 +711,13 @@
 #define EIGEN_HAS_BUILTIN(x) 0
 #endif
 
+// Cross compiler wrapper around LLVM's __has_attribute
+#ifdef __has_attribute
+#define EIGEN_HAS_ATTRIBUTE(x) __has_attribute(x)
+#else
+#define EIGEN_HAS_ATTRIBUTE(x) 0
+#endif
+
 // A Clang feature extension to determine compiler features.
 // We use it to determine 'cxx_rvalue_references'
 #ifndef __has_feature
@@ -695,12 +729,12 @@
 //
 // With MSVC, without defining /Zc:__cplusplus, the __cplusplus macro will
 // report 199711L regardless of the language standard specified via /std.
-// We need to rely on _MSVC_LANG instead, which is only available after
-// VS2015.3.
+// We need to rely on _MSVC_LANG instead where available. Older MSVC versions
+// supported by Eigen do not define _MSVC_LANG, so use Eigen's minimum standard.
 #if EIGEN_COMP_MSVC_LANG > 0
 #define EIGEN_CPLUSPLUS EIGEN_COMP_MSVC_LANG
 #elif EIGEN_COMP_MSVC >= 1900
-#define EIGEN_CPLUSPLUS 201103L
+#define EIGEN_CPLUSPLUS 201402L
 #elif defined(__cplusplus)
 #define EIGEN_CPLUSPLUS __cplusplus
 #else
@@ -716,10 +750,8 @@
 #define EIGEN_COMP_CXXVER 17
 #elif EIGEN_CPLUSPLUS >= 201402L
 #define EIGEN_COMP_CXXVER 14
-#elif EIGEN_CPLUSPLUS >= 201103L
-#define EIGEN_COMP_CXXVER 11
 #else
-#define EIGEN_COMP_CXXVER 03
+#define EIGEN_COMP_CXXVER 0
 #endif
 
 // The macros EIGEN_HAS_CXX?? defines a rough estimate of available c++ features
@@ -748,8 +780,8 @@
 
 // Does the compiler support std::hash?
 #ifndef EIGEN_HAS_STD_HASH
-// The std::hash struct is defined in C++11 but is not labelled as a __device__
-// function and is not constexpr, so cannot be used on device.
+// The std::hash struct is not labelled as a __device__ function and is not
+// constexpr, so cannot be used on device.
 #if !defined(EIGEN_GPU_COMPILE_PHASE)
 #define EIGEN_HAS_STD_HASH 1
 #else
@@ -774,6 +806,15 @@
 // NOTE: Intel C++ Compiler Classic (icc) Version 19.0 and later supports dynamic allocation
 //       for over-aligned data, but not in a manner that is compatible with Eigen.
 //       See https://gitlab.com/libeigen/eigen/-/issues/2575
+// Does the compiler support C++17 if constexpr?
+#ifndef EIGEN_HAS_CXX17_IFCONSTEXPR
+#if EIGEN_MAX_CPP_VER >= 17 && EIGEN_COMP_CXXVER >= 17 &&                                                            \
+    ((EIGEN_COMP_MSVC >= 1911) || (EIGEN_GNUC_STRICT_AT_LEAST(7, 0, 0)) || (EIGEN_CLANG_STRICT_AT_LEAST(3, 9, 0)) || \
+     (EIGEN_COMP_CLANGAPPLE && EIGEN_COMP_CLANGAPPLE >= 10000000))
+#define EIGEN_HAS_CXX17_IFCONSTEXPR 1
+#endif
+#endif
+
 #ifndef EIGEN_HAS_CXX17_OVERALIGN
 #if EIGEN_MAX_CPP_VER >= 17 && EIGEN_COMP_CXXVER >= 17 &&                                                            \
     ((EIGEN_COMP_MSVC >= 1912) || (EIGEN_GNUC_STRICT_AT_LEAST(7, 0, 0)) || (EIGEN_CLANG_STRICT_AT_LEAST(5, 0, 0)) || \
@@ -786,7 +827,7 @@
 #endif
 
 #if defined(EIGEN_CUDACC)
-// While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules
+// Enable device-side constexpr when the toolchain supports relaxed constexpr rules.
 #if defined(__NVCC__)
 // nvcc considers constexpr functions as __host__ __device__ with the option --expt-relaxed-constexpr
 #ifdef __CUDACC_RELAXED_CONSTEXPR__
@@ -814,6 +855,18 @@
 #endif
 #endif
 
+// Does the compiler support vector types?
+#if EIGEN_HAS_ATTRIBUTE(ext_vector_type) && EIGEN_HAS_BUILTIN(__builtin_vectorelements)
+#define EIGEN_ARCH_VECTOR_EXTENSIONS 1
+#else
+#define EIGEN_ARCH_VECTOR_EXTENSIONS 0
+#endif
+
+// Multidimensional subscript operator feature test
+#if defined(__cpp_multidimensional_subscript) && __cpp_multidimensional_subscript >= 202110L
+#define EIGEN_MULTIDIMENSIONAL_SUBSCRIPT
+#endif
+
 //------------------------------------------------------------------------------------------
 // Preprocessor programming helpers
 //------------------------------------------------------------------------------------------
@@ -855,6 +908,17 @@
 #define EIGEN_ALWAYS_INLINE EIGEN_STRONG_INLINE
 #endif
 
+// EIGEN_LAMBDA_ALWAYS_INLINE forces inlining of lambda functions.
+// On GCC/Clang, __attribute__((always_inline)) works on lambdas.
+// On MSVC, [[msvc::forceinline]] cannot be applied to generic lambdas
+// (those with auto parameters), so we leave it empty and rely on the
+// optimizer to inline small lambda bodies at /O2.
+#if EIGEN_COMP_GNUC && !defined(SYCL_DEVICE_ONLY)
+#define EIGEN_LAMBDA_ALWAYS_INLINE __attribute__((always_inline))
+#else
+#define EIGEN_LAMBDA_ALWAYS_INLINE
+#endif
+
 #if EIGEN_COMP_GNUC
 #define EIGEN_DONT_INLINE __attribute__((noinline))
 #elif EIGEN_COMP_MSVC
@@ -944,6 +1008,22 @@
 #define EIGEN_DEPRECATED
 #endif
 
+#ifndef EIGEN_NO_DEPRECATED_WARNING
+#if EIGEN_COMP_GNUC
+#define EIGEN_DEPRECATED_WITH_REASON(message) __attribute__((deprecated(message)))
+#elif EIGEN_COMP_MSVC
+#define EIGEN_DEPRECATED_WITH_REASON(message) __declspec(deprecated(message))
+#else
+#define EIGEN_DEPRECATED_WITH_REASON(message)
+#endif
+#else
+#define EIGEN_DEPRECATED_WITH_REASON(message)
+#endif
+
+// Deprecated no-op macro. Was a workaround for GCC 4.3 empty struct issues, removed in Eigen 5.0.
+// Defined here for backward compatibility with downstream code that still references it.
+#define EIGEN_EMPTY_STRUCT_CTOR(X)
+
 #if EIGEN_COMP_GNUC
 #define EIGEN_UNUSED __attribute__((unused))
 #else
@@ -970,13 +1050,13 @@
 namespace Eigen {
 namespace internal {
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(const T&) {}
+EIGEN_DEVICE_FUNC constexpr void ignore_unused_variable(const T&) {}
 }  // namespace internal
 }  // namespace Eigen
-#define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var);
+#define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var)
 
 #if !defined(EIGEN_ASM_COMMENT)
-#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64)
+#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_RISCV)
 #define EIGEN_ASM_COMMENT(X) __asm__("#" X)
 #else
 #define EIGEN_ASM_COMMENT(X)
@@ -993,8 +1073,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(cons
 #endif
 
 #if !defined(EIGEN_OPTIMIZATION_BARRIER)
-#if EIGEN_COMP_GNUC
-   // According to https://gcc.gnu.org/onlinedocs/gcc/Constraints.html:
+// Implement the barrier on GNUC compilers or clang-cl.
+#if EIGEN_COMP_GNUC || (defined(__clang__) && defined(_MSC_VER))
+// According to https://gcc.gnu.org/onlinedocs/gcc/Constraints.html:
 //   X: Any operand whatsoever.
 //   r: A register operand is allowed provided that it is in a general
 //      register.
@@ -1027,44 +1108,44 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(cons
 // directly for std::complex<T>, Eigen::half, Eigen::bfloat16. For these,
 // you will need to apply to the underlying POD type.
 #if EIGEN_ARCH_PPC && EIGEN_COMP_GNUC_STRICT
-   // This seems to be broken on clang. Packet4f is loaded into a single
+// This seems to be broken on clang. Packet4f is loaded into a single
 //   register rather than a vector, zeroing out some entries. Integer
 //   types also generate a compile error.
 #if EIGEN_OS_MAC
-   // General, Altivec for Apple (VSX were added in ISA v2.06):
+// General, Altivec for Apple (VSX were added in ISA v2.06):
 #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+r,v"(X));
 #else
-   // General, Altivec, VSX otherwise:
+// General, Altivec, VSX otherwise:
 #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+r,v,wa"(X));
 #endif
 #elif EIGEN_ARCH_ARM_OR_ARM64
 #ifdef __ARM_FP
-   // General, VFP or NEON.
+// General, VFP or NEON.
 // Clang doesn't like "r",
 //    error: non-trivial scalar-to-vector conversion, possible invalid
 //           constraint for vector typ
 #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+g,w"(X));
 #else
-   // Arm without VFP or NEON.
+// Arm without VFP or NEON.
 // "w" constraint will not compile.
 #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+g"(X));
 #endif
 #elif EIGEN_ARCH_i386_OR_x86_64
-   // General, SSE.
+// General, SSE.
 #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+g,x"(X));
 #else
-   // Not implemented for other architectures.
+// Not implemented for other architectures.
 #define EIGEN_OPTIMIZATION_BARRIER(X)
 #endif
 #else
-   // Not implemented for other compilers.
+// Not implemented for other compilers.
 #define EIGEN_OPTIMIZATION_BARRIER(X)
 #endif
 #endif
 
 #if EIGEN_COMP_MSVC
 // NOTE MSVC often gives C4127 warnings with compiletime if statements. See bug 1362.
-// This workaround is ugly, but it does the job.
+// This workaround suppresses MSVC C4127 warnings for compile-time conditionals.
 #define EIGEN_CONST_CONDITIONAL(cond) (void)0, cond
 #else
 #define EIGEN_CONST_CONDITIONAL(cond) cond
@@ -1131,7 +1212,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(cons
 /** \internal
  * \brief Macro to manually inherit assignment operators.
  * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is
- * defined. With C++11 or later this also default-implements the copy-constructor
+ * defined. This also default-implements the copy-constructor.
  */
 #define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
   EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)  \
@@ -1141,8 +1222,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(cons
  * \brief Macro to manually define default constructors and destructors.
  * This is necessary when the copy constructor is re-defined.
  * For empty helper classes this should usually be protected, to avoid accidentally creating empty objects.
- *
- * Hiding the default destructor lead to problems in C++03 mode together with boost::multiprecision
  */
 #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \
   EIGEN_DEVICE_FUNC Derived() = default;                        \
@@ -1201,7 +1280,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(cons
 
 #define EIGEN_MAKE_CWISE_BINARY_OP(METHOD, OPNAME)                                                                \
   template <typename OtherDerived>                                                                                \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(                                     \
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(                           \
       Derived, OtherDerived, OPNAME)(METHOD)(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const { \
     return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived, OtherDerived, OPNAME)(derived(), other.derived());             \
   }
@@ -1222,7 +1301,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(cons
 
 #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD, OPNAME)                                                       \
   template <typename T>                                                                                              \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(                                \
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(                      \
       Derived,                                                                                                       \
       typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(          \
           OPNAME, Scalar, T)>::type,                                                                                 \
@@ -1236,7 +1315,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(cons
 
 #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD, OPNAME)                                                        \
   template <typename T>                                                                                              \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(                         \
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE friend const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(               \
       typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(          \
           OPNAME, T, Scalar)>::type,                                                                                 \
       Derived, OPNAME)(METHOD)(const T& scalar, const StorageBaseType& matrix) {                                     \
@@ -1281,10 +1360,10 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(cons
 namespace Eigen {
 namespace internal {
 
-EIGEN_DEVICE_FUNC inline bool all() { return true; }
+EIGEN_DEVICE_FUNC constexpr bool all() { return true; }
 
 template <typename T, typename... Ts>
-EIGEN_DEVICE_FUNC bool all(T t, Ts... ts) {
+EIGEN_DEVICE_FUNC constexpr bool all(T t, Ts... ts) {
   return t && all(ts...);
 }
 
diff --git a/Eigen/src/Core/util/MaxSizeVector.h b/Eigen/src/Core/util/MaxSizeVector.h
index db5bb8950ed..3f13b2d468f 100644
--- a/Eigen/src/Core/util/MaxSizeVector.h
+++ b/Eigen/src/Core/util/MaxSizeVector.h
@@ -34,104 +34,103 @@ class MaxSizeVector {
  public:
   // Construct a new MaxSizeVector, reserve n elements.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit MaxSizeVector(size_t n)
-      : reserve_(n), size_(0), data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {}
+      : m_reserve(n), m_size(0), m_data(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {}
 
   // Construct a new MaxSizeVector, reserve and resize to n.
   // Copy the init value to all elements.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaxSizeVector(size_t n, const T& init)
-      : reserve_(n), size_(n), data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
+      : m_reserve(n), m_size(n), m_data(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
     size_t i = 0;
     EIGEN_TRY {
-      for (; i < size_; ++i) {
-        new (&data_[i]) T(init);
+      for (; i < m_size; ++i) {
+        new (&m_data[i]) T(init);
       }
     }
     EIGEN_CATCH(...) {
       // Construction failed, destruct in reverse order:
       for (; (i + 1) > 0; --i) {
-        data_[i - 1].~T();
+        m_data[i - 1].~T();
       }
-      internal::handmade_aligned_free(data_);
+      internal::handmade_aligned_free(m_data);
       EIGEN_THROW;
     }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~MaxSizeVector() {
-    for (size_t i = size_; i > 0; --i) {
-      data_[i - 1].~T();
+    for (size_t i = m_size; i > 0; --i) {
+      m_data[i - 1].~T();
     }
-    internal::handmade_aligned_free(data_);
+    internal::handmade_aligned_free(m_data);
   }
 
   void resize(size_t n) {
-    eigen_assert(n <= reserve_);
-    for (; size_ < n; ++size_) {
-      new (&data_[size_]) T;
+    eigen_assert(n <= m_reserve);
+    for (; m_size < n; ++m_size) {
+      new (&m_data[m_size]) T;
     }
-    for (; size_ > n; --size_) {
-      data_[size_ - 1].~T();
+    for (; m_size > n; --m_size) {
+      m_data[m_size - 1].~T();
     }
-    eigen_assert(size_ == n);
+    eigen_assert(m_size == n);
   }
 
   // Append new elements (up to reserved size).
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void push_back(const T& t) {
-    eigen_assert(size_ < reserve_);
-    new (&data_[size_++]) T(t);
+    eigen_assert(m_size < m_reserve);
+    new (&m_data[m_size++]) T(t);
   }
 
-  // For C++03 compatibility this only takes one argument
   template <class X>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void emplace_back(const X& x) {
-    eigen_assert(size_ < reserve_);
-    new (&data_[size_++]) T(x);
+    eigen_assert(m_size < m_reserve);
+    new (&m_data[m_size++]) T(x);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator[](size_t i) const {
-    eigen_assert(i < size_);
-    return data_[i];
+    eigen_assert(i < m_size);
+    return m_data[i];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& operator[](size_t i) {
-    eigen_assert(i < size_);
-    return data_[i];
+    eigen_assert(i < m_size);
+    return m_data[i];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& back() {
-    eigen_assert(size_ > 0);
-    return data_[size_ - 1];
+    eigen_assert(m_size > 0);
+    return m_data[m_size - 1];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& back() const {
-    eigen_assert(size_ > 0);
-    return data_[size_ - 1];
+    eigen_assert(m_size > 0);
+    return m_data[m_size - 1];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pop_back() {
-    eigen_assert(size_ > 0);
-    data_[--size_].~T();
+    eigen_assert(m_size > 0);
+    m_data[--m_size].~T();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t size() const { return size_; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t size() const { return m_size; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool empty() const { return size_ == 0; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool empty() const { return m_size == 0; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return data_; }
+  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return data_; }
+  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* begin() { return data_; }
+  EIGEN_DEVICE_FUNC constexpr T* begin() { return m_data; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* end() { return data_ + size_; }
+  EIGEN_DEVICE_FUNC constexpr T* end() { return m_data + m_size; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* begin() const { return data_; }
+  EIGEN_DEVICE_FUNC constexpr const T* begin() const { return m_data; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* end() const { return data_ + size_; }
+  EIGEN_DEVICE_FUNC constexpr const T* end() const { return m_data + m_size; }
 
  private:
-  size_t reserve_;
-  size_t size_;
-  T* data_;
+  size_t m_reserve;
+  size_t m_size;
+  T* m_data;
 };
 
 }  // namespace Eigen
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 44056b33401..c52137d5fae 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -91,6 +91,9 @@ namespace internal {
 EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() {
   eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)");
 }
+EIGEN_DEVICE_FUNC inline void check_that_free_is_allowed() {
+  eigen_assert(false && "heap deallocation is forbidden (EIGEN_NO_MALLOC is defined)");
+}
 #elif defined EIGEN_RUNTIME_NO_MALLOC
 EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false) {
   EIGEN_MALLOC_CHECK_THREAD_LOCAL static bool value = true;
@@ -101,10 +104,22 @@ EIGEN_DEVICE_FUNC inline bool is_malloc_allowed() { return is_malloc_allowed_imp
 EIGEN_DEVICE_FUNC inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
 EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() {
   eigen_assert(is_malloc_allowed() &&
-               "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
+               "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and set_is_malloc_allowed is false)");
+}
+EIGEN_DEVICE_FUNC inline bool is_free_allowed_impl(bool update, bool new_value = false) {
+  EIGEN_MALLOC_CHECK_THREAD_LOCAL static bool value = true;
+  if (update == 1) value = new_value;
+  return value;
+}
+EIGEN_DEVICE_FUNC inline bool is_free_allowed() { return is_free_allowed_impl(false); }
+EIGEN_DEVICE_FUNC inline bool set_is_free_allowed(bool new_value) { return is_free_allowed_impl(true, new_value); }
+EIGEN_DEVICE_FUNC inline void check_that_free_is_allowed() {
+  eigen_assert(is_free_allowed() &&
+               "heap deallocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and set_is_free_allowed is false)");
 }
 #else
 EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() {}
+EIGEN_DEVICE_FUNC inline void check_that_free_is_allowed() {}
 #endif
 
 EIGEN_DEVICE_FUNC inline void throw_std_bad_alloc() {
@@ -161,7 +176,7 @@ EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void* ptr) {
     std::size_t offset = static_cast<std::size_t>(*(static_cast<uint8_t*>(ptr) - 1)) + 1;
     void* original = static_cast<void*>(static_cast<uint8_t*>(ptr) - offset);
 
-    check_that_malloc_is_allowed();
+    check_that_free_is_allowed();
     EIGEN_USING_STD(free)
     free(original);
   }
@@ -227,7 +242,7 @@ EIGEN_DEVICE_FUNC inline void aligned_free(void* ptr) {
 #if (EIGEN_DEFAULT_ALIGN_BYTES == 0) || EIGEN_MALLOC_ALREADY_ALIGNED
 
   if (ptr != nullptr) {
-    check_that_malloc_is_allowed();
+    check_that_free_is_allowed();
     EIGEN_USING_STD(free)
     free(ptr);
   }
@@ -252,7 +267,7 @@ EIGEN_DEVICE_FUNC inline void* aligned_realloc(void* ptr, std::size_t new_size,
 
   void* result;
 #if (EIGEN_DEFAULT_ALIGN_BYTES == 0) || EIGEN_MALLOC_ALREADY_ALIGNED
-  EIGEN_UNUSED_VARIABLE(old_size)
+  EIGEN_UNUSED_VARIABLE(old_size);
 
   check_that_malloc_is_allowed();
   EIGEN_USING_STD(realloc)
@@ -299,7 +314,7 @@ EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void* ptr) {
 template <>
 EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void* ptr) {
   if (ptr != nullptr) {
-    check_that_malloc_is_allowed();
+    check_that_free_is_allowed();
     EIGEN_USING_STD(free)
     free(ptr);
   }
@@ -405,7 +420,7 @@ template <typename T>
 EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size) {
   check_size_for_overflow<T>(size);
   T* result = static_cast<T*>(aligned_malloc(sizeof(T) * size));
-  EIGEN_TRY { return default_construct_elements_of_array(result, size); }
+  EIGEN_TRY { default_construct_elements_of_array(result, size); }
   EIGEN_CATCH(...) {
     aligned_free(result);
     EIGEN_THROW;
@@ -417,7 +432,7 @@ template <typename T, bool Align>
 EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(std::size_t size) {
   check_size_for_overflow<T>(size);
   T* result = static_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T) * size));
-  EIGEN_TRY { return default_construct_elements_of_array(result, size); }
+  EIGEN_TRY { default_construct_elements_of_array(result, size); }
   EIGEN_CATCH(...) {
     conditional_aligned_free<Align>(result);
     EIGEN_THROW;
@@ -619,11 +634,6 @@ struct smart_memmove_helper<T, false> {
   }
 };
 
-template <typename T>
-EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target) {
-  return std::move(start, end, target);
-}
-
 /*****************************************************************************
 *** Implementation of runtime stack allocation (falling back to malloc)    ***
 *****************************************************************************/
@@ -650,8 +660,11 @@ EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target) {
 // This helper class construct the allocated memory, and takes care of destructing and freeing the handled data
 // at destruction time. In practice this helper class is mainly useful to avoid memory leak in case of exceptions.
 template <typename T>
-class aligned_stack_memory_handler : noncopyable {
+class aligned_stack_memory_handler {
  public:
+  aligned_stack_memory_handler(const aligned_stack_memory_handler&) = delete;
+  aligned_stack_memory_handler& operator=(const aligned_stack_memory_handler&) = delete;
+
   /* Creates a stack_memory_handler responsible for the buffer \a ptr of size \a size.
    * Note that \a ptr can be 0 regardless of the other parameters.
    * This constructor takes care of constructing/initializing the elements of the buffer if required by the scalar type
@@ -718,25 +731,6 @@ struct local_nested_eval_wrapper<Xpr, NbEvaluations, true> {
 
 #endif  // EIGEN_ALLOCA
 
-template <typename T>
-class scoped_array : noncopyable {
-  T* m_ptr;
-
- public:
-  explicit scoped_array(std::ptrdiff_t size) { m_ptr = new T[size]; }
-  ~scoped_array() { delete[] m_ptr; }
-  T& operator[](std::ptrdiff_t i) { return m_ptr[i]; }
-  const T& operator[](std::ptrdiff_t i) const { return m_ptr[i]; }
-  T*& ptr() { return m_ptr; }
-  const T* ptr() const { return m_ptr; }
-  operator const T*() const { return m_ptr; }
-};
-
-template <typename T>
-void swap(scoped_array<T>& a, scoped_array<T>& b) {
-  std::swap(a.ptr(), b.ptr());
-}
-
 }  // end namespace internal
 
 /** \internal
@@ -878,12 +872,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* eigen_aligned_alloca_helper(void* pt
 #endif
 
 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true)
-#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar, Size)                                 \
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(                                                                            \
-      bool(((Size) != Eigen::Dynamic) &&                                                                         \
-           (((EIGEN_MAX_ALIGN_BYTES >= 16) && ((sizeof(Scalar) * (Size)) % (EIGEN_MAX_ALIGN_BYTES) == 0)) ||     \
-            ((EIGEN_MAX_ALIGN_BYTES >= 32) && ((sizeof(Scalar) * (Size)) % (EIGEN_MAX_ALIGN_BYTES / 2) == 0)) || \
-            ((EIGEN_MAX_ALIGN_BYTES >= 64) && ((sizeof(Scalar) * (Size)) % (EIGEN_MAX_ALIGN_BYTES / 4) == 0)))))
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar, Size)                                       \
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(                                                                                  \
+      bool(((Size) != Eigen::Dynamic) &&                                                                               \
+           (((EIGEN_MAX_ALIGN_BYTES >= 16) && ((sizeof(Scalar) * size_t(Size)) % (EIGEN_MAX_ALIGN_BYTES) == 0)) ||     \
+            ((EIGEN_MAX_ALIGN_BYTES >= 32) && ((sizeof(Scalar) * size_t(Size)) % (EIGEN_MAX_ALIGN_BYTES / 2) == 0)) || \
+            ((EIGEN_MAX_ALIGN_BYTES >= 64) && ((sizeof(Scalar) * size_t(Size)) % (EIGEN_MAX_ALIGN_BYTES / 4) == 0)))))
 
 #endif
 
@@ -998,7 +992,7 @@ inline bool cpuid_is_vendor(int abcd[4], const int vendor[3]) {
   return abcd[1] == vendor[0] && abcd[3] == vendor[1] && abcd[2] == vendor[2];
 }
 
-inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3) {
+inline void queryCacheSizes_intel_direct(std::ptrdiff_t& l1, std::ptrdiff_t& l2, std::ptrdiff_t& l3) {
   int abcd[4];
   l1 = l2 = l3 = 0;
   int cache_id = 0;
@@ -1015,7 +1009,8 @@ inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3) {
       int line_size = (abcd[1] & 0x00000FFF) >> 0;    // B[11:0]
       int sets = (abcd[2]);                           // C[31:0]
 
-      int cache_size = (ways + 1) * (partitions + 1) * (line_size + 1) * (sets + 1);
+      std::ptrdiff_t cache_size =
+          static_cast<std::ptrdiff_t>(ways + 1) * (partitions + 1) * (line_size + 1) * (sets + 1);
 
       switch (cache_level) {
         case 1:
@@ -1035,7 +1030,7 @@ inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3) {
   } while (cache_type > 0 && cache_id < 16);
 }
 
-inline void queryCacheSizes_intel_codes(int& l1, int& l2, int& l3) {
+inline void queryCacheSizes_intel_codes(std::ptrdiff_t& l1, std::ptrdiff_t& l2, std::ptrdiff_t& l3) {
   int abcd[4];
   abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
   l1 = l2 = l3 = 0;
@@ -1231,7 +1226,7 @@ inline void queryCacheSizes_intel_codes(int& l1, int& l2, int& l3) {
   l3 *= 1024;
 }
 
-inline void queryCacheSizes_intel(int& l1, int& l2, int& l3, int max_std_funcs) {
+inline void queryCacheSizes_intel(std::ptrdiff_t& l1, std::ptrdiff_t& l2, std::ptrdiff_t& l3, int max_std_funcs) {
   if (max_std_funcs >= 4)
     queryCacheSizes_intel_direct(l1, l2, l3);
   else if (max_std_funcs >= 2)
@@ -1240,7 +1235,7 @@ inline void queryCacheSizes_intel(int& l1, int& l2, int& l3, int max_std_funcs)
     l1 = l2 = l3 = 0;
 }
 
-inline void queryCacheSizes_amd(int& l1, int& l2, int& l3) {
+inline void queryCacheSizes_amd(std::ptrdiff_t& l1, std::ptrdiff_t& l2, std::ptrdiff_t& l3) {
   int abcd[4];
   abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
 
@@ -1251,8 +1246,8 @@ inline void queryCacheSizes_amd(int& l1, int& l2, int& l3) {
     l1 = (abcd[2] >> 24) * 1024;  // C[31:24] = L1 size in KB
     abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
     EIGEN_CPUID(abcd, 0x80000006, 0);
-    l2 = (abcd[2] >> 16) * 1024;                      // C[31;16] = l2 cache size in KB
-    l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024;  // D[31;18] = l3 cache size in 512KB
+    l2 = (abcd[2] >> 16) * 1024;                                                 // C[31;16] = l2 cache size in KB
+    l3 = static_cast<std::ptrdiff_t>((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024;  // D[31;18] = l3 cache size in 512KB
   } else {
     l1 = l2 = l3 = 0;
   }
@@ -1261,7 +1256,7 @@ inline void queryCacheSizes_amd(int& l1, int& l2, int& l3) {
 
 /** \internal
  * Queries and returns the cache sizes in Bytes of the L1, L2, and L3 data caches respectively */
-inline void queryCacheSizes(int& l1, int& l2, int& l3) {
+inline void queryCacheSizes(std::ptrdiff_t& l1, std::ptrdiff_t& l2, std::ptrdiff_t& l3) {
 #ifdef EIGEN_CPUID
   int abcd[4];
   const int GenuineIntel[] = {0x756e6547, 0x49656e69, 0x6c65746e};
@@ -1290,6 +1285,37 @@ inline void queryCacheSizes(int& l1, int& l2, int& l3) {
     //   ||cpuid_is_vendor(abcd,"SiS SiS SiS ")
     //   ||cpuid_is_vendor(abcd,"UMC UMC UMC ")
     //   ||cpuid_is_vendor(abcd,"NexGenDriven")
+#elif EIGEN_OS_MAC
+  // On macOS (including Apple Silicon), use sysctlbyname to query cache sizes.
+  // The sysctl values are 64-bit, so read into int64_t and convert.
+  // For L1, prefer P-core (perflevel0) size since compute-heavy work like GEMM
+  // is typically scheduled on performance cores. L1 is per-core so always safe.
+  // For L2, use the generic hw.l2cachesize which is more conservative (reports
+  // the smaller E-core cluster L2 on heterogeneous chips). The P-core L2 is
+  // shared among all P-cores and would overestimate per-core capacity.
+  {
+    int64_t val = 0;
+    std::size_t val_size = sizeof(val);
+    l1 = -1;
+    val_size = sizeof(val);
+    if (sysctlbyname("hw.perflevel0.l1dcachesize", &val, &val_size, NULL, 0) == 0 && val > 0)
+      l1 = val;
+    else {
+      val_size = sizeof(val);
+      if (sysctlbyname("hw.l1dcachesize", &val, &val_size, NULL, 0) == 0) l1 = val;
+    }
+    l2 = -1;
+    val_size = sizeof(val);
+    if (sysctlbyname("hw.l2cachesize", &val, &val_size, NULL, 0) == 0) l2 = val;
+    l3 = -1;
+    val_size = sizeof(val);
+    if (sysctlbyname("hw.l3cachesize", &val, &val_size, NULL, 0) == 0 && val > 0) l3 = val;
+  }
+#elif EIGEN_OS_UNIX && defined(_SC_LEVEL1_DCACHE_SIZE)
+  // On Linux and other POSIX systems, use sysconf to query cache sizes.
+  l1 = sysconf(_SC_LEVEL1_DCACHE_SIZE);
+  l2 = sysconf(_SC_LEVEL2_CACHE_SIZE);
+  l3 = sysconf(_SC_LEVEL3_CACHE_SIZE);
 #else
   l1 = l2 = l3 = -1;
 #endif
@@ -1297,16 +1323,16 @@ inline void queryCacheSizes(int& l1, int& l2, int& l3) {
 
 /** \internal
  * \returns the size in Bytes of the L1 data cache */
-inline int queryL1CacheSize() {
-  int l1(-1), l2, l3;
+inline std::ptrdiff_t queryL1CacheSize() {
+  std::ptrdiff_t l1(-1), l2, l3;
   queryCacheSizes(l1, l2, l3);
   return l1;
 }
 
 /** \internal
  * \returns the size in Bytes of the L2 or L3 cache if this later is present */
-inline int queryTopLevelCacheSize() {
-  int l1, l2(-1), l3(-1);
+inline std::ptrdiff_t queryTopLevelCacheSize() {
+  std::ptrdiff_t l1, l2(-1), l3(-1);
   queryCacheSizes(l1, l2, l3);
   return (std::max)(l2, l3);
 }
@@ -1339,6 +1365,28 @@ EIGEN_DEVICE_FUNC void destroy_at(T* p) {
 }
 #endif
 
+// FIXME(rmlarsen): Work around missing linker symbol with msan on ARM.
+#if !defined(EIGEN_DONT_ASSUME_ALIGNED) && __has_feature(memory_sanitizer) && (EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64)
+#define EIGEN_DONT_ASSUME_ALIGNED
+#endif
+
+#if !defined(EIGEN_DONT_ASSUME_ALIGNED) && defined(__cpp_lib_assume_aligned) && (__cpp_lib_assume_aligned >= 201811L)
+template <std::size_t N, typename T>
+EIGEN_DEVICE_FUNC constexpr T* assume_aligned(T* ptr) {
+  return std::assume_aligned<N, T>(ptr);
+}
+#elif !defined(EIGEN_DONT_ASSUME_ALIGNED) && EIGEN_HAS_BUILTIN(__builtin_assume_aligned)
+template <std::size_t N, typename T>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC T* assume_aligned(T* ptr) {
+  return static_cast<T*>(__builtin_assume_aligned(ptr, N));
+}
+#else
+template <std::size_t N, typename T>
+EIGEN_DEVICE_FUNC constexpr T* assume_aligned(T* ptr) {
+  return ptr;
+}
+#endif
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index ddbc898e1b2..d78df80c105 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -94,13 +94,7 @@ using std::false_type;
 using std::true_type;
 
 template <bool Condition>
-struct bool_constant;
-
-template <>
-struct bool_constant<true> : true_type {};
-
-template <>
-struct bool_constant<false> : false_type {};
+using bool_constant = std::integral_constant<bool, Condition>;
 
 // Third-party libraries rely on these.
 using std::conditional;
@@ -136,77 +130,24 @@ struct remove_all<T*> {
 template <typename T>
 using remove_all_t = typename remove_all<T>::type;
 
+// Eigen's is_arithmetic is similar to std::is_arithmetic but can be specialized
+// for SIMD packet types and other Eigen-specific types. The primary template
+// delegates to std::is_arithmetic for fundamental types.
 template <typename T>
 struct is_arithmetic {
-  enum { value = false };
-};
-template <>
-struct is_arithmetic<float> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<double> {
-  enum { value = true };
+  enum { value = std::is_arithmetic<T>::value };
 };
 // GPU devices treat `long double` as `double`.
-#ifndef EIGEN_GPU_COMPILE_PHASE
+#ifdef EIGEN_GPU_COMPILE_PHASE
 template <>
 struct is_arithmetic<long double> {
-  enum { value = true };
+  enum { value = false };
 };
 #endif
-template <>
-struct is_arithmetic<bool> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<char> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<signed char> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<unsigned char> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<signed short> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<unsigned short> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<signed int> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<unsigned int> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<signed long> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<unsigned long> {
-  enum { value = true };
-};
 
-template <typename T, typename U>
-struct is_same {
-  enum { value = 0 };
-};
-template <typename T>
-struct is_same<T, T> {
-  enum { value = 1 };
-};
+using std::is_same;
 
-template <class T>
-struct is_void : is_same<void, std::remove_const_t<T>> {};
+using std::is_void;
 
 /** \internal
  * Implementation of std::void_t for SFINAE.
@@ -223,26 +164,11 @@ template <typename...>
 using void_t = void;
 #endif
 
-template <>
-struct is_arithmetic<signed long long> {
-  enum { value = true };
-};
-template <>
-struct is_arithmetic<unsigned long long> {
-  enum { value = true };
-};
 using std::is_integral;
 
 using std::make_unsigned;
 
-template <typename T>
-struct is_const {
-  enum { value = 0 };
-};
-template <typename T>
-struct is_const<T const> {
-  enum { value = 1 };
-};
+using std::is_const;
 
 template <typename T>
 struct add_const_on_value_type {
@@ -270,18 +196,6 @@ using add_const_on_value_type_t = typename add_const_on_value_type<T>::type;
 
 using std::is_convertible;
 
-/** \internal
- * A base class do disable default copy ctor and copy assignment operator.
- */
-class noncopyable {
-  EIGEN_DEVICE_FUNC noncopyable(const noncopyable&);
-  EIGEN_DEVICE_FUNC const noncopyable& operator=(const noncopyable&);
-
- protected:
-  EIGEN_DEVICE_FUNC noncopyable() {}
-  EIGEN_DEVICE_FUNC ~noncopyable() {}
-};
-
 /** \internal
  * Provides access to the number of elements in the object of as a compile-time constant expression.
  * It "returns" Eigen::Dynamic if the size cannot be resolved at compile-time (default).
@@ -291,7 +205,7 @@ class noncopyable {
  * It currently supports:
  *  - any types T defining T::SizeAtCompileTime
  *  - plain C arrays as T[N]
- *  - std::array (c++11)
+ *  - std::array
  *  - some internal types such as SingleRange and AllRange
  *
  * The second template parameter eases SFINAE-based specializations.
@@ -375,24 +289,12 @@ struct result_of<F(ArgTypes...)> {
   typedef typename std::invoke_result<F, ArgTypes...>::type type1;
   typedef remove_all_t<type1> type;
 };
-
-template <typename F, typename... ArgTypes>
-struct invoke_result {
-  typedef typename std::invoke_result<F, ArgTypes...>::type type1;
-  typedef remove_all_t<type1> type;
-};
 #else
 template <typename T>
 struct result_of {
   typedef typename std::result_of<T>::type type1;
   typedef remove_all_t<type1> type;
 };
-
-template <typename F, typename... ArgTypes>
-struct invoke_result {
-  typedef typename result_of<F(ArgTypes...)>::type type1;
-  typedef remove_all_t<type1> type;
-};
 #endif
 
 // Reduces a sequence of bools to true if all are true, false otherwise.
@@ -400,82 +302,35 @@ template <bool... values>
 using reduce_all =
     std::is_same<std::integer_sequence<bool, values..., true>, std::integer_sequence<bool, true, values...>>;
 
-// Reduces a sequence of bools to true if any are true, false if all false.
-template <bool... values>
-using reduce_any = std::integral_constant<bool, !std::is_same<std::integer_sequence<bool, values..., false>,
-                                                              std::integer_sequence<bool, false, values...>>::value>;
-
-struct meta_yes {
-  char a[1];
-};
-struct meta_no {
-  char a[2];
-};
-
 // Check whether T::ReturnType does exist
-template <typename T>
-struct has_ReturnType {
-  template <typename C>
-  static meta_yes testFunctor(C const*, typename C::ReturnType const* = 0);
-  template <typename C>
-  static meta_no testFunctor(...);
-
-  enum { value = sizeof(testFunctor<T>(static_cast<T*>(0))) == sizeof(meta_yes) };
-};
+template <typename T, typename EnableIf = void>
+struct has_ReturnType : false_type {};
 
 template <typename T>
-const T* return_ptr();
+struct has_ReturnType<T, void_t<typename T::ReturnType>> : true_type {};
 
-template <typename T, typename IndexType = Index>
-struct has_nullary_operator {
-  template <typename C>
-  static meta_yes testFunctor(C const*, std::enable_if_t<(sizeof(return_ptr<C>()->operator()()) > 0)>* = 0);
-  static meta_no testFunctor(...);
+template <typename T, typename IndexType = Index, typename EnableIf = void>
+struct has_nullary_operator : false_type {};
 
-  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
-};
-
-template <typename T, typename IndexType = Index>
-struct has_unary_operator {
-  template <typename C>
-  static meta_yes testFunctor(C const*, std::enable_if_t<(sizeof(return_ptr<C>()->operator()(IndexType(0))) > 0)>* = 0);
-  static meta_no testFunctor(...);
+template <typename T, typename IndexType>
+struct has_nullary_operator<T, IndexType, std::enable_if_t<(sizeof(decltype(std::declval<const T&>()())) > 0)>>
+    : true_type {};
 
-  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
-};
+template <typename T, typename IndexType = Index, typename EnableIf = void>
+struct has_unary_operator : false_type {};
 
-template <typename T, typename IndexType = Index>
-struct has_binary_operator {
-  template <typename C>
-  static meta_yes testFunctor(
-      C const*, std::enable_if_t<(sizeof(return_ptr<C>()->operator()(IndexType(0), IndexType(0))) > 0)>* = 0);
-  static meta_no testFunctor(...);
+template <typename T, typename IndexType>
+struct has_unary_operator<T, IndexType,
+                          std::enable_if_t<(sizeof(decltype(std::declval<const T&>()(IndexType(0)))) > 0)>>
+    : true_type {};
 
-  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
-};
+template <typename T, typename IndexType = Index, typename EnableIf = void>
+struct has_binary_operator : false_type {};
 
-/** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.
- * Usage example: \code meta_sqrt<1023>::ret \endcode
- */
-template <int Y, int InfX = 0, int SupX = ((Y == 1) ? 1 : Y / 2),
-          bool Done = ((SupX - InfX) <= 1 || ((SupX * SupX <= Y) && ((SupX + 1) * (SupX + 1) > Y)))>
-class meta_sqrt {
-  enum {
-    MidX = (InfX + SupX) / 2,
-    TakeInf = MidX * MidX > Y ? 1 : 0,
-    NewInf = int(TakeInf) ? InfX : int(MidX),
-    NewSup = int(TakeInf) ? int(MidX) : SupX
-  };
-
- public:
-  enum { ret = meta_sqrt<Y, NewInf, NewSup>::ret };
-};
-
-template <int Y, int InfX, int SupX>
-class meta_sqrt<Y, InfX, SupX, true> {
- public:
-  enum { ret = (SupX * SupX <= Y) ? SupX : InfX };
-};
+template <typename T, typename IndexType>
+struct has_binary_operator<
+    T, IndexType, std::enable_if_t<(sizeof(decltype(std::declval<const T&>()(IndexType(0), IndexType(0)))) > 0)>>
+    : true_type {};
 
 /** \internal Computes the least common multiple of two positive integer A and B
  * at compile-time.
@@ -499,12 +354,6 @@ struct scalar_product_traits {
   enum { Defined = 0 };
 };
 
-// FIXME quick workaround around current limitation of result_of
-// template<typename Scalar, typename ArgType0, typename ArgType1>
-// struct result_of<scalar_product_op<Scalar>(ArgType0,ArgType1)> {
-// typedef typename scalar_product_traits<remove_all_t<ArgType0>, remove_all_t<ArgType1>>::ReturnType type;
-// };
-
 /** \internal Obtains a POD type suitable to use as storage for an object of a size
  * of at most Len bytes, aligned as specified by \c Align.
  */
@@ -524,14 +373,14 @@ namespace numext {
 
 #if defined(EIGEN_GPU_COMPILE_PHASE)
 template <typename T>
-EIGEN_DEVICE_FUNC void swap(T& a, T& b) {
+EIGEN_DEVICE_FUNC constexpr void swap(T& a, T& b) {
   T tmp = b;
   b = a;
   a = tmp;
 }
 #else
 template <typename T>
-EIGEN_STRONG_INLINE void swap(T& a, T& b) {
+constexpr EIGEN_STRONG_INLINE void swap(T& a, T& b) {
   std::swap(a, b);
 }
 #endif
@@ -542,15 +391,15 @@ using std::numeric_limits;
 template <typename X, typename Y, bool XIsInteger = NumTraits<X>::IsInteger, bool XIsSigned = NumTraits<X>::IsSigned,
           bool YIsInteger = NumTraits<Y>::IsInteger, bool YIsSigned = NumTraits<Y>::IsSigned>
 struct equal_strict_impl {
-  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool run(const X& x, const Y& y) { return x == y; }
+  static constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool run(const X& x, const Y& y) { return x == y; }
 };
 template <typename X, typename Y>
 struct equal_strict_impl<X, Y, true, false, true, true> {
   // X is an unsigned integer
   // Y is a signed integer
   // if Y is non-negative, it may be represented exactly as its unsigned counterpart.
-  using UnsignedY = typename internal::make_unsigned<Y>::type;
-  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool run(const X& x, const Y& y) {
+  using UnsignedY = std::make_unsigned_t<Y>;
+  static constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool run(const X& x, const Y& y) {
     return y < Y(0) ? false : (x == static_cast<UnsignedY>(y));
   }
 };
@@ -558,7 +407,7 @@ template <typename X, typename Y>
 struct equal_strict_impl<X, Y, true, true, true, false> {
   // X is a signed integer
   // Y is an unsigned integer
-  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool run(const X& x, const Y& y) {
+  static constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool run(const X& x, const Y& y) {
     return equal_strict_impl<Y, X>::run(y, x);
   }
 };
@@ -566,18 +415,18 @@ struct equal_strict_impl<X, Y, true, true, true, false> {
 // The aim of the following functions is to bypass -Wfloat-equal warnings
 // when we really want a strict equality comparison on floating points.
 template <typename X, typename Y>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const X& x, const Y& y) {
+constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const X& x, const Y& y) {
   return equal_strict_impl<X, Y>::run(x, y);
 }
 
 #if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
 template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const float& x, const float& y) {
+constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const float& x, const float& y) {
   return std::equal_to<float>()(x, y);
 }
 
 template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const double& x, const double& y) {
+constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const double& x, const double& y) {
   return std::equal_to<double>()(x, y);
 }
 #endif
@@ -587,7 +436,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const double& x, const d
  * Use this to to bypass -Wfloat-equal warnings when exact zero is what needs to be tested.
  */
 template <typename X>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool is_exactly_zero(const X& x) {
+constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool is_exactly_zero(const X& x) {
   return equal_strict(x, typename NumTraits<X>::Literal{0});
 }
 
@@ -596,23 +445,23 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool is_exactly_zero(const X& x) {
  * Use this to to bypass -Wfloat-equal warnings when exact one is what needs to be tested.
  */
 template <typename X>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool is_exactly_one(const X& x) {
+constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool is_exactly_one(const X& x) {
   return equal_strict(x, typename NumTraits<X>::Literal{1});
 }
 
 template <typename X, typename Y>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const X& x, const Y& y) {
+constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const X& x, const Y& y) {
   return !equal_strict_impl<X, Y>::run(x, y);
 }
 
 #if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
 template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const float& x, const float& y) {
+constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const float& x, const float& y) {
   return std::not_equal_to<float>()(x, y);
 }
 
 template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const double& x, const double& y) {
+constexpr EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const double& x, const double& y) {
   return std::not_equal_to<double>()(x, y);
 }
 #endif
@@ -623,11 +472,11 @@ namespace internal {
 
 template <typename Scalar>
 struct is_identically_zero_impl {
-  static inline bool run(const Scalar& s) { return numext::is_exactly_zero(s); }
+  static constexpr bool run(const Scalar& s) { return numext::is_exactly_zero(s); }
 };
 
 template <typename Scalar>
-EIGEN_STRONG_INLINE bool is_identically_zero(const Scalar& s) {
+constexpr EIGEN_STRONG_INLINE bool is_identically_zero(const Scalar& s) {
   return is_identically_zero_impl<Scalar>::run(s);
 }
 
@@ -716,20 +565,6 @@ constexpr bool enum_lt_not_dynamic(A a, B b) {
   return (int)a < (int)b;
 }
 
-template <typename A, typename B>
-constexpr bool enum_le_not_dynamic(A a, B b) {
-  plain_enum_asserts(a, b);
-  if ((int)a == Dynamic || (int)b == Dynamic) return false;
-  return (int)a <= (int)b;
-}
-
-template <typename A, typename B>
-constexpr bool enum_gt_not_dynamic(A a, B b) {
-  plain_enum_asserts(a, b);
-  if ((int)a == Dynamic || (int)b == Dynamic) return false;
-  return (int)a > (int)b;
-}
-
 template <typename A, typename B>
 constexpr bool enum_ge_not_dynamic(A a, B b) {
   plain_enum_asserts(a, b);
diff --git a/Eigen/src/Core/util/MoreMeta.h b/Eigen/src/Core/util/MoreMeta.h
index 6823bca9772..4f831670b47 100644
--- a/Eigen/src/Core/util/MoreMeta.h
+++ b/Eigen/src/Core/util/MoreMeta.h
@@ -186,12 +186,11 @@ struct h_skip_helper_type<0> {
 template <int n>
 struct h_skip {
   template <typename T, T... ii>
-  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_numeric<T, n, ii...>::type helper(
-      numeric_list<T, ii...>) {
+  constexpr static typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) {
     return typename h_skip_helper_numeric<T, n, ii...>::type();
   }
   template <typename... tt>
-  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) {
+  constexpr static typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) {
     return typename h_skip_helper_type<n, tt...>::type();
   }
 };
@@ -330,17 +329,17 @@ struct reduce;
 
 template <typename Reducer>
 struct reduce<Reducer> {
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE int run() { return Reducer::Identity; }
+  EIGEN_DEVICE_FUNC constexpr static int run() { return Reducer::Identity; }
 };
 
 template <typename Reducer, typename A>
 struct reduce<Reducer, A> {
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE A run(A a) { return a; }
+  EIGEN_DEVICE_FUNC constexpr static A run(A a) { return a; }
 };
 
 template <typename Reducer, typename A, typename... Ts>
 struct reduce<Reducer, A, Ts...> {
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, Ts... ts)
+  EIGEN_DEVICE_FUNC constexpr static auto run(A a, Ts... ts)
       -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
     return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
   }
@@ -350,14 +349,14 @@ struct reduce<Reducer, A, Ts...> {
 
 struct sum_op {
   template <typename A, typename B>
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a + b) {
+  EIGEN_DEVICE_FUNC constexpr static auto run(A a, B b) -> decltype(a + b) {
     return a + b;
   }
   static constexpr int Identity = 0;
 };
 struct product_op {
   template <typename A, typename B>
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a * b) {
+  EIGEN_DEVICE_FUNC constexpr static auto run(A a, B b) -> decltype(a * b) {
     return a * b;
   }
   static constexpr int Identity = 1;
@@ -365,100 +364,49 @@ struct product_op {
 
 struct logical_and_op {
   template <typename A, typename B>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a && b) {
+  constexpr static auto run(A a, B b) -> decltype(a && b) {
     return a && b;
   }
 };
-struct logical_or_op {
-  template <typename A, typename B>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a || b) {
-    return a || b;
-  }
-};
-
-struct equal_op {
-  template <typename A, typename B>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a == b) {
-    return a == b;
-  }
-};
-struct not_equal_op {
-  template <typename A, typename B>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a != b) {
-    return a != b;
-  }
-};
 struct lesser_op {
   template <typename A, typename B>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a < b) {
+  constexpr static auto run(A a, B b) -> decltype(a < b) {
     return a < b;
   }
 };
-struct lesser_equal_op {
-  template <typename A, typename B>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a <= b) {
-    return a <= b;
-  }
-};
-struct greater_op {
-  template <typename A, typename B>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a > b) {
-    return a > b;
-  }
-};
-struct greater_equal_op {
-  template <typename A, typename B>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a >= b) {
-    return a >= b;
-  }
-};
 
 /* generic unary operations */
 
-struct not_op {
-  template <typename A>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(!a) {
-    return !a;
-  }
-};
-struct negation_op {
-  template <typename A>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(-a) {
-    return -a;
-  }
-};
 struct greater_equal_zero_op {
   template <typename A>
-  constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(a >= 0) {
+  constexpr static auto run(A a) -> decltype(a >= 0) {
     return a >= 0;
   }
 };
 
 /* reductions for lists */
 
-// using auto -> return value spec makes ICC 13.0 and 13.1 crash here, so we have to hack it
-// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
-// does...
+// Using auto -> return value spec makes ICC 13.0 and 13.1 crash here,
+// so the return type is specified explicitly using decltype.
 template <typename... Ts>
-EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(
-    Ts... ts) {
+EIGEN_DEVICE_FUNC constexpr decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts) {
   return reduce<product_op, Ts...>::run(ts...);
 }
 
 template <typename... Ts>
-constexpr EIGEN_STRONG_INLINE decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts) {
+constexpr decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts) {
   return reduce<sum_op, Ts...>::run(ts...);
 }
 
 /* reverse arrays */
 
 template <typename Array, int... n>
-constexpr EIGEN_STRONG_INLINE Array h_array_reverse(Array arr, numeric_list<int, n...>) {
+constexpr Array h_array_reverse(Array arr, numeric_list<int, n...>) {
   return {{array_get<sizeof...(n) - n - 1>(arr)...}};
 }
 
 template <typename T, std::size_t N>
-constexpr EIGEN_STRONG_INLINE array<T, N> array_reverse(array<T, N> arr) {
+constexpr array<T, N> array_reverse(array<T, N> arr) {
   return h_array_reverse(arr, typename gen_numeric_list<int, N>::type());
 }
 
@@ -471,7 +419,7 @@ constexpr EIGEN_STRONG_INLINE array<T, N> array_reverse(array<T, N> arr) {
 // an infinite loop)
 template <typename Reducer, typename T, std::size_t N, std::size_t n = N - 1>
 struct h_array_reduce {
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(array<T, N> arr, T identity)
+  EIGEN_DEVICE_FUNC constexpr static auto run(array<T, N> arr, T identity)
       -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr))) {
     return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr));
   }
@@ -479,16 +427,16 @@ struct h_array_reduce {
 
 template <typename Reducer, typename T, std::size_t N>
 struct h_array_reduce<Reducer, T, N, 0> {
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, N>& arr, T) { return array_get<0>(arr); }
+  EIGEN_DEVICE_FUNC constexpr static T run(const array<T, N>& arr, T) { return array_get<0>(arr); }
 };
 
 template <typename Reducer, typename T>
 struct h_array_reduce<Reducer, T, 0> {
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, 0>&, T identity) { return identity; }
+  EIGEN_DEVICE_FUNC constexpr static T run(const array<T, 0>&, T identity) { return identity; }
 };
 
 template <typename Reducer, typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_reduce(const array<T, N>& arr, T identity)
+EIGEN_DEVICE_FUNC constexpr auto array_reduce(const array<T, N>& arr, T identity)
     -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity)) {
   return h_array_reduce<Reducer, T, N>::run(arr, identity);
 }
@@ -496,13 +444,13 @@ EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_reduce(const array<T,
 /* standard array reductions */
 
 template <typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_sum(const array<T, N>& arr)
+EIGEN_DEVICE_FUNC constexpr auto array_sum(const array<T, N>& arr)
     -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0))) {
   return array_reduce<sum_op, T, N>(arr, static_cast<T>(0));
 }
 
 template <typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_prod(const array<T, N>& arr)
+EIGEN_DEVICE_FUNC constexpr auto array_prod(const array<T, N>& arr)
     -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1))) {
   return array_reduce<product_op, T, N>(arr, static_cast<T>(1));
 }
@@ -520,20 +468,19 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
 /* zip an array */
 
 template <typename Op, typename A, typename B, std::size_t N, int... n>
-constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())), N> h_array_zip(array<A, N> a, array<B, N> b,
-                                                                                numeric_list<int, n...>) {
+constexpr array<decltype(Op::run(A(), B())), N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>) {
   return array<decltype(Op::run(A(), B())), N>{{Op::run(array_get<n>(a), array_get<n>(b))...}};
 }
 
 template <typename Op, typename A, typename B, std::size_t N>
-constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())), N> array_zip(array<A, N> a, array<B, N> b) {
+constexpr array<decltype(Op::run(A(), B())), N> array_zip(array<A, N> a, array<B, N> b) {
   return h_array_zip<Op>(a, b, typename gen_numeric_list<int, N>::type());
 }
 
 /* zip an array and reduce the result */
 
 template <typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n>
-constexpr EIGEN_STRONG_INLINE auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
+constexpr auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
     -> decltype(reduce<Reducer, typename id_numeric<int, n, decltype(Op::run(A(), B()))>::type...>::run(
         Op::run(array_get<n>(a), array_get<n>(b))...)) {
   return reduce<Reducer, typename id_numeric<int, n, decltype(Op::run(A(), B()))>::type...>::run(
@@ -541,7 +488,7 @@ constexpr EIGEN_STRONG_INLINE auto h_array_zip_and_reduce(array<A, N> a, array<B
 }
 
 template <typename Reducer, typename Op, typename A, typename B, std::size_t N>
-constexpr EIGEN_STRONG_INLINE auto array_zip_and_reduce(array<A, N> a, array<B, N> b)
+constexpr auto array_zip_and_reduce(array<A, N> a, array<B, N> b)
     -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type())) {
   return h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type());
 }
@@ -549,19 +496,19 @@ constexpr EIGEN_STRONG_INLINE auto array_zip_and_reduce(array<A, N> a, array<B,
 /* apply stuff to an array */
 
 template <typename Op, typename A, std::size_t N, int... n>
-constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())), N> h_array_apply(array<A, N> a, numeric_list<int, n...>) {
+constexpr array<decltype(Op::run(A())), N> h_array_apply(array<A, N> a, numeric_list<int, n...>) {
   return array<decltype(Op::run(A())), N>{{Op::run(array_get<n>(a))...}};
 }
 
 template <typename Op, typename A, std::size_t N>
-constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())), N> array_apply(array<A, N> a) {
+constexpr array<decltype(Op::run(A())), N> array_apply(array<A, N> a) {
   return h_array_apply<Op>(a, typename gen_numeric_list<int, N>::type());
 }
 
 /* apply stuff to an array and reduce */
 
 template <typename Reducer, typename Op, typename A, std::size_t N, int... n>
-constexpr EIGEN_STRONG_INLINE auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>)
+constexpr auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>)
     -> decltype(reduce<Reducer, typename id_numeric<int, n, decltype(Op::run(A()))>::type...>::run(
         Op::run(array_get<n>(arr))...)) {
   return reduce<Reducer, typename id_numeric<int, n, decltype(Op::run(A()))>::type...>::run(
@@ -569,7 +516,7 @@ constexpr EIGEN_STRONG_INLINE auto h_array_apply_and_reduce(array<A, N> arr, num
 }
 
 template <typename Reducer, typename Op, typename A, std::size_t N>
-constexpr EIGEN_STRONG_INLINE auto array_apply_and_reduce(array<A, N> a)
+constexpr auto array_apply_and_reduce(array<A, N> a)
     -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type())) {
   return h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type());
 }
@@ -582,7 +529,7 @@ constexpr EIGEN_STRONG_INLINE auto array_apply_and_reduce(array<A, N> a)
 template <int n>
 struct h_repeat {
   template <typename t, int... ii>
-  constexpr static EIGEN_STRONG_INLINE array<t, n> run(t v, numeric_list<int, ii...>) {
+  constexpr static array<t, n> run(t v, numeric_list<int, ii...>) {
     return {{typename id_numeric<int, ii, t>::type(v)...}};
   }
 };
diff --git a/Eigen/src/Core/util/Serializer.h b/Eigen/src/Core/util/Serializer.h
index 1e12820008a..6de7fe247dd 100644
--- a/Eigen/src/Core/util/Serializer.h
+++ b/Eigen/src/Core/util/Serializer.h
@@ -28,7 +28,8 @@ class Serializer;
 
 // Specialization for POD types.
 template <typename T>
-class Serializer<T, typename std::enable_if_t<std::is_trivial<T>::value && std::is_standard_layout<T>::value>> {
+class Serializer<T,
+                 typename std::enable_if_t<std::is_trivially_copyable<T>::value && std::is_standard_layout<T>::value>> {
  public:
   /**
    * Determines the required size of the serialization buffer for a value.
@@ -45,7 +46,7 @@ class Serializer<T, typename std::enable_if_t<std::is_trivial<T>::value && std::
    * \param value the value to serialize.
    * \return the next memory address past the end of the serialized data.
    */
-  EIGEN_DEVICE_FUNC uint8_t* serialize(uint8_t* dest, uint8_t* end, const T& value) {
+  EIGEN_DEVICE_FUNC uint8_t* serialize(uint8_t* dest, uint8_t* end, const T& value) const {
     if (EIGEN_PREDICT_FALSE(dest == nullptr)) return nullptr;
     if (EIGEN_PREDICT_FALSE(dest + sizeof(value) > end)) return nullptr;
     EIGEN_USING_STD(memcpy)
@@ -83,7 +84,7 @@ class Serializer<DenseBase<Derived>, void> {
 
   EIGEN_DEVICE_FUNC size_t size(const Derived& value) const { return sizeof(Header) + sizeof(Scalar) * value.size(); }
 
-  EIGEN_DEVICE_FUNC uint8_t* serialize(uint8_t* dest, uint8_t* end, const Derived& value) {
+  EIGEN_DEVICE_FUNC uint8_t* serialize(uint8_t* dest, uint8_t* end, const Derived& value) const {
     if (EIGEN_PREDICT_FALSE(dest == nullptr)) return nullptr;
     if (EIGEN_PREDICT_FALSE(dest + size(value) > end)) return nullptr;
     const size_t header_bytes = sizeof(Header);
@@ -128,7 +129,7 @@ struct serialize_impl;
 
 template <size_t N, typename T1, typename... Ts>
 struct serialize_impl<N, T1, Ts...> {
-  using Serializer = Eigen::Serializer<typename std::decay<T1>::type>;
+  using Serializer = Eigen::Serializer<std::decay_t<T1>>;
 
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t serialize_size(const T1& value, const Ts&... args) {
     Serializer serializer;
diff --git a/Eigen/src/Core/util/SymbolicIndex.h b/Eigen/src/Core/util/SymbolicIndex.h
index dc204af4ba2..a90e2bb68b6 100644
--- a/Eigen/src/Core/util/SymbolicIndex.h
+++ b/Eigen/src/Core/util/SymbolicIndex.h
@@ -188,10 +188,10 @@ template <typename IndexType>
 class ValueExpr : BaseExpr<ValueExpr<IndexType>> {
  public:
   constexpr ValueExpr() = default;
-  constexpr ValueExpr(IndexType val) : value_(val) {}
+  constexpr ValueExpr(IndexType val) : m_value(val) {}
   template <typename... Tags, typename... Types>
   constexpr IndexType eval_impl(const SymbolValue<Tags, Types>&...) const {
-    return value_;
+    return m_value;
   }
   template <typename... Tags, typename... Types>
   static constexpr IndexType eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
@@ -199,7 +199,7 @@ class ValueExpr : BaseExpr<ValueExpr<IndexType>> {
   }
 
  protected:
-  IndexType value_;
+  IndexType m_value;
 };
 
 // Specialization for compile-time value,
@@ -232,10 +232,10 @@ class SymbolValue<Tag, Index> : public BaseExpr<SymbolValue<Tag, Index>> {
   constexpr SymbolValue() = default;
 
   /** Default constructor from the value \a val */
-  constexpr SymbolValue(Index val) : value_(val) {}
+  constexpr SymbolValue(Index val) : m_value(val) {}
 
   /** \returns the stored value of the symbol */
-  constexpr Index value() const { return value_; }
+  constexpr Index value() const { return m_value; }
 
   /** \returns the stored value of the symbol at compile time, or Undefined if not known. */
   static constexpr Index value_at_compile_time() { return Index(Undefined); }
@@ -251,7 +251,7 @@ class SymbolValue<Tag, Index> : public BaseExpr<SymbolValue<Tag, Index>> {
   }
 
  protected:
-  Index value_;
+  Index m_value;
 };
 
 template <typename Tag, int N>
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index a0e160eba4f..9ae6d69ce09 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -20,11 +20,10 @@ namespace internal {
 
 // useful for unsigned / signed integer comparisons when idx is intended to be non-negative
 template <typename IndexType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename make_unsigned<IndexType>::type returnUnsignedIndexValue(
-    const IndexType& idx) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::make_unsigned_t<IndexType> returnUnsignedIndexValue(const IndexType& idx) {
   EIGEN_STATIC_ASSERT((NumTraits<IndexType>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
   eigen_internal_assert(idx >= 0 && "Index value is negative and target type is unsigned");
-  using UnsignedType = typename make_unsigned<IndexType>::type;
+  using UnsignedType = std::make_unsigned_t<IndexType>;
   return static_cast<UnsignedType>(idx);
 }
 
@@ -97,7 +96,7 @@ struct promote_scalar_arg;
 
 template <typename S, typename T>
 struct promote_scalar_arg<S, T, true> {
-  typedef T type;
+  using type = T;
 };
 
 // Recursively check safe conversion to PromotedType, and then ExprScalar if they are different.
@@ -113,7 +112,7 @@ struct promote_scalar_arg<S, T, false> : promote_scalar_arg_unsupported<S, T, ty
 // We found a match!
 template <typename S, typename T, typename PromotedType>
 struct promote_scalar_arg_unsupported<S, T, PromotedType, true, true> {
-  typedef PromotedType type;
+  using type = PromotedType;
 };
 
 // No match, but no real-to-integer issues, and ExprScalar and current PromotedType are different,
@@ -132,8 +131,7 @@ struct promote_scalar_arg_unsupported<S, T, S, false, true> {};
 
 // classes inheriting no_assignment_operator don't generate a default operator=.
 class no_assignment_operator {
- private:
-  no_assignment_operator& operator=(const no_assignment_operator&);
+  no_assignment_operator& operator=(const no_assignment_operator&) = delete;
 
  protected:
   EIGEN_DEFAULT_COPY_CONSTRUCTOR(no_assignment_operator)
@@ -143,7 +141,7 @@ class no_assignment_operator {
 /** \internal return the index type with the largest number of bits */
 template <typename I1, typename I2>
 struct promote_index_type {
-  typedef std::conditional_t<(sizeof(I1) < sizeof(I2)), I2, I1> type;
+  using type = std::conditional_t<(sizeof(I1) < sizeof(I2)), I2, I1>;
 };
 
 /** \internal If the template parameter Value is Dynamic, this class is just a wrapper around a T variable that
@@ -153,13 +151,12 @@ struct promote_index_type {
 template <typename T, int Value>
 class variable_if_dynamic {
  public:
-  EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(variable_if_dynamic)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T v) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit variable_if_dynamic(T v) {
     EIGEN_ONLY_USED_FOR_DEBUG(v);
     eigen_assert(v == T(Value));
   }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr T value() { return T(Value); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr operator T() const { return T(Value); }
+  EIGEN_DEVICE_FUNC static constexpr T value() { return T(Value); }
+  EIGEN_DEVICE_FUNC constexpr operator T() const { return T(Value); }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T v) const {
     EIGEN_ONLY_USED_FOR_DEBUG(v);
     eigen_assert(v == T(Value));
@@ -171,9 +168,9 @@ class variable_if_dynamic<T, Dynamic> {
   T m_value;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) noexcept : m_value(value) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return m_value; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) noexcept : m_value(value) {}
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE T value() const { return m_value; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE operator T() const { return m_value; }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
 };
 
@@ -182,12 +179,12 @@ class variable_if_dynamic<T, Dynamic> {
 template <typename T, int Value>
 class variable_if_dynamicindex {
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T v) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T v) {
     EIGEN_ONLY_USED_FOR_DEBUG(v);
     eigen_assert(v == T(Value));
   }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr T value() { return T(Value); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {}
+  EIGEN_DEVICE_FUNC static constexpr T value() { return T(Value); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void setValue(T) {}
 };
 
 template <typename T>
@@ -196,8 +193,8 @@ class variable_if_dynamicindex<T, DynamicIndex> {
   EIGEN_DEVICE_FUNC variable_if_dynamicindex() { eigen_assert(false); }
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T value) : m_value(value) {}
-  EIGEN_DEVICE_FUNC T EIGEN_STRONG_INLINE value() const { return m_value; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T value) : m_value(value) {}
+  EIGEN_DEVICE_FUNC constexpr T EIGEN_STRONG_INLINE value() const { return m_value; }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
 };
 
@@ -277,17 +274,17 @@ struct find_best_packet_helper;
 
 template <int Size, typename PacketType>
 struct find_best_packet_helper<Size, PacketType, true> {
-  typedef PacketType type;
+  using type = PacketType;
 };
 
 template <int Size, typename PacketType>
 struct find_best_packet_helper<Size, PacketType, false> {
-  typedef typename find_best_packet_helper<Size, typename unpacket_traits<PacketType>::half>::type type;
+  using type = typename find_best_packet_helper<Size, typename unpacket_traits<PacketType>::half>::type;
 };
 
 template <typename T, int Size>
 struct find_best_packet {
-  typedef typename find_best_packet_helper<Size, typename packet_traits<T>::type>::type type;
+  using type = typename find_best_packet_helper<Size, typename packet_traits<T>::type>::type;
 };
 
 template <int Size, typename PacketType,
@@ -349,17 +346,16 @@ template <typename Scalar_, int Rows_, int Cols_,
                                       : (Cols_ == 1 && Rows_ != 1) ? ColMajor
                                                                    : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION),
           int MaxRows_ = Rows_, int MaxCols_ = Cols_>
-class make_proper_matrix_type {
-  enum {
-    IsColVector = Cols_ == 1 && Rows_ != 1,
-    IsRowVector = Rows_ == 1 && Cols_ != 1,
-    Options = IsColVector   ? (Options_ | ColMajor) & ~RowMajor
-              : IsRowVector ? (Options_ | RowMajor) & ~ColMajor
-                            : Options_
-  };
+struct make_proper_matrix_type {
+ private:
+  static constexpr bool IsColVector = Cols_ == 1 && Rows_ != 1;
+  static constexpr bool IsRowVector = Rows_ == 1 && Cols_ != 1;
+  static constexpr int Options = IsColVector   ? (Options_ | ColMajor) & ~RowMajor
+                                 : IsRowVector ? (Options_ | RowMajor) & ~ColMajor
+                                               : Options_;
 
  public:
-  typedef Matrix<Scalar_, Rows_, Cols_, Options, MaxRows_, MaxCols_> type;
+  using type = Matrix<Scalar_, Rows_, Cols_, Options, MaxRows_, MaxCols_>;
 };
 
 constexpr unsigned compute_matrix_flags(int Options) {
@@ -391,32 +387,30 @@ template <typename T, typename BaseClassType, int Flags>
 struct plain_matrix_type_dense;
 template <typename T>
 struct plain_matrix_type<T, Dense> {
-  typedef typename plain_matrix_type_dense<T, typename traits<T>::XprKind, traits<T>::Flags>::type type;
+  using type = typename plain_matrix_type_dense<T, typename traits<T>::XprKind, traits<T>::Flags>::type;
 };
 template <typename T>
 struct plain_matrix_type<T, DiagonalShape> {
-  typedef typename T::PlainObject type;
+  using type = typename T::PlainObject;
 };
 
 template <typename T>
 struct plain_matrix_type<T, SkewSymmetricShape> {
-  typedef typename T::PlainObject type;
+  using type = typename T::PlainObject;
 };
 
 template <typename T, int Flags>
 struct plain_matrix_type_dense<T, MatrixXpr, Flags> {
-  typedef Matrix<typename traits<T>::Scalar, traits<T>::RowsAtCompileTime, traits<T>::ColsAtCompileTime,
-                 AutoAlign | (Flags & RowMajorBit ? RowMajor : ColMajor), traits<T>::MaxRowsAtCompileTime,
-                 traits<T>::MaxColsAtCompileTime>
-      type;
+  using type = Matrix<typename traits<T>::Scalar, traits<T>::RowsAtCompileTime, traits<T>::ColsAtCompileTime,
+                      AutoAlign | (Flags & RowMajorBit ? RowMajor : ColMajor), traits<T>::MaxRowsAtCompileTime,
+                      traits<T>::MaxColsAtCompileTime>;
 };
 
 template <typename T, int Flags>
 struct plain_matrix_type_dense<T, ArrayXpr, Flags> {
-  typedef Array<typename traits<T>::Scalar, traits<T>::RowsAtCompileTime, traits<T>::ColsAtCompileTime,
-                AutoAlign | (Flags & RowMajorBit ? RowMajor : ColMajor), traits<T>::MaxRowsAtCompileTime,
-                traits<T>::MaxColsAtCompileTime>
-      type;
+  using type = Array<typename traits<T>::Scalar, traits<T>::RowsAtCompileTime, traits<T>::ColsAtCompileTime,
+                     AutoAlign | (Flags & RowMajorBit ? RowMajor : ColMajor), traits<T>::MaxRowsAtCompileTime,
+                     traits<T>::MaxColsAtCompileTime>;
 };
 
 /* eval : the return type of eval(). For matrices, this is just a const reference
@@ -428,36 +422,28 @@ struct eval;
 
 template <typename T>
 struct eval<T, Dense> {
-  typedef typename plain_matrix_type<T>::type type;
-  //   typedef typename T::PlainObject type;
-  //   typedef T::Matrix<typename traits<T>::Scalar,
-  //                 traits<T>::RowsAtCompileTime,
-  //                 traits<T>::ColsAtCompileTime,
-  //                 AutoAlign | (traits<T>::Flags&RowMajorBit ? RowMajor : ColMajor),
-  //                 traits<T>::MaxRowsAtCompileTime,
-  //                 traits<T>::MaxColsAtCompileTime
-  //           > type;
+  using type = typename plain_matrix_type<T>::type;
 };
 
 template <typename T>
 struct eval<T, DiagonalShape> {
-  typedef typename plain_matrix_type<T>::type type;
+  using type = typename plain_matrix_type<T>::type;
 };
 
 template <typename T>
 struct eval<T, SkewSymmetricShape> {
-  typedef typename plain_matrix_type<T>::type type;
+  using type = typename plain_matrix_type<T>::type;
 };
 
 // for matrices, no need to evaluate, just use a const reference to avoid a useless copy
 template <typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
 struct eval<Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>, Dense> {
-  typedef const Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>& type;
+  using type = const Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>&;
 };
 
 template <typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
 struct eval<Array<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>, Dense> {
-  typedef const Array<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>& type;
+  using type = const Array<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>&;
 };
 
 /* similar to plain_matrix_type, but using the evaluator's Flags */
@@ -466,37 +452,31 @@ struct plain_object_eval;
 
 template <typename T>
 struct plain_object_eval<T, Dense> {
-  typedef typename plain_matrix_type_dense<T, typename traits<T>::XprKind, evaluator<T>::Flags>::type type;
+  using type = typename plain_matrix_type_dense<T, typename traits<T>::XprKind, evaluator<T>::Flags>::type;
 };
 
 /* plain_matrix_type_column_major : same as plain_matrix_type but guaranteed to be column-major
  */
 template <typename T>
 struct plain_matrix_type_column_major {
-  enum {
-    Rows = traits<T>::RowsAtCompileTime,
-    Cols = traits<T>::ColsAtCompileTime,
-    MaxRows = traits<T>::MaxRowsAtCompileTime,
-    MaxCols = traits<T>::MaxColsAtCompileTime
-  };
-  typedef Matrix<typename traits<T>::Scalar, Rows, Cols, (MaxRows == 1 && MaxCols != 1) ? RowMajor : ColMajor, MaxRows,
-                 MaxCols>
-      type;
+  static constexpr int Rows = traits<T>::RowsAtCompileTime;
+  static constexpr int Cols = traits<T>::ColsAtCompileTime;
+  static constexpr int MaxRows = traits<T>::MaxRowsAtCompileTime;
+  static constexpr int MaxCols = traits<T>::MaxColsAtCompileTime;
+  using type = Matrix<typename traits<T>::Scalar, Rows, Cols, (MaxRows == 1 && MaxCols != 1) ? RowMajor : ColMajor,
+                      MaxRows, MaxCols>;
 };
 
 /* plain_matrix_type_row_major : same as plain_matrix_type but guaranteed to be row-major
  */
 template <typename T>
 struct plain_matrix_type_row_major {
-  enum {
-    Rows = traits<T>::RowsAtCompileTime,
-    Cols = traits<T>::ColsAtCompileTime,
-    MaxRows = traits<T>::MaxRowsAtCompileTime,
-    MaxCols = traits<T>::MaxColsAtCompileTime
-  };
-  typedef Matrix<typename traits<T>::Scalar, Rows, Cols, (MaxCols == 1 && MaxRows != 1) ? ColMajor : RowMajor, MaxRows,
-                 MaxCols>
-      type;
+  static constexpr int Rows = traits<T>::RowsAtCompileTime;
+  static constexpr int Cols = traits<T>::ColsAtCompileTime;
+  static constexpr int MaxRows = traits<T>::MaxRowsAtCompileTime;
+  static constexpr int MaxCols = traits<T>::MaxColsAtCompileTime;
+  using type = Matrix<typename traits<T>::Scalar, Rows, Cols, (MaxCols == 1 && MaxRows != 1) ? ColMajor : RowMajor,
+                      MaxRows, MaxCols>;
 };
 
 /** \internal The reference selector for template expressions. The idea is that we don't
@@ -504,15 +484,9 @@ struct plain_matrix_type_row_major {
  * objects which should generate no copying overhead. */
 template <typename T>
 struct ref_selector {
-  typedef std::conditional_t<bool(traits<T>::Flags& NestByRefBit), T const&, const T> type;
+  using type = std::conditional_t<bool(traits<T>::Flags& NestByRefBit), T const&, const T>;
 
-  typedef std::conditional_t<bool(traits<T>::Flags& NestByRefBit), T&, T> non_const_type;
-};
-
-/** \internal Adds the const qualifier on the value-type of T2 if and only if T1 is a const type */
-template <typename T1, typename T2>
-struct transfer_constness {
-  typedef std::conditional_t<bool(internal::is_const<T1>::value), add_const_on_value_type_t<T2>, T2> type;
+  using non_const_type = std::conditional_t<bool(traits<T>::Flags& NestByRefBit), T&, T>;
 };
 
 // However, we still need a mechanism to detect whether an expression which is evaluated multiple time
@@ -546,14 +520,9 @@ struct nested_eval {
     Evaluate = (int(evaluator<T>::Flags) & EvalBeforeNestingBit) || (int(CostEval) < int(CostNoEval))
   };
 
-  typedef std::conditional_t<Evaluate, PlainObject, typename ref_selector<T>::type> type;
+  using type = std::conditional_t<Evaluate, PlainObject, typename ref_selector<T>::type>;
 };
 
-template <typename T>
-EIGEN_DEVICE_FUNC inline T* const_cast_ptr(const T* ptr) {
-  return const_cast<T*>(ptr);
-}
-
 template <typename Derived, typename XprKind = typename traits<Derived>::XprKind>
 struct dense_xpr_base {
   /* dense_xpr_base should only ever be used on dense expressions, thus falling either into the MatrixXpr or into the
@@ -562,12 +531,12 @@ struct dense_xpr_base {
 
 template <typename Derived>
 struct dense_xpr_base<Derived, MatrixXpr> {
-  typedef MatrixBase<Derived> type;
+  using type = MatrixBase<Derived>;
 };
 
 template <typename Derived>
 struct dense_xpr_base<Derived, ArrayXpr> {
-  typedef ArrayBase<Derived> type;
+  using type = ArrayBase<Derived>;
 };
 
 template <typename Derived, typename XprKind = typename traits<Derived>::XprKind,
@@ -576,15 +545,15 @@ struct generic_xpr_base;
 
 template <typename Derived, typename XprKind>
 struct generic_xpr_base<Derived, XprKind, Dense> {
-  typedef typename dense_xpr_base<Derived, XprKind>::type type;
+  using type = typename dense_xpr_base<Derived, XprKind>::type;
 };
 
 template <typename XprType, typename CastType>
 struct cast_return_type {
-  typedef typename XprType::Scalar CurrentScalarType;
-  typedef remove_all_t<CastType> CastType_;
-  typedef typename CastType_::Scalar NewScalarType;
-  typedef std::conditional_t<is_same<CurrentScalarType, NewScalarType>::value, const XprType&, CastType> type;
+  using CurrentScalarType = typename XprType::Scalar;
+  using CastType_ = remove_all_t<CastType>;
+  using NewScalarType = typename CastType_::Scalar;
+  using type = std::conditional_t<is_same<CurrentScalarType, NewScalarType>::value, const XprType&, CastType>;
 };
 
 template <typename A, typename B>
@@ -592,15 +561,15 @@ struct promote_storage_type;
 
 template <typename A>
 struct promote_storage_type<A, A> {
-  typedef A ret;
+  using ret = A;
 };
 template <typename A>
 struct promote_storage_type<A, const A> {
-  typedef A ret;
+  using ret = A;
 };
 template <typename A>
 struct promote_storage_type<const A, A> {
-  typedef A ret;
+  using ret = A;
 };
 
 /** \internal Specify the "storage kind" of applying a coefficient-wise
@@ -621,27 +590,27 @@ struct cwise_promote_storage_type;
 
 template <typename A, typename Functor>
 struct cwise_promote_storage_type<A, A, Functor> {
-  typedef A ret;
+  using ret = A;
 };
 template <typename Functor>
 struct cwise_promote_storage_type<Dense, Dense, Functor> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 template <typename A, typename Functor>
 struct cwise_promote_storage_type<A, Dense, Functor> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 template <typename B, typename Functor>
 struct cwise_promote_storage_type<Dense, B, Functor> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 template <typename Functor>
 struct cwise_promote_storage_type<Sparse, Dense, Functor> {
-  typedef Sparse ret;
+  using ret = Sparse;
 };
 template <typename Functor>
 struct cwise_promote_storage_type<Dense, Sparse, Functor> {
-  typedef Sparse ret;
+  using ret = Sparse;
 };
 
 template <typename LhsKind, typename RhsKind, int LhsOrder, int RhsOrder>
@@ -681,74 +650,74 @@ struct product_promote_storage_type;
 
 template <typename A, int ProductTag>
 struct product_promote_storage_type<A, A, ProductTag> {
-  typedef A ret;
+  using ret = A;
 };
 template <int ProductTag>
 struct product_promote_storage_type<Dense, Dense, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 template <typename A, int ProductTag>
 struct product_promote_storage_type<A, Dense, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 template <typename B, int ProductTag>
 struct product_promote_storage_type<Dense, B, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 
 template <typename A, int ProductTag>
 struct product_promote_storage_type<A, DiagonalShape, ProductTag> {
-  typedef A ret;
+  using ret = A;
 };
 template <typename B, int ProductTag>
 struct product_promote_storage_type<DiagonalShape, B, ProductTag> {
-  typedef B ret;
+  using ret = B;
 };
 template <int ProductTag>
 struct product_promote_storage_type<Dense, DiagonalShape, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 template <int ProductTag>
 struct product_promote_storage_type<DiagonalShape, Dense, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 
 template <typename A, int ProductTag>
 struct product_promote_storage_type<A, SkewSymmetricShape, ProductTag> {
-  typedef A ret;
+  using ret = A;
 };
 template <typename B, int ProductTag>
 struct product_promote_storage_type<SkewSymmetricShape, B, ProductTag> {
-  typedef B ret;
+  using ret = B;
 };
 template <int ProductTag>
 struct product_promote_storage_type<Dense, SkewSymmetricShape, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 template <int ProductTag>
 struct product_promote_storage_type<SkewSymmetricShape, Dense, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 template <int ProductTag>
 struct product_promote_storage_type<SkewSymmetricShape, SkewSymmetricShape, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 
 template <typename A, int ProductTag>
 struct product_promote_storage_type<A, PermutationStorage, ProductTag> {
-  typedef A ret;
+  using ret = A;
 };
 template <typename B, int ProductTag>
 struct product_promote_storage_type<PermutationStorage, B, ProductTag> {
-  typedef B ret;
+  using ret = B;
 };
 template <int ProductTag>
 struct product_promote_storage_type<Dense, PermutationStorage, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 template <int ProductTag>
 struct product_promote_storage_type<PermutationStorage, Dense, ProductTag> {
-  typedef Dense ret;
+  using ret = Dense;
 };
 
 /** \internal gives the plain matrix or array type to store a row/column/diagonal of a matrix type.
@@ -756,63 +725,56 @@ struct product_promote_storage_type<PermutationStorage, Dense, ProductTag> {
  */
 template <typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
 struct plain_row_type {
-  typedef Matrix<Scalar, 1, ExpressionType::ColsAtCompileTime,
-                 int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime>
-      MatrixRowType;
-  typedef Array<Scalar, 1, ExpressionType::ColsAtCompileTime, int(ExpressionType::PlainObject::Options) | int(RowMajor),
-                1, ExpressionType::MaxColsAtCompileTime>
-      ArrayRowType;
+  using MatrixRowType =
+      Matrix<Scalar, 1, ExpressionType::ColsAtCompileTime, int(ExpressionType::PlainObject::Options) | int(RowMajor), 1,
+             ExpressionType::MaxColsAtCompileTime>;
+  using ArrayRowType =
+      Array<Scalar, 1, ExpressionType::ColsAtCompileTime, int(ExpressionType::PlainObject::Options) | int(RowMajor), 1,
+            ExpressionType::MaxColsAtCompileTime>;
 
-  typedef std::conditional_t<is_same<typename traits<ExpressionType>::XprKind, MatrixXpr>::value, MatrixRowType,
-                             ArrayRowType>
-      type;
+  using type = std::conditional_t<is_same<typename traits<ExpressionType>::XprKind, MatrixXpr>::value, MatrixRowType,
+                                  ArrayRowType>;
 };
 
 template <typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
 struct plain_col_type {
-  typedef Matrix<Scalar, ExpressionType::RowsAtCompileTime, 1, ExpressionType::PlainObject::Options & ~RowMajor,
-                 ExpressionType::MaxRowsAtCompileTime, 1>
-      MatrixColType;
-  typedef Array<Scalar, ExpressionType::RowsAtCompileTime, 1, ExpressionType::PlainObject::Options & ~RowMajor,
-                ExpressionType::MaxRowsAtCompileTime, 1>
-      ArrayColType;
+  using MatrixColType =
+      Matrix<Scalar, ExpressionType::RowsAtCompileTime, 1, ExpressionType::PlainObject::Options & ~RowMajor,
+             ExpressionType::MaxRowsAtCompileTime, 1>;
+  using ArrayColType = Array<Scalar, ExpressionType::RowsAtCompileTime, 1,
+                             ExpressionType::PlainObject::Options & ~RowMajor, ExpressionType::MaxRowsAtCompileTime, 1>;
 
-  typedef std::conditional_t<is_same<typename traits<ExpressionType>::XprKind, MatrixXpr>::value, MatrixColType,
-                             ArrayColType>
-      type;
+  using type = std::conditional_t<is_same<typename traits<ExpressionType>::XprKind, MatrixXpr>::value, MatrixColType,
+                                  ArrayColType>;
 };
 
 template <typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
 struct plain_diag_type {
-  enum {
-    diag_size = internal::min_size_prefer_dynamic(ExpressionType::RowsAtCompileTime, ExpressionType::ColsAtCompileTime),
-    max_diag_size = min_size_prefer_fixed(ExpressionType::MaxRowsAtCompileTime, ExpressionType::MaxColsAtCompileTime)
-  };
-  typedef Matrix<Scalar, diag_size, 1, ExpressionType::PlainObject::Options & ~RowMajor, max_diag_size, 1>
-      MatrixDiagType;
-  typedef Array<Scalar, diag_size, 1, ExpressionType::PlainObject::Options & ~RowMajor, max_diag_size, 1> ArrayDiagType;
+  static constexpr int diag_size =
+      internal::min_size_prefer_dynamic(ExpressionType::RowsAtCompileTime, ExpressionType::ColsAtCompileTime);
+  static constexpr int max_diag_size =
+      min_size_prefer_fixed(ExpressionType::MaxRowsAtCompileTime, ExpressionType::MaxColsAtCompileTime);
+  using MatrixDiagType =
+      Matrix<Scalar, diag_size, 1, ExpressionType::PlainObject::Options & ~RowMajor, max_diag_size, 1>;
+  using ArrayDiagType = Array<Scalar, diag_size, 1, ExpressionType::PlainObject::Options & ~RowMajor, max_diag_size, 1>;
 
-  typedef std::conditional_t<is_same<typename traits<ExpressionType>::XprKind, MatrixXpr>::value, MatrixDiagType,
-                             ArrayDiagType>
-      type;
+  using type = std::conditional_t<is_same<typename traits<ExpressionType>::XprKind, MatrixXpr>::value, MatrixDiagType,
+                                  ArrayDiagType>;
 };
 
 template <typename Expr, typename Scalar = typename Expr::Scalar>
 struct plain_constant_type {
-  enum { Options = (traits<Expr>::Flags & RowMajorBit) ? RowMajor : 0 };
+  static constexpr int Options = (traits<Expr>::Flags & RowMajorBit) ? RowMajor : 0;
 
-  typedef Array<Scalar, traits<Expr>::RowsAtCompileTime, traits<Expr>::ColsAtCompileTime, Options,
-                traits<Expr>::MaxRowsAtCompileTime, traits<Expr>::MaxColsAtCompileTime>
-      array_type;
+  using array_type = Array<Scalar, traits<Expr>::RowsAtCompileTime, traits<Expr>::ColsAtCompileTime, Options,
+                           traits<Expr>::MaxRowsAtCompileTime, traits<Expr>::MaxColsAtCompileTime>;
 
-  typedef Matrix<Scalar, traits<Expr>::RowsAtCompileTime, traits<Expr>::ColsAtCompileTime, Options,
-                 traits<Expr>::MaxRowsAtCompileTime, traits<Expr>::MaxColsAtCompileTime>
-      matrix_type;
+  using matrix_type = Matrix<Scalar, traits<Expr>::RowsAtCompileTime, traits<Expr>::ColsAtCompileTime, Options,
+                             traits<Expr>::MaxRowsAtCompileTime, traits<Expr>::MaxColsAtCompileTime>;
 
-  typedef CwiseNullaryOp<
+  using type = CwiseNullaryOp<
       scalar_constant_op<Scalar>,
-      const std::conditional_t<is_same<typename traits<Expr>::XprKind, MatrixXpr>::value, matrix_type, array_type>>
-      type;
+      const std::conditional_t<is_same<typename traits<Expr>::XprKind, MatrixXpr>::value, matrix_type, array_type>>;
 };
 
 template <typename ExpressionType>
@@ -854,7 +816,7 @@ template <typename S1, typename S2>
 struct glue_shapes;
 template <>
 struct glue_shapes<DenseShape, TriangularShape> {
-  typedef TriangularShape type;
+  using type = TriangularShape;
 };
 
 template <typename T1, typename T2>
@@ -1049,34 +1011,34 @@ struct ScalarBinaryOpTraits
 
 template <typename T, typename BinaryOp>
 struct ScalarBinaryOpTraits<T, T, BinaryOp> {
-  typedef T ReturnType;
+  using ReturnType = T;
 };
 
 template <typename T, typename BinaryOp>
 struct ScalarBinaryOpTraits<T, typename NumTraits<std::enable_if_t<NumTraits<T>::IsComplex, T>>::Real, BinaryOp> {
-  typedef T ReturnType;
+  using ReturnType = T;
 };
 template <typename T, typename BinaryOp>
 struct ScalarBinaryOpTraits<typename NumTraits<std::enable_if_t<NumTraits<T>::IsComplex, T>>::Real, T, BinaryOp> {
-  typedef T ReturnType;
+  using ReturnType = T;
 };
 
 // For Matrix * Permutation
 template <typename T, typename BinaryOp>
 struct ScalarBinaryOpTraits<T, void, BinaryOp> {
-  typedef T ReturnType;
+  using ReturnType = T;
 };
 
 // For Permutation * Matrix
 template <typename T, typename BinaryOp>
 struct ScalarBinaryOpTraits<void, T, BinaryOp> {
-  typedef T ReturnType;
+  using ReturnType = T;
 };
 
 // for Permutation*Permutation
 template <typename BinaryOp>
 struct ScalarBinaryOpTraits<void, void, BinaryOp> {
-  typedef void ReturnType;
+  using ReturnType = void;
 };
 
 // We require Lhs and Rhs to have "compatible" scalar types.
diff --git a/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/Eigen/src/Eigenvalues/ComplexEigenSolver.h
index 50fa3b8095d..c8489555429 100644
--- a/Eigen/src/Eigenvalues/ComplexEigenSolver.h
+++ b/Eigen/src/Eigenvalues/ComplexEigenSolver.h
@@ -222,7 +222,7 @@ class ComplexEigenSolver {
   }
 
   /** \brief Returns the maximum number of iterations. */
-  Index getMaxIterations() { return m_schur.getMaxIterations(); }
+  Index getMaxIterations() const { return m_schur.getMaxIterations(); }
 
  protected:
   EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
@@ -265,8 +265,6 @@ template <typename MatrixType>
 void ComplexEigenSolver<MatrixType>::doComputeEigenvectors(RealScalar matrixnorm) {
   const Index n = m_eivalues.size();
 
-  matrixnorm = numext::maxi(matrixnorm, (std::numeric_limits<RealScalar>::min)());
-
   // Compute X such that T = X D X^(-1), where D is the diagonal of T.
   // The matrix X is unit triangular.
   m_matX = EigenvectorType::Zero(n, n);
@@ -282,7 +280,8 @@ void ComplexEigenSolver<MatrixType>::doComputeEigenvectors(RealScalar matrixnorm
       if (z == ComplexScalar(0)) {
         // If the i-th and k-th eigenvalue are equal, then z equals 0.
         // Use a small value instead, to prevent division by zero.
-        numext::real_ref(z) = NumTraits<RealScalar>::epsilon() * matrixnorm;
+        numext::real_ref(z) = numext::maxi(std::numeric_limits<RealScalar>::epsilon() * matrixnorm,
+                                           (std::numeric_limits<RealScalar>::min)());
       }
       m_matX.coeffRef(i, k) = m_matX.coeff(i, k) / z;
     }
diff --git a/Eigen/src/Eigenvalues/ComplexQZ.h b/Eigen/src/Eigenvalues/ComplexQZ.h
new file mode 100644
index 00000000000..1d5f8d0dae9
--- /dev/null
+++ b/Eigen/src/Eigenvalues/ComplexQZ.h
@@ -0,0 +1,651 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Alexey Korepanov
+// Copyright (C) 2025 Ludwig Striet <ludwig.striet@mathematik.uni-freiburg.de>
+//
+// This Source Code Form is subject to the terms of the
+// Mozilla Public License v. 2.0. If a copy of the MPL
+// was not distributed with this file, You can obtain one at
+// https://mozilla.org/MPL/2.0/.
+//
+// Derived from: Eigen/src/Eigenvalues/RealQZ.h
+
+#ifndef EIGEN_COMPLEX_QZ_H_
+#define EIGEN_COMPLEX_QZ_H_
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+/** \eigenvalues_module \ingroup Eigenvalues_Module
+ *
+ *
+ * \class ComplexQZ
+ *
+ * \brief Performs a QZ decomposition of a pair of matrices A, B
+ *
+ * \tparam MatrixType_ the type input type of the matrix.
+ *
+ * Given to complex square matrices A and B, this class computes the QZ decomposition
+ * \f$ A = Q S Z \f$, \f$ B = Q T Z\f$ where Q and Z are unitary matrices and
+ * S and T a re upper-triangular matrices. More precisely, Q and Z fulfill
+ * \f$ Q Q* = Id\f$ and \f$ Z Z* = Id\f$. The generalized Eigenvalues are then
+ * obtained as ratios of corresponding diagonal entries, lambda(i) = S(i,i) / T(i, i).
+ *
+ * The QZ algorithm was introduced in the seminal work "An Algorithm for
+ * Generalized Matrix Eigenvalue Problems" by Moler & Stewart in 1973. The matrix
+ * pair S = A, T = B is first transformed to Hessenberg-Triangular form where S is an
+ * upper Hessenberg matrix and T is an upper Triangular matrix.
+ *
+ * This pair is subsequently reduced to the desired form using implicit QZ shifts as
+ * described in the original paper. The algorithms to find small entries on the
+ * diagonals and subdiagonals are based on the variants in the implementation
+ * for Real matrices in the RealQZ class.
+ *
+ * \sa class RealQZ
+ */
+
+namespace Eigen {
+
+template <typename MatrixType_>
+class ComplexQZ {
+ public:
+  using MatrixType = MatrixType_;
+  using Scalar = typename MatrixType_::Scalar;
+  using RealScalar = typename MatrixType_::RealScalar;
+
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    Options = internal::traits<MatrixType>::Options,
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using Vec2 = Matrix<Scalar, 2, 1>;
+  using Vec3 = Matrix<Scalar, 3, 1>;
+  using Row2 = Matrix<Scalar, 1, 2>;
+  using Mat2 = Matrix<Scalar, 2, 2>;
+
+  /** \brief Returns matrix Q in the QZ decomposition.
+   *
+   * \returns A const reference to the matrix Q.
+   */
+  const MatrixType& matrixQ() const {
+    eigen_assert(m_isInitialized && "ComplexQZ is not initialized.");
+    eigen_assert(m_computeQZ && "The matrices Q and Z have not been computed during the QZ decomposition.");
+    return m_Q;
+  }
+
+  /** \brief Returns matrix Z in the QZ decomposition.
+   *
+   * \returns A const reference to the matrix Z.
+   */
+  const MatrixType& matrixZ() const {
+    eigen_assert(m_isInitialized && "ComplexQZ is not initialized.");
+    eigen_assert(m_computeQZ && "The matrices Q and Z have not been computed during the QZ decomposition.");
+    return m_Z;
+  }
+
+  /** \brief Returns matrix S in the QZ decomposition.
+   *
+   * \returns A const reference to the matrix S.
+   */
+  const MatrixType& matrixS() const {
+    eigen_assert(m_isInitialized && "ComplexQZ is not initialized.");
+    return m_S;
+  }
+
+  /** \brief Returns matrix S in the QZ decomposition.
+   *
+   * \returns A const reference to the matrix S.
+   */
+  const MatrixType& matrixT() const {
+    eigen_assert(m_isInitialized && "ComplexQZ is not initialized.");
+    return m_T;
+  }
+
+  /** \brief Constructor
+   *
+   * \param[in] n size of the matrices whose QZ decomposition we compute
+   *
+   * This constructor is used when we use the compute(...) method later,
+   * especially when we aim to compute the decomposition of two sparse
+   * matrices.
+   */
+  ComplexQZ(Index n, bool computeQZ = true, unsigned int maxIters = 400)
+      : m_n(n),
+        m_S(n, n),
+        m_T(n, n),
+        m_Q(computeQZ ? n : (MatrixType::RowsAtCompileTime == Eigen::Dynamic ? 0 : MatrixType::RowsAtCompileTime),
+            computeQZ ? n : (MatrixType::ColsAtCompileTime == Eigen::Dynamic ? 0 : MatrixType::ColsAtCompileTime)),
+        m_Z(computeQZ ? n : (MatrixType::RowsAtCompileTime == Eigen::Dynamic ? 0 : MatrixType::RowsAtCompileTime),
+            computeQZ ? n : (MatrixType::ColsAtCompileTime == Eigen::Dynamic ? 0 : MatrixType::ColsAtCompileTime)),
+        m_ws(2 * n),
+        m_computeQZ(computeQZ),
+        m_maxIters(maxIters) {}
+
+  /** \brief Constructor. computes the QZ decomposition of given matrices
+   * upon creation
+   *
+   * \param[in] A         input matrix A
+   * \param[in] B         input matrix B
+   * \param[in] computeQZ If false, the matrices Q and Z are not computed
+   *
+   * This constructor calls the compute() method to compute the QZ decomposition.
+   * If input matrices are sparse, call the constructor that uses only the
+   * size as input the computeSparse(...) method.
+   */
+  ComplexQZ(const MatrixType& A, const MatrixType& B, bool computeQZ = true, unsigned int maxIters = 400)
+      : m_n(A.rows()),
+        m_maxIters(maxIters),
+        m_computeQZ(computeQZ),
+        m_S(A.rows(), A.cols()),
+        m_T(A.rows(), A.cols()),
+        m_Q(computeQZ ? m_n : (MatrixType::RowsAtCompileTime == Eigen::Dynamic ? 0 : MatrixType::RowsAtCompileTime),
+            computeQZ ? m_n : (MatrixType::ColsAtCompileTime == Eigen::Dynamic ? 0 : MatrixType::ColsAtCompileTime)),
+        m_Z(computeQZ ? m_n : (MatrixType::RowsAtCompileTime == Eigen::Dynamic ? 0 : MatrixType::RowsAtCompileTime),
+            computeQZ ? m_n : (MatrixType::ColsAtCompileTime == Eigen::Dynamic ? 0 : MatrixType::ColsAtCompileTime)),
+        m_ws(2 * m_n) {
+    compute(A, B, computeQZ);
+  }
+
+  /** \brief Compute the QZ decomposition of complex input matrices
+   *
+   * \param[in] A         Matrix A.
+   * \param[in] B         Matrix B.
+   * \param[in] computeQZ If false, the matrices Q and Z are not computed.
+   */
+  void compute(const MatrixType& A, const MatrixType& B, bool computeQZ = true);
+
+  /** \brief Compute the decomposition of sparse complex input matrices.
+   * Main difference to the compute(...) method is that it computes a
+   * SparseQR decomposition of B
+   *
+   * \param[in] A         Matrix A.
+   * \param[in] B         Matrix B.
+   * \param[in] computeQZ If false, the matrices Q and Z are not computed.
+   */
+  template <typename SparseMatrixType_>
+  void computeSparse(const SparseMatrixType_& A, const SparseMatrixType_& B, bool computeQZ = true);
+
+  /** \brief Reports whether the last computation was successful.
+   *
+   * \returns \c Success if computation was successful, \c NoConvergence otherwise.
+   */
+  ComputationInfo info() const { return m_info; }
+
+  /** \brief number of performed QZ steps
+   */
+  unsigned int iterations() const {
+    eigen_assert(m_isInitialized && "ComplexQZ is not initialized.");
+    return m_global_iter;
+  }
+
+ private:
+  Index m_n;
+  const unsigned int m_maxIters;
+  unsigned int m_global_iter;
+  bool m_isInitialized;
+  bool m_computeQZ;
+  ComputationInfo m_info;
+  MatrixType m_S, m_T, m_Q, m_Z;
+  RealScalar m_normOfT, m_normOfS;
+  Vec m_ws;
+
+  // Test if a Scalar is 0 up to a certain tolerance
+  static bool is_negligible(const Scalar x, const RealScalar tol = NumTraits<RealScalar>::epsilon()) {
+    return numext::abs(x) <= tol;
+  }
+
+  void do_QZ_step(Index p, Index q);
+
+  inline Mat2 computeZk2(const Row2& b);
+
+  // This is basically taken from Eigen3::RealQZ
+  void hessenbergTriangular(const MatrixType& A, const MatrixType& B);
+
+  // This function can be called when m_Q and m_Z are initialized and m_S, m_T
+  // are in hessenberg-triangular form
+  void reduceHessenbergTriangular();
+
+  // Sparse variant of the above method.
+  template <typename SparseMatrixType_>
+  void hessenbergTriangularSparse(const SparseMatrixType_& A, const SparseMatrixType_& B);
+
+  void computeNorms();
+
+  Index findSmallSubdiagEntry(Index l);
+  Index findSmallDiagEntry(Index f, Index l);
+
+  void push_down_zero_ST(Index k, Index l);
+
+  void reduceDiagonal2x2block(Index i);
+};
+
+template <typename MatrixType_>
+void ComplexQZ<MatrixType_>::compute(const MatrixType& A, const MatrixType& B, bool computeQZ) {
+  m_computeQZ = computeQZ;
+  m_n = A.rows();
+
+  eigen_assert(m_n == A.cols() && "A is not a square matrix");
+  eigen_assert(m_n == B.rows() && m_n == B.cols() && "B is not a square matrix or B is not of the same size as A");
+
+  m_isInitialized = true;
+  m_global_iter = 0;
+
+  // This will initialize m_Q and m_Z and bring m_S, m_T to hessenberg-triangular form
+  hessenbergTriangular(A, B);
+
+  // We assume that we already have that S is upper-Hessenberg and T is
+  // upper-triangular. This is what the hessenbergTriangular(...) method does
+  reduceHessenbergTriangular();
+}
+
+// This is basically taken from Eigen3::RealQZ
+template <typename MatrixType_>
+void ComplexQZ<MatrixType_>::hessenbergTriangular(const MatrixType& A, const MatrixType& B) {
+  // Copy A and B, these will be the matrices on which we operate later
+  m_S = A;
+  m_T = B;
+
+  // Perform QR decomposition of the matrix Q
+  HouseholderQR<MatrixType> qr(m_T);
+  m_T = qr.matrixQR();
+  m_T.template triangularView<StrictlyLower>().setZero();
+
+  if (m_computeQZ) m_Q = qr.householderQ();
+
+  // overwrite S with Q* x S
+  m_S.applyOnTheLeft(qr.householderQ().adjoint());
+
+  if (m_computeQZ) m_Z = MatrixType::Identity(m_n, m_n);
+
+  // reduce S to upper Hessenberg with Givens rotations
+  for (Index j = 0; j <= m_n - 3; j++) {
+    for (Index i = m_n - 1; i >= j + 2; i--) {
+      JacobiRotation<Scalar> G;
+      // delete S(i,j)
+      if (!numext::is_exactly_zero(m_S.coeff(i, j))) {
+        G.makeGivens(m_S.coeff(i - 1, j), m_S.coeff(i, j), &m_S.coeffRef(i - 1, j));
+        m_S.coeffRef(i, j) = Scalar(0);
+        m_T.rightCols(m_n - i + 1).applyOnTheLeft(i - 1, i, G.adjoint());
+        m_S.rightCols(m_n - j - 1).applyOnTheLeft(i - 1, i, G.adjoint());
+        // This is what we want to achieve
+        if (!is_negligible(m_S(i, j)))
+          m_info = ComputationInfo::NumericalIssue;
+        else
+          m_S(i, j) = Scalar(0);
+        // update Q
+        if (m_computeQZ) m_Q.applyOnTheRight(i - 1, i, G);
+      }
+
+      if (!numext::is_exactly_zero(m_T.coeff(i, i - 1))) {
+        // Compute rotation and update matrix T
+        G.makeGivens(m_T.coeff(i, i), m_T.coeff(i, i - 1), &m_T.coeffRef(i, i));
+        m_T.topRows(i).applyOnTheRight(i - 1, i, G.adjoint());
+        m_T.coeffRef(i, i - 1) = Scalar(0);
+        // Update matrix S
+        m_S.applyOnTheRight(i - 1, i, G.adjoint());
+        // update Z
+        if (m_computeQZ) m_Z.applyOnTheLeft(i - 1, i, G);
+      }
+    }
+  }
+}
+
+template <typename MatrixType>
+template <typename SparseMatrixType_>
+void ComplexQZ<MatrixType>::hessenbergTriangularSparse(const SparseMatrixType_& A, const SparseMatrixType_& B) {
+  m_S = A.toDense();
+
+  SparseQR<SparseMatrix<Scalar, ColMajor>, NaturalOrdering<Index>> sparseQR;
+
+  eigen_assert(B.isCompressed() &&
+               "SparseQR requires a sparse matrix in compressed mode."
+               "Call .makeCompressed() before passing it to SparseQR");
+
+  // Computing QR decomposition of T...
+  sparseQR.setPivotThreshold(RealScalar(0));  // This prevends algorithm from doing pivoting
+  sparseQR.compute(B);
+  // perform QR decomposition of T, overwrite T with R, save Q
+  // HouseholderQR<Mat> qrT(m_T);
+  m_T = sparseQR.matrixR();
+  m_T.template triangularView<StrictlyLower>().setZero();
+
+  if (m_computeQZ) m_Q = sparseQR.matrixQ();
+
+  // overwrite S with Q* S
+  m_S = sparseQR.matrixQ().adjoint() * m_S;
+
+  if (m_computeQZ) m_Z = MatrixType::Identity(m_n, m_n);
+
+  // reduce S to upper Hessenberg with Givens rotations
+  for (Index j = 0; j <= m_n - 3; j++) {
+    for (Index i = m_n - 1; i >= j + 2; i--) {
+      JacobiRotation<Scalar> G;
+      // kill S(i,j)
+      // if(!numext::is_exactly_zero(_S.coeff(i, j)))
+      if (m_S.coeff(i, j) != Scalar(0)) {
+        // This is the adapted code
+        G.makeGivens(m_S.coeff(i - 1, j), m_S.coeff(i, j), &m_S.coeffRef(i - 1, j));
+        m_S.coeffRef(i, j) = Scalar(0);
+        m_T.rightCols(m_n - i + 1).applyOnTheLeft(i - 1, i, G.adjoint());
+        m_S.rightCols(m_n - j - 1).applyOnTheLeft(i - 1, i, G.adjoint());
+        // This is what we want to achieve
+        if (!is_negligible(m_S(i, j))) {
+          m_info = ComputationInfo::NumericalIssue;
+        }
+        m_S(i, j) = Scalar(0);
+        // update Q
+        if (m_computeQZ) m_Q.applyOnTheRight(i - 1, i, G);
+      }
+
+      if (!numext::is_exactly_zero(m_T.coeff(i, i - 1))) {
+        // Compute rotation and update matrix T
+        G.makeGivens(m_T.coeff(i, i), m_T.coeff(i, i - 1), &m_T.coeffRef(i, i));
+        m_T.topRows(i).applyOnTheRight(i - 1, i, G.adjoint());
+        m_T.coeffRef(i, i - 1) = Scalar(0);
+        // Update matrix S
+        m_S.applyOnTheRight(i - 1, i, G.adjoint());
+        // update Z
+        if (m_computeQZ) m_Z.applyOnTheLeft(i - 1, i, G);
+      }
+    }
+  }
+}
+
+template <typename MatrixType>
+template <typename SparseMatrixType_>
+void ComplexQZ<MatrixType>::computeSparse(const SparseMatrixType_& A, const SparseMatrixType_& B, bool computeQZ) {
+  m_computeQZ = computeQZ;
+  m_n = A.rows();
+  eigen_assert(m_n == A.cols() && "A is not a square matrix");
+  eigen_assert(m_n == B.rows() && m_n == B.cols() && "B is not a square matrix or B is not of the same size as A");
+  m_isInitialized = true;
+  m_global_iter = 0;
+  hessenbergTriangularSparse(A, B);
+
+  // We assume that we already have that A is upper-Hessenberg and B is
+  // upper-triangular. This is what the hessenbergTriangular(...) method does
+  reduceHessenbergTriangular();
+}
+
+template <typename MatrixType_>
+void ComplexQZ<MatrixType_>::reduceHessenbergTriangular() {
+  Index l = m_n - 1, f;
+  unsigned int local_iter = 0;
+  computeNorms();
+
+  while (l > 0 && local_iter < m_maxIters) {
+    f = findSmallSubdiagEntry(l);
+
+    // Subdiag entry is small -> can be safely set to 0
+    if (f > 0) {
+      m_S.coeffRef(f, f - 1) = Scalar(0);
+    }
+    if (f == l) {  // One root found
+      l--;
+      local_iter = 0;
+    } else if (f == l - 1) {  // Two roots found
+      // We found an undesired non-zero at (f+1,f) in S and eliminate it immediately
+      reduceDiagonal2x2block(f);
+      l -= 2;
+      local_iter = 0;
+    } else {
+      Index z = findSmallDiagEntry(f, l);
+      if (z >= f) {
+        push_down_zero_ST(z, l);
+      } else {
+        do_QZ_step(f, m_n - l - 1);
+        local_iter++;
+        m_global_iter++;
+      }
+    }
+  }
+
+  m_info = (local_iter < m_maxIters) ? Success : NoConvergence;
+}
+
+template <typename MatrixType_>
+inline typename ComplexQZ<MatrixType_>::Mat2 ComplexQZ<MatrixType_>::computeZk2(const Row2& b) {
+  Mat2 S;
+  S << Scalar(0), Scalar(1), Scalar(1), Scalar(0);
+  Vec2 bprime = S * b.adjoint();
+  JacobiRotation<Scalar> J;
+  J.makeGivens(bprime(0), bprime(1));
+  Mat2 Z = S;
+  Z.applyOnTheLeft(0, 1, J);
+  Z = S * Z;
+  return Z;
+}
+
+template <typename MatrixType_>
+void ComplexQZ<MatrixType_>::do_QZ_step(Index p, Index q) {
+  // This is certainly not the most efficient way of doing this,
+  // but a readable one.
+  const auto a = [p, this](Index i, Index j) { return m_S(p + i - 1, p + j - 1); };
+  const auto b = [p, this](Index i, Index j) { return m_T(p + i - 1, p + j - 1); };
+  const Index m = m_n - p - q;  // Size of the inner block
+  Scalar x, y, z;
+  // We could introduce doing exceptional shifts from time to time.
+  Scalar W1 = a(m - 1, m - 1) / b(m - 1, m - 1) - a(1, 1) / b(1, 1), W2 = a(m, m) / b(m, m) - a(1, 1) / b(1, 1),
+         W3 = a(m, m - 1) / b(m - 1, m - 1);
+
+  x = (W1 * W2 - a(m - 1, m) / b(m, m) * W3 + W3 * b(m - 1, m) / b(m, m) * a(1, 1) / b(1, 1)) * b(1, 1) / a(2, 1) +
+      a(1, 2) / b(2, 2) - a(1, 1) / b(1, 1) * b(1, 2) / b(2, 2);
+  y = (a(2, 2) / b(2, 2) - a(1, 1) / b(1, 1)) - a(2, 1) / b(1, 1) * b(1, 2) / b(2, 2) - W1 - W2 +
+      W3 * (b(m - 1, m) / b(m, m));
+  z = a(3, 2) / b(2, 2);
+  Vec3 X;
+  const PermutationMatrix<3, 3, int> S3(Vector3i(2, 0, 1));
+  for (Index k = p; k < p + m - 2; k++) {
+    X << x, y, z;
+    Vec2 ess;
+    Scalar tau;
+    RealScalar beta;
+    X.makeHouseholder(ess, tau, beta);
+    // The permutations are needed because the makeHouseHolder-method computes
+    // the householder transformation in a way that the vector is reflected to
+    // (1 0 ... 0) instead of (0 ... 0 1)
+    m_S.template middleRows<3>(k)
+        .rightCols((std::min)(m_n, m_n - k + 1))
+        .applyHouseholderOnTheLeft(ess, tau, m_ws.data());
+    m_T.template middleRows<3>(k).rightCols(m_n - k).applyHouseholderOnTheLeft(ess, tau, m_ws.data());
+    if (m_computeQZ) m_Q.template middleCols<3>(k).applyHouseholderOnTheRight(ess, std::conj(tau), m_ws.data());
+
+    // Compute Matrix Zk1 s.t. (b(k+2,k) ... b(k+2, k+2)) Zk1 = (0,0,*)
+    Vec3 bprime = (m_T.template block<1, 3>(k + 2, k) * S3).adjoint();
+    bprime.makeHouseholder(ess, tau, beta);
+    m_S.template middleCols<3>(k).topRows((std::min)(k + 4, m_n)).applyOnTheRight(S3);
+    m_S.template middleCols<3>(k)
+        .topRows((std::min)(k + 4, m_n))
+        .applyHouseholderOnTheRight(ess, std::conj(tau), m_ws.data());
+    m_S.template middleCols<3>(k).topRows((std::min)(k + 4, m_n)).applyOnTheRight(S3.transpose());
+    m_T.template middleCols<3>(k).topRows((std::min)(k + 3, m_n)).applyOnTheRight(S3);
+    m_T.template middleCols<3>(k)
+        .topRows((std::min)(k + 3, m_n))
+        .applyHouseholderOnTheRight(ess, std::conj(tau), m_ws.data());
+    m_T.template middleCols<3>(k).topRows((std::min)(k + 3, m_n)).applyOnTheRight(S3.transpose());
+    if (m_computeQZ) {
+      m_Z.template middleRows<3>(k).applyOnTheLeft(S3.transpose());
+      m_Z.template middleRows<3>(k).applyHouseholderOnTheLeft(ess, tau, m_ws.data());
+      m_Z.template middleRows<3>(k).applyOnTheLeft(S3);
+    }
+    Mat2 Zk2 = computeZk2(m_T.template block<1, 2>(k + 1, k));
+    m_S.template middleCols<2>(k).topRows((std::min)(k + 4, m_n)).applyOnTheRight(Zk2);
+    m_T.template middleCols<2>(k).topRows((std::min)(k + 3, m_n)).applyOnTheRight(Zk2);
+
+    if (m_computeQZ) m_Z.template middleRows<2>(k).applyOnTheLeft(Zk2.adjoint());
+
+    x = m_S(k + 1, k);
+    y = m_S(k + 2, k);
+    if (k < p + m - 3) {
+      z = m_S(k + 3, k);
+    }
+  };
+
+  // Find a Householdermartirx Qn1 s.t. Qn1 (x y)^T = (* 0)
+  JacobiRotation<Scalar> J;
+  J.makeGivens(x, y);
+  m_S.template middleRows<2>(p + m - 2).applyOnTheLeft(0, 1, J.adjoint());
+  m_T.template middleRows<2>(p + m - 2).applyOnTheLeft(0, 1, J.adjoint());
+
+  if (m_computeQZ) m_Q.template middleCols<2>(p + m - 2).applyOnTheRight(0, 1, J);
+
+  // Find a Householdermatrix Zn1 s.t. (b(n,n-1) b(n,n)) * Zn1 = (0 *)
+  Mat2 Zn1 = computeZk2(m_T.template block<1, 2>(p + m - 1, p + m - 2));
+  m_S.template middleCols<2>(p + m - 2).applyOnTheRight(Zn1);
+  m_T.template middleCols<2>(p + m - 2).applyOnTheRight(Zn1);
+
+  if (m_computeQZ) m_Z.template middleRows<2>(p + m - 2).applyOnTheLeft(Zn1.adjoint());
+}
+
+/** \internal we found an undesired non-zero at (i+1,i) on the subdiagonal of S and reduce the block */
+template <typename MatrixType_>
+void ComplexQZ<MatrixType_>::reduceDiagonal2x2block(Index i) {
+  // We have found a non-zero on the subdiagonal and want to eliminate it
+  Mat2 Si = m_S.template block<2, 2>(i, i), Ti = m_T.template block<2, 2>(i, i);
+  if (is_negligible(Ti(0, 0)) && !is_negligible(Ti(1, 1))) {
+    Eigen::JacobiRotation<Scalar> G;
+    G.makeGivens(m_S(i, i), m_S(i + 1, i));
+    m_S.applyOnTheLeft(i, i + 1, G.adjoint());
+    m_T.applyOnTheLeft(i, i + 1, G.adjoint());
+
+    if (m_computeQZ) m_Q.applyOnTheRight(i, i + 1, G);
+
+  } else if (!is_negligible(Ti(0, 0)) && is_negligible(Ti(1, 1))) {
+    Eigen::JacobiRotation<Scalar> G;
+    G.makeGivens(m_S(i + 1, i + 1), m_S(i + 1, i));
+    m_S.applyOnTheRight(i, i + 1, G.adjoint());
+    m_T.applyOnTheRight(i, i + 1, G.adjoint());
+    if (m_computeQZ) m_Z.applyOnTheLeft(i, i + 1, G);
+  } else if (!is_negligible(Ti(0, 0)) && !is_negligible((Ti(1, 1)))) {
+    Scalar mu = Si(0, 0) / Ti(0, 0);
+    Scalar a12_bar = Si(0, 1) - mu * Ti(0, 1);
+    Scalar a22_bar = Si(1, 1) - mu * Ti(1, 1);
+    Scalar p = Scalar(0.5) * (a22_bar / Ti(1, 1) - Ti(0, 1) * Si(1, 0) / (Ti(0, 0) * Ti(1, 1)));
+    RealScalar sgn_p = p.real() >= RealScalar(0) ? RealScalar(1) : RealScalar(-1);
+    Scalar q = Si(1, 0) * a12_bar / (Ti(0, 0) * Ti(1, 1));
+    Scalar r = p * p + q;
+    Scalar lambda = mu + p + sgn_p * numext::sqrt(r);
+    Mat2 E = Si - lambda * Ti;
+    Index l;
+    E.rowwise().norm().maxCoeff(&l);
+    JacobiRotation<Scalar> G;
+    G.makeGivens(E(l, 1), E(l, 0));
+    m_S.applyOnTheRight(i, i + 1, G.adjoint());
+    m_T.applyOnTheRight(i, i + 1, G.adjoint());
+
+    if (m_computeQZ) m_Z.applyOnTheLeft(i, i + 1, G);
+
+    Mat2 tildeSi = m_S.template block<2, 2>(i, i), tildeTi = m_T.template block<2, 2>(i, i);
+    Mat2 C = tildeSi.norm() < (lambda * tildeTi).norm() ? tildeSi : lambda * tildeTi;
+    G.makeGivens(C(0, 0), C(1, 0));
+    m_S.applyOnTheLeft(i, i + 1, G.adjoint());
+    m_T.applyOnTheLeft(i, i + 1, G.adjoint());
+
+    if (m_computeQZ) m_Q.applyOnTheRight(i, i + 1, G);
+  }
+
+  if (!is_negligible(m_S(i + 1, i), m_normOfS * NumTraits<RealScalar>::epsilon())) {
+    m_info = ComputationInfo::NumericalIssue;
+  } else {
+    m_S(i + 1, i) = Scalar(0);
+  }
+}
+
+/** \internal We found a zero at T(k,k) and want to "push it down" to T(l,l) */
+template <typename MatrixType_>
+void ComplexQZ<MatrixType_>::push_down_zero_ST(Index k, Index l) {
+  // Test Preconditions
+
+  JacobiRotation<Scalar> J;
+  for (Index j = k + 1; j <= l; j++) {
+    // Create a 0 at _T(j, j)
+    J.makeGivens(m_T(j - 1, j), m_T(j, j), &m_T.coeffRef(j - 1, j));
+    if (m_n - j - 1 > 0) {
+      m_T.rightCols(m_n - j - 1).applyOnTheLeft(j - 1, j, J.adjoint());
+    }
+    m_T.coeffRef(j, j) = Scalar(0);
+
+    m_S.applyOnTheLeft(j - 1, j, J.adjoint());
+
+    if (m_computeQZ) m_Q.applyOnTheRight(j - 1, j, J);
+
+    // Delete the non-desired non-zero at _S(j, j-2)
+    if (j > 1) {
+      J.makeGivens(std::conj(m_S(j, j - 1)), std::conj(m_S(j, j - 2)));
+      m_S.applyOnTheRight(j - 1, j - 2, J);
+      m_S(j, j - 2) = Scalar(0);
+      m_T.applyOnTheRight(j - 1, j - 2, J);
+      if (m_computeQZ) m_Z.applyOnTheLeft(j - 1, j - 2, J.adjoint());
+    }
+  }
+
+  // Assume we have the desired structure now, up to the non-zero entry at
+  // _S(l, l-1) which we will delete through a last right-jacobi-rotation
+  J.makeGivens(std::conj(m_S(l, l)), std::conj(m_S(l, l - 1)));
+  m_S.topRows(l + 1).applyOnTheRight(l, l - 1, J);
+
+  if (!is_negligible(m_S(l, l - 1), m_normOfS * NumTraits<Scalar>::epsilon())) {
+    m_info = ComputationInfo::NumericalIssue;
+  } else {
+    m_S(l, l - 1) = Scalar(0);
+  }
+  m_T.topRows(l + 1).applyOnTheRight(l, l - 1, J);
+
+  if (m_computeQZ) m_Z.applyOnTheLeft(l, l - 1, J.adjoint());
+
+  // Ensure postconditions
+  if (!is_negligible(m_T(l, l)) || !is_negligible(m_S(l, l - 1))) {
+    m_info = ComputationInfo::NumericalIssue;
+  } else {
+    m_T(l, l) = Scalar(0);
+    m_S(l, l - 1) = Scalar(0);
+  }
+}
+
+/** \internal Computes vector L1 norms of S and T when in Hessenberg-Triangular form already */
+template <typename MatrixType_>
+void ComplexQZ<MatrixType_>::computeNorms() {
+  const Index size = m_S.cols();
+  m_normOfS = RealScalar(0);
+  m_normOfT = RealScalar(0);
+  for (Index j = 0; j < size; ++j) {
+    m_normOfS += m_S.col(j).segment(0, (std::min)(size, j + 2)).cwiseAbs().sum();
+    m_normOfT += m_T.row(j).segment(j, size - j).cwiseAbs().sum();
+  }
+}
+
+/** \internal Look for single small sub-diagonal element S(res, res-1) and return res (or 0). Copied from Eigen3 RealQZ
+ * implementation */
+template <typename MatrixType_>
+inline Index ComplexQZ<MatrixType_>::findSmallSubdiagEntry(Index iu) {
+  Index res = iu;
+  while (res > 0) {
+    RealScalar s = numext::abs(m_S.coeff(res - 1, res - 1)) + numext::abs(m_S.coeff(res, res));
+    if (s == Scalar(0)) s = m_normOfS;
+    if (numext::abs(m_S.coeff(res, res - 1)) < NumTraits<RealScalar>::epsilon() * s) break;
+    res--;
+  }
+  return res;
+}
+
+//
+/** \internal Look for single small diagonal element T(res, res) for res between f and l, and return res (or f-1).
+ * Copied from Eigen3 RealQZ implementation. */
+template <typename MatrixType_>
+inline Index ComplexQZ<MatrixType_>::findSmallDiagEntry(Index f, Index l) {
+  Index res = l;
+  while (res >= f) {
+    if (numext::abs(m_T.coeff(res, res)) <= NumTraits<RealScalar>::epsilon() * m_normOfT) break;
+    res--;
+  }
+  return res;
+}
+
+}  // namespace Eigen
+
+#endif  // _COMPLEX_QZ_H_
diff --git a/Eigen/src/Eigenvalues/ComplexSchur.h b/Eigen/src/Eigenvalues/ComplexSchur.h
index 22433f2bde4..86d6349c15a 100644
--- a/Eigen/src/Eigenvalues/ComplexSchur.h
+++ b/Eigen/src/Eigenvalues/ComplexSchur.h
@@ -233,7 +233,7 @@ class ComplexSchur {
   }
 
   /** \brief Returns the maximum number of iterations. */
-  Index getMaxIterations() { return m_maxIters; }
+  Index getMaxIterations() const { return m_maxIters; }
 
   /** \brief Maximum number of iterations per row.
    *
@@ -277,7 +277,8 @@ typename ComplexSchur<MatrixType>::ComplexScalar ComplexSchur<MatrixType>::compu
   using std::abs;
   if ((iter == 10 || iter == 20) && iu > 1) {
     // exceptional shift, taken from http://www.netlib.org/eispack/comqr.f
-    return abs(numext::real(m_matT.coeff(iu, iu - 1))) + abs(numext::real(m_matT.coeff(iu - 1, iu - 2)));
+    return ComplexSchur<MatrixType>::ComplexScalar(abs(numext::real(m_matT.coeff(iu, iu - 1))) +
+                                                   abs(numext::real(m_matT.coeff(iu - 1, iu - 2))));
   }
 
   // compute the shift as one of the eigenvalues of t, the 2x2
@@ -362,7 +363,7 @@ struct complex_schur_reduce_to_hessenberg<MatrixType, false> {
     _this.m_hess.compute(matrix);
     _this.m_matT = _this.m_hess.matrixH().template cast<ComplexScalar>();
     if (computeU) {
-      // This may cause an allocation which seems to be avoidable
+      // TODO: this temporary allocation could potentially be avoided.
       MatrixType Q = _this.m_hess.matrixQ();
       _this.m_matU = Q.template cast<ComplexScalar>();
     }
diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h
index 9dba7bd1861..f439912070c 100644
--- a/Eigen/src/Eigenvalues/EigenSolver.h
+++ b/Eigen/src/Eigenvalues/EigenSolver.h
@@ -292,7 +292,7 @@ class EigenSolver {
   }
 
   /** \brief Returns the maximum number of iterations. */
-  Index getMaxIterations() { return m_realSchur.getMaxIterations(); }
+  Index getMaxIterations() const { return m_realSchur.getMaxIterations(); }
 
  private:
   void doComputeEigenvectors();
diff --git a/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/Eigen/src/Eigenvalues/HessenbergDecomposition.h
index f79ee331ac5..42fe3fd6eb7 100644
--- a/Eigen/src/Eigenvalues/HessenbergDecomposition.h
+++ b/Eigen/src/Eigenvalues/HessenbergDecomposition.h
@@ -317,7 +317,7 @@ namespace internal {
  * HessenbergDecomposition class until the it is assigned or evaluated for
  * some other reason (the reference should remain valid during the life time
  * of this object). This class is the return type of
- * HessenbergDecomposition::matrixH(); there is probably no other use for this
+ * HessenbergDecomposition::matrixH(); there is no other intended use for this
  * class.
  */
 template <typename MatrixType>
diff --git a/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
index 62227bdc166..38f46f81564 100644
--- a/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
+++ b/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
@@ -111,8 +111,7 @@ template <typename Derived>
 inline typename MatrixBase<Derived>::RealScalar MatrixBase<Derived>::operatorNorm() const {
   using std::sqrt;
   typename Derived::PlainObject m_eval(derived());
-  // FIXME if it is really guaranteed that the eigenvalues are already sorted,
-  // then we don't need to compute a maxCoeff() here, comparing the 1st and last ones is enough.
+  // FIXME: if eigenvalues are guaranteed to be sorted, comparing the first and last is sufficient.
   return sqrt((m_eval * m_eval.adjoint()).eval().template selfadjointView<Lower>().eigenvalues().maxCoeff());
 }
 
diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h
index a54d82d4e3d..b091d90b0f2 100644
--- a/Eigen/src/Eigenvalues/RealQZ.h
+++ b/Eigen/src/Eigenvalues/RealQZ.h
@@ -544,9 +544,11 @@ RealQZ<MatrixType>& RealQZ<MatrixType>::compute(const MatrixType& A_in, const Ma
         // and T.block(f,f, l-f+1,l-f+1) is invertible uper-triangular, which allows to
         // apply a QR-like iteration to rows and columns f..l.
         step(f, l, local_iter);
-        local_iter++;
+        // count QR-like steps
         m_global_iter++;
       }
+      // count iterations toward m_maxIters
+      local_iter++;
     }
   }
   // check if we converged before reaching iterations limit
diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h
index 54a74e2f59f..b9dbc6ab035 100644
--- a/Eigen/src/Eigenvalues/RealSchur.h
+++ b/Eigen/src/Eigenvalues/RealSchur.h
@@ -207,7 +207,7 @@ class RealSchur {
   }
 
   /** \brief Returns the maximum number of iterations. */
-  Index getMaxIterations() { return m_maxIters; }
+  Index getMaxIterations() const { return m_maxIters; }
 
   /** \brief Maximum number of iterations per row.
    *
@@ -343,9 +343,9 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
 template <typename MatrixType>
 inline typename MatrixType::Scalar RealSchur<MatrixType>::computeNormOfT() {
   const Index size = m_matT.cols();
-  // FIXME to be efficient the following would requires a triangular reduxion code
-  // Scalar norm = m_matT.upper().cwiseAbs().sum()
-  //               + m_matT.bottomLeftCorner(size-1,size-1).diagonal().cwiseAbs().sum();
+  // m_matT is upper-Hessenberg, so per column only rows [0, j+1] are nonzero.
+  // The column-wise loop touches ~n^2/2 entries; scanning the full matrix
+  // would double that, and TriangularView has no direct cwiseAbs().sum().
   Scalar norm(0);
   for (Index j = 0; j < size; ++j) norm += m_matT.col(j).segment(0, (std::min)(size, j + 2)).cwiseAbs().sum();
   return norm;
@@ -409,7 +409,7 @@ inline void RealSchur<MatrixType>::computeShift(Index iu, Index iter, Scalar& ex
   shiftInfo.coeffRef(2) = m_matT.coeff(iu, iu - 1) * m_matT.coeff(iu - 1, iu);
 
   // Alternate exceptional shifting strategy every 16 iterations.
-  if (iter % 16 == 0) {
+  if (iter > 0 && iter % 16 == 0) {
     // Wilkinson's original ad hoc shift
     if (iter % 32 != 0) {
       exshift += shiftInfo.coeff(0);
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index f84da913996..65533d88071 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -25,7 +25,7 @@ namespace internal {
 template <typename SolverType, int Size, bool IsComplex>
 struct direct_selfadjoint_eigenvalues;
 
-template <typename MatrixType, typename DiagType, typename SubDiagType>
+template <bool PerBlockScaling, typename MatrixType, typename DiagType, typename SubDiagType>
 EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag,
                                                               const Index maxIterations, bool computeEigenvectors,
                                                               MatrixType& eivec);
@@ -325,6 +325,22 @@ class SelfAdjointEigenSolver {
     return m_eivec * m_eivalues.cwiseSqrt().asDiagonal() * m_eivec.adjoint();
   }
 
+  /** \brief Computes the matrix exponential the matrix.
+   *
+   * \returns the matrix exponential the matrix.
+   *
+   * \pre The eigenvalues and eigenvectors of a positive-definite matrix
+   * have been computed before.
+   *
+   * \sa operatorInverseSqrt(), operatorSqrt(),
+   * <a href="unsupported/group__MatrixFunctions__Module.html">MatrixFunctions Module</a>
+   */
+  EIGEN_DEVICE_FUNC MatrixType operatorExp() const {
+    eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
+    eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+    return m_eivec * m_eivalues.array().exp().matrix().asDiagonal() * m_eivec.adjoint();
+  }
+
   /** \brief Computes the inverse square root of the matrix.
    *
    * \returns the inverse positive-definite square root of the matrix
@@ -422,7 +438,7 @@ EIGEN_DEVICE_FUNC SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<Mat
     m_eivec = matrix;
     m_eivalues.coeffRef(0, 0) = numext::real(m_eivec.coeff(0, 0));
     if (computeEigenvectors) m_eivec.setOnes(n, n);
-    m_info = Success;
+    m_info = (numext::isfinite)(m_eivalues.coeffRef(0, 0)) ? Success : NoConvergence;
     m_isInitialized = true;
     m_eigenvectorsOk = computeEigenvectors;
     return *this;
@@ -432,18 +448,29 @@ EIGEN_DEVICE_FUNC SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<Mat
   RealVectorType& diag = m_eivalues;
   EigenvectorsType& mat = m_eivec;
 
-  // map the matrix coefficients to [-1:1] to avoid over- and underflow.
+  // Scale the matrix to [-1:1] to avoid overflow/underflow during tridiagonalization
+  // and subsequent QR iteration. This uniform scaling ensures the tridiagonal output is
+  // well-conditioned. Note: for block-diagonal matrices with widely separated scales, this
+  // can underflow small blocks. Users with such matrices should tridiagonalize separately
+  // and call computeFromTridiagonal(), which uses per-block scaling.
   mat = matrix.template triangularView<Lower>();
   RealScalar scale = mat.cwiseAbs().maxCoeff();
+  if (!(numext::isfinite)(scale)) {
+    // Input contains Inf or NaN.
+    m_info = NoConvergence;
+    m_isInitialized = true;
+    m_eigenvectorsOk = false;
+    return *this;
+  }
   if (numext::is_exactly_zero(scale)) scale = RealScalar(1);
   mat.template triangularView<Lower>() /= scale;
   m_subdiag.resize(n - 1);
   m_hcoeffs.resize(n - 1);
   internal::tridiagonalization_inplace(mat, diag, m_subdiag, m_hcoeffs, m_workspace, computeEigenvectors);
 
-  m_info = internal::computeFromTridiagonal_impl(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
+  m_info = internal::computeFromTridiagonal_impl<false>(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
 
-  // scale back the eigen values
+  // Scale back the eigenvalues.
   m_eivalues *= scale;
 
   m_isInitialized = true;
@@ -454,15 +481,31 @@ EIGEN_DEVICE_FUNC SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<Mat
 template <typename MatrixType>
 SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>::computeFromTridiagonal(
     const RealVectorType& diag, const SubDiagonalType& subdiag, int options) {
-  // TODO : Add an option to scale the values beforehand
   bool computeEigenvectors = (options & ComputeEigenvectors) == ComputeEigenvectors;
 
   m_eivalues = diag;
   m_subdiag = subdiag;
+
+  // Check for Inf/NaN in the input.
+  {
+    RealScalar scale = RealScalar(0);
+    if (m_eivalues.size() > 0) scale = m_eivalues.cwiseAbs().maxCoeff();
+    if (m_subdiag.size() > 0) scale = numext::maxi(scale, m_subdiag.cwiseAbs().maxCoeff());
+    if (!(numext::isfinite)(scale)) {
+      m_info = NoConvergence;
+      m_isInitialized = true;
+      m_eigenvectorsOk = false;
+      return *this;
+    }
+  }
+
   if (computeEigenvectors) {
     m_eivec.setIdentity(diag.size(), diag.size());
   }
-  m_info = internal::computeFromTridiagonal_impl(m_eivalues, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
+  // Use per-deflation-block scaling (like LAPACK's DSTERF) to avoid losing
+  // precision when the tridiagonal entries span a wide range of magnitudes.
+  m_info =
+      internal::computeFromTridiagonal_impl<true>(m_eivalues, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
 
   m_isInitialized = true;
   m_eigenvectorsOk = computeEigenvectors;
@@ -474,6 +517,10 @@ namespace internal {
  * \internal
  * \brief Compute the eigendecomposition from a tridiagonal matrix
  *
+ * \tparam PerBlockScaling If true, each deflation block is independently scaled to [-1,1] before
+ *         QR iteration, following LAPACK's DSTERF approach. This prevents precision loss when entries
+ *         span a wide range of magnitudes. When false, the caller is responsible for ensuring the
+ *         entries are in a safe range (e.g. by pre-scaling the dense matrix before tridiagonalization).
  * \param[in,out] diag : On input, the diagonal of the matrix, on output the eigenvalues
  * \param[in,out] subdiag : The subdiagonal part of the matrix (entries are modified during the decomposition)
  * \param[in] maxIterations : the maximum number of iterations
@@ -481,7 +528,7 @@ namespace internal {
  * \param[out] eivec : The matrix to store the eigenvectors if computeEigenvectors==true. Must be allocated on input.
  * \returns \c Success or \c NoConvergence
  */
-template <typename MatrixType, typename DiagType, typename SubDiagType>
+template <bool PerBlockScaling, typename MatrixType, typename DiagType, typename SubDiagType>
 EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag,
                                                               const Index maxIterations, bool computeEigenvectors,
                                                               MatrixType& eivec) {
@@ -496,21 +543,32 @@ EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, Su
   typedef typename DiagType::RealScalar RealScalar;
   const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
   const RealScalar precision_inv = RealScalar(1) / NumTraits<RealScalar>::epsilon();
-  while (end > 0) {
-    for (Index i = start; i < end; ++i) {
+
+  // Helper lambda for the deflation test.
+  auto deflate = [&](Index lo, Index hi) {
+    for (Index i = lo; i < hi; ++i) {
       if (numext::abs(subdiag[i]) < considerAsZero) {
         subdiag[i] = RealScalar(0);
       } else {
-        // abs(subdiag[i]) <= epsilon * sqrt(abs(diag[i]) + abs(diag[i+1]))
-        // Scaled to prevent underflows.
         const RealScalar scaled_subdiag = precision_inv * subdiag[i];
         if (scaled_subdiag * scaled_subdiag <= (numext::abs(diag[i]) + numext::abs(diag[i + 1]))) {
           subdiag[i] = RealScalar(0);
         }
       }
     }
+  };
+
+  // For per-block scaling, track the currently scaled block and its scale factor.
+  // When the outer loop identifies a block outside the scaled region, unscale the old
+  // block and scale the new one. This keeps the same outer loop structure (one QR step
+  // per iteration) while ensuring each block is processed in scaled coordinates.
+  Index scaled_start = -1, scaled_end = -1;
+  RealScalar block_scale = RealScalar(1);
+
+  while (end > 0) {
+    deflate(start, end);
 
-    // find the largest unreduced block at the end of the matrix.
+    // Find the largest unreduced block at the end of the matrix.
     while (end > 0 && numext::is_exactly_zero(subdiag[end - 1])) {
       end--;
     }
@@ -523,17 +581,49 @@ EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, Su
     start = end - 1;
     while (start > 0 && !numext::is_exactly_zero(subdiag[start - 1])) start--;
 
+    if (PerBlockScaling) {
+      // Check if we've moved to a different block than the one currently scaled.
+      if (start != scaled_start || end != scaled_end) {
+        // Unscale the previous block if it was scaled.
+        if (block_scale != RealScalar(1)) {
+          for (Index i = scaled_start; i <= scaled_end; ++i) diag[i] /= block_scale;
+          for (Index i = scaled_start; i < scaled_end; ++i) {
+            if (!numext::is_exactly_zero(subdiag[i])) subdiag[i] /= block_scale;
+          }
+          block_scale = RealScalar(1);
+        }
+        // Compute the norm and scale the new block to [-1:1].
+        RealScalar block_norm = RealScalar(0);
+        for (Index i = start; i <= end; ++i) block_norm = numext::maxi(block_norm, numext::abs(diag[i]));
+        for (Index i = start; i < end; ++i) block_norm = numext::maxi(block_norm, numext::abs(subdiag[i]));
+        if (block_norm > RealScalar(0) && block_norm != RealScalar(1)) {
+          block_scale = RealScalar(1) / block_norm;
+          for (Index i = start; i <= end; ++i) diag[i] *= block_scale;
+          for (Index i = start; i < end; ++i) subdiag[i] *= block_scale;
+        }
+        scaled_start = start;
+        scaled_end = end;
+      }
+    }
+
     internal::tridiagonal_qr_step<MatrixType::Flags & RowMajorBit ? RowMajor : ColMajor>(
         diag.data(), subdiag.data(), start, end, computeEigenvectors ? eivec.data() : (Scalar*)0, n);
   }
+
+  // Unscale any remaining scaled block.
+  if (PerBlockScaling && block_scale != RealScalar(1)) {
+    for (Index i = scaled_start; i <= scaled_end; ++i) diag[i] /= block_scale;
+    for (Index i = scaled_start; i < scaled_end; ++i) {
+      if (!numext::is_exactly_zero(subdiag[i])) subdiag[i] /= block_scale;
+    }
+  }
   if (iter <= maxIterations * n)
     info = Success;
   else
     info = NoConvergence;
 
   // Sort eigenvalues and corresponding vectors.
-  // TODO make the sort optional ?
-  // TODO use a better sort algorithm !!
+  // TODO: make the sort optional and use a more efficient sorting algorithm.
   if (info == Success) {
     for (Index i = 0; i < n - 1; ++i) {
       Index k;
@@ -637,16 +727,38 @@ struct direct_selfadjoint_eigenvalues<SolverType, 3, false> {
 
     // Shift the matrix to the mean eigenvalue and map the matrix coefficients to [-1:1] to avoid over- and underflow.
     Scalar shift = mat.trace() / Scalar(3);
-    // TODO Avoid this copy. Currently it is necessary to suppress bogus values when determining maxCoeff and for
-    // computing the eigenvectors later
+    // TODO: avoid this copy. Currently necessary to suppress bogus values when determining maxCoeff and for
+    // computing the eigenvectors later.
     MatrixType scaledMat = mat.template selfadjointView<Lower>();
     scaledMat.diagonal().array() -= shift;
     Scalar scale = scaledMat.cwiseAbs().maxCoeff();
-    if (scale > 0) scaledMat /= scale;  // TODO for scale==0 we could save the remaining operations
+    if (scale > 0) scaledMat /= scale;  // TODO: skip remaining operations when scale==0.
 
     // compute the eigenvalues
     computeRoots(scaledMat, eivals);
 
+    // computeRoots produces theoretically sorted roots, but floating-point
+    // rounding in the trigonometric formulas can break the ordering.
+    // Enforce sorting with a branchless min/max network (3 elements).
+    {
+      Scalar tmp;
+      if (eivals(0) > eivals(1)) {
+        tmp = eivals(0);
+        eivals(0) = eivals(1);
+        eivals(1) = tmp;
+      }
+      if (eivals(1) > eivals(2)) {
+        tmp = eivals(1);
+        eivals(1) = eivals(2);
+        eivals(2) = tmp;
+      }
+      if (eivals(0) > eivals(1)) {
+        tmp = eivals(0);
+        eivals(0) = eivals(1);
+        eivals(1) = tmp;
+      }
+    }
+
     // compute the eigenvectors
     if (computeEigenvectors) {
       if ((eivals(2) - eivals(0)) <= Eigen::NumTraits<Scalar>::epsilon()) {
@@ -676,7 +788,7 @@ struct direct_selfadjoint_eigenvalues<SolverType, 3, false> {
         if (d0 <= 2 * Eigen::NumTraits<Scalar>::epsilon() * d1) {
           // If d0 is too small, then the two other eigenvalues are numerically the same,
           // and thus we only have to ortho-normalize the near orthogonal vector we saved above.
-          eivecs.col(l) -= eivecs.col(k).dot(eivecs.col(l)) * eivecs.col(l);
+          eivecs.col(l) -= eivecs.col(k).dot(eivecs.col(l)) * eivecs.col(k);
           eivecs.col(l).normalize();
         } else {
           tmp = scaledMat;
@@ -836,7 +948,7 @@ EIGEN_DEVICE_FUNC static void tridiagonal_qr_step(RealScalar* diag, RealScalar*
 
     // apply the givens rotation to the unit matrix Q = Q * G
     if (matrixQ) {
-      // FIXME if StorageOrder == RowMajor this operation is not very efficient
+      // FIXME: this operation is inefficient for RowMajor storage order.
       Map<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > q(matrixQ, n, n);
       q.applyOnTheRight(k, k + 1, rot);
     }
diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h
index 9cc92011f7e..1dbe009b918 100644
--- a/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -22,7 +22,11 @@ template <typename MatrixType>
 struct TridiagonalizationMatrixTReturnType;
 template <typename MatrixType>
 struct traits<TridiagonalizationMatrixTReturnType<MatrixType>> : public traits<typename MatrixType::PlainObject> {
-  typedef typename MatrixType::PlainObject ReturnType;  // FIXME shall it be a BandMatrix?
+  // matrixT() returns a dense n x n matrix. A band-stored alternative (e.g. a
+  // future matrixTBand() returning BandMatrix<Scalar, Dynamic, Dynamic, 1, 1>)
+  // would be ~3n storage instead of n^2, but changing this ReturnType in place
+  // would be API-breaking for callers that assume a dense matrix.
+  typedef typename MatrixType::PlainObject ReturnType;
   enum { Flags = 0 };
 };
 
@@ -306,7 +310,8 @@ typename Tridiagonalization<MatrixType>::SubDiagonalReturnType Tridiagonalizatio
 namespace internal {
 
 /** \internal
- * Performs a tridiagonal decomposition of the selfadjoint matrix \a matA in-place.
+ * Unblocked tridiagonal decomposition of the selfadjoint matrix \a matA in-place.
+ * Processes one column at a time using Level 2 BLAS operations (SYMV, SYR2).
  *
  * \param[in,out] matA On input the selfadjoint matrix. Only the \b lower triangular part is referenced.
  *                     On output, the strict upper part is left unchanged, and the lower triangular part
@@ -329,7 +334,7 @@ namespace internal {
  * \sa Tridiagonalization::packedMatrix()
  */
 template <typename MatrixType, typename CoeffVectorType>
-EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs) {
+EIGEN_DEVICE_FUNC void tridiagonalization_inplace_unblocked(MatrixType& matA, CoeffVectorType& hCoeffs) {
   using numext::conj;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
@@ -364,6 +369,159 @@ EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorT
   }
 }
 
+#if !defined(EIGEN_GPU_COMPILE_PHASE)
+/** \internal
+ * Blocked tridiagonal decomposition (analogous to LAPACK's dsytrd/dlatrd).
+ * Processes columns in panels of BlockSize, accumulating Householder reflectors
+ * and deferring the symmetric rank-2k update to use Level 3 BLAS (triangular GEMM).
+ * Falls back to the unblocked algorithm for the last (partial) panel.
+ */
+template <typename MatrixType, typename CoeffVectorType>
+void tridiagonalization_inplace_blocked(MatrixType& matA, CoeffVectorType& hCoeffs, Index nb = 16) {
+  using numext::conj;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  const Index n = matA.rows();
+  eigen_assert(n == matA.cols());
+  eigen_assert(n == hCoeffs.size() + 1);
+  eigen_assert(nb >= 2 && nb < n);
+
+  enum {
+    StorageOrder = (traits<MatrixType>::Flags & RowMajorBit) ? RowMajor : ColMajor,
+    RhsStorageOrder = (StorageOrder == ColMajor) ? RowMajor : ColMajor
+  };
+
+  // Workspace: W matrix (n x nb) for deferred update vectors, temp vector (nb) for GEMV, betas (nb).
+  typedef Matrix<Scalar, Dynamic, Dynamic, StorageOrder> WorkMatrixType;
+  WorkMatrixType W(n, nb);
+  Matrix<Scalar, Dynamic, 1> temp(nb);
+  Matrix<RealScalar, Dynamic, 1> betas(nb);
+
+  // Pre-allocate GEMM blocking workspace for the largest trailing matrix (first panel).
+  // Reused across all panels to avoid repeated heap allocations.
+  typedef gemm_blocking_space<StorageOrder, Scalar, Scalar, Dynamic, Dynamic, Dynamic> BlockingType;
+  const Index maxTrailingSize = n - nb;
+  BlockingType blocking(maxTrailingSize, maxTrailingSize, nb, 1, false);
+
+  Index j0 = 0;
+  for (; j0 + nb < n - 1; j0 += nb) {
+    const Index j_end = j0 + nb;
+
+    // ---- Panel factorization (dlatrd) ----
+    // Process columns j0..j_end-1, computing Householder vectors (stored in matA)
+    // and update vectors (stored in W). The rank-2k update to the trailing
+    // submatrix is deferred until after the panel.
+    for (Index j = j0; j < j_end; ++j) {
+      const Index local_j = j - j0;
+      const Index remainingSize = n - j - 1;
+
+      // Step 1: Update column j for deferred rank-2 updates from columns j0..j-1.
+      // A(j:n-1, j) -= V * W(j,:)^H + W * V(j,:)^H
+      // where V = matA(j:n-1, j0:j-1) holds Householder vectors,
+      // and W(j:n-1, 0:lj-1) holds the corresponding update vectors.
+      if (local_j > 0) {
+        auto col_j = matA.col(j).segment(j, n - j);
+        col_j.noalias() -= matA.block(j, j0, n - j, local_j) * W.row(j).head(local_j).adjoint();
+        col_j.noalias() -= W.block(j, 0, n - j, local_j) * matA.row(j).segment(j0, local_j).adjoint();
+        // Keep diagonal real (for complex scalars; no-op for real).
+        matA.coeffRef(j, j) = numext::real(matA.coeff(j, j));
+      }
+
+      // Step 2: Compute Householder reflector for column j.
+      RealScalar beta;
+      Scalar h;
+      matA.col(j).tail(remainingSize).makeHouseholderInPlace(h, beta);
+      betas(local_j) = beta;
+      matA.col(j).coeffRef(j + 1) = Scalar(1);
+
+      auto v = matA.col(j).tail(remainingSize);
+      auto w = W.col(local_j).tail(remainingSize);
+
+      // Step 3: Compute w = conj(h) * A_eff * v where A_eff accounts for deferred updates.
+      // Start with SYMV on the stored (not yet updated) trailing submatrix.
+      w.noalias() =
+          matA.bottomRightCorner(remainingSize, remainingSize).template selfadjointView<Lower>() * (conj(h) * v);
+
+      // GEMV corrections for deferred rank-2 updates within this panel.
+      if (local_j > 0) {
+        auto V_prev = matA.block(j + 1, j0, remainingSize, local_j);
+        auto W_prev = W.block(j + 1, 0, remainingSize, local_j);
+
+        // w -= conj(h) * V_prev * (W_prev^H * v)
+        temp.head(local_j).noalias() = W_prev.adjoint() * v;
+        w.noalias() -= conj(h) * (V_prev * temp.head(local_j));
+
+        // w -= conj(h) * W_prev * (V_prev^H * v)
+        temp.head(local_j).noalias() = V_prev.adjoint() * v;
+        w.noalias() -= conj(h) * (W_prev * temp.head(local_j));
+      }
+
+      // Step 4: Half-dot correction: w -= 0.5 * conj(h) * (w^H * v) * v
+      w += (conj(h) * RealScalar(-0.5) * w.dot(v)) * v;
+
+      hCoeffs.coeffRef(j) = h;
+    }
+
+    // ---- Apply rank-2k update to trailing submatrix ----
+    // A(j_end:n-1, j_end:n-1) -= V_trail * W_trail^H + W_trail * V_trail^H
+    // using Level 3 BLAS (triangular GEMM).
+    const Index trailingSize = n - j_end;
+    if (trailingSize > 0) {
+      const Scalar* V_data = &matA.coeffRef(j_end, j0);
+      const Scalar* W_data = &W.coeffRef(j_end, 0);
+      Scalar* C_data = &matA.coeffRef(j_end, j_end);
+      const Index V_stride = matA.outerStride();
+      const Index W_stride = W.outerStride();
+      const Index C_stride = matA.outerStride();
+
+      // C -= V * W^H
+      general_matrix_matrix_triangular_product<Index, Scalar, StorageOrder, false, Scalar, RhsStorageOrder,
+                                               NumTraits<Scalar>::IsComplex, StorageOrder, 1,
+                                               Lower>::run(trailingSize, nb, V_data, V_stride, W_data, W_stride, C_data,
+                                                           1, C_stride, Scalar(-1), blocking);
+
+      // C -= W * V^H
+      general_matrix_matrix_triangular_product<Index, Scalar, StorageOrder, false, Scalar, RhsStorageOrder,
+                                               NumTraits<Scalar>::IsComplex, StorageOrder, 1,
+                                               Lower>::run(trailingSize, nb, W_data, W_stride, V_data, V_stride, C_data,
+                                                           1, C_stride, Scalar(-1), blocking);
+    }
+
+    // Restore subdiagonal entries (overwritten with 1 for Householder vectors).
+    for (Index j = j0; j < j_end; ++j) {
+      matA.coeffRef(j + 1, j) = betas(j - j0);
+    }
+  }
+
+  // ---- Process remaining columns with unblocked algorithm ----
+  if (j0 < n - 1) {
+    const Index remaining = n - j0;
+    auto trailing = matA.bottomRightCorner(remaining, remaining);
+    auto hCoeffs_tail = hCoeffs.segment(j0, remaining - 1);
+    tridiagonalization_inplace_unblocked(trailing, hCoeffs_tail);
+  }
+}
+#endif  // !EIGEN_GPU_COMPILE_PHASE
+
+/** \internal
+ * Dispatches to blocked or unblocked tridiagonalization based on matrix size.
+ * On GPU, always uses the unblocked algorithm.
+ */
+template <typename MatrixType, typename CoeffVectorType>
+EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs) {
+  Index n = matA.rows();
+  eigen_assert(n == matA.cols());
+  eigen_assert(n == hCoeffs.size() + 1 || n == 1);
+
+#if !defined(EIGEN_GPU_COMPILE_PHASE)
+  if ((MatrixType::RowsAtCompileTime == Dynamic || MatrixType::ColsAtCompileTime == Dynamic) && n >= 96) {
+    tridiagonalization_inplace_blocked(matA, hCoeffs);
+    return;
+  }
+#endif
+  tridiagonalization_inplace_unblocked(matA, hCoeffs);
+}
+
 // forward declaration, implementation at the end of this file
 template <typename MatrixType, int Size = MatrixType::ColsAtCompileTime,
           bool IsComplex = NumTraits<typename MatrixType::Scalar>::IsComplex>
diff --git a/Eigen/src/Geometry/AngleAxis.h b/Eigen/src/Geometry/AngleAxis.h
index a00ed178ddd..02f3fb2c896 100644
--- a/Eigen/src/Geometry/AngleAxis.h
+++ b/Eigen/src/Geometry/AngleAxis.h
@@ -130,6 +130,18 @@ class AngleAxis : public RotationBase<AngleAxis<Scalar_>, 3> {
   EIGEN_DEVICE_FUNC AngleAxis& fromRotationMatrix(const MatrixBase<Derived>& m);
   EIGEN_DEVICE_FUNC Matrix3 toRotationMatrix(void) const;
 
+  /** Applies the rotation to a 3D vector using Rodrigues' formula directly,
+   * without constructing the full rotation matrix. */
+  template <typename OtherVectorType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Vector3 _transformVector(const OtherVectorType& v) const {
+    EIGEN_USING_STD(sin)
+    EIGEN_USING_STD(cos)
+    // Rodrigues' rotation formula: v' = v*cos(θ) + (k×v)*sin(θ) + k*(k·v)*(1-cos(θ))
+    const Scalar c = cos(m_angle);
+    const Scalar s = sin(m_angle);
+    return v * c + m_axis.cross(v) * s + m_axis * (m_axis.dot(v) * (Scalar(1) - c));
+  }
+
   /** \returns \c *this with scalar type casted to \a NewScalarType
    *
    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
@@ -197,9 +209,7 @@ EIGEN_DEVICE_FUNC AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const Quaterni
 template <typename Scalar>
 template <typename Derived>
 EIGEN_DEVICE_FUNC AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const MatrixBase<Derived>& mat) {
-  // Since a direct conversion would not be really faster,
-  // let's use the robust Quaternion implementation:
-  return *this = QuaternionType(mat);
+  return fromRotationMatrix(mat);
 }
 
 /**
@@ -208,7 +218,65 @@ EIGEN_DEVICE_FUNC AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const MatrixBa
 template <typename Scalar>
 template <typename Derived>
 EIGEN_DEVICE_FUNC AngleAxis<Scalar>& AngleAxis<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat) {
-  return *this = QuaternionType(mat);
+  EIGEN_USING_STD(atan2)
+  EIGEN_USING_STD(sqrt)
+  EIGEN_STATIC_ASSERT(
+      (internal::is_same<Scalar, typename Derived::Scalar>::value),
+      YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+  eigen_assert(mat.cols() == 3 && mat.rows() == 3);
+
+  const typename internal::nested_eval<Derived, 3>::type m(mat);
+
+  // Skew-symmetric part gives sin(angle) * axis.
+  const Scalar sx = m.coeff(2, 1) - m.coeff(1, 2);
+  const Scalar sy = m.coeff(0, 2) - m.coeff(2, 0);
+  const Scalar sz = m.coeff(1, 0) - m.coeff(0, 1);
+  const Scalar s = sqrt(sx * sx + sy * sy + sz * sz);  // = 2*sin(angle)
+
+  // trace = 1 + 2*cos(angle)
+  const Scalar c = m.trace() - Scalar(1);  // = 2*cos(angle)
+
+  // Use atan2 for the angle: accurate at all angles including near 0 and pi.
+  m_angle = atan2(s, c);
+
+  // Use the skew-symmetric part only when sin(angle) is large enough for
+  // accurate axis extraction. Near angle=0 or angle=pi, sin(angle) is small
+  // and the axis must be computed differently.
+  const Scalar sin_threshold = sqrt(NumTraits<Scalar>::epsilon());
+  if (s > sin_threshold) {
+    // General case: axis from skew-symmetric part.
+    const Scalar inv_s = Scalar(1) / s;
+    m_axis << sx * inv_s, sy * inv_s, sz * inv_s;
+  } else if (c > Scalar(0)) {
+    // Near identity (angle ≈ 0): axis is arbitrary, use (1,0,0).
+    m_axis << Scalar(1), Scalar(0), Scalar(0);
+  } else {
+    // Near angle = pi: extract axis from the symmetric part (R + I) / 2.
+    // The axis is the eigenvector corresponding to eigenvalue 1.
+    // Use the column of (R + I) with the largest diagonal entry for robustness.
+    const Scalar d0 = m.coeff(0, 0);
+    const Scalar d1 = m.coeff(1, 1);
+    const Scalar d2 = m.coeff(2, 2);
+    if (d0 >= d1 && d0 >= d2) {
+      // x is the largest component
+      const Scalar x = sqrt(numext::maxi(d0 - d1 - d2 + Scalar(1), Scalar(0)) * Scalar(0.5));
+      const Scalar inv_2x = Scalar(0.5) / (x + NumTraits<Scalar>::epsilon());
+      m_axis << x, (m.coeff(0, 1) + m.coeff(1, 0)) * inv_2x, (m.coeff(0, 2) + m.coeff(2, 0)) * inv_2x;
+    } else if (d1 >= d2) {
+      // y is the largest component
+      const Scalar y = sqrt(numext::maxi(d1 - d0 - d2 + Scalar(1), Scalar(0)) * Scalar(0.5));
+      const Scalar inv_2y = Scalar(0.5) / (y + NumTraits<Scalar>::epsilon());
+      m_axis << (m.coeff(0, 1) + m.coeff(1, 0)) * inv_2y, y, (m.coeff(1, 2) + m.coeff(2, 1)) * inv_2y;
+    } else {
+      // z is the largest component
+      const Scalar z = sqrt(numext::maxi(d2 - d0 - d1 + Scalar(1), Scalar(0)) * Scalar(0.5));
+      const Scalar inv_2z = Scalar(0.5) / (z + NumTraits<Scalar>::epsilon());
+      m_axis << (m.coeff(0, 2) + m.coeff(2, 0)) * inv_2z, (m.coeff(1, 2) + m.coeff(2, 1)) * inv_2z, z;
+    }
+    m_axis.normalize();
+  }
+
+  return *this;
 }
 
 /** Constructs and \returns an equivalent 3x3 rotation matrix.
diff --git a/Eigen/src/Geometry/EulerAngles.h b/Eigen/src/Geometry/EulerAngles.h
index ad6b821bea1..8e5052833a3 100644
--- a/Eigen/src/Geometry/EulerAngles.h
+++ b/Eigen/src/Geometry/EulerAngles.h
@@ -37,7 +37,7 @@ namespace Eigen {
  * For proper Euler angle configurations (a0 == a2), the returned angles are in the ranges [-pi:pi]x[0:pi]x[-pi:pi].
  *
  * The approach used is also described here:
- * https://d3cw3dd2w32x2b.cloudfront.net/wp-content/uploads/2012/07/euler-angles.pdf
+ * https://web.archive.org/web/20240715191429/https://d3cw3dd2w32x2b.cloudfront.net/wp-content/uploads/2012/07/euler-angles.pdf
  *
  * \sa class AngleAxis
  */
@@ -91,8 +91,26 @@ EIGEN_DEVICE_FUNC inline Matrix<typename MatrixBase<Derived>::Scalar, 3, 1> Matr
     //
     //  Thus:  m11.c1 - m21.s1 = c3  &   m12.c1 - m22.s1 = s3
 
-    Scalar s1 = numext::sin(res[0]);
-    Scalar c1 = numext::cos(res[0]);
+    // Recover sin(res[0]) and cos(res[0]) from the atan2 arguments directly,
+    // avoiding a redundant sin+cos evaluation. s2 = hypot(coeff(j,i), coeff(k,i))
+    // is the norm of the atan2 arguments (with sign adjustment for !odd).
+    Scalar s1, c1;
+    if (s2 > NumTraits<Scalar>::epsilon()) {
+      Scalar inv_s2 = Scalar(1) / s2;
+      if (odd) {
+        // res[0] = atan2(coeff(j,i), coeff(k,i))
+        s1 = coeff(j, i) * inv_s2;
+        c1 = coeff(k, i) * inv_s2;
+      } else {
+        // res[0] = atan2(-coeff(j,i), -coeff(k,i))
+        s1 = -coeff(j, i) * inv_s2;
+        c1 = -coeff(k, i) * inv_s2;
+      }
+    } else {
+      // Gimbal lock (s2 ≈ 0): recover sin/cos from the computed angle.
+      s1 = numext::sin(res[0]);
+      c1 = numext::cos(res[0]);
+    }
     res[2] = numext::atan2(c1 * coeff(j, k) - s1 * coeff(k, k), c1 * coeff(j, j) - s1 * coeff(k, j));
   } else {
     // Tait-Bryan angles (all three axes are different; typically used for yaw-pitch-roll calculations).
@@ -102,6 +120,11 @@ EIGEN_DEVICE_FUNC inline Matrix<typename MatrixBase<Derived>::Scalar, 3, 1> Matr
     //  c2s3    s2s1s3 + c1c3     s2c1s3 - s1c3
     // -s2      c2s1              c2c1
 
+    // Recover sin(res[0]) and cos(res[0]) from the atan2 arguments directly:
+    //   res[0] = atan2(coeff(j,k), coeff(k,k))
+    //   sin(res[0]) = coeff(j,k) / hypot(coeff(j,k), coeff(k,k))
+    //   cos(res[0]) = coeff(k,k) / hypot(coeff(j,k), coeff(k,k))
+    Scalar n1 = numext::hypot(coeff(j, k), coeff(k, k));
     res[0] = numext::atan2(coeff(j, k), coeff(k, k));
 
     Scalar c2 = numext::hypot(coeff(i, i), coeff(i, j));
@@ -109,8 +132,17 @@ EIGEN_DEVICE_FUNC inline Matrix<typename MatrixBase<Derived>::Scalar, 3, 1> Matr
     // range [-pi/2, pi/2]
     res[1] = numext::atan2(-coeff(i, k), c2);
 
-    Scalar s1 = numext::sin(res[0]);
-    Scalar c1 = numext::cos(res[0]);
+    Scalar s1, c1;
+    if (n1 > NumTraits<Scalar>::epsilon()) {
+      Scalar inv_n1 = Scalar(1) / n1;
+      s1 = coeff(j, k) * inv_n1;
+      c1 = coeff(k, k) * inv_n1;
+    } else {
+      // Gimbal lock: coeff(j,k) and coeff(k,k) are both near zero.
+      // Fall back to sin/cos of the computed angle.
+      s1 = numext::sin(res[0]);
+      c1 = numext::cos(res[0]);
+    }
     res[2] = numext::atan2(s1 * coeff(k, i) - c1 * coeff(j, i), c1 * coeff(j, j) - s1 * coeff(k, j));
   }
   if (!odd) {
@@ -133,8 +165,8 @@ EIGEN_DEVICE_FUNC inline Matrix<typename MatrixBase<Derived>::Scalar, 3, 1> Matr
  * \sa class AngleAxis
  */
 template <typename Derived>
-EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline Matrix<typename MatrixBase<Derived>::Scalar, 3, 1>
-MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const {
+EIGEN_DEVICE_FUNC inline Matrix<typename MatrixBase<Derived>::Scalar, 3, 1> MatrixBase<Derived>::eulerAngles(
+    Index a0, Index a1, Index a2) const {
   /* Implemented from Graphics Gems IV */
   EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
 
diff --git a/Eigen/src/Geometry/Homogeneous.h b/Eigen/src/Geometry/Homogeneous.h
index 795af0d8d6e..22361e9989a 100644
--- a/Eigen/src/Geometry/Homogeneous.h
+++ b/Eigen/src/Geometry/Homogeneous.h
@@ -80,14 +80,12 @@ class Homogeneous : public MatrixBase<Homogeneous<MatrixType, Direction_> >, int
 
   template <typename Rhs>
   EIGEN_DEVICE_FUNC inline const Product<Homogeneous, Rhs> operator*(const MatrixBase<Rhs>& rhs) const {
-    eigen_assert(int(Direction) == Horizontal);
     return Product<Homogeneous, Rhs>(*this, rhs.derived());
   }
 
   template <typename Lhs>
   friend EIGEN_DEVICE_FUNC inline const Product<Lhs, Homogeneous> operator*(const MatrixBase<Lhs>& lhs,
                                                                             const Homogeneous& rhs) {
-    eigen_assert(int(Direction) == Vertical);
     return Product<Lhs, Homogeneous>(lhs.derived(), rhs);
   }
 
@@ -249,7 +247,7 @@ struct homogeneous_left_product_impl<Homogeneous<MatrixType, Vertical>, Lhs>
 
   template <typename Dest>
   EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const {
-    // FIXME investigate how to allow lazy evaluation of this product when possible
+    // FIXME: investigate how to allow lazy evaluation of this product when possible.
     dst = Block < const LhsMatrixTypeNested, LhsMatrixTypeNested::RowsAtCompileTime,
     LhsMatrixTypeNested::ColsAtCompileTime == Dynamic
         ? Dynamic
@@ -280,7 +278,7 @@ struct homogeneous_right_product_impl<Homogeneous<MatrixType, Horizontal>, Rhs>
 
   template <typename Dest>
   EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const {
-    // FIXME investigate how to allow lazy evaluation of this product when possible
+    // FIXME: investigate how to allow lazy evaluation of this product when possible.
     dst = m_lhs * Block < const RhsNested,
     RhsNested::RowsAtCompileTime == Dynamic ? Dynamic : RhsNested::RowsAtCompileTime - 1,
     RhsNested::ColsAtCompileTime > (m_rhs, 0, 0, m_rhs.rows() - 1, m_rhs.cols());
@@ -394,8 +392,6 @@ struct generic_product_impl<Lhs, Homogeneous<RhsArg, Vertical>, DenseShape, Homo
   }
 };
 
-// TODO: the following specialization is to address a regression from 3.2 to 3.3
-// In the future, this path should be optimized.
 template <typename Lhs, typename RhsArg, int ProductTag>
 struct generic_product_impl<Lhs, Homogeneous<RhsArg, Vertical>, TriangularShape, HomogeneousShape, ProductTag> {
   template <typename Dest>
diff --git a/Eigen/src/Geometry/Hyperplane.h b/Eigen/src/Geometry/Hyperplane.h
index 0fa0319a03d..1bdb811ead1 100644
--- a/Eigen/src/Geometry/Hyperplane.h
+++ b/Eigen/src/Geometry/Hyperplane.h
@@ -111,7 +111,7 @@ class Hyperplane {
    * If the dimension of the ambient space is greater than 2, then there isn't uniqueness,
    * so an arbitrary choice is made.
    */
-  // FIXME to be consistent with the rest this could be implemented as a static Through function ??
+  // FIXME: for consistency, consider implementing as a static Through function.
   EIGEN_DEVICE_FUNC explicit Hyperplane(const ParametrizedLine<Scalar, AmbientDimAtCompileTime>& parametrized) {
     normal() = parametrized.direction().unitOrthogonal();
     offset() = -parametrized.origin().dot(normal());
diff --git a/Eigen/src/Geometry/OrthoMethods.h b/Eigen/src/Geometry/OrthoMethods.h
index fc708ee2237..758c82e0d8c 100644
--- a/Eigen/src/Geometry/OrthoMethods.h
+++ b/Eigen/src/Geometry/OrthoMethods.h
@@ -200,7 +200,7 @@ struct unitOrthogonal_selector<Derived, 3> {
   EIGEN_DEVICE_FUNC static inline VectorType run(const Derived& src) {
     VectorType perp;
     /* Let us compute the crossed product of *this with a vector
-     * that is not too close to being colinear to *this.
+     * that is not too close to being collinear to *this.
      */
 
     /* unless the x and y coords are both close to zero, we can
@@ -213,7 +213,7 @@ struct unitOrthogonal_selector<Derived, 3> {
       perp.coeffRef(2) = 0;
     }
     /* if both x and y are close to zero, then the vector is close
-     * to the z-axis, so it's far from colinear to the x-axis for instance.
+     * to the z-axis, so it's far from collinear to the x-axis for instance.
      * So we take the crossed product with (1,0,0) and normalize it.
      */
     else {
@@ -242,7 +242,7 @@ struct unitOrthogonal_selector<Derived, 2> {
  * \returns a unit vector which is orthogonal to \c *this
  *
  * The size of \c *this must be at least 2. If the size is exactly 2,
- * then the returned vector is a counter clock wise rotation of \c *this, i.e., (-y,x).normalized().
+ * then the returned vector is a counter-clockwise rotation of \c *this, i.e., (-y,x).normalized().
  *
  * \sa cross()
  */
diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index f2d2d051fe6..b409833f986 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -115,13 +115,6 @@ class QuaternionBase : public RotationBase<Derived, 3> {
   template <class OtherDerived>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const QuaternionBase<OtherDerived>& other);
 
-  // disabled this copy operator as it is giving very strange compilation errors when compiling
-  // test_stdvector with GCC 4.4.2. This looks like a GCC bug though, so feel free to re-enable it if it's
-  // useful; however notice that we already have the templated operator= above and e.g. in MatrixBase
-  // we didn't have to add, in addition to templated operator=, such a non-templated copy operator.
-  //  Derived& operator=(const QuaternionBase& other)
-  //  { return operator=<Derived>(other); }
-
   EIGEN_DEVICE_FUNC Derived& operator=(const AngleAxisType& aa);
   template <class OtherDerived>
   EIGEN_DEVICE_FUNC Derived& operator=(const MatrixBase<OtherDerived>& m);
@@ -697,19 +690,10 @@ EIGEN_DEVICE_FUNC inline Derived& QuaternionBase<Derived>::setFromTwoVectors(con
   Scalar c = v1.dot(v0);
 
   // if dot == -1, vectors are nearly opposites
-  // => accurately compute the rotation axis by computing the
-  //    intersection of the two planes. This is done by solving:
-  //       x^T v0 = 0
-  //       x^T v1 = 0
-  //    under the constraint:
-  //       ||x|| = 1
-  //    which yields a singular value problem
+  // => any axis perpendicular to v0 will do for a ~180 degree rotation.
   if (c < Scalar(-1) + NumTraits<Scalar>::dummy_precision()) {
     c = numext::maxi(c, Scalar(-1));
-    Matrix<Scalar, 2, 3> m;
-    m << v0.transpose(), v1.transpose();
-    JacobiSVD<Matrix<Scalar, 2, 3>, ComputeFullV> svd(m);
-    Vector3 axis = svd.matrixV().col(2);
+    Vector3 axis = v0.unitOrthogonal();
 
     Scalar w2 = (Scalar(1) + c) * Scalar(0.5);
     this->w() = sqrt(w2);
@@ -797,7 +781,7 @@ EIGEN_DEVICE_FUNC Quaternion<Scalar, Options> Quaternion<Scalar, Options>::FromT
 template <class Derived>
 EIGEN_DEVICE_FUNC inline Quaternion<typename internal::traits<Derived>::Scalar> QuaternionBase<Derived>::inverse()
     const {
-  // FIXME should this function be called multiplicativeInverse and conjugate() be called inverse() or opposite()  ??
+  // FIXME: consider renaming to multiplicativeInverse() and renaming conjugate() to inverse() or opposite().
   Scalar n2 = this->squaredNorm();
   if (n2 > Scalar(0))
     return Quaternion<Scalar>(conjugate().coeffs() / n2);
@@ -861,6 +845,7 @@ EIGEN_DEVICE_FUNC Quaternion<typename internal::traits<Derived>::Scalar> Quatern
   Scalar scale1;
 
   if (absD >= one) {
+    // Near-parallel quaternions: use lerp to avoid division by ~zero sinTheta.
     scale0 = Scalar(1) - t;
     scale1 = t;
   } else {
@@ -890,7 +875,7 @@ struct quaternionbase_assign_impl<Other, 3, 3> {
     // Ken Shoemake, 1987 SIGGRAPH course notes
     Scalar t = mat.trace();
     if (t > Scalar(0)) {
-      t = sqrt(t + Scalar(1.0));
+      t = sqrt(numext::maxi(t + Scalar(1.0), Scalar(0)));
       q.w() = Scalar(0.5) * t;
       t = Scalar(0.5) / t;
       q.x() = (mat.coeff(2, 1) - mat.coeff(1, 2)) * t;
@@ -903,7 +888,8 @@ struct quaternionbase_assign_impl<Other, 3, 3> {
       Index j = (i + 1) % 3;
       Index k = (j + 1) % 3;
 
-      t = sqrt(mat.coeff(i, i) - mat.coeff(j, j) - mat.coeff(k, k) + Scalar(1.0));
+      // Guard against slightly negative argument from non-orthogonal matrices.
+      t = sqrt(numext::maxi(mat.coeff(i, i) - mat.coeff(j, j) - mat.coeff(k, k) + Scalar(1.0), Scalar(0)));
       q.coeffs().coeffRef(i) = Scalar(0.5) * t;
       t = Scalar(0.5) / t;
       q.w() = (mat.coeff(k, j) - mat.coeff(j, k)) * t;
diff --git a/Eigen/src/Geometry/Rotation2D.h b/Eigen/src/Geometry/Rotation2D.h
index 59180253a51..515bd270ed0 100644
--- a/Eigen/src/Geometry/Rotation2D.h
+++ b/Eigen/src/Geometry/Rotation2D.h
@@ -23,7 +23,7 @@ namespace Eigen {
  *
  * \tparam Scalar_ the scalar type, i.e., the type of the coefficients
  *
- * This class is equivalent to a single scalar representing a counter clock wise rotation
+ * This class is equivalent to a single scalar representing a counter-clockwise rotation
  * as a single angle in radian. It provides some additional features such as the automatic
  * conversion from/to a 2x2 rotation matrix. Moreover this class aims to provide a similar
  * interface to Quaternion in order to facilitate the writing of generic algorithms
@@ -57,7 +57,7 @@ class Rotation2D : public RotationBase<Rotation2D<Scalar_>, 2> {
   Scalar m_angle;
 
  public:
-  /** Construct a 2D counter clock wise rotation from the angle \a a in radian. */
+  /** Construct a 2D counter-clockwise rotation from the angle \a a in radian. */
   EIGEN_DEVICE_FUNC explicit inline Rotation2D(const Scalar& a) : m_angle(a) {}
 
   /** Default constructor without initialization. The represented rotation is undefined. */
diff --git a/Eigen/src/Geometry/Scaling.h b/Eigen/src/Geometry/Scaling.h
index a0604cee16a..5bb7c45d997 100644
--- a/Eigen/src/Geometry/Scaling.h
+++ b/Eigen/src/Geometry/Scaling.h
@@ -80,7 +80,7 @@ class UniformScaling {
   }
 
   /** Concatenates a uniform scaling and a linear transformation matrix */
-  // TODO returns an expression
+  // TODO: return an expression instead of a dense matrix.
   template <typename Derived>
   inline typename Eigen::internal::plain_matrix_type<Derived>::type operator*(const MatrixBase<Derived>& other) const {
     return other * m_factor;
diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index a5d7b608376..987cc636537 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -198,7 +198,7 @@ class Transform {
     Options = Options_,
     Dim = Dim_,       ///< space dimension in which the transformation holds
     HDim = Dim_ + 1,  ///< size of a respective homogeneous vector
-    Rows = int(Mode) == (AffineCompact) ? Dim : HDim
+    Rows = int(Mode) == int(AffineCompact) ? Dim : HDim
   };
   /** the scalar type of the coefficients */
   typedef Scalar_ Scalar;
@@ -365,6 +365,15 @@ class Transform {
    * \sa MatrixBase::operator(Index,Index) */
   EIGEN_DEVICE_FUNC inline Scalar& operator()(Index row, Index col) { return m_matrix(row, col); }
 
+#ifdef EIGEN_MULTIDIMENSIONAL_SUBSCRIPT
+  /** shortcut for m_matrix(row,col);
+   * \sa MatrixBase::operator(Index,Index) const */
+  EIGEN_DEVICE_FUNC inline Scalar operator[](Index row, Index col) const { return m_matrix[row, col]; }
+  /** shortcut for m_matrix(row,col);
+   * \sa MatrixBase::operator(Index,Index) */
+  EIGEN_DEVICE_FUNC inline Scalar& operator[](Index row, Index col) { return m_matrix[row, col]; }
+#endif
+
   /** \returns a read-only expression of the transformation matrix */
   EIGEN_DEVICE_FUNC inline const MatrixType& matrix() const { return m_matrix; }
   /** \returns a writable expression of the transformation matrix */
@@ -736,7 +745,7 @@ Transform<Scalar, Dim, Mode, Options>& Transform<Scalar, Dim, Mode, Options>::op
 
 /** \returns a QMatrix from \c *this assuming the dimension is 2.
  *
- * \warning this conversion might loss data if \c *this is not affine
+ * \warning this conversion might lose data if \c *this is not affine
  *
  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
  */
diff --git a/Eigen/src/Geometry/Umeyama.h b/Eigen/src/Geometry/Umeyama.h
index 8ed63449a3c..246986e162d 100644
--- a/Eigen/src/Geometry/Umeyama.h
+++ b/Eigen/src/Geometry/Umeyama.h
@@ -141,6 +141,13 @@ typename internal::umeyama_transform_matrix_type<Derived, OtherDerived>::type um
     // Eq. (36)-(37)
     const Scalar src_var = src_demean.rowwise().squaredNorm().sum() * one_over_n;
 
+    if (src_var <= Scalar(0)) {
+      // Degenerate: source points have zero variance (all nearly identical).
+      // Scaling is undefined; return the best-fit pure translation.
+      Rt.col(m).head(m) = dst_mean - src_mean;
+      return Rt;
+    }
+
     // Eq. (42)
     const Scalar c = Scalar(1) / src_var * svd.singularValues().dot(S);
 
diff --git a/Eigen/src/Householder/BlockHouseholder.h b/Eigen/src/Householder/BlockHouseholder.h
index 8b92304912f..faa5594a07f 100644
--- a/Eigen/src/Householder/BlockHouseholder.h
+++ b/Eigen/src/Householder/BlockHouseholder.h
@@ -20,35 +20,6 @@ namespace Eigen {
 
 namespace internal {
 
-/** \internal */
-// template<typename TriangularFactorType,typename VectorsType,typename CoeffsType>
-// void make_block_householder_triangular_factor(TriangularFactorType& triFactor, const VectorsType& vectors, const
-// CoeffsType& hCoeffs)
-// {
-//   typedef typename VectorsType::Scalar Scalar;
-//   const Index nbVecs = vectors.cols();
-//   eigen_assert(triFactor.rows() == nbVecs && triFactor.cols() == nbVecs && vectors.rows()>=nbVecs);
-//
-//   for(Index i = 0; i < nbVecs; i++)
-//   {
-//     Index rs = vectors.rows() - i;
-//     // Warning, note that hCoeffs may alias with vectors.
-//     // It is then necessary to copy it before modifying vectors(i,i).
-//     typename CoeffsType::Scalar h = hCoeffs(i);
-//     // This hack permits to pass through nested Block<> and Transpose<> expressions.
-//     Scalar *Vii_ptr = const_cast<Scalar*>(vectors.data() + vectors.outerStride()*i + vectors.innerStride()*i);
-//     Scalar Vii = *Vii_ptr;
-//     *Vii_ptr = Scalar(1);
-//     triFactor.col(i).head(i).noalias() = -h * vectors.block(i, 0, rs, i).adjoint()
-//                                        * vectors.col(i).tail(rs);
-//     *Vii_ptr = Vii;
-//     // FIXME add .noalias() once the triangular product can work inplace
-//     triFactor.col(i).head(i) = triFactor.block(0,0,i,i).template triangularView<Upper>()
-//                              * triFactor.col(i).head(i);
-//     triFactor(i,i) = hCoeffs(i);
-//   }
-// }
-
 /** \internal */
 // This variant avoid modifications in vectors
 template <typename TriangularFactorType, typename VectorsType, typename CoeffsType>
@@ -65,14 +36,8 @@ void make_block_householder_triangular_factor(TriangularFactorType& triFactor, c
       triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint() *
                                             vectors.bottomRightCorner(rs, rt).template triangularView<UnitLower>();
 
-      // FIXME use the following line with .noalias() once the triangular product can work inplace
-      // triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template
-      // triangularView<Upper>();
-      for (Index j = nbVecs - 1; j > i; --j) {
-        typename TriangularFactorType::Scalar z = triFactor(i, j);
-        triFactor(i, j) = z * triFactor(j, j);
-        if (nbVecs - j - 1 > 0) triFactor.row(i).tail(nbVecs - j - 1) += z * triFactor.row(j).tail(nbVecs - j - 1);
-      }
+      triFactor.row(i).tail(rt) =
+          (triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt, rt).template triangularView<Upper>()).eval();
     }
     triFactor(i, i) = hCoeffs(i);
   }
@@ -100,14 +65,43 @@ void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vec
          (VectorsType::MaxColsAtCompileTime == 1 && MatrixType::MaxColsAtCompileTime != 1) ? RowMajor : ColMajor,
          VectorsType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime>
       tmp = V.adjoint() * mat;
-  // FIXME add .noalias() once the triangular product can work inplace
   if (forward)
-    tmp = T.template triangularView<Upper>() * tmp;
+    tmp = (T.template triangularView<Upper>() * tmp).eval();
   else
-    tmp = T.template triangularView<Upper>().adjoint() * tmp;
+    tmp = (T.template triangularView<Upper>().adjoint() * tmp).eval();
   mat.noalias() -= V * tmp;
 }
 
+/** \internal
+ * if forward then perform   mat = mat * H0 * H1 * H2
+ * otherwise perform         mat = mat * H2 * H1 * H0
+ */
+template <typename MatrixType, typename VectorsType, typename CoeffsType>
+void apply_block_householder_on_the_right(MatrixType& mat, const VectorsType& vectors, const CoeffsType& hCoeffs,
+                                          bool forward) {
+  enum { TFactorSize = VectorsType::ColsAtCompileTime };
+  Index nbVecs = vectors.cols();
+  Matrix<typename MatrixType::Scalar, TFactorSize, TFactorSize, RowMajor> T(nbVecs, nbVecs);
+
+  if (forward)
+    make_block_householder_triangular_factor(T, vectors, hCoeffs);
+  else
+    make_block_householder_triangular_factor(T, vectors, hCoeffs.conjugate());
+  const TriangularView<const VectorsType, UnitLower> V(vectors);
+
+  // A -= (A * V) * T * V^*   (forward)
+  // A -= (A * V) * T^* * V^* (backward)
+  Matrix<typename MatrixType::Scalar, MatrixType::RowsAtCompileTime, VectorsType::ColsAtCompileTime,
+         (MatrixType::MaxRowsAtCompileTime == 1 && VectorsType::MaxColsAtCompileTime != 1) ? ColMajor : RowMajor,
+         MatrixType::MaxRowsAtCompileTime, VectorsType::MaxColsAtCompileTime>
+      tmp = mat * V;
+  if (forward)
+    tmp = (tmp * T.template triangularView<Upper>()).eval();
+  else
+    tmp = (tmp * T.template triangularView<Upper>().adjoint()).eval();
+  mat.noalias() -= tmp * V.adjoint();
+}
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h
index 96b1daf5fa9..f77e54f15d7 100644
--- a/Eigen/src/Householder/Householder.h
+++ b/Eigen/src/Householder/Householder.h
@@ -17,9 +17,17 @@
 namespace Eigen {
 
 namespace internal {
-template <int n>
+template <int N>
 struct decrement_size {
-  enum { ret = n == Dynamic ? n : n - 1 };
+  static constexpr int ret = N - 1;
+};
+template <>
+struct decrement_size<0> {
+  static constexpr int ret = 0;
+};
+template <>
+struct decrement_size<Dynamic> {
+  static constexpr int ret = Dynamic;
 };
 }  // namespace internal
 
@@ -65,12 +73,11 @@ template <typename EssentialPart>
 EIGEN_DEVICE_FUNC void MatrixBase<Derived>::makeHouseholder(EssentialPart& essential, Scalar& tau,
                                                             RealScalar& beta) const {
   using numext::conj;
-  using numext::sqrt;
 
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(EssentialPart)
-  VectorBlock<const Derived, EssentialPart::SizeAtCompileTime> tail(derived(), 1, size() - 1);
+  const VectorBlock<const Derived, EssentialPart::SizeAtCompileTime> tail(derived(), 1, size() - 1);
 
-  RealScalar tailSqNorm = size() == 1 ? RealScalar(0) : tail.squaredNorm();
+  RealScalar tailSqNorm = size() == 1 ? RealScalar(0) : tail.unwind().squaredNorm();
   Scalar c0 = coeff(0);
   const RealScalar tol = (std::numeric_limits<RealScalar>::min)();
 
@@ -79,9 +86,9 @@ EIGEN_DEVICE_FUNC void MatrixBase<Derived>::makeHouseholder(EssentialPart& essen
     beta = numext::real(c0);
     essential.setZero();
   } else {
-    beta = sqrt(numext::abs2(c0) + tailSqNorm);
+    beta = numext::sqrt(numext::abs2(c0) + tailSqNorm);
     if (numext::real(c0) >= RealScalar(0)) beta = -beta;
-    essential = tail / (c0 - beta);
+    essential = tail.unwind() / (c0 - beta);
     tau = conj((beta - c0) / beta);
   }
 }
@@ -111,10 +118,10 @@ EIGEN_DEVICE_FUNC void MatrixBase<Derived>::applyHouseholderOnTheLeft(const Esse
     Map<typename internal::plain_row_type<PlainObject>::type> tmp(workspace, cols());
     Block<Derived, EssentialPart::SizeAtCompileTime, Derived::ColsAtCompileTime> bottom(derived(), 1, 0, rows() - 1,
                                                                                         cols());
-    tmp.noalias() = essential.adjoint() * bottom;
-    tmp += this->row(0);
-    this->row(0) -= tau * tmp;
-    bottom.noalias() -= tau * essential * tmp;
+    tmp.noalias() = essential.adjoint() * bottom.unwind();
+    tmp = tau * (tmp + this->row(0));
+    this->row(0) = this->row(0) - tmp;
+    bottom.unwind().noalias() -= essential * tmp;
   }
 }
 
@@ -143,10 +150,10 @@ EIGEN_DEVICE_FUNC void MatrixBase<Derived>::applyHouseholderOnTheRight(const Ess
     Map<typename internal::plain_col_type<PlainObject>::type> tmp(workspace, rows());
     Block<Derived, Derived::RowsAtCompileTime, EssentialPart::SizeAtCompileTime> right(derived(), 0, 1, rows(),
                                                                                        cols() - 1);
-    tmp.noalias() = right * essential;
-    tmp += this->col(0);
-    this->col(0) -= tau * tmp;
-    right.noalias() -= tau * tmp * essential.adjoint();
+    tmp.noalias() = right.unwind() * essential;
+    tmp = tau * (tmp + this->col(0));
+    this->col(0) = this->col(0) - tmp;
+    right.unwind().noalias() -= tmp * essential.adjoint();
   }
 }
 
diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h
index d49c96156d8..d7eb4771292 100644
--- a/Eigen/src/Householder/HouseholderSequence.h
+++ b/Eigen/src/Householder/HouseholderSequence.h
@@ -281,10 +281,7 @@ class HouseholderSequence : public EigenBase<HouseholderSequence<VectorsType, Co
       for (Index k = 0; k < cols() - vecs; ++k) dst.col(k).tail(rows() - k - 1).setZero();
     } else if (m_length > BlockSize) {
       dst.setIdentity(rows(), rows());
-      if (m_reverse)
-        applyThisOnTheLeft(dst, workspace, true);
-      else
-        applyThisOnTheLeft(dst, workspace, true);
+      applyThisOnTheLeft(dst, workspace, true);
     } else {
       dst.setIdentity(rows(), rows());
       for (Index k = vecs - 1; k >= 0; --k) {
@@ -309,14 +306,49 @@ class HouseholderSequence : public EigenBase<HouseholderSequence<VectorsType, Co
   /** \internal */
   template <typename Dest, typename Workspace>
   inline void applyThisOnTheRight(Dest& dst, Workspace& workspace) const {
-    workspace.resize(dst.rows());
-    for (Index k = 0; k < m_length; ++k) {
-      Index actual_k = m_reverse ? m_length - k - 1 : k;
-      dst.rightCols(rows() - m_shift - actual_k)
-          .applyHouseholderOnTheRight(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
+    // Use the block path when the reflectors are long enough for GEMM to outperform GEMV.
+    // The threshold on rows() (the reflector length) is higher than for the left-side path because
+    // the right-side block application has more overhead from the tmp = mat * V product layout.
+    if (m_length >= BlockSize && rows() - m_shift >= 4 * BlockSize) {
+      applyBlockOnTheRight(dst);
+    } else {
+      workspace.resize(dst.rows());
+      for (Index k = 0; k < m_length; ++k) {
+        Index actual_k = m_reverse ? m_length - k - 1 : k;
+        dst.rightCols(rows() - m_shift - actual_k)
+            .applyHouseholderOnTheRight(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
+      }
+    }
+  }
+
+ private:
+  /** \internal Block Householder application on the right, kept out-of-line
+   * to avoid template bloat pessimizing the scalar path above. */
+  template <typename Dest>
+  EIGEN_DONT_INLINE void applyBlockOnTheRight(Dest& dst) const {
+    // Make sure we have at least 2 useful blocks, otherwise it is point-less:
+    Index blockSize = m_length < Index(2 * BlockSize) ? (m_length + 1) / 2 : Index(BlockSize);
+    for (Index i = 0; i < m_length; i += blockSize) {
+      // Right-side application processes blocks in opposite order to left-side:
+      // forward (m_reverse=false): first block first; reversed: last block first.
+      Index end = m_reverse ? m_length - i : (std::min)(m_length, i + blockSize);
+      Index k = m_reverse ? (std::max)(Index(0), end - blockSize) : i;
+      Index bs = end - k;
+      Index start = k + m_shift;
+
+      typedef Block<internal::remove_all_t<VectorsType>, Dynamic, Dynamic> SubVectorsType;
+      SubVectorsType sub_vecs1(m_vectors.const_cast_derived(), Side == OnTheRight ? k : start,
+                               Side == OnTheRight ? start : k, Side == OnTheRight ? bs : m_vectors.rows() - start,
+                               Side == OnTheRight ? m_vectors.cols() - start : bs);
+      std::conditional_t<Side == OnTheRight, Transpose<SubVectorsType>, SubVectorsType&> sub_vecs(sub_vecs1);
+
+      Index dstCols = rows() - m_shift - k;
+      auto sub_dst = dst.rightCols(dstCols);
+      internal::apply_block_householder_on_the_right(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_reverse);
     }
   }
 
+ public:
   /** \internal */
   template <typename Dest>
   inline void applyThisOnTheLeft(Dest& dst, bool inputIsIdentity = false) const {
diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
index 904d853f901..d4a187ea3e3 100644
--- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
@@ -86,14 +86,14 @@ class DiagonalPreconditioner {
   }
 
   template <typename Rhs>
-  inline const Solve<DiagonalPreconditioner, Rhs> solve(const MatrixBase<Rhs>& b) const {
+  inline Solve<DiagonalPreconditioner, Rhs> solve(const MatrixBase<Rhs>& b) const {
     eigen_assert(m_isInitialized && "DiagonalPreconditioner is not initialized.");
     eigen_assert(m_invdiag.size() == b.rows() &&
                  "DiagonalPreconditioner::solve(): invalid number of rows of the right hand side matrix b");
     return Solve<DiagonalPreconditioner, Rhs>(*this, b.derived());
   }
 
-  ComputationInfo info() { return Success; }
+  ComputationInfo info() const { return Success; }
 
  protected:
   Vector m_invdiag;
@@ -166,7 +166,7 @@ class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<Scalar_>
     return factorize(mat);
   }
 
-  ComputationInfo info() { return Success; }
+  ComputationInfo info() const { return Success; }
 
  protected:
 };
@@ -205,7 +205,7 @@ class IdentityPreconditioner {
     return b;
   }
 
-  ComputationInfo info() { return Success; }
+  ComputationInfo info() const { return Success; }
 };
 
 }  // end namespace Eigen
diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
index 8fdeb849bbd..fff86bcab33 100644
--- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
@@ -34,7 +34,6 @@ bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x, const Precondition
   typedef typename Dest::RealScalar RealScalar;
   typedef typename Dest::Scalar Scalar;
   typedef Matrix<Scalar, Dynamic, 1> VectorType;
-  RealScalar tol = tol_error;
   Index maxIters = iters;
 
   Index n = mat.cols();
@@ -48,6 +47,9 @@ bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x, const Precondition
     x.setZero();
     return true;
   }
+
+  RealScalar tol = tol_error * rhs_norm;
+
   Scalar rho(1);
   Scalar alpha(0);
   Scalar w(1);
@@ -166,6 +168,7 @@ struct traits<BiCGSTAB<MatrixType_, Preconditioner_> > {
  */
 template <typename MatrixType_, typename Preconditioner_>
 class BiCGSTAB : public IterativeSolverBase<BiCGSTAB<MatrixType_, Preconditioner_> > {
+ protected:
   typedef IterativeSolverBase<BiCGSTAB> Base;
   using Base::m_error;
   using Base::m_info;
diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index 5bb0efe8a55..9130a3cb775 100644
--- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -150,6 +150,7 @@ struct traits<ConjugateGradient<MatrixType_, UpLo_, Preconditioner_> > {
   */
 template <typename MatrixType_, int UpLo_, typename Preconditioner_>
 class ConjugateGradient : public IterativeSolverBase<ConjugateGradient<MatrixType_, UpLo_, Preconditioner_> > {
+ protected:
   typedef IterativeSolverBase<ConjugateGradient> Base;
   using Base::m_error;
   using Base::m_info;
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
index dd40058ab7e..a426381e749 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
@@ -264,7 +264,7 @@ void IncompleteCholesky<Scalar, UpLo_, OrderingType>::factorize(const MatrixType
     else
       m_scale(j) = 1;
 
-  // TODO disable scaling if not needed, i.e., if it is roughly uniform? (this will make solve() faster)
+  // TODO: disable scaling when roughly uniform to speed up solve().
 
   // Scale and compute the shift for the matrix
   RealScalar mindiag = NumTraits<RealScalar>::highest();
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
index 11ce5e5aabb..61e8d3cda8a 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
@@ -142,7 +142,11 @@ class IncompleteLUT : public SparseSolverBase<IncompleteLUT<Scalar_, StorageInde
   /** \brief Reports whether previous computation was successful.
    *
    * \returns \c Success if computation was successful,
-   *          \c NumericalIssue if the matrix.appears to be negative.
+   *          \c NumericalIssue if a zero pivot was encountered during
+   *          factorization (the resulting preconditioner is unlikely to be
+   *          usable; the input matrix typically has zero diagonal entries
+   *          that cannot be moved by a static row permutation, e.g. it is
+   *          structurally singular).
    */
   ComputationInfo info() const {
     eigen_assert(m_isInitialized && "IncompleteLUT is not initialized.");
@@ -157,7 +161,11 @@ class IncompleteLUT : public SparseSolverBase<IncompleteLUT<Scalar_, StorageInde
 
   /**
    * Compute an incomplete LU factorization with dual threshold on the matrix mat
-   * No pivoting is done in this version
+   * No partial pivoting is done in this version. A static row permutation
+   * (maximum bipartite matching) is computed in \c analyzePattern so that
+   * the permuted matrix has a structurally nonzero diagonal whenever one
+   * exists; without it, the lack of pivoting makes ILUT silently produce
+   * a useless preconditioner on matrices with zero diagonal entries.
    *
    **/
   template <typename MatrixType>
@@ -172,7 +180,7 @@ class IncompleteLUT : public SparseSolverBase<IncompleteLUT<Scalar_, StorageInde
 
   template <typename Rhs, typename Dest>
   void _solve_impl(const Rhs& b, Dest& x) const {
-    x = m_Pinv * b;
+    x = m_PinvPr * b;
     x = m_lu.template triangularView<UnitLower>().solve(x);
     x = m_lu.template triangularView<Upper>().solve(x);
     x = m_P * x;
@@ -184,6 +192,23 @@ class IncompleteLUT : public SparseSolverBase<IncompleteLUT<Scalar_, StorageInde
     inline bool operator()(const Index& row, const Index& col, const Scalar&) const { return row != col; }
   };
 
+  template <typename MatrixType>
+  Index computeRowMatching(const MatrixType& amat);
+
+  // Produce a column-major CSC sparsity pattern for `amat` (integers only —
+  // the scalar values are never read or copied). When `amat` is already a
+  // compressed column-major SparseMatrix, `outer`/`inner` point directly into
+  // its index storage; otherwise the pattern is materialized into the local
+  // `outer_buf`/`inner_buf` arrays and pointers are set into them.
+  static void patternColMajor(const SparseMatrix<Scalar, ColMajor, StorageIndex>& amat,
+                              Matrix<StorageIndex, Dynamic, 1>& outer_buf, Matrix<StorageIndex, Dynamic, 1>& inner_buf,
+                              const StorageIndex*& outer, const StorageIndex*& inner);
+
+  template <typename MatrixType>
+  static void patternColMajor(const MatrixType& amat, Matrix<StorageIndex, Dynamic, 1>& outer_buf,
+                              Matrix<StorageIndex, Dynamic, 1>& inner_buf, const StorageIndex*& outer,
+                              const StorageIndex*& inner);
+
  protected:
   FactorType m_lu;
   RealScalar m_droptol;
@@ -191,8 +216,10 @@ class IncompleteLUT : public SparseSolverBase<IncompleteLUT<Scalar_, StorageInde
   bool m_analysisIsOk;
   bool m_factorizationIsOk;
   ComputationInfo m_info;
-  PermutationMatrix<Dynamic, Dynamic, StorageIndex> m_P;     // Fill-reducing permutation
-  PermutationMatrix<Dynamic, Dynamic, StorageIndex> m_Pinv;  // Inverse permutation
+  PermutationMatrix<Dynamic, Dynamic, StorageIndex> m_P;       // Fill-reducing permutation
+  PermutationMatrix<Dynamic, Dynamic, StorageIndex> m_Pinv;    // Inverse permutation
+  PermutationMatrix<Dynamic, Dynamic, StorageIndex> m_Pr;      // Static row permutation (matching-based)
+  PermutationMatrix<Dynamic, Dynamic, StorageIndex> m_PinvPr;  // Cached composition m_Pinv * m_Pr for solve
 };
 
 /**
@@ -235,21 +262,221 @@ const typename IncompleteLUT<Scalar, StorageIndex>::FactorType IncompleteLUT<Sca
   return m_lu.template triangularView<Upper>();
 }
 
+// Specialization: amat is already a column-major SparseMatrix.
+// Share its index storage directly when compressed; otherwise materialize the
+// indices (without copying any scalar values) into outer_buf/inner_buf.
+template <typename Scalar, typename StorageIndex>
+void IncompleteLUT<Scalar, StorageIndex>::patternColMajor(const SparseMatrix<Scalar, ColMajor, StorageIndex>& amat,
+                                                          Matrix<StorageIndex, Dynamic, 1>& outer_buf,
+                                                          Matrix<StorageIndex, Dynamic, 1>& inner_buf,
+                                                          const StorageIndex*& outer, const StorageIndex*& inner) {
+  if (amat.isCompressed()) {
+    outer = amat.outerIndexPtr();
+    inner = amat.innerIndexPtr();
+    return;
+  }
+  const Index n = amat.cols();
+  const StorageIndex* a_outer = amat.outerIndexPtr();
+  const StorageIndex* a_inner_nz = amat.innerNonZeroPtr();
+  const StorageIndex* a_inner = amat.innerIndexPtr();
+  outer_buf.resize(n + 1);
+  outer_buf(0) = 0;
+  for (Index j = 0; j < n; ++j) outer_buf(j + 1) = outer_buf(j) + a_inner_nz[j];
+  inner_buf.resize(outer_buf(n));
+  for (Index j = 0; j < n; ++j) {
+    const StorageIndex* src = a_inner + a_outer[j];
+    std::copy(src, src + a_inner_nz[j], inner_buf.data() + outer_buf(j));
+  }
+  outer = outer_buf.data();
+  inner = inner_buf.data();
+}
+
+// Generic fallback: any other sparse input (row-major, expressions). Build a
+// column-major pattern via inner iterators — no scalar values are read.
+template <typename Scalar, typename StorageIndex>
+template <typename MatrixType_>
+void IncompleteLUT<Scalar, StorageIndex>::patternColMajor(const MatrixType_& amat,
+                                                          Matrix<StorageIndex, Dynamic, 1>& outer_buf,
+                                                          Matrix<StorageIndex, Dynamic, 1>& inner_buf,
+                                                          const StorageIndex*& outer, const StorageIndex*& inner) {
+  using internal::convert_index;
+  const Index n = amat.cols();
+  outer_buf.setZero(n + 1);
+  for (Index i = 0; i < amat.outerSize(); ++i)
+    for (typename MatrixType_::InnerIterator it(amat, i); it; ++it) ++outer_buf(it.col() + 1);
+  for (Index j = 0; j < n; ++j) outer_buf(j + 1) += outer_buf(j);
+  inner_buf.resize(outer_buf(n));
+  Matrix<StorageIndex, Dynamic, 1> head = outer_buf.head(n);
+  for (Index i = 0; i < amat.outerSize(); ++i)
+    for (typename MatrixType_::InnerIterator it(amat, i); it; ++it)
+      inner_buf(head(it.col())++) = convert_index<StorageIndex>(it.row());
+  outer = outer_buf.data();
+  inner = inner_buf.data();
+}
+
+// Compute a row permutation m_Pr such that (m_Pr * amat) has a structurally
+// nonzero diagonal wherever one exists. Returns the number of matched columns.
+// Uses a maximum bipartite cardinality matching on the sparsity pattern, with
+// a greedy initialization that prefers the natural diagonal so that matrices
+// already having a nonzero diagonal yield the identity permutation.
+template <typename Scalar, typename StorageIndex>
+template <typename MatrixType_>
+Index IncompleteLUT<Scalar, StorageIndex>::computeRowMatching(const MatrixType_& amat) {
+  using internal::convert_index;
+  const Index n = amat.rows();
+  // We only need the column-major sparsity pattern; never read scalar values.
+  // Share amat's index storage when it is already a compressed column-major
+  // SparseMatrix, otherwise build a value-free pattern into local arrays.
+  Matrix<StorageIndex, Dynamic, 1> outer_buf;
+  Matrix<StorageIndex, Dynamic, 1> inner_buf;
+  const StorageIndex* outer = nullptr;
+  const StorageIndex* inner = nullptr;
+  patternColMajor(amat, outer_buf, inner_buf, outer, inner);
+
+  const StorageIndex kUnmatched = StorageIndex(-1);
+  // match_row[j] = original row matched to column j; match_col[i] = column matched to row i.
+  std::vector<StorageIndex> match_row(n, kUnmatched);
+  std::vector<StorageIndex> match_col(n, kUnmatched);
+
+  // The matching uses the stored sparsity pattern only and is independent of
+  // numerical values. This preserves the analyzePattern/factorize contract:
+  // the same analysis is reusable for any matrix sharing this stored pattern.
+  // Phase 1: greedy diagonal preference.
+  for (Index j = 0; j < n; ++j) {
+    for (Index k = outer[j]; k < outer[j + 1]; ++k) {
+      if (Index(inner[k]) == j) {
+        match_row[j] = convert_index<StorageIndex>(j);
+        match_col[j] = convert_index<StorageIndex>(j);
+        break;
+      }
+    }
+  }
+  // Phase 2: greedy off-diagonal pickup of any free row.
+  for (Index j = 0; j < n; ++j) {
+    if (match_row[j] != kUnmatched) continue;
+    for (Index k = outer[j]; k < outer[j + 1]; ++k) {
+      Index i = inner[k];
+      if (match_col[i] == kUnmatched) {
+        match_row[j] = convert_index<StorageIndex>(i);
+        match_col[i] = convert_index<StorageIndex>(j);
+        break;
+      }
+    }
+  }
+  // Phase 3: augmenting paths for any column still unmatched.
+  std::vector<StorageIndex> visited(n, kUnmatched);
+  // Iterative DFS: the stack frames are (column, edge index, chosen row).
+  // chosen_row[k] is the row that frame k will commit to if a path is found.
+  std::vector<Index> stack_col;
+  std::vector<Index> stack_pos;
+  std::vector<Index> stack_chosen_row;
+  stack_col.reserve(n);
+  stack_pos.reserve(n);
+  stack_chosen_row.reserve(n);
+
+  for (Index start = 0; start < n; ++start) {
+    if (match_row[start] != kUnmatched) continue;
+    StorageIndex epoch = convert_index<StorageIndex>(start);
+    stack_col.clear();
+    stack_pos.clear();
+    stack_chosen_row.clear();
+    stack_col.push_back(start);
+    stack_pos.push_back(outer[start]);
+    stack_chosen_row.push_back(-1);
+
+    while (!stack_col.empty()) {
+      Index j = stack_col.back();
+      Index pos = stack_pos.back();
+      Index col_end = outer[j + 1];
+      bool advanced = false;
+
+      while (pos < col_end) {
+        Index i = inner[pos];
+        ++pos;
+        if (visited[i] == epoch) continue;
+        visited[i] = epoch;
+
+        if (match_col[i] == kUnmatched) {
+          // Found an augmenting path: commit it.
+          stack_chosen_row.back() = i;
+          stack_pos.back() = pos;
+          for (size_t k = 0; k < stack_col.size(); ++k) {
+            Index col = stack_col[k];
+            Index row = stack_chosen_row[k];
+            match_row[col] = convert_index<StorageIndex>(row);
+            match_col[row] = convert_index<StorageIndex>(col);
+          }
+          stack_col.clear();
+          break;
+        } else {
+          // Descend into the column currently matched to row i.
+          stack_chosen_row.back() = i;
+          stack_pos.back() = pos;
+          Index next_col = match_col[i];
+          stack_col.push_back(next_col);
+          stack_pos.push_back(outer[next_col]);
+          stack_chosen_row.push_back(-1);
+          advanced = true;
+          break;
+        }
+      }
+
+      if (!advanced && !stack_col.empty()) {
+        stack_col.pop_back();
+        stack_pos.pop_back();
+        stack_chosen_row.pop_back();
+      }
+    }
+  }
+
+  // Build the row permutation. Matched columns get their matching row;
+  // any leftover columns are filled in identity-fashion with the leftover rows.
+  m_Pr.resize(n);
+  std::vector<bool> col_used(n, false), row_used(n, false);
+  Index matched = 0;
+  for (Index j = 0; j < n; ++j) {
+    if (match_row[j] != kUnmatched) {
+      m_Pr.indices()(match_row[j]) = convert_index<StorageIndex>(j);
+      col_used[j] = true;
+      row_used[match_row[j]] = true;
+      ++matched;
+    }
+  }
+  Index next_col = 0;
+  for (Index i = 0; i < n; ++i) {
+    if (row_used[i]) continue;
+    while (next_col < n && col_used[next_col]) ++next_col;
+    m_Pr.indices()(i) = convert_index<StorageIndex>(next_col);
+    ++next_col;
+  }
+  return matched;
+}
+
 template <typename Scalar, typename StorageIndex>
 template <typename MatrixType_>
 void IncompleteLUT<Scalar, StorageIndex>::analyzePattern(const MatrixType_& amat) {
-  // Compute the Fill-reducing permutation
+  eigen_assert((amat.rows() == amat.cols()) && "The factorization should be done on a square matrix");
+  // 1. Compute a static row permutation that makes the diagonal structurally
+  //    nonzero. This is a workaround for the lack of partial pivoting in ILUT.
+  //    For matrices that already have a nonzero diagonal, this returns the
+  //    identity permutation and is essentially free.
+  computeRowMatching(amat);
+
+  // 2. Compute the Fill-reducing permutation on the row-permuted matrix.
   // Since ILUT does not perform any numerical pivoting,
   // it is highly preferable to keep the diagonal through symmetric permutations.
   // To this end, let's symmetrize the pattern and perform AMD on it.
-  SparseMatrix<Scalar, ColMajor, StorageIndex> mat1 = amat;
-  SparseMatrix<Scalar, ColMajor, StorageIndex> mat2 = amat.transpose();
-  // FIXME for a matrix with nearly symmetric pattern, mat2+mat1 is the appropriate choice.
-  //       on the other hand for a really non-symmetric pattern, mat2*mat1 should be preferred...
+  SparseMatrix<Scalar, ColMajor, StorageIndex> mat1 = m_Pr * amat;
+  SparseMatrix<Scalar, ColMajor, StorageIndex> mat2 = mat1.transpose();
+  // FIXME: for a nearly symmetric pattern, mat2+mat1 is appropriate;
+  //        for a highly non-symmetric pattern, mat2*mat1 should be preferred.
   SparseMatrix<Scalar, ColMajor, StorageIndex> AtA = mat2 + mat1;
   AMDOrdering<StorageIndex> ordering;
   ordering(AtA, m_P);
   m_Pinv = m_P.inverse();  // cache the inverse permutation
+  // Cache the composition m_Pinv * m_Pr so _solve_impl applies a single
+  // permutation to the RHS instead of two.
+  m_PinvPr = m_Pinv * m_Pr;
   m_analysisIsOk = true;
   m_factorizationIsOk = false;
   m_isInitialized = true;
@@ -271,10 +498,13 @@ void IncompleteLUT<Scalar, StorageIndex>::factorize(const MatrixType_& amat) {
   VectorI ju(n);  // column position of the values in u -- maximum size  is n
   VectorI jr(n);  // Indicate the position of the nonzero elements in the vector u -- A zero location is indicated by -1
 
-  // Apply the fill-reducing permutation
+  // Apply the static row permutation (from analyzePattern), then the
+  // fill-reducing symmetric permutation.
   eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
+  SparseMatrix<Scalar, RowMajor, StorageIndex> row_permuted_mat = m_Pr * amat;
   SparseMatrix<Scalar, RowMajor, StorageIndex> mat;
-  mat = amat.twistedBy(m_Pinv);
+  mat = row_permuted_mat.twistedBy(m_Pinv);
+  Index zero_pivots = 0;
 
   // Initialization
   jr.fill(-1);
@@ -415,7 +645,10 @@ void IncompleteLUT<Scalar, StorageIndex>::factorize(const MatrixType_& amat) {
 
     // store the diagonal element
     // apply a shifting rule to avoid zero pivots (we are doing an incomplete factorization)
-    if (u(ii) == Scalar(0)) u(ii) = sqrt(m_droptol) * rownorm;
+    if (u(ii) == Scalar(0)) {
+      u(ii) = sqrt(m_droptol) * rownorm;
+      ++zero_pivots;
+    }
     m_lu.insertBackByOuterInnerUnordered(ii, ii) = u(ii);
 
     // sort the U-part of the row
@@ -441,7 +674,11 @@ void IncompleteLUT<Scalar, StorageIndex>::factorize(const MatrixType_& amat) {
   m_lu.makeCompressed();
 
   m_factorizationIsOk = true;
-  m_info = Success;
+  // If we had to shift any zero pivot, the factorization is not faithful to
+  // the input matrix and the resulting preconditioner may be useless.
+  // Report this to the caller via NumericalIssue rather than silently
+  // returning Success.
+  m_info = (zero_pivots == 0) ? Success : NumericalIssue;
 }
 
 }  // end namespace Eigen
diff --git a/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
index 182f3190a3b..b5fa4aac299 100644
--- a/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
@@ -145,6 +145,7 @@ struct traits<LeastSquaresConjugateGradient<MatrixType_, Preconditioner_> > {
 template <typename MatrixType_, typename Preconditioner_>
 class LeastSquaresConjugateGradient
     : public IterativeSolverBase<LeastSquaresConjugateGradient<MatrixType_, Preconditioner_> > {
+ protected:
   typedef IterativeSolverBase<LeastSquaresConjugateGradient> Base;
   using Base::m_error;
   using Base::m_info;
diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h
index 2686a5237af..f24a851f3fe 100644
--- a/Eigen/src/Jacobi/Jacobi.h
+++ b/Eigen/src/Jacobi/Jacobi.h
@@ -239,7 +239,7 @@ EIGEN_DEVICE_FUNC void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const
 
 namespace internal {
 /** \jacobi_module
- * Applies the clock wise 2D rotation \a j to the set of 2D vectors of coordinates \a x and \a y:
+ * Applies the clockwise 2D rotation \a j to the set of 2D vectors of coordinates \a x and \a y:
  * \f$ \left ( \begin{array}{cc} x \\ y \end{array} \right )  =  J \left ( \begin{array}{cc} x \\ y \end{array} \right )
  * \f$
  *
@@ -305,7 +305,7 @@ struct apply_rotation_in_the_plane_selector<Scalar, OtherScalar, SizeAtCompileTi
     typedef typename packet_traits<OtherScalar>::type OtherPacket;
 
     constexpr int RequiredAlignment =
-        (std::max)(unpacket_traits<Packet>::alignment, unpacket_traits<OtherPacket>::alignment);
+        (std::max<int>)(unpacket_traits<Packet>::alignment, unpacket_traits<OtherPacket>::alignment);
     constexpr Index PacketSize = packet_traits<Scalar>::size;
 
     /*** dynamic-size vectorized paths ***/
@@ -335,8 +335,8 @@ struct apply_rotation_in_the_plane_selector<Scalar, OtherScalar, SizeAtCompileTi
         for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
           Packet xi = pload<Packet>(px);
           Packet yi = pload<Packet>(py);
-          pstore(px, padd(pm.pmul(pc, xi), pcj.pmul(ps, yi)));
-          pstore(py, psub(pcj.pmul(pc, yi), pm.pmul(ps, xi)));
+          pstore(px, pm.pmadd(pc, xi, pcj.pmul(ps, yi)));
+          pstore(py, pcj.pmsub(pc, yi, pm.pmul(ps, xi)));
           px += PacketSize;
           py += PacketSize;
         }
@@ -347,18 +347,18 @@ struct apply_rotation_in_the_plane_selector<Scalar, OtherScalar, SizeAtCompileTi
           Packet xi1 = ploadu<Packet>(px + PacketSize);
           Packet yi = pload<Packet>(py);
           Packet yi1 = pload<Packet>(py + PacketSize);
-          pstoreu(px, padd(pm.pmul(pc, xi), pcj.pmul(ps, yi)));
-          pstoreu(px + PacketSize, padd(pm.pmul(pc, xi1), pcj.pmul(ps, yi1)));
-          pstore(py, psub(pcj.pmul(pc, yi), pm.pmul(ps, xi)));
-          pstore(py + PacketSize, psub(pcj.pmul(pc, yi1), pm.pmul(ps, xi1)));
+          pstoreu(px, pm.pmadd(pc, xi, pcj.pmul(ps, yi)));
+          pstoreu(px + PacketSize, pm.pmadd(pc, xi1, pcj.pmul(ps, yi1)));
+          pstore(py, pcj.pmsub(pc, yi, pm.pmul(ps, xi)));
+          pstore(py + PacketSize, pcj.pmsub(pc, yi1, pm.pmul(ps, xi1)));
           px += Peeling * PacketSize;
           py += Peeling * PacketSize;
         }
         if (alignedEnd != peelingEnd) {
           Packet xi = ploadu<Packet>(x + peelingEnd);
           Packet yi = pload<Packet>(y + peelingEnd);
-          pstoreu(x + peelingEnd, padd(pm.pmul(pc, xi), pcj.pmul(ps, yi)));
-          pstore(y + peelingEnd, psub(pcj.pmul(pc, yi), pm.pmul(ps, xi)));
+          pstoreu(x + peelingEnd, pm.pmadd(pc, xi, pcj.pmul(ps, yi)));
+          pstore(y + peelingEnd, pcj.pmsub(pc, yi, pm.pmul(ps, xi)));
         }
       }
 
@@ -381,8 +381,8 @@ struct apply_rotation_in_the_plane_selector<Scalar, OtherScalar, SizeAtCompileTi
       for (Index i = 0; i < size; i += PacketSize) {
         Packet xi = pload<Packet>(px);
         Packet yi = pload<Packet>(py);
-        pstore(px, padd(pm.pmul(pc, xi), pcj.pmul(ps, yi)));
-        pstore(py, psub(pcj.pmul(pc, yi), pm.pmul(ps, xi)));
+        pstore(px, pm.pmadd(pc, xi, pcj.pmul(ps, yi)));
+        pstore(py, pcj.pmsub(pc, yi, pm.pmul(ps, xi)));
         px += PacketSize;
         py += PacketSize;
       }
@@ -420,6 +420,47 @@ EIGEN_DEVICE_FUNC void inline apply_rotation_in_the_plane(DenseBase<VectorX>& xp
       x, incrx, y, incry, size, c, s);
 }
 
+template <typename MatrixType, typename RealScalar, typename Index>
+EIGEN_DONT_INLINE void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
+                                           JacobiRotation<RealScalar>* j_left, JacobiRotation<RealScalar>* j_right) {
+  // Extract 2x2 submatrix into scalars (avoids Matrix construction on stack).
+  const RealScalar m00 = numext::real(matrix.coeff(p, p));
+  const RealScalar m01 = numext::real(matrix.coeff(p, q));
+  const RealScalar m10 = numext::real(matrix.coeff(q, p));
+  const RealScalar m11 = numext::real(matrix.coeff(q, q));
+
+  // Compute the symmetrizing rotation rot1 such that rot1 * [m] is symmetric.
+  const RealScalar t = m00 + m11;
+  const RealScalar d = m10 - m01;
+
+  RealScalar c1, s1;
+  if (numext::abs(d) < (std::numeric_limits<RealScalar>::min)()) {
+    c1 = RealScalar(1);
+    s1 = RealScalar(0);
+  } else {
+    // If d!=0, then t/d cannot overflow because the magnitude of the
+    // entries forming d are not too small compared to the ones forming t.
+    RealScalar u = t / d;
+    s1 = RealScalar(1) / numext::sqrt(RealScalar(1) + numext::abs2(u));
+    c1 = u * s1;
+  }
+
+  // Apply rot1 to the 2x2 submatrix inline (avoids rotation dispatch overhead).
+  // Result is symmetric, so we only need 3 values: a00, a01 (== a10), a11.
+  const RealScalar a00 = c1 * m00 + s1 * m10;
+  const RealScalar a01 = c1 * m01 + s1 * m11;
+  const RealScalar a11 = -s1 * m01 + c1 * m11;
+
+  // Compute the diagonalizing rotation j_right from the symmetrized matrix.
+  j_right->makeJacobi(a00, a01, a11);
+
+  // Compose j_left = rot1 * j_right^T inline (avoids template machinery overhead).
+  const RealScalar jr_c = j_right->c();
+  const RealScalar jr_s = j_right->s();
+  j_left->c() = c1 * jr_c + s1 * jr_s;
+  j_left->s() = s1 * jr_c - c1 * jr_s;
+}
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/KLUSupport/KLUSupport.h b/Eigen/src/KLUSupport/KLUSupport.h
index 12cf6c2b2b3..f07ad8d128c 100644
--- a/Eigen/src/KLUSupport/KLUSupport.h
+++ b/Eigen/src/KLUSupport/KLUSupport.h
@@ -112,32 +112,7 @@ class KLU : public SparseSolverBase<KLU<MatrixType_> > {
     eigen_assert(m_isInitialized && "Decomposition is not initialized.");
     return m_info;
   }
-#if 0  // not implemented yet
-    inline const LUMatrixType& matrixL() const
-    {
-      if (m_extractedDataAreDirty) extractData();
-      return m_l;
-    }
-
-    inline const LUMatrixType& matrixU() const
-    {
-      if (m_extractedDataAreDirty) extractData();
-      return m_u;
-    }
-
-    inline const IntColVectorType& permutationP() const
-    {
-      if (m_extractedDataAreDirty) extractData();
-      return m_p;
-    }
-
-    inline const IntRowVectorType& permutationQ() const
-    {
-      if (m_extractedDataAreDirty) extractData();
-      return m_q;
-    }
-#endif
-  /** Computes the sparse Cholesky decomposition of \a matrix
+  /** Computes the sparse LU factorization of \a matrix
    *  Note that the matrix should be column-major, and in compressed format for best performance.
    *  \sa SparseMatrix::makeCompressed().
    */
@@ -150,7 +125,7 @@ class KLU : public SparseSolverBase<KLU<MatrixType_> > {
     factorize_impl();
   }
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -172,7 +147,7 @@ class KLU : public SparseSolverBase<KLU<MatrixType_> > {
    */
   inline const klu_common &kluCommon() const { return m_common; }
 
-  /** Provides access to the control settings array used by UmfPack.
+  /** Provides access to the control settings array used by KLU.
    *
    * If this array contains NaN's, the default values are used.
    *
@@ -182,7 +157,7 @@ class KLU : public SparseSolverBase<KLU<MatrixType_> > {
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed.
+   * The given matrix must have the same sparsity as the matrix on which the pattern analysis has been performed.
    *
    * \sa analyzePattern(), compute()
    */
@@ -200,12 +175,6 @@ class KLU : public SparseSolverBase<KLU<MatrixType_> > {
   template <typename BDerived, typename XDerived>
   bool _solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const;
 
-#if 0  // not implemented yet
-    Scalar determinant() const;
-
-    void extractData() const;
-#endif
-
  protected:
   void init() {
     m_info = InvalidInput;
@@ -255,14 +224,6 @@ class KLU : public SparseSolverBase<KLU<MatrixType_> > {
     }
   }
 
-  // cached data to reduce reallocation, etc.
-#if 0  // not implemented yet
-    mutable LUMatrixType m_l;
-    mutable LUMatrixType m_u;
-    mutable IntColVectorType m_p;
-    mutable IntRowVectorType m_q;
-#endif
-
   KLUMatrixType m_dummy;
   KLUMatrixRef mp_matrix;
 
@@ -278,45 +239,6 @@ class KLU : public SparseSolverBase<KLU<MatrixType_> > {
   KLU(const KLU &) {}
 };
 
-#if 0  // not implemented yet
-template<typename MatrixType>
-void KLU<MatrixType>::extractData() const
-{
-  if (m_extractedDataAreDirty)
-  {
-     eigen_assert(false && "KLU: extractData Not Yet Implemented");
-
-    // get size of the data
-    int lnz, unz, rows, cols, nz_udiag;
-    umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar());
-
-    // allocate data
-    m_l.resize(rows,(std::min)(rows,cols));
-    m_l.resizeNonZeros(lnz);
-
-    m_u.resize((std::min)(rows,cols),cols);
-    m_u.resizeNonZeros(unz);
-
-    m_p.resize(rows);
-    m_q.resize(cols);
-
-    // extract
-    umfpack_get_numeric(m_l.outerIndexPtr(), m_l.innerIndexPtr(), m_l.valuePtr(),
-                        m_u.outerIndexPtr(), m_u.innerIndexPtr(), m_u.valuePtr(),
-                        m_p.data(), m_q.data(), 0, 0, 0, m_numeric);
-
-    m_extractedDataAreDirty = false;
-  }
-}
-
-template<typename MatrixType>
-typename KLU<MatrixType>::Scalar KLU<MatrixType>::determinant() const
-{
-  eigen_assert(false && "KLU: extractData Not Yet Implemented");
-  return Scalar();
-}
-#endif
-
 template <typename MatrixType>
 template <typename BDerived, typename XDerived>
 bool KLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const {
diff --git a/Eigen/src/LU/Determinant.h b/Eigen/src/LU/Determinant.h
index ae4fee38b48..0427c01f44f 100644
--- a/Eigen/src/LU/Determinant.h
+++ b/Eigen/src/LU/Determinant.h
@@ -25,10 +25,7 @@ EIGEN_DEVICE_FUNC inline const typename Derived::Scalar bruteforce_det3_helper(c
 
 template <typename Derived, int DeterminantType = Derived::RowsAtCompileTime>
 struct determinant_impl {
-  static inline typename traits<Derived>::Scalar run(const Derived& m) {
-    if (Derived::ColsAtCompileTime == Dynamic && m.rows() == 0) return typename traits<Derived>::Scalar(1);
-    return m.partialPivLu().determinant();
-  }
+  static inline typename traits<Derived>::Scalar run(const Derived& m) { return internal::partial_lu_determinant(m); }
 };
 
 template <typename Derived>
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 786cd76da04..098b16fe2c0 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -60,11 +60,23 @@ struct traits<FullPivLU<MatrixType_, PermutationIndex_> > : traits<MatrixType_>
  * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse()
  */
 template <typename MatrixType_, typename PermutationIndex_>
-class FullPivLU : public SolverBase<FullPivLU<MatrixType_, PermutationIndex_> > {
+class FullPivLU : public SolverBase<FullPivLU<MatrixType_, PermutationIndex_> >,
+                  public RankRevealingBase<FullPivLU<MatrixType_, PermutationIndex_> > {
  public:
   typedef MatrixType_ MatrixType;
   typedef SolverBase<FullPivLU> Base;
+  typedef RankRevealingBase<FullPivLU> RankRevealingBase_;
   friend class SolverBase<FullPivLU>;
+  friend class RankRevealingBase<FullPivLU>;
+  using RankRevealingBase_::dimensionOfKernel;
+  using RankRevealingBase_::isInjective;
+  using RankRevealingBase_::isInvertible;
+  using RankRevealingBase_::isSurjective;
+  using RankRevealingBase_::maxPivot;
+  using RankRevealingBase_::nonzeroPivots;
+  using RankRevealingBase_::rank;
+  using RankRevealingBase_::setThreshold;
+  using RankRevealingBase_::threshold;
 
   EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivLU)
   enum {
@@ -148,23 +160,6 @@ class FullPivLU : public SolverBase<FullPivLU<MatrixType_, PermutationIndex_> >
     return m_lu;
   }
 
-  /** \returns the number of nonzero pivots in the LU decomposition.
-   * Here nonzero is meant in the exact sense, not in a fuzzy sense.
-   * So that notion isn't really intrinsically interesting, but it is
-   * still useful when implementing algorithms.
-   *
-   * \sa rank()
-   */
-  inline Index nonzeroPivots() const {
-    eigen_assert(m_isInitialized && "LU is not initialized.");
-    return m_nonzero_pivots;
-  }
-
-  /** \returns the absolute value of the biggest pivot, i.e. the biggest
-   *          diagonal coefficient of U.
-   */
-  RealScalar maxPivot() const { return m_maxpivot; }
-
   /** \returns the permutation matrix P
    *
    * \sa permutationQ()
@@ -247,7 +242,7 @@ class FullPivLU : public SolverBase<FullPivLU<MatrixType_, PermutationIndex_> >
    * \sa TriangularView::solve(), kernel(), inverse()
    */
   template <typename Rhs>
-  inline const Solve<FullPivLU, Rhs> solve(const MatrixBase<Rhs>& b) const;
+  inline Solve<FullPivLU, Rhs> solve(const MatrixBase<Rhs>& b) const;
 #endif
 
   /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
@@ -278,113 +273,10 @@ class FullPivLU : public SolverBase<FullPivLU<MatrixType_, PermutationIndex_> >
    */
   typename internal::traits<MatrixType>::Scalar determinant() const;
 
-  /** Allows to prescribe a threshold to be used by certain methods, such as rank(),
-   * who need to determine when pivots are to be considered nonzero. This is not used for the
-   * LU decomposition itself.
-   *
-   * When it needs to get the threshold value, Eigen calls threshold(). By default, this
-   * uses a formula to automatically determine a reasonable threshold.
-   * Once you have called the present method setThreshold(const RealScalar&),
-   * your value is used instead.
-   *
-   * \param threshold The new value to use as the threshold.
-   *
-   * A pivot will be considered nonzero if its absolute value is strictly greater than
-   *  \f$ \vert pivot \vert \leqslant threshold \times \vert maxpivot \vert \f$
-   * where maxpivot is the biggest pivot.
-   *
-   * If you want to come back to the default behavior, call setThreshold(Default_t)
-   */
-  FullPivLU& setThreshold(const RealScalar& threshold) {
-    m_usePrescribedThreshold = true;
-    m_prescribedThreshold = threshold;
-    return *this;
-  }
-
-  /** Allows to come back to the default behavior, letting Eigen use its default formula for
-   * determining the threshold.
-   *
-   * You should pass the special object Eigen::Default as parameter here.
-   * \code lu.setThreshold(Eigen::Default); \endcode
-   *
-   * See the documentation of setThreshold(const RealScalar&).
-   */
-  FullPivLU& setThreshold(Default_t) {
-    m_usePrescribedThreshold = false;
-    return *this;
-  }
-
-  /** Returns the threshold that will be used by certain methods such as rank().
-   *
-   * See the documentation of setThreshold(const RealScalar&).
-   */
-  RealScalar threshold() const {
-    eigen_assert(m_isInitialized || m_usePrescribedThreshold);
-    return m_usePrescribedThreshold ? m_prescribedThreshold
-                                    // this formula comes from experimenting (see "LU precision tuning" thread on the
-                                    // list) and turns out to be identical to Higham's formula used already in LDLt.
-                                    : NumTraits<Scalar>::epsilon() * RealScalar(m_lu.diagonalSize());
-  }
-
-  /** \returns the rank of the matrix of which *this is the LU decomposition.
-   *
-   * \note This method has to determine which pivots should be considered nonzero.
-   *       For that, it uses the threshold value that you can control by calling
-   *       setThreshold(const RealScalar&).
-   */
-  inline Index rank() const {
+  /** \returns the absolute value of the i-th pivot coefficient (for RankRevealingBase). */
+  RealScalar pivotCoeff(Index i) const {
     using std::abs;
-    eigen_assert(m_isInitialized && "LU is not initialized.");
-    RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold();
-    Index result = 0;
-    for (Index i = 0; i < m_nonzero_pivots; ++i) result += (abs(m_lu.coeff(i, i)) > premultiplied_threshold);
-    return result;
-  }
-
-  /** \returns the dimension of the kernel of the matrix of which *this is the LU decomposition.
-   *
-   * \note This method has to determine which pivots should be considered nonzero.
-   *       For that, it uses the threshold value that you can control by calling
-   *       setThreshold(const RealScalar&).
-   */
-  inline Index dimensionOfKernel() const {
-    eigen_assert(m_isInitialized && "LU is not initialized.");
-    return cols() - rank();
-  }
-
-  /** \returns true if the matrix of which *this is the LU decomposition represents an injective
-   *          linear map, i.e. has trivial kernel; false otherwise.
-   *
-   * \note This method has to determine which pivots should be considered nonzero.
-   *       For that, it uses the threshold value that you can control by calling
-   *       setThreshold(const RealScalar&).
-   */
-  inline bool isInjective() const {
-    eigen_assert(m_isInitialized && "LU is not initialized.");
-    return rank() == cols();
-  }
-
-  /** \returns true if the matrix of which *this is the LU decomposition represents a surjective
-   *          linear map; false otherwise.
-   *
-   * \note This method has to determine which pivots should be considered nonzero.
-   *       For that, it uses the threshold value that you can control by calling
-   *       setThreshold(const RealScalar&).
-   */
-  inline bool isSurjective() const {
-    eigen_assert(m_isInitialized && "LU is not initialized.");
-    return rank() == rows();
-  }
-
-  /** \returns true if the matrix of which *this is the LU decomposition is invertible.
-   *
-   * \note This method has to determine which pivots should be considered nonzero.
-   *       For that, it uses the threshold value that you can control by calling
-   *       setThreshold(const RealScalar&).
-   */
-  inline bool isInvertible() const {
-    eigen_assert(m_isInitialized && "LU is not initialized.");
-    return isInjective() && (m_lu.rows() == m_lu.cols());
+    return abs(m_lu.coeff(i, i));
   }
 
   /** \returns the inverse of the matrix of which *this is the LU decomposition.
@@ -394,7 +286,7 @@ class FullPivLU : public SolverBase<FullPivLU<MatrixType_, PermutationIndex_> >
    *
    * \sa MatrixBase::inverse()
    */
-  inline const Inverse<FullPivLU> inverse() const {
+  inline Inverse<FullPivLU> inverse() const {
     eigen_assert(m_isInitialized && "LU is not initialized.");
     eigen_assert(m_lu.rows() == m_lu.cols() && "You can't take the inverse of a non-square matrix!");
     return Inverse<FullPivLU>(*this);
@@ -423,15 +315,13 @@ class FullPivLU : public SolverBase<FullPivLU<MatrixType_, PermutationIndex_> >
   PermutationQType m_q;
   IntColVectorType m_rowsTranspositions;
   IntRowVectorType m_colsTranspositions;
-  Index m_nonzero_pivots;
   RealScalar m_l1_norm;
-  RealScalar m_maxpivot, m_prescribedThreshold;
   signed char m_det_pq;
-  bool m_isInitialized, m_usePrescribedThreshold;
+  bool m_isInitialized;
 };
 
 template <typename MatrixType, typename PermutationIndex>
-FullPivLU<MatrixType, PermutationIndex>::FullPivLU() : m_isInitialized(false), m_usePrescribedThreshold(false) {}
+FullPivLU<MatrixType, PermutationIndex>::FullPivLU() : m_isInitialized(false) {}
 
 template <typename MatrixType, typename PermutationIndex>
 FullPivLU<MatrixType, PermutationIndex>::FullPivLU(Index rows, Index cols)
@@ -440,8 +330,7 @@ FullPivLU<MatrixType, PermutationIndex>::FullPivLU(Index rows, Index cols)
       m_q(cols),
       m_rowsTranspositions(rows),
       m_colsTranspositions(cols),
-      m_isInitialized(false),
-      m_usePrescribedThreshold(false) {}
+      m_isInitialized(false) {}
 
 template <typename MatrixType, typename PermutationIndex>
 template <typename InputType>
@@ -451,8 +340,7 @@ FullPivLU<MatrixType, PermutationIndex>::FullPivLU(const EigenBase<InputType>& m
       m_q(matrix.cols()),
       m_rowsTranspositions(matrix.rows()),
       m_colsTranspositions(matrix.cols()),
-      m_isInitialized(false),
-      m_usePrescribedThreshold(false) {
+      m_isInitialized(false) {
   compute(matrix.derived());
 }
 
@@ -464,8 +352,7 @@ FullPivLU<MatrixType, PermutationIndex>::FullPivLU(EigenBase<InputType>& matrix)
       m_q(matrix.cols()),
       m_rowsTranspositions(matrix.rows()),
       m_colsTranspositions(matrix.cols()),
-      m_isInitialized(false),
-      m_usePrescribedThreshold(false) {
+      m_isInitialized(false) {
   computeInPlace();
 }
 
@@ -486,8 +373,8 @@ void FullPivLU<MatrixType, PermutationIndex>::computeInPlace() {
   m_colsTranspositions.resize(m_lu.cols());
   Index number_of_transpositions = 0;  // number of NONTRIVIAL transpositions, i.e. m_rowsTranspositions[i]!=i
 
-  m_nonzero_pivots = size;  // the generic case is that in which all pivots are nonzero (invertible case)
-  m_maxpivot = RealScalar(0);
+  this->m_nonzero_pivots = size;  // the generic case is that in which all pivots are nonzero (invertible case)
+  this->m_maxpivot = RealScalar(0);
 
   for (Index k = 0; k < size; ++k) {
     // First, we need to find the pivot.
@@ -506,7 +393,7 @@ void FullPivLU<MatrixType, PermutationIndex>::computeInPlace() {
     if (numext::is_exactly_zero(biggest_in_corner)) {
       // before exiting, make sure to initialize the still uninitialized transpositions
       // in a sane state without destroying what we already have.
-      m_nonzero_pivots = k;
+      this->m_nonzero_pivots = k;
       for (Index i = k; i < size; ++i) {
         m_rowsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
         m_colsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
@@ -516,7 +403,7 @@ void FullPivLU<MatrixType, PermutationIndex>::computeInPlace() {
 
     RealScalar abs_pivot = internal::abs_knowing_score<Scalar>()(
         m_lu(row_of_biggest_in_corner, col_of_biggest_in_corner), biggest_in_corner);
-    if (abs_pivot > m_maxpivot) m_maxpivot = abs_pivot;
+    if (abs_pivot > this->m_maxpivot) this->m_maxpivot = abs_pivot;
 
     // Now that we've found the pivot, we need to apply the row/col swaps to
     // bring it to the location (k,k).
@@ -571,7 +458,7 @@ MatrixType FullPivLU<MatrixType, PermutationIndex>::reconstructedMatrix() const
   const Index smalldim = (std::min)(m_lu.rows(), m_lu.cols());
   // LU
   MatrixType res(m_lu.rows(), m_lu.cols());
-  // FIXME the .toDenseMatrix() should not be needed...
+  // FIXME: the .toDenseMatrix() calls should not be needed.
   res = m_lu.leftCols(smalldim).template triangularView<UnitLower>().toDenseMatrix() *
         m_lu.topRows(smalldim).template triangularView<Upper>().toDenseMatrix();
 
@@ -632,10 +519,10 @@ struct kernel_retval<FullPivLU<MatrixType_, PermutationIndex_> >
       if (abs(dec().matrixLU().coeff(i, i)) > premultiplied_threshold) pivots.coeffRef(p++) = i;
     eigen_internal_assert(p == rank());
 
-    // we construct a temporaty trapezoid matrix m, by taking the U matrix and
-    // permuting the rows and cols to bring the nonnegligible pivots to the top of
-    // the main diagonal. We need that to be able to apply our triangular solvers.
-    // FIXME when we get triangularView-for-rectangular-matrices, this can be simplified
+    // Construct a temporary trapezoid matrix m by taking the U matrix and permuting
+    // the rows and cols to bring the nonnegligible pivots to the top of the main diagonal.
+    // This is needed to apply our triangular solvers.
+    // FIXME: simplify once triangularView supports rectangular matrices.
     Matrix<typename MatrixType::Scalar, Dynamic, Dynamic, traits<MatrixType>::Options, MaxSmallDimAtCompileTime,
            MatrixType::MaxColsAtCompileTime>
         m(dec().matrixLU().block(0, 0, rank(), cols));
@@ -816,8 +703,7 @@ struct Assignment<
  */
 template <typename Derived>
 template <typename PermutationIndex>
-inline const FullPivLU<typename MatrixBase<Derived>::PlainObject, PermutationIndex> MatrixBase<Derived>::fullPivLu()
-    const {
+inline FullPivLU<typename MatrixBase<Derived>::PlainObject, PermutationIndex> MatrixBase<Derived>::fullPivLu() const {
   return FullPivLU<PlainObject, PermutationIndex>(eval());
 }
 
diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h
index fe8859e9ac3..58ac305bc4e 100644
--- a/Eigen/src/LU/InverseImpl.h
+++ b/Eigen/src/LU/InverseImpl.h
@@ -276,7 +276,7 @@ struct Assignment<DstXprType, Inverse<XprType>,
  * \sa computeInverseAndDetWithCheck()
  */
 template <typename Derived>
-EIGEN_DEVICE_FUNC inline const Inverse<Derived> MatrixBase<Derived>::inverse() const {
+EIGEN_DEVICE_FUNC inline Inverse<Derived> MatrixBase<Derived>::inverse() const {
   EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
   eigen_assert(rows() == cols());
   return Inverse<Derived>(derived());
@@ -308,8 +308,9 @@ inline void MatrixBase<Derived>::computeInverseAndDetWithCheck(ResultType& inver
                                                                typename ResultType::Scalar& determinant,
                                                                bool& invertible,
                                                                const RealScalar& absDeterminantThreshold) const {
-  // i'd love to put some static assertions there, but SFINAE means that they have no effect...
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived, ResultType)
   eigen_assert(rows() == cols());
+  inverse.resize(rows(), cols());
   // for 2x2, it's worth giving a chance to avoid evaluating.
   // for larger sizes, evaluating has negligible cost and limits code size.
   typedef std::conditional_t<RowsAtCompileTime == 2,
@@ -343,8 +344,6 @@ template <typename ResultType>
 inline void MatrixBase<Derived>::computeInverseWithCheck(ResultType& inverse, bool& invertible,
                                                          const RealScalar& absDeterminantThreshold) const {
   Scalar determinant;
-  // i'd love to put some static assertions there, but SFINAE means that they have no effect...
-  eigen_assert(rows() == cols());
   computeInverseAndDetWithCheck(inverse, determinant, invertible, absDeterminantThreshold);
 }
 
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index 7ea14f5761d..5a2205144fc 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -28,9 +28,6 @@ struct traits<PartialPivLU<MatrixType_, PermutationIndex_> > : traits<MatrixType
 
 template <typename T, typename Derived>
 struct enable_if_ref;
-// {
-//   typedef Derived type;
-// };
 
 template <typename T, typename Derived>
 struct enable_if_ref<Ref<T>, Derived> {
@@ -181,7 +178,7 @@ class PartialPivLU : public SolverBase<PartialPivLU<MatrixType_, PermutationInde
    * \sa TriangularView::solve(), inverse(), computeInverse()
    */
   template <typename Rhs>
-  inline const Solve<PartialPivLU, Rhs> solve(const MatrixBase<Rhs>& b) const;
+  inline Solve<PartialPivLU, Rhs> solve(const MatrixBase<Rhs>& b) const;
 #endif
 
   /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
@@ -199,7 +196,7 @@ class PartialPivLU : public SolverBase<PartialPivLU<MatrixType_, PermutationInde
    *
    * \sa MatrixBase::inverse(), LU::inverse()
    */
-  inline const Inverse<PartialPivLU> inverse() const {
+  inline Inverse<PartialPivLU> inverse() const {
     eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
     return Inverse<PartialPivLU>(*this);
   }
@@ -341,9 +338,10 @@ struct partial_lu_impl {
     const Index rows = lu.rows();
     const Index cols = lu.cols();
     const Index size = (std::min)(rows, cols);
-    // For small compile-time matrices it is worth processing the last row separately:
+    // For small compile-time matrices and square runtime matrices it is worth processing the last row separately:
     //  speedup: +100% for 2x2, +10% for others.
-    const Index endk = UnBlockedAtCompileTime ? size - 1 : size;
+    const bool process_last_row_separately = UnBlockedAtCompileTime || rows == cols;
+    const Index endk = process_last_row_separately ? size - 1 : size;
     nb_transpositions = 0;
     Index first_zero_pivot = -1;
     for (Index k = 0; k < endk; ++k) {
@@ -369,13 +367,14 @@ struct partial_lu_impl {
         first_zero_pivot = k;
       }
 
-      if (k < rows - 1)
+      // Skip the trailing update for rectangular panels with no remaining columns.
+      if (rrows > 0 && rcols > 0)
         lu.bottomRightCorner(fix<RRows>(rrows), fix<RCols>(rcols)).noalias() -=
             lu.col(k).tail(fix<RRows>(rrows)) * lu.row(k).tail(fix<RCols>(rcols));
     }
 
     // special handling of the last entry
-    if (UnBlockedAtCompileTime) {
+    if (process_last_row_separately) {
       Index k = endk;
       row_transpositions[k] = PivIndex(k);
       if (numext::is_exactly_zero(Scoring()(lu(k, k))) && first_zero_pivot == -1) first_zero_pivot = k;
@@ -430,7 +429,6 @@ struct partial_lu_impl {
       //                          A00 | A01 | A02
       // lu  = A_0 | A_1 | A_2 =  A10 | A11 | A12
       //                          A20 | A21 | A22
-      BlockType A_0 = lu.block(0, 0, rows, k);
       BlockType A_2 = lu.block(0, k + bs, rows, tsize);
       BlockType A11 = lu.block(k, k, bs, bs);
       BlockType A12 = lu.block(k, k + bs, bs, tsize);
@@ -446,9 +444,12 @@ struct partial_lu_impl {
 
       nb_transpositions += nb_transpositions_in_panel;
       // update permutations and apply them to A_0
-      for (Index i = k; i < k + bs; ++i) {
-        Index piv = (row_transpositions[i] += internal::convert_index<PivIndex>(k));
-        A_0.row(i).swap(A_0.row(piv));
+      if (k > 0) {
+        BlockType A_0 = lu.block(0, 0, rows, k);
+        for (Index i = k; i < k + bs; ++i) {
+          Index piv = (row_transpositions[i] += internal::convert_index<PivIndex>(k));
+          A_0.row(i).swap(A_0.row(piv));
+        }
       }
 
       if (trows) {
@@ -486,6 +487,29 @@ void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions,
                  nb_transpositions);
 }
 
+/** \internal returns the determinant computed from an in-place partial-pivoting
+ * LU decomposition of \a m without constructing a PartialPivLU object.
+ */
+template <typename Derived>
+typename traits<Derived>::Scalar partial_lu_determinant(const Derived& m) {
+  typedef typename traits<Derived>::Scalar Scalar;
+  if (m.rows() == 0) return Scalar(1);
+  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+
+  typedef typename plain_matrix_type<Derived>::type PlainObject;
+  typedef Transpositions<PlainObject::RowsAtCompileTime, PlainObject::MaxRowsAtCompileTime, DefaultPermutationIndex>
+      TranspositionType;
+
+  eigen_assert(m.rows() < NumTraits<DefaultPermutationIndex>::highest());
+  PlainObject lu(m);
+
+  TranspositionType row_transpositions(lu.rows());
+  typename TranspositionType::StorageIndex nb_transpositions;
+  partial_lu_inplace(lu, row_transpositions, nb_transpositions);
+
+  return Scalar((nb_transpositions % 2) ? -1 : 1) * lu.diagonal().prod();
+}
+
 }  // end namespace internal
 
 template <typename MatrixType, typename PermutationIndex>
@@ -562,8 +586,8 @@ struct Assignment<
  */
 template <typename Derived>
 template <typename PermutationIndex>
-inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject, PermutationIndex>
-MatrixBase<Derived>::partialPivLu() const {
+inline PartialPivLU<typename MatrixBase<Derived>::PlainObject, PermutationIndex> MatrixBase<Derived>::partialPivLu()
+    const {
   return PartialPivLU<PlainObject, PermutationIndex>(eval());
 }
 
@@ -577,7 +601,7 @@ MatrixBase<Derived>::partialPivLu() const {
  */
 template <typename Derived>
 template <typename PermutationIndex>
-inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject, PermutationIndex> MatrixBase<Derived>::lu() const {
+inline PartialPivLU<typename MatrixBase<Derived>::PlainObject, PermutationIndex> MatrixBase<Derived>::lu() const {
   return PartialPivLU<PlainObject, PermutationIndex>(eval());
 }
 
diff --git a/Eigen/src/MetisSupport/MetisSupport.h b/Eigen/src/MetisSupport/MetisSupport.h
index 6c7bf946503..961c8408981 100644
--- a/Eigen/src/MetisSupport/MetisSupport.h
+++ b/Eigen/src/MetisSupport/MetisSupport.h
@@ -38,7 +38,7 @@ class MetisOrdering {
     IndexVector visited(m);
     visited.setConstant(-1);
     for (StorageIndex j = 0; j < m; j++) {
-      // Compute the union structure of of A(j,:) and At(j,:)
+      // Compute the union structure of A(j,:) and At(j,:)
       visited(j) = j;  // Do not include the diagonal element
       // Get the nonzeros in row/column j of A
       for (typename MatrixType::InnerIterator it(A, j); it; ++it) {
diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h
index f1ea2ee5bd0..02be2828e48 100644
--- a/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h
@@ -713,7 +713,7 @@ static void init_scoring(
   for (c = n_col - 1; c >= 0; c--) {
     deg = Col[c].length;
     if (deg == 0) {
-      /* this is a empty column, kill and order it last */
+      /* this is an empty column, kill and order it last */
       Col[c].shared2.order = --n_col2;
       Col[c].kill_principal();
     }
diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h
index 1a650077685..8ede4de1335 100644
--- a/Eigen/src/OrderingMethods/Ordering.h
+++ b/Eigen/src/OrderingMethods/Ordering.h
@@ -22,7 +22,7 @@ namespace internal {
  * \ingroup OrderingMethods_Module
  * \param[in] A the input non-symmetric matrix
  * \param[out] symmat the symmetric pattern A^T+A from the input matrix \a A.
- * FIXME: The values should not be considered here
+ * FIXME: only the sparsity pattern should be used here; values should be ignored.
  */
 template <typename MatrixType>
 void ordering_helper_at_plus_a(const MatrixType& A, MatrixType& symmat) {
@@ -53,7 +53,7 @@ class AMDOrdering {
    * This routine is much faster if the input matrix is column-major
    */
   template <typename MatrixType>
-  void operator()(const MatrixType& mat, PermutationType& perm) {
+  void operator()(const MatrixType& mat, PermutationType& perm) const {
     // Compute the symmetric pattern
     SparseMatrix<typename MatrixType::Scalar, ColMajor, StorageIndex> symm;
     internal::ordering_helper_at_plus_a(mat, symm);
@@ -65,7 +65,7 @@ class AMDOrdering {
 
   /** Compute the permutation with a selfadjoint matrix */
   template <typename SrcType, unsigned int SrcUpLo>
-  void operator()(const SparseSelfAdjointView<SrcType, SrcUpLo>& mat, PermutationType& perm) {
+  void operator()(const SparseSelfAdjointView<SrcType, SrcUpLo>& mat, PermutationType& perm) const {
     SparseMatrix<typename SrcType::Scalar, ColMajor, StorageIndex> C;
     C = mat;
 
@@ -90,7 +90,7 @@ class NaturalOrdering {
 
   /** Compute the permutation vector from a column-major sparse matrix */
   template <typename MatrixType>
-  void operator()(const MatrixType& /*mat*/, PermutationType& perm) {
+  void operator()(const MatrixType& /*mat*/, PermutationType& perm) const {
     perm.resize(0);
   }
 };
@@ -113,7 +113,7 @@ class COLAMDOrdering {
    * \warning The input sparse matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
    */
   template <typename MatrixType>
-  void operator()(const MatrixType& mat, PermutationType& perm) {
+  void operator()(const MatrixType& mat, PermutationType& perm) const {
     eigen_assert(mat.isCompressed() &&
                  "COLAMDOrdering requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it "
                  "to COLAMDOrdering");
diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h
index 67c1167750d..90df3cdcdf1 100644
--- a/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/Eigen/src/PardisoSupport/PardisoSupport.h
@@ -147,7 +147,7 @@ class PardisoImpl : public SparseSolverBase<Derived> {
    * See the PARDISO manual to know how to use it. */
   ParameterType& pardisoParameterArray() { return m_iparm; }
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -157,7 +157,8 @@ class PardisoImpl : public SparseSolverBase<Derived> {
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+   * The given matrix must have the same sparsity as the matrix on which the symbolic decomposition has been
+   * performed.
    *
    * \sa analyzePattern()
    */
@@ -184,7 +185,7 @@ class PardisoImpl : public SparseSolverBase<Derived> {
     bool symmetric = std::abs(m_type) < 10;
     m_iparm[0] = 1;                   // No solver default
     m_iparm[1] = 2;                   // use Metis for the ordering
-    m_iparm[2] = 0;                   // Reserved. Set to zero. (??Numbers of processors, value of OMP_NUM_THREADS??)
+    m_iparm[2] = 0;                   // Reserved. Set to zero. (Was number of processors / OMP_NUM_THREADS.)
     m_iparm[3] = 0;                   // No iterative-direct algorithm
     m_iparm[4] = 0;                   // No user fill-in reducing permutation
     m_iparm[5] = 0;                   // Write solution into x, b is left unchanged
diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h
index 092c29d6186..73865647342 100644
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h
@@ -51,11 +51,23 @@ struct traits<ColPivHouseholderQR<MatrixType_, PermutationIndex_>> : traits<Matr
  * \sa MatrixBase::colPivHouseholderQr()
  */
 template <typename MatrixType_, typename PermutationIndex_>
-class ColPivHouseholderQR : public SolverBase<ColPivHouseholderQR<MatrixType_, PermutationIndex_>> {
+class ColPivHouseholderQR : public SolverBase<ColPivHouseholderQR<MatrixType_, PermutationIndex_>>,
+                            public RankRevealingBase<ColPivHouseholderQR<MatrixType_, PermutationIndex_>> {
  public:
   typedef MatrixType_ MatrixType;
   typedef SolverBase<ColPivHouseholderQR> Base;
+  typedef RankRevealingBase<ColPivHouseholderQR> RankRevealingBase_;
   friend class SolverBase<ColPivHouseholderQR>;
+  friend class RankRevealingBase<ColPivHouseholderQR>;
+  using RankRevealingBase_::dimensionOfKernel;
+  using RankRevealingBase_::isInjective;
+  using RankRevealingBase_::isInvertible;
+  using RankRevealingBase_::isSurjective;
+  using RankRevealingBase_::maxPivot;
+  using RankRevealingBase_::nonzeroPivots;
+  using RankRevealingBase_::rank;
+  using RankRevealingBase_::setThreshold;
+  using RankRevealingBase_::threshold;
   typedef PermutationIndex_ PermutationIndex;
   EIGEN_GENERIC_PUBLIC_INTERFACE(ColPivHouseholderQR)
 
@@ -82,7 +94,6 @@ class ColPivHouseholderQR : public SolverBase<ColPivHouseholderQR<MatrixType_, P
     m_colNormsUpdated.resize(cols);
     m_colNormsDirect.resize(cols);
     m_isInitialized = false;
-    m_usePrescribedThreshold = false;
   }
 
  public:
@@ -100,8 +111,7 @@ class ColPivHouseholderQR : public SolverBase<ColPivHouseholderQR<MatrixType_, P
         m_temp(),
         m_colNormsUpdated(),
         m_colNormsDirect(),
-        m_isInitialized(false),
-        m_usePrescribedThreshold(false) {}
+        m_isInitialized(false) {}
 
   /** \brief Default Constructor with memory preallocation
    *
@@ -158,7 +168,7 @@ class ColPivHouseholderQR : public SolverBase<ColPivHouseholderQR<MatrixType_, P
    * Output: \verbinclude ColPivHouseholderQR_solve.out
    */
   template <typename Rhs>
-  inline const Solve<ColPivHouseholderQR, Rhs> solve(const MatrixBase<Rhs>& b) const;
+  inline Solve<ColPivHouseholderQR, Rhs> solve(const MatrixBase<Rhs>& b) const;
 #endif
 
   HouseholderSequenceType householderQ() const;
@@ -252,65 +262,10 @@ class ColPivHouseholderQR : public SolverBase<ColPivHouseholderQR<MatrixType_, P
    */
   typename MatrixType::Scalar signDeterminant() const;
 
-  /** \returns the rank of the matrix of which *this is the QR decomposition.
-   *
-   * \note This method has to determine which pivots should be considered nonzero.
-   *       For that, it uses the threshold value that you can control by calling
-   *       setThreshold(const RealScalar&).
-   */
-  inline Index rank() const {
+  /** \returns the absolute value of the i-th pivot coefficient (for RankRevealingBase). */
+  RealScalar pivotCoeff(Index i) const {
     using std::abs;
-    eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-    RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold();
-    Index result = 0;
-    for (Index i = 0; i < m_nonzero_pivots; ++i) result += (abs(m_qr.coeff(i, i)) > premultiplied_threshold);
-    return result;
-  }
-
-  /** \returns the dimension of the kernel of the matrix of which *this is the QR decomposition.
-   *
-   * \note This method has to determine which pivots should be considered nonzero.
-   *       For that, it uses the threshold value that you can control by calling
-   *       setThreshold(const RealScalar&).
-   */
-  inline Index dimensionOfKernel() const {
-    eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-    return cols() - rank();
-  }
-
-  /** \returns true if the matrix of which *this is the QR decomposition represents an injective
-   *          linear map, i.e. has trivial kernel; false otherwise.
-   *
-   * \note This method has to determine which pivots should be considered nonzero.
-   *       For that, it uses the threshold value that you can control by calling
-   *       setThreshold(const RealScalar&).
-   */
-  inline bool isInjective() const {
-    eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-    return rank() == cols();
-  }
-
-  /** \returns true if the matrix of which *this is the QR decomposition represents a surjective
-   *          linear map; false otherwise.
-   *
-   * \note This method has to determine which pivots should be considered nonzero.
-   *       For that, it uses the threshold value that you can control by calling
-   *       setThreshold(const RealScalar&).
-   */
-  inline bool isSurjective() const {
-    eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-    return rank() == rows();
-  }
-
-  /** \returns true if the matrix of which *this is the QR decomposition is invertible.
-   *
-   * \note This method has to determine which pivots should be considered nonzero.
-   *       For that, it uses the threshold value that you can control by calling
-   *       setThreshold(const RealScalar&).
-   */
-  inline bool isInvertible() const {
-    eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-    return isInjective() && isSurjective();
+    return abs(m_qr.coeff(i, i));
   }
 
   /** \returns the inverse of the matrix of which *this is the QR decomposition.
@@ -318,7 +273,7 @@ class ColPivHouseholderQR : public SolverBase<ColPivHouseholderQR<MatrixType_, P
    * \note If this matrix is not invertible, the returned matrix has undefined coefficients.
    *       Use isInvertible() to first determine whether this matrix is invertible.
    */
-  inline const Inverse<ColPivHouseholderQR> inverse() const {
+  inline Inverse<ColPivHouseholderQR> inverse() const {
     eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
     return Inverse<ColPivHouseholderQR>(*this);
   }
@@ -332,71 +287,6 @@ class ColPivHouseholderQR : public SolverBase<ColPivHouseholderQR<MatrixType_, P
    */
   const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
 
-  /** Allows to prescribe a threshold to be used by certain methods, such as rank(),
-   * who need to determine when pivots are to be considered nonzero. This is not used for the
-   * QR decomposition itself.
-   *
-   * When it needs to get the threshold value, Eigen calls threshold(). By default, this
-   * uses a formula to automatically determine a reasonable threshold.
-   * Once you have called the present method setThreshold(const RealScalar&),
-   * your value is used instead.
-   *
-   * \param threshold The new value to use as the threshold.
-   *
-   * A pivot will be considered nonzero if its absolute value is strictly greater than
-   *  \f$ \vert pivot \vert \leqslant threshold \times \vert maxpivot \vert \f$
-   * where maxpivot is the biggest pivot.
-   *
-   * If you want to come back to the default behavior, call setThreshold(Default_t)
-   */
-  ColPivHouseholderQR& setThreshold(const RealScalar& threshold) {
-    m_usePrescribedThreshold = true;
-    m_prescribedThreshold = threshold;
-    return *this;
-  }
-
-  /** Allows to come back to the default behavior, letting Eigen use its default formula for
-   * determining the threshold.
-   *
-   * You should pass the special object Eigen::Default as parameter here.
-   * \code qr.setThreshold(Eigen::Default); \endcode
-   *
-   * See the documentation of setThreshold(const RealScalar&).
-   */
-  ColPivHouseholderQR& setThreshold(Default_t) {
-    m_usePrescribedThreshold = false;
-    return *this;
-  }
-
-  /** Returns the threshold that will be used by certain methods such as rank().
-   *
-   * See the documentation of setThreshold(const RealScalar&).
-   */
-  RealScalar threshold() const {
-    eigen_assert(m_isInitialized || m_usePrescribedThreshold);
-    return m_usePrescribedThreshold ? m_prescribedThreshold
-                                    // this formula comes from experimenting (see "LU precision tuning" thread on the
-                                    // list) and turns out to be identical to Higham's formula used already in LDLt.
-                                    : NumTraits<Scalar>::epsilon() * RealScalar(m_qr.diagonalSize());
-  }
-
-  /** \returns the number of nonzero pivots in the QR decomposition.
-   * Here nonzero is meant in the exact sense, not in a fuzzy sense.
-   * So that notion isn't really intrinsically interesting, but it is
-   * still useful when implementing algorithms.
-   *
-   * \sa rank()
-   */
-  inline Index nonzeroPivots() const {
-    eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-    return m_nonzero_pivots;
-  }
-
-  /** \returns the absolute value of the biggest pivot, i.e. the biggest
-   *          diagonal coefficient of R.
-   */
-  RealScalar maxPivot() const { return m_maxpivot; }
-
   /** \brief Reports whether the QR factorization was successful.
    *
    * \note This function always returns \c Success. It is provided for compatibility
@@ -430,9 +320,7 @@ class ColPivHouseholderQR : public SolverBase<ColPivHouseholderQR<MatrixType_, P
   RowVectorType m_temp;
   RealRowVectorType m_colNormsUpdated;
   RealRowVectorType m_colNormsDirect;
-  bool m_isInitialized, m_usePrescribedThreshold;
-  RealScalar m_prescribedThreshold, m_maxpivot;
-  Index m_nonzero_pivots;
+  bool m_isInitialized;
   Index m_det_p;
 };
 
@@ -514,8 +402,8 @@ void ColPivHouseholderQR<MatrixType, PermutationIndex>::computeInPlace() {
       numext::abs2<RealScalar>(m_colNormsUpdated.maxCoeff() * NumTraits<RealScalar>::epsilon()) / RealScalar(rows);
   RealScalar norm_downdate_threshold = numext::sqrt(NumTraits<RealScalar>::epsilon());
 
-  m_nonzero_pivots = size;  // the generic case is that in which all pivots are nonzero (invertible case)
-  m_maxpivot = RealScalar(0);
+  this->m_nonzero_pivots = size;  // the generic case is that in which all pivots are nonzero (invertible case)
+  this->m_maxpivot = RealScalar(0);
 
   for (Index k = 0; k < size; ++k) {
     // first, we look up in our table m_colNormsUpdated which column has the biggest norm
@@ -525,7 +413,8 @@ void ColPivHouseholderQR<MatrixType, PermutationIndex>::computeInPlace() {
 
     // Track the number of meaningful pivots but do not stop the decomposition to make
     // sure that the initial matrix is properly reproduced. See bug 941.
-    if (m_nonzero_pivots == size && biggest_col_sq_norm < threshold_helper * RealScalar(rows - k)) m_nonzero_pivots = k;
+    if (this->m_nonzero_pivots == size && biggest_col_sq_norm < threshold_helper * RealScalar(rows - k))
+      this->m_nonzero_pivots = k;
 
     // apply the transposition to the columns
     m_colsTranspositions.coeffRef(k) = static_cast<PermutationIndex>(biggest_col_index);
@@ -544,7 +433,7 @@ void ColPivHouseholderQR<MatrixType, PermutationIndex>::computeInPlace() {
     m_qr.coeffRef(k, k) = beta;
 
     // remember the maximum absolute value of diagonal coefficients
-    if (abs(beta) > m_maxpivot) m_maxpivot = abs(beta);
+    if (abs(beta) > this->m_maxpivot) this->m_maxpivot = abs(beta);
 
     // apply the householder transformation
     m_qr.bottomRightCorner(rows - k, cols - k - 1)
@@ -664,7 +553,7 @@ ColPivHouseholderQR<MatrixType, PermutationIndex>::householderQ() const {
  */
 template <typename Derived>
 template <typename PermutationIndexType>
-const ColPivHouseholderQR<typename MatrixBase<Derived>::PlainObject, PermutationIndexType>
+ColPivHouseholderQR<typename MatrixBase<Derived>::PlainObject, PermutationIndexType>
 MatrixBase<Derived>::colPivHouseholderQr() const {
   return ColPivHouseholderQR<PlainObject, PermutationIndexType>(eval());
 }
diff --git a/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h b/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
index 37ac55fa1a7..881bc3c329a 100644
--- a/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
+++ b/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
@@ -97,7 +97,9 @@ struct ColPivHouseholderQR_LAPACKE_impl {
 
     maxpivot = qr.diagonal().cwiseAbs().maxCoeff();
     hCoeffs.adjointInPlace();
-    RealScalar defaultThreshold = NumTraits<RealScalar>::epsilon() * RealScalar(qr.diagonalSize());
+    // Higham's backward error bound (Theorem 19.4): ||ΔA||₂ ≤ c·min(m,n)·u·||A||₂.
+    // The factor of 4 covers the constant c (typically 3–6 worst-case).
+    RealScalar defaultThreshold = NumTraits<RealScalar>::epsilon() * RealScalar(4 * qr.diagonalSize());
     RealScalar threshold = usePrescribedThreshold ? prescribedThreshold : defaultThreshold;
     RealScalar premultiplied_threshold = maxpivot * threshold;
     nonzero_pivots = (qr.diagonal().cwiseAbs().array() > premultiplied_threshold).count();
diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
index 960ccb1e9ff..3a5be592949 100644
--- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h
+++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
+// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -140,7 +140,7 @@ class CompleteOrthogonalDecomposition
    *
    */
   template <typename Rhs>
-  inline const Solve<CompleteOrthogonalDecomposition, Rhs> solve(const MatrixBase<Rhs>& b) const;
+  inline Solve<CompleteOrthogonalDecomposition, Rhs> solve(const MatrixBase<Rhs>& b) const;
 #endif
 
   HouseholderSequenceType householderQ(void) const;
@@ -293,7 +293,7 @@ class CompleteOrthogonalDecomposition
    * \warning: Do not compute \c this->pseudoInverse()*rhs to solve a linear systems.
    * It is more efficient and numerically stable to call \c this->solve(rhs).
    */
-  inline const Inverse<CompleteOrthogonalDecomposition> pseudoInverse() const {
+  inline Inverse<CompleteOrthogonalDecomposition> pseudoInverse() const {
     eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized.");
     return Inverse<CompleteOrthogonalDecomposition>(*this);
   }
@@ -638,7 +638,7 @@ CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::householderQ() co
  */
 template <typename Derived>
 template <typename PermutationIndex>
-const CompleteOrthogonalDecomposition<typename MatrixBase<Derived>::PlainObject, PermutationIndex>
+CompleteOrthogonalDecomposition<typename MatrixBase<Derived>::PlainObject, PermutationIndex>
 MatrixBase<Derived>::completeOrthogonalDecomposition() const {
   return CompleteOrthogonalDecomposition<PlainObject>(eval());
 }
diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h
index d1734445952..547bad0870f 100644
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h
@@ -60,11 +60,23 @@ struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType, PermutationIndex
  * \sa MatrixBase::fullPivHouseholderQr()
  */
 template <typename MatrixType_, typename PermutationIndex_>
-class FullPivHouseholderQR : public SolverBase<FullPivHouseholderQR<MatrixType_, PermutationIndex_> > {
+class FullPivHouseholderQR : public SolverBase<FullPivHouseholderQR<MatrixType_, PermutationIndex_> >,
+                             public RankRevealingBase<FullPivHouseholderQR<MatrixType_, PermutationIndex_> > {
  public:
   typedef MatrixType_ MatrixType;
   typedef SolverBase<FullPivHouseholderQR> Base;
+  typedef RankRevealingBase<FullPivHouseholderQR> RankRevealingBase_;
   friend class SolverBase<FullPivHouseholderQR>;
+  friend class RankRevealingBase<FullPivHouseholderQR>;
+  using RankRevealingBase_::dimensionOfKernel;
+  using RankRevealingBase_::isInjective;
+  using RankRevealingBase_::isInvertible;
+  using RankRevealingBase_::isSurjective;
+  using RankRevealingBase_::maxPivot;
+  using RankRevealingBase_::nonzeroPivots;
+  using RankRevealingBase_::rank;
+  using RankRevealingBase_::setThreshold;
+  using RankRevealingBase_::threshold;
   typedef PermutationIndex_ PermutationIndex;
   EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivHouseholderQR)
 
@@ -105,8 +117,7 @@ class FullPivHouseholderQR : public SolverBase<FullPivHouseholderQR<MatrixType_,
         m_cols_transpositions(),
         m_cols_permutation(),
         m_temp(),
-        m_isInitialized(false),
-        m_usePrescribedThreshold(false) {}
+        m_isInitialized(false) {}
 
   /** \brief Default Constructor with memory preallocation
    *
@@ -121,8 +132,7 @@ class FullPivHouseholderQR : public SolverBase<FullPivHouseholderQR<MatrixType_,
         m_cols_transpositions((std::min)(rows, cols)),
         m_cols_permutation(cols),
         m_temp(cols),
-        m_isInitialized(false),
-        m_usePrescribedThreshold(false) {}
+        m_isInitialized(false) {}
 
   /** \brief Constructs a QR factorization from a given matrix
    *
@@ -144,8 +154,7 @@ class FullPivHouseholderQR : public SolverBase<FullPivHouseholderQR<MatrixType_,
         m_cols_transpositions((std::min)(matrix.rows(), matrix.cols())),
         m_cols_permutation(matrix.cols()),
         m_temp(matrix.cols()),
-        m_isInitialized(false),
-        m_usePrescribedThreshold(false) {
+        m_isInitialized(false) {
     compute(matrix.derived());
   }
 
@@ -164,8 +173,7 @@ class FullPivHouseholderQR : public SolverBase<FullPivHouseholderQR<MatrixType_,
         m_cols_transpositions((std::min)(matrix.rows(), matrix.cols())),
         m_cols_permutation(matrix.cols()),
         m_temp(matrix.cols()),
-        m_isInitialized(false),
-        m_usePrescribedThreshold(false) {
+        m_isInitialized(false) {
     computeInPlace();
   }
 
@@ -186,7 +194,7 @@ class FullPivHouseholderQR : public SolverBase<FullPivHouseholderQR<MatrixType_,
    * Output: \verbinclude FullPivHouseholderQR_solve.out
    */
   template <typename Rhs>
-  inline const Solve<FullPivHouseholderQR, Rhs> solve(const MatrixBase<Rhs>& b) const;
+  inline Solve<FullPivHouseholderQR, Rhs> solve(const MatrixBase<Rhs>& b) const;
 #endif
 
   /** \returns Expression object representing the matrix Q
@@ -273,65 +281,10 @@ class FullPivHouseholderQR : public SolverBase<FullPivHouseholderQR<MatrixType_,
    */
   typename MatrixType::Scalar signDeterminant() const;
 
-  /** \returns the rank of the matrix of which *this is the QR decomposition.
-   *
-   * \note This method has to determine which pivots should be considered nonzero.
-   *       For that, it uses the threshold value that you can control by calling
-   *       setThreshold(const RealScalar&).
-   */
-  inline Index rank() const {
+  /** \returns the absolute value of the i-th pivot coefficient (for RankRevealingBase). */
+  RealScalar pivotCoeff(Index i) const {
     using std::abs;
-    eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-    RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold();
-    Index result = 0;
-    for (Index i = 0; i < m_nonzero_pivots; ++i) result += (abs(m_qr.coeff(i, i)) > premultiplied_threshold);
-    return result;
-  }
-
-  /** \returns the dimension of the kernel of the matrix of which *this is the QR decomposition.
-   *
-   * \note This method has to determine which pivots should be considered nonzero.
-   *       For that, it uses the threshold value that you can control by calling
-   *       setThreshold(const RealScalar&).
-   */
-  inline Index dimensionOfKernel() const {
-    eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-    return cols() - rank();
-  }
-
-  /** \returns true if the matrix of which *this is the QR decomposition represents an injective
-   *          linear map, i.e. has trivial kernel; false otherwise.
-   *
-   * \note This method has to determine which pivots should be considered nonzero.
-   *       For that, it uses the threshold value that you can control by calling
-   *       setThreshold(const RealScalar&).
-   */
-  inline bool isInjective() const {
-    eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-    return rank() == cols();
-  }
-
-  /** \returns true if the matrix of which *this is the QR decomposition represents a surjective
-   *          linear map; false otherwise.
-   *
-   * \note This method has to determine which pivots should be considered nonzero.
-   *       For that, it uses the threshold value that you can control by calling
-   *       setThreshold(const RealScalar&).
-   */
-  inline bool isSurjective() const {
-    eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-    return rank() == rows();
-  }
-
-  /** \returns true if the matrix of which *this is the QR decomposition is invertible.
-   *
-   * \note This method has to determine which pivots should be considered nonzero.
-   *       For that, it uses the threshold value that you can control by calling
-   *       setThreshold(const RealScalar&).
-   */
-  inline bool isInvertible() const {
-    eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-    return isInjective() && isSurjective();
+    return abs(m_qr.coeff(i, i));
   }
 
   /** \returns the inverse of the matrix of which *this is the QR decomposition.
@@ -339,7 +292,7 @@ class FullPivHouseholderQR : public SolverBase<FullPivHouseholderQR<MatrixType_,
    * \note If this matrix is not invertible, the returned matrix has undefined coefficients.
    *       Use isInvertible() to first determine whether this matrix is invertible.
    */
-  inline const Inverse<FullPivHouseholderQR> inverse() const {
+  inline Inverse<FullPivHouseholderQR> inverse() const {
     eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
     return Inverse<FullPivHouseholderQR>(*this);
   }
@@ -353,71 +306,6 @@ class FullPivHouseholderQR : public SolverBase<FullPivHouseholderQR<MatrixType_,
    */
   const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
 
-  /** Allows to prescribe a threshold to be used by certain methods, such as rank(),
-   * who need to determine when pivots are to be considered nonzero. This is not used for the
-   * QR decomposition itself.
-   *
-   * When it needs to get the threshold value, Eigen calls threshold(). By default, this
-   * uses a formula to automatically determine a reasonable threshold.
-   * Once you have called the present method setThreshold(const RealScalar&),
-   * your value is used instead.
-   *
-   * \param threshold The new value to use as the threshold.
-   *
-   * A pivot will be considered nonzero if its absolute value is strictly greater than
-   *  \f$ \vert pivot \vert \leqslant threshold \times \vert maxpivot \vert \f$
-   * where maxpivot is the biggest pivot.
-   *
-   * If you want to come back to the default behavior, call setThreshold(Default_t)
-   */
-  FullPivHouseholderQR& setThreshold(const RealScalar& threshold) {
-    m_usePrescribedThreshold = true;
-    m_prescribedThreshold = threshold;
-    return *this;
-  }
-
-  /** Allows to come back to the default behavior, letting Eigen use its default formula for
-   * determining the threshold.
-   *
-   * You should pass the special object Eigen::Default as parameter here.
-   * \code qr.setThreshold(Eigen::Default); \endcode
-   *
-   * See the documentation of setThreshold(const RealScalar&).
-   */
-  FullPivHouseholderQR& setThreshold(Default_t) {
-    m_usePrescribedThreshold = false;
-    return *this;
-  }
-
-  /** Returns the threshold that will be used by certain methods such as rank().
-   *
-   * See the documentation of setThreshold(const RealScalar&).
-   */
-  RealScalar threshold() const {
-    eigen_assert(m_isInitialized || m_usePrescribedThreshold);
-    return m_usePrescribedThreshold ? m_prescribedThreshold
-                                    // this formula comes from experimenting (see "LU precision tuning" thread on the
-                                    // list) and turns out to be identical to Higham's formula used already in LDLt.
-                                    : NumTraits<Scalar>::epsilon() * RealScalar(m_qr.diagonalSize());
-  }
-
-  /** \returns the number of nonzero pivots in the QR decomposition.
-   * Here nonzero is meant in the exact sense, not in a fuzzy sense.
-   * So that notion isn't really intrinsically interesting, but it is
-   * still useful when implementing algorithms.
-   *
-   * \sa rank()
-   */
-  inline Index nonzeroPivots() const {
-    eigen_assert(m_isInitialized && "LU is not initialized.");
-    return m_nonzero_pivots;
-  }
-
-  /** \returns the absolute value of the biggest pivot, i.e. the biggest
-   *          diagonal coefficient of U.
-   */
-  RealScalar maxPivot() const { return m_maxpivot; }
-
 #ifndef EIGEN_PARSED_BY_DOXYGEN
   template <typename RhsType, typename DstType>
   void _solve_impl(const RhsType& rhs, DstType& dst) const;
@@ -437,9 +325,7 @@ class FullPivHouseholderQR : public SolverBase<FullPivHouseholderQR<MatrixType_,
   IntDiagSizeVectorType m_cols_transpositions;
   PermutationType m_cols_permutation;
   RowVectorType m_temp;
-  bool m_isInitialized, m_usePrescribedThreshold;
-  RealScalar m_prescribedThreshold, m_maxpivot;
-  Index m_nonzero_pivots;
+  bool m_isInitialized;
   RealScalar m_precision;
   Index m_det_p;
 };
@@ -512,8 +398,8 @@ void FullPivHouseholderQR<MatrixType, PermutationIndex>::computeInPlace() {
 
   RealScalar biggest(0);
 
-  m_nonzero_pivots = size;  // the generic case is that in which all pivots are nonzero (invertible case)
-  m_maxpivot = RealScalar(0);
+  this->m_nonzero_pivots = size;  // the generic case is that in which all pivots are nonzero (invertible case)
+  this->m_maxpivot = RealScalar(0);
 
   for (Index k = 0; k < size; ++k) {
     Index row_of_biggest_in_corner, col_of_biggest_in_corner;
@@ -531,7 +417,7 @@ void FullPivHouseholderQR<MatrixType, PermutationIndex>::computeInPlace() {
 
     // if the corner is negligible, then we have less than full rank, and we can finish early
     if (internal::isMuchSmallerThan(biggest_in_corner, biggest, m_precision)) {
-      m_nonzero_pivots = k;
+      this->m_nonzero_pivots = k;
       for (Index i = k; i < size; i++) {
         m_rows_transpositions.coeffRef(i) = internal::convert_index<PermutationIndex>(i);
         m_cols_transpositions.coeffRef(i) = internal::convert_index<PermutationIndex>(i);
@@ -556,7 +442,7 @@ void FullPivHouseholderQR<MatrixType, PermutationIndex>::computeInPlace() {
     m_qr.coeffRef(k, k) = beta;
 
     // remember the maximum absolute value of diagonal coefficients
-    if (abs(beta) > m_maxpivot) m_maxpivot = abs(beta);
+    if (abs(beta) > this->m_maxpivot) this->m_maxpivot = abs(beta);
 
     m_qr.bottomRightCorner(rows - k, cols - k - 1)
         .applyHouseholderOnTheLeft(m_qr.col(k).tail(rows - k - 1), m_hCoeffs.coeffRef(k), &m_temp.coeffRef(k + 1));
@@ -575,8 +461,7 @@ template <typename RhsType, typename DstType>
 void FullPivHouseholderQR<MatrixType_, PermutationIndex_>::_solve_impl(const RhsType& rhs, DstType& dst) const {
   const Index l_rank = rank();
 
-  // FIXME introduce nonzeroPivots() and use it here. and more generally,
-  // make the same improvements in this dec as in FullPivLU.
+  // FIXME: introduce nonzeroPivots() and apply the same improvements as in FullPivLU.
   if (l_rank == 0) {
     dst.setZero();
     return;
@@ -703,11 +588,6 @@ struct FullPivHouseholderQRMatrixQReturnType
   typename IntDiagSizeVectorType::Nested m_rowsTranspositions;
 };
 
-// template<typename MatrixType>
-// struct evaluator<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
-//  : public evaluator<ReturnByValue<FullPivHouseholderQRMatrixQReturnType<MatrixType> > >
-// {};
-
 }  // end namespace internal
 
 template <typename MatrixType, typename PermutationIndex>
@@ -723,7 +603,7 @@ FullPivHouseholderQR<MatrixType, PermutationIndex>::matrixQ() const {
  */
 template <typename Derived>
 template <typename PermutationIndex>
-const FullPivHouseholderQR<typename MatrixBase<Derived>::PlainObject, PermutationIndex>
+FullPivHouseholderQR<typename MatrixBase<Derived>::PlainObject, PermutationIndex>
 MatrixBase<Derived>::fullPivHouseholderQr() const {
   return FullPivHouseholderQR<PlainObject, PermutationIndex>(eval());
 }
diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h
index 497085dbff9..c65c28822a6 100644
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h
@@ -156,7 +156,7 @@ class HouseholderQR : public SolverBase<HouseholderQR<MatrixType_>> {
    * Output: \verbinclude HouseholderQR_solve.out
    */
   template <typename Rhs>
-  inline const Solve<HouseholderQR, Rhs> solve(const MatrixBase<Rhs>& b) const;
+  inline Solve<HouseholderQR, Rhs> solve(const MatrixBase<Rhs>& b) const;
 #endif
 
   /** This method returns an expression of the unitary matrix Q as a sequence of Householder transformations.
@@ -384,15 +384,19 @@ void householder_qr_inplace_unblocked(MatrixQR& mat, HCoeffs& hCoeffs, typename
   }
 }
 
-// TODO: add a corresponding public API for updating a QR factorization
 /** \internal
- * Basically a modified copy of @c Eigen::internal::householder_qr_inplace_unblocked that
- * performs a rank-1 update of the QR matrix in compact storage. This function assumes, that
- * the first @c k-1 columns of the matrix @c mat contain the QR decomposition of \f$A^N\f$ up to
- * column k-1. Then the QR decomposition of the k-th column (given by @c newColumn) is computed by
- * applying the k-1 Householder projectors on it and finally compute the projector \f$H_k\f$ of
- * it. On exit the matrix @c mat and the vector @c hCoeffs contain the QR decomposition of the
- * first k columns of \f$A^N\f$. The \a tempData argument must point to at least mat.cols() scalars.  */
+ * Column-insert / column-replace helper for a compact-storage Householder QR.
+ * Given a matrix @c mat and @c hCoeffs holding the QR factorization of the first @c k columns of
+ * some matrix A, this function replaces column @c k of that factorization with @c newColumn: it
+ * applies the existing k Householder reflectors (stored in columns 0..k-1 of @c mat and in
+ * @c hCoeffs.head(k)) to @c newColumn, then computes the k-th reflector in place. On exit
+ * @c mat.leftCols(k+1) and @c hCoeffs.head(k+1) hold the QR factorization of A.leftCols(k) with
+ * @c newColumn inserted at position @c k. @c tempData must point to at least @c mat.cols() scalars.
+ *
+ * Despite the historical "rank-1 update" label, this is not a full QR update in the
+ * Gill-Golub-Murray-Saunders sense: there is no public API for @c QR(A + u vT) or for column/row
+ * delete. See libeigen/eigen#3072 for a tracker of that feature gap. Currently only NNLS relies on
+ * this helper; the signature is tuned to its active-set bookkeeping. */
 template <typename MatrixQR, typename HCoeffs, typename VectorQR>
 void householder_qr_inplace_update(MatrixQR& mat, HCoeffs& hCoeffs, const VectorQR& newColumn,
                                    typename MatrixQR::Index k, typename MatrixQR::Scalar* tempData) {
@@ -534,7 +538,7 @@ void HouseholderQR<MatrixType>::computeInPlace() {
  * \sa class HouseholderQR
  */
 template <typename Derived>
-const HouseholderQR<typename MatrixBase<Derived>::PlainObject> MatrixBase<Derived>::householderQr() const {
+HouseholderQR<typename MatrixBase<Derived>::PlainObject> MatrixBase<Derived>::householderQr() const {
   return HouseholderQR<PlainObject>(eval());
 }
 
diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
index 31327948822..99598f58128 100644
--- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
+++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
@@ -28,6 +28,11 @@ namespace internal {
 template <typename SPQRType>
 struct traits<SPQRMatrixQReturnType<SPQRType> > {
   typedef typename SPQRType::MatrixType ReturnType;
+  typedef typename ReturnType::Scalar Scalar;
+  typedef typename ReturnType::StorageIndex StorageIndex;
+  typedef typename ReturnType::StorageKind StorageKind;
+  static constexpr int RowsAtCompileTime = Dynamic;
+  static constexpr int ColsAtCompileTime = Dynamic;
 };
 template <typename SPQRType>
 struct traits<SPQRMatrixQTransposeReturnType<SPQRType> > {
@@ -120,7 +125,9 @@ class SPQR : public SparseSolverBase<SPQR<MatrixType_> > {
     cholmod_l_free_sparse(&m_cR, &m_cc);
     cholmod_l_free_dense(&m_HTau, &m_cc);
     std::free(m_E);
+    m_E = nullptr;
     std::free(m_HPinv);
+    m_HPinv = nullptr;
   }
 
   void compute(const MatrixType_& matrix) {
@@ -150,6 +157,12 @@ class SPQR : public SparseSolverBase<SPQR<MatrixType_> > {
       m_isInitialized = false;
       return;
     }
+    if (!m_E && !initIdentityPermutation(m_cR->ncol)) {
+      SPQR_free();
+      m_info = NumericalIssue;
+      m_isInitialized = false;
+      return;
+    }
     m_info = Success;
     m_isInitialized = true;
     m_isRUpToDate = false;
@@ -193,7 +206,7 @@ class SPQR : public SparseSolverBase<SPQR<MatrixType_> > {
 
   /** \returns the sparse triangular factor R. It is a sparse matrix
    */
-  const MatrixType matrixR() const {
+  const MatrixType& matrixR() const {
     eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
     if (!m_isRUpToDate) {
       m_R = viewAsEigen<Scalar, StorageIndex>(*m_cR);
@@ -255,6 +268,16 @@ class SPQR : public SparseSolverBase<SPQR<MatrixType_> > {
   mutable cholmod_common m_cc;              // Workspace and parameters
   bool m_useDefaultThreshold;               // Use default threshold
   Index m_rows;
+
+  bool initIdentityPermutation(StorageIndex size) {
+    if (m_E || size == 0) return true;
+    // SuiteSparse can omit the permutation array when no column reordering is applied.
+    m_E = static_cast<StorageIndex*>(std::malloc(sizeof(StorageIndex) * size));
+    if (!m_E) return false;
+    for (StorageIndex i = 0; i < size; ++i) m_E[i] = i;
+    return true;
+  }
+
   template <typename, typename>
   friend struct SPQR_QProduct;
 };
@@ -267,33 +290,102 @@ struct SPQR_QProduct : ReturnByValue<SPQR_QProduct<SPQRType, Derived> > {
   SPQR_QProduct(const SPQRType& spqr, const Derived& other, bool transpose)
       : m_spqr(spqr), m_other(other), m_transpose(transpose) {}
 
-  inline Index rows() const { return m_transpose ? m_spqr.rows() : m_spqr.cols(); }
+  inline Index rows() const { return m_spqr.rows(); }
   inline Index cols() const { return m_other.cols(); }
   // Assign to a vector
   template <typename ResType>
   void evalTo(ResType& res) const {
+    evalToImpl(res, m_other);
+  }
+
+ private:
+  template <typename ResType, typename OtherDerived,
+            typename std::enable_if<(int(OtherDerived::Flags) & DirectAccessBit) == DirectAccessBit, int>::type = 0>
+  void evalToImpl(ResType& res, const MatrixBase<OtherDerived>& otherExpr) const {
     cholmod_dense y_cd;
     cholmod_dense* x_cd;
     int method = m_transpose ? SPQR_QTX : SPQR_QX;
     cholmod_common* cc = m_spqr.cholmodCommon();
-    y_cd = viewAsCholmod(m_other.const_cast_derived());
+    y_cd = viewAsCholmod(otherExpr.const_cast_derived());
     x_cd = SuiteSparseQR_qmult<Scalar>(method, m_spqr.m_H, m_spqr.m_HTau, m_spqr.m_HPinv, &y_cd, cc);
     res = Matrix<Scalar, ResType::RowsAtCompileTime, ResType::ColsAtCompileTime>::Map(
         reinterpret_cast<Scalar*>(x_cd->x), x_cd->nrow, x_cd->ncol);
     cholmod_l_free_dense(&x_cd, cc);
   }
+
+  template <typename ResType, typename OtherDerived,
+            typename std::enable_if<(int(OtherDerived::Flags) & DirectAccessBit) == 0, int>::type = 0>
+  void evalToImpl(ResType& res, const MatrixBase<OtherDerived>& otherExpr) const {
+    cholmod_dense y_cd;
+    cholmod_dense* x_cd;
+    int method = m_transpose ? SPQR_QTX : SPQR_QX;
+    cholmod_common* cc = m_spqr.cholmodCommon();
+    typename OtherDerived::PlainObject other = otherExpr;
+    y_cd = viewAsCholmod(other);
+    x_cd = SuiteSparseQR_qmult<Scalar>(method, m_spqr.m_H, m_spqr.m_HTau, m_spqr.m_HPinv, &y_cd, cc);
+    res = Matrix<Scalar, ResType::RowsAtCompileTime, ResType::ColsAtCompileTime>::Map(
+        reinterpret_cast<Scalar*>(x_cd->x), x_cd->nrow, x_cd->ncol);
+    cholmod_l_free_dense(&x_cd, cc);
+  }
+
+  template <typename ResType, typename OtherDerived>
+  void evalToImpl(ResType& res, const SparseMatrixBase<OtherDerived>& otherExpr) const {
+    cholmod_sparse y_cs;
+    cholmod_sparse* x_cs;
+    int method = m_transpose ? SPQR_QTX : SPQR_QX;
+    cholmod_common* cc = m_spqr.cholmodCommon();
+    typename OtherDerived::PlainObject other = otherExpr;
+    other.makeCompressed();
+    y_cs = viewAsCholmod(other);
+    x_cs = SuiteSparseQR_qmult<Scalar>(method, m_spqr.m_H, m_spqr.m_HTau, m_spqr.m_HPinv, &y_cs, cc);
+    res = viewAsEigen<Scalar, StorageIndex>(*x_cs);
+    cholmod_l_free_sparse(&x_cs, cc);
+  }
+
+  template <typename ResType, typename OtherScalar, int OtherOptions, typename OtherStorageIndex,
+            typename std::enable_if<internal::is_same<OtherStorageIndex, StorageIndex>::value, int>::type = 0>
+  void evalToImpl(ResType& res, const SparseMatrix<OtherScalar, OtherOptions, OtherStorageIndex>& otherExpr) const {
+    cholmod_sparse y_cs;
+    cholmod_sparse* x_cs;
+    int method = m_transpose ? SPQR_QTX : SPQR_QX;
+    cholmod_common* cc = m_spqr.cholmodCommon();
+    const SparseMatrix<OtherScalar, OtherOptions, OtherStorageIndex>* otherPtr = &otherExpr;
+    SparseMatrix<OtherScalar, OtherOptions, OtherStorageIndex> other;
+
+    if (!otherExpr.isCompressed()) {
+      other = otherExpr;
+      other.makeCompressed();
+      otherPtr = &other;
+    }
+
+    y_cs = viewAsCholmod(*otherPtr);
+    x_cs = SuiteSparseQR_qmult<Scalar>(method, m_spqr.m_H, m_spqr.m_HTau, m_spqr.m_HPinv, &y_cs, cc);
+    res = viewAsEigen<Scalar, StorageIndex>(*x_cs);
+    cholmod_l_free_sparse(&x_cs, cc);
+  }
+
+ public:
   const SPQRType& m_spqr;
   const Derived& m_other;
   bool m_transpose;
 };
 template <typename SPQRType>
-struct SPQRMatrixQReturnType {
+struct SPQRMatrixQReturnType : public EigenBase<SPQRMatrixQReturnType<SPQRType> > {
+  typedef typename SPQRType::Scalar Scalar;
+  static constexpr int RowsAtCompileTime = Dynamic;
+  static constexpr int ColsAtCompileTime = Dynamic;
   SPQRMatrixQReturnType(const SPQRType& spqr) : m_spqr(spqr) {}
   template <typename Derived>
   SPQR_QProduct<SPQRType, Derived> operator*(const MatrixBase<Derived>& other) {
     return SPQR_QProduct<SPQRType, Derived>(m_spqr, other.derived(), false);
   }
+  template <typename Derived>
+  SPQR_QProduct<SPQRType, Derived> operator*(const SparseMatrixBase<Derived>& other) {
+    return SPQR_QProduct<SPQRType, Derived>(m_spqr, other.derived(), false);
+  }
   SPQRMatrixQTransposeReturnType<SPQRType> adjoint() const { return SPQRMatrixQTransposeReturnType<SPQRType>(m_spqr); }
+  inline Index rows() const { return m_spqr.rows(); }
+  inline Index cols() const { return m_spqr.rows(); }
   // To use for operations with the transpose of Q
   SPQRMatrixQTransposeReturnType<SPQRType> transpose() const {
     return SPQRMatrixQTransposeReturnType<SPQRType>(m_spqr);
@@ -308,8 +400,47 @@ struct SPQRMatrixQTransposeReturnType {
   SPQR_QProduct<SPQRType, Derived> operator*(const MatrixBase<Derived>& other) {
     return SPQR_QProduct<SPQRType, Derived>(m_spqr, other.derived(), true);
   }
+  template <typename Derived>
+  SPQR_QProduct<SPQRType, Derived> operator*(const SparseMatrixBase<Derived>& other) {
+    return SPQR_QProduct<SPQRType, Derived>(m_spqr, other.derived(), true);
+  }
   const SPQRType& m_spqr;
 };
 
+namespace internal {
+
+template <typename SPQRType>
+struct evaluator_traits<SPQRMatrixQReturnType<SPQRType> > {
+  typedef typename SPQRType::MatrixType MatrixType;
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef SparseShape Shape;
+};
+
+template <typename DstXprType, typename SPQRType>
+struct Assignment<DstXprType, SPQRMatrixQReturnType<SPQRType>,
+                  internal::assign_op<typename DstXprType::Scalar, typename DstXprType::Scalar>, Sparse2Sparse> {
+  typedef SPQRMatrixQReturnType<SPQRType> SrcXprType;
+  typedef typename DstXprType::Scalar Scalar;
+
+  static void run(DstXprType& dst, const SrcXprType& src, const internal::assign_op<Scalar, Scalar>& /*func*/) {
+    typename DstXprType::PlainObject idMat(src.rows(), src.cols());
+    idMat.setIdentity();
+    dst = src.m_spqr.matrixQ() * idMat;
+  }
+};
+
+template <typename DstXprType, typename SPQRType>
+struct Assignment<DstXprType, SPQRMatrixQReturnType<SPQRType>,
+                  internal::assign_op<typename DstXprType::Scalar, typename DstXprType::Scalar>, Sparse2Dense> {
+  typedef SPQRMatrixQReturnType<SPQRType> SrcXprType;
+  typedef typename DstXprType::Scalar Scalar;
+
+  static void run(DstXprType& dst, const SrcXprType& src, const internal::assign_op<Scalar, Scalar>& /*func*/) {
+    dst = src.m_spqr.matrixQ() * DstXprType::Identity(src.rows(), src.cols());
+  }
+};
+
+}  // namespace internal
+
 }  // End namespace Eigen
 #endif
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index 6fab905e54b..2a8499a0879 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -19,27 +19,15 @@
 
 #ifndef EIGEN_BDCSVD_H
 #define EIGEN_BDCSVD_H
-// #define EIGEN_BDCSVD_DEBUG_VERBOSE
-// #define EIGEN_BDCSVD_SANITY_CHECKS
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-#undef eigen_internal_assert
-#define eigen_internal_assert(X) assert(X);
-#endif
 
 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"
 
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-#include <iostream>
-#endif
+// Internal D&C implementation, templated only on RealScalar.
+#include "BDCSVDImpl.h"
 
 namespace Eigen {
 
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-IOFormat bdcsvdfmt(8, 0, ", ", "\n", "  [", "]");
-#endif
-
 template <typename MatrixType_, int Options>
 class BDCSVD;
 
@@ -99,8 +87,8 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
   typedef typename Base::Index Index;
   enum {
     Options = Options_,
-    QRDecomposition = Options & internal::QRPreconditionerBits,
-    ComputationOptions = Options & internal::ComputationOptionsBits,
+    QRDecomposition = internal::get_qr_preconditioner(Options),
+    ComputationOptions = internal::get_computation_options(Options),
     RowsAtCompileTime = Base::RowsAtCompileTime,
     ColsAtCompileTime = Base::ColsAtCompileTime,
     DiagSizeAtCompileTime = Base::DiagSizeAtCompileTime,
@@ -127,7 +115,7 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
    * The default constructor is useful in cases in which the user intends to
    * perform decompositions via BDCSVD::compute(const MatrixType&).
    */
-  BDCSVD() : m_algoswap(16), m_isTranspose(false), m_compU(false), m_compV(false), m_numIters(0) {}
+  BDCSVD() : m_isTranspose(false), m_numIters(0) {}
 
   /** \brief Default Constructor with memory preallocation
    *
@@ -135,9 +123,7 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
    * according to the specified problem size and \a Options template parameter.
    * \sa BDCSVD()
    */
-  BDCSVD(Index rows, Index cols) : m_algoswap(16), m_numIters(0) {
-    allocate(rows, cols, internal::get_computation_options(Options));
-  }
+  BDCSVD(Index rows, Index cols) : m_numIters(0) { allocate(rows, cols, internal::get_computation_options(Options)); }
 
   /** \brief Default Constructor with memory preallocation
    *
@@ -155,7 +141,8 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
    * \deprecated Will be removed in the next major Eigen version. Options should
    * be specified in the \a Options template parameter.
    */
-  EIGEN_DEPRECATED BDCSVD(Index rows, Index cols, unsigned int computationOptions) : m_algoswap(16), m_numIters(0) {
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using the class template parameter.")
+  BDCSVD(Index rows, Index cols, unsigned int computationOptions) : m_numIters(0) {
     internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, rows, cols);
     allocate(rows, cols, computationOptions);
   }
@@ -166,10 +153,24 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
    * \param matrix the matrix to decompose
    */
   template <typename Derived>
-  BDCSVD(const MatrixBase<Derived>& matrix) : m_algoswap(16), m_numIters(0) {
+  BDCSVD(const MatrixBase<Derived>& matrix) : m_numIters(0) {
     compute_impl(matrix, internal::get_computation_options(Options));
   }
 
+  /** \brief Constructor performing the SVD of an upper bidiagonal matrix given its diagonal and superdiagonal.
+   *
+   * This skips the bidiagonalization step and directly runs the divide-and-conquer algorithm.
+   * The input vectors must be real-valued. For an n x n bidiagonal matrix, \a diagonal has n entries
+   * and \a superdiagonal has n-1 entries.
+   *
+   * \param diagonal the diagonal entries of the bidiagonal matrix
+   * \param superdiagonal the superdiagonal entries of the bidiagonal matrix
+   */
+  template <typename DerivedD, typename DerivedE>
+  BDCSVD(const MatrixBase<DerivedD>& diagonal, const MatrixBase<DerivedE>& superdiagonal) : m_numIters(0) {
+    compute_bidiagonal_impl(diagonal, superdiagonal, internal::get_computation_options(Options));
+  }
+
   /** \brief Constructor performing the decomposition of given matrix using specified options
    *         for computing unitaries.
    *
@@ -183,8 +184,8 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
    * be specified in the \a Options template parameter.
    */
   template <typename Derived>
-  EIGEN_DEPRECATED BDCSVD(const MatrixBase<Derived>& matrix, unsigned int computationOptions)
-      : m_algoswap(16), m_numIters(0) {
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using the class template parameter.")
+  BDCSVD(const MatrixBase<Derived>& matrix, unsigned int computationOptions) : m_numIters(0) {
     internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, matrix.rows(), matrix.cols());
     compute_impl(matrix, computationOptions);
   }
@@ -211,53 +212,53 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
    * be specified in the \a Options template parameter.
    */
   template <typename Derived>
-  EIGEN_DEPRECATED BDCSVD& compute(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using the class template parameter.")
+  BDCSVD& compute(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
     internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, matrix.rows(), matrix.cols());
     return compute_impl(matrix, computationOptions);
   }
 
+  /** \brief Compute the SVD of an upper bidiagonal matrix given its diagonal and superdiagonal.
+   *
+   * This skips the bidiagonalization step and directly runs the divide-and-conquer algorithm.
+   * The input vectors must be real-valued. For an n x n bidiagonal matrix, \a diagonal has n entries
+   * and \a superdiagonal has n-1 entries.
+   *
+   * \param diagonal the diagonal entries of the bidiagonal matrix
+   * \param superdiagonal the superdiagonal entries of the bidiagonal matrix
+   */
+  template <typename DerivedD, typename DerivedE>
+  BDCSVD& compute(const MatrixBase<DerivedD>& diagonal, const MatrixBase<DerivedE>& superdiagonal) {
+    return compute_bidiagonal_impl(diagonal, superdiagonal, m_computationOptions);
+  }
+
   void setSwitchSize(int s) {
     eigen_assert(s >= 3 && "BDCSVD the size of the algo switch has to be at least 3.");
-    m_algoswap = s;
+    m_impl.setAlgoSwap(s);
   }
 
  private:
   template <typename Derived>
   BDCSVD& compute_impl(const MatrixBase<Derived>& matrix, unsigned int computationOptions);
-  void divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift);
-  void computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V);
-  void computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, VectorType& singVals,
-                       ArrayRef shifts, ArrayRef mus);
-  void perturbCol0(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals,
-                   const ArrayRef& shifts, const ArrayRef& mus, ArrayRef zhat);
-  void computeSingVecs(const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals,
-                       const ArrayRef& shifts, const ArrayRef& mus, MatrixXr& U, MatrixXr& V);
-  void deflation43(Index firstCol, Index shift, Index i, Index size);
-  void deflation44(Index firstColu, Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size);
-  void deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift);
+  template <typename DerivedD, typename DerivedE>
+  BDCSVD& compute_bidiagonal_impl(const MatrixBase<DerivedD>& diagonal, const MatrixBase<DerivedE>& superdiagonal,
+                                  unsigned int computationOptions);
   template <typename HouseholderU, typename HouseholderV, typename NaiveU, typename NaiveV>
   void copyUV(const HouseholderU& householderU, const HouseholderV& householderV, const NaiveU& naiveU,
               const NaiveV& naivev);
-  void structured_update(Block<MatrixXr, Dynamic, Dynamic> A, const MatrixXr& B, Index n1);
-  static RealScalar secularEq(RealScalar x, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm,
-                              const ArrayRef& diagShifted, RealScalar shift);
-  template <typename SVDType>
-  void computeBaseCase(SVDType& svd, Index n, Index firstCol, Index firstRowW, Index firstColW, Index shift);
 
  protected:
   void allocate(Index rows, Index cols, unsigned int computationOptions);
-  MatrixXr m_naiveU, m_naiveV;
-  MatrixXr m_computed;
-  Index m_nRec;
-  ArrayXr m_workspace;
-  ArrayXi m_workspaceI;
-  int m_algoswap;
-  bool m_isTranspose, m_compU, m_compV, m_useQrDecomp;
-  JacobiSVD<MatrixType, ComputationOptions> smallSvd;
+  internal::bdcsvd_impl<RealScalar> m_impl;
+  bool m_isTranspose, m_useQrDecomp;
+  JacobiSVD<MatrixX> smallSvd;
   HouseholderQR<MatrixX> qrDecomp;
   internal::UpperBidiagonalization<MatrixX> bid;
   MatrixX copyWorkspace;
   MatrixX reducedTriangle;
+  // Reused workspace for HouseholderSequence::applyThisOnTheLeft in copyUV().
+  // Without this, each apply allocates a fresh row vector.
+  Matrix<Scalar, 1, Dynamic, RowMajor> m_householderWorkspace;
 
   using Base::m_computationOptions;
   using Base::m_computeThinU;
@@ -278,14 +279,16 @@ template <typename MatrixType, int Options>
 void BDCSVD<MatrixType, Options>::allocate(Index rows, Index cols, unsigned int computationOptions) {
   if (Base::allocate(rows, cols, computationOptions)) return;
 
-  if (cols < m_algoswap)
+  if (cols < m_impl.algoSwap())
     smallSvd.allocate(rows, cols, Options == 0 ? computationOptions : internal::get_computation_options(Options));
 
-  m_computed = MatrixXr::Zero(diagSize() + 1, diagSize());
-  m_compU = computeV();
-  m_compV = computeU();
   m_isTranspose = (cols > rows);
-  if (m_isTranspose) std::swap(m_compU, m_compV);
+
+  bool compU = computeV();
+  bool compV = computeU();
+  if (m_isTranspose) std::swap(compU, compV);
+
+  m_impl.allocate(diagSize(), compU, compV);
 
   // kMinAspectRatio is the crossover point that determines if we perform R-Bidiagonalization
   // or bidiagonalize the input matrix directly.
@@ -302,30 +305,16 @@ void BDCSVD<MatrixType, Options>::allocate(Index rows, Index cols, unsigned int
   copyWorkspace = MatrixX(m_isTranspose ? cols : rows, m_isTranspose ? rows : cols);
   bid = internal::UpperBidiagonalization<MatrixX>(m_useQrDecomp ? diagSize() : copyWorkspace.rows(),
                                                   m_useQrDecomp ? diagSize() : copyWorkspace.cols());
-
-  if (m_compU)
-    m_naiveU = MatrixXr::Zero(diagSize() + 1, diagSize() + 1);
-  else
-    m_naiveU = MatrixXr::Zero(2, diagSize() + 1);
-
-  if (m_compV) m_naiveV = MatrixXr::Zero(diagSize(), diagSize());
-
-  m_workspace.resize((diagSize() + 1) * (diagSize() + 1) * 3);
-  m_workspaceI.resize(3 * diagSize());
 }  // end allocate
 
 template <typename MatrixType, int Options>
 template <typename Derived>
-BDCSVD<MatrixType, Options>& BDCSVD<MatrixType, Options>::compute_impl(const MatrixBase<Derived>& matrix,
-                                                                       unsigned int computationOptions) {
+EIGEN_DONT_INLINE BDCSVD<MatrixType, Options>& BDCSVD<MatrixType, Options>::compute_impl(
+    const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived, MatrixType);
   EIGEN_STATIC_ASSERT((std::is_same<typename Derived::Scalar, typename MatrixType::Scalar>::value),
                       Input matrix must have the same Scalar type as the BDCSVD object.);
 
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  std::cout << "\n\n\n================================================================================================="
-               "=====================\n\n\n";
-#endif
   using std::abs;
 
   allocate(matrix.rows(), matrix.cols(), computationOptions);
@@ -333,7 +322,7 @@ BDCSVD<MatrixType, Options>& BDCSVD<MatrixType, Options>::compute_impl(const Mat
   const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
 
   //**** step -1 - If the problem is too small, directly falls back to JacobiSVD and return
-  if (matrix.cols() < m_algoswap) {
+  if (matrix.cols() < m_impl.algoSwap()) {
     smallSvd.compute(matrix);
     m_isInitialized = true;
     m_info = smallSvd.info();
@@ -375,12 +364,19 @@ BDCSVD<MatrixType, Options>& BDCSVD<MatrixType, Options>::compute_impl(const Mat
   }
 
   //**** step 2 - Divide & Conquer
-  m_naiveU.setZero();
-  m_naiveV.setZero();
-  // FIXME this line involves a temporary matrix
-  m_computed.topRows(diagSize()) = bid.bidiagonal().toDenseMatrix().transpose();
-  m_computed.template bottomRows<1>().setZero();
-  divide(0, diagSize() - 1, 0, 0, 0);
+  m_impl.naiveU().setZero();
+  m_impl.naiveV().setZero();
+  // The transposed bidiagonal has only the main diagonal and one sub-diagonal;
+  // fill those directly instead of materializing a dense temporary.
+  // Note: BandMatrix::diagonal<N>() const has a latent type bug (returns
+  // Block<CoefficientsType, ...> instead of Block<const CoefficientsType, ...>),
+  // so use the index-based overload which is correctly const-qualified.
+  m_impl.computed().setZero();
+  m_impl.computed().topRows(diagSize()).diagonal() = bid.bidiagonal().diagonal();
+  m_impl.computed().topRows(diagSize()).template diagonal<-1>() = bid.bidiagonal().diagonal(1);
+  m_impl.divide(0, diagSize() - 1, 0, 0, 0);
+  m_info = m_impl.info();
+  m_numIters = m_impl.numIters();
   if (m_info != Success && m_info != NoConvergence) {
     m_isInitialized = true;
     return *this;
@@ -388,7 +384,7 @@ BDCSVD<MatrixType, Options>& BDCSVD<MatrixType, Options>::compute_impl(const Mat
 
   //**** step 3 - Copy singular values and vectors
   for (int i = 0; i < diagSize(); i++) {
-    RealScalar a = abs(m_computed.coeff(i, i));
+    RealScalar a = abs(m_impl.computed().coeff(i, i));
     m_singularValues.coeffRef(i) = a * scale;
     if (a < considerZero) {
       m_nonzeroSingularValues = i;
@@ -402,9 +398,9 @@ BDCSVD<MatrixType, Options>& BDCSVD<MatrixType, Options>::compute_impl(const Mat
 
   //**** step 4 - Finalize unitaries U and V
   if (m_isTranspose)
-    copyUV(bid.householderV(), bid.householderU(), m_naiveV, m_naiveU);
+    copyUV(bid.householderV(), bid.householderU(), m_impl.naiveV(), m_impl.naiveU());
   else
-    copyUV(bid.householderU(), bid.householderV(), m_naiveU, m_naiveV);
+    copyUV(bid.householderU(), bid.householderV(), m_impl.naiveU(), m_impl.naiveV());
 
   if (m_useQrDecomp) {
     if (m_isTranspose && computeV())
@@ -419,1024 +415,151 @@ BDCSVD<MatrixType, Options>& BDCSVD<MatrixType, Options>::compute_impl(const Mat
 
 template <typename MatrixType, int Options>
 template <typename HouseholderU, typename HouseholderV, typename NaiveU, typename NaiveV>
-void BDCSVD<MatrixType, Options>::copyUV(const HouseholderU& householderU, const HouseholderV& householderV,
-                                         const NaiveU& naiveU, const NaiveV& naiveV) {
-  // Note exchange of U and V: m_matrixU is set from m_naiveV and vice versa
+EIGEN_DONT_INLINE void BDCSVD<MatrixType, Options>::copyUV(const HouseholderU& householderU,
+                                                           const HouseholderV& householderV, const NaiveU& naiveU,
+                                                           const NaiveV& naiveV) {
+  // Note exchange of U and V: m_matrixU is set from m_naiveV and vice versa.
+  // Cast the diagSize x diagSize block (rather than the full naive matrix) to avoid materializing
+  // a full-size temporary when Scalar != RealScalar; reuse m_householderWorkspace across the two
+  // applyThisOnTheLeft calls so each does not allocate a fresh row vector.
   if (computeU()) {
     Index Ucols = m_computeThinU ? diagSize() : rows();
     m_matrixU = MatrixX::Identity(rows(), Ucols);
     m_matrixU.topLeftCorner(diagSize(), diagSize()) =
-        naiveV.template cast<Scalar>().topLeftCorner(diagSize(), diagSize());
-    // FIXME the following conditionals involve temporary buffers
-    if (m_useQrDecomp)
-      m_matrixU.topLeftCorner(householderU.cols(), diagSize()).applyOnTheLeft(householderU);
-    else
-      m_matrixU.applyOnTheLeft(householderU);
+        naiveV.topLeftCorner(diagSize(), diagSize()).template cast<Scalar>();
+    if (m_useQrDecomp) {
+      auto sub = m_matrixU.topLeftCorner(householderU.cols(), diagSize());
+      householderU.applyThisOnTheLeft(sub, m_householderWorkspace);
+    } else {
+      householderU.applyThisOnTheLeft(m_matrixU, m_householderWorkspace);
+    }
   }
   if (computeV()) {
     Index Vcols = m_computeThinV ? diagSize() : cols();
     m_matrixV = MatrixX::Identity(cols(), Vcols);
     m_matrixV.topLeftCorner(diagSize(), diagSize()) =
-        naiveU.template cast<Scalar>().topLeftCorner(diagSize(), diagSize());
-    // FIXME the following conditionals involve temporary buffers
-    if (m_useQrDecomp)
-      m_matrixV.topLeftCorner(householderV.cols(), diagSize()).applyOnTheLeft(householderV);
-    else
-      m_matrixV.applyOnTheLeft(householderV);
-  }
-}
-
-/** \internal
- * Performs A = A * B exploiting the special structure of the matrix A. Splitting A as:
- *  A = [A1]
- *      [A2]
- * such that A1.rows()==n1, then we assume that at least half of the columns of A1 and A2 are zeros.
- * We can thus pack them prior to the the matrix product. However, this is only worth the effort if the matrix is large
- * enough.
- */
-template <typename MatrixType, int Options>
-void BDCSVD<MatrixType, Options>::structured_update(Block<MatrixXr, Dynamic, Dynamic> A, const MatrixXr& B, Index n1) {
-  Index n = A.rows();
-  if (n > 100) {
-    // If the matrices are large enough, let's exploit the sparse structure of A by
-    // splitting it in half (wrt n1), and packing the non-zero columns.
-    Index n2 = n - n1;
-    Map<MatrixXr> A1(m_workspace.data(), n1, n);
-    Map<MatrixXr> A2(m_workspace.data() + n1 * n, n2, n);
-    Map<MatrixXr> B1(m_workspace.data() + n * n, n, n);
-    Map<MatrixXr> B2(m_workspace.data() + 2 * n * n, n, n);
-    Index k1 = 0, k2 = 0;
-    for (Index j = 0; j < n; ++j) {
-      if ((A.col(j).head(n1).array() != Literal(0)).any()) {
-        A1.col(k1) = A.col(j).head(n1);
-        B1.row(k1) = B.row(j);
-        ++k1;
-      }
-      if ((A.col(j).tail(n2).array() != Literal(0)).any()) {
-        A2.col(k2) = A.col(j).tail(n2);
-        B2.row(k2) = B.row(j);
-        ++k2;
-      }
+        naiveU.topLeftCorner(diagSize(), diagSize()).template cast<Scalar>();
+    if (m_useQrDecomp) {
+      auto sub = m_matrixV.topLeftCorner(householderV.cols(), diagSize());
+      householderV.applyThisOnTheLeft(sub, m_householderWorkspace);
+    } else {
+      householderV.applyThisOnTheLeft(m_matrixV, m_householderWorkspace);
     }
-
-    A.topRows(n1).noalias() = A1.leftCols(k1) * B1.topRows(k1);
-    A.bottomRows(n2).noalias() = A2.leftCols(k2) * B2.topRows(k2);
-  } else {
-    Map<MatrixXr, Aligned> tmp(m_workspace.data(), n, n);
-    tmp.noalias() = A * B;
-    A = tmp;
   }
 }
 
 template <typename MatrixType, int Options>
-template <typename SVDType>
-void BDCSVD<MatrixType, Options>::computeBaseCase(SVDType& svd, Index n, Index firstCol, Index firstRowW,
-                                                  Index firstColW, Index shift) {
-  svd.compute(m_computed.block(firstCol, firstCol, n + 1, n));
-  m_info = svd.info();
-  if (m_info != Success && m_info != NoConvergence) return;
-  if (m_compU)
-    m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() = svd.matrixU();
-  else {
-    m_naiveU.row(0).segment(firstCol, n + 1).real() = svd.matrixU().row(0);
-    m_naiveU.row(1).segment(firstCol, n + 1).real() = svd.matrixU().row(n);
-  }
-  if (m_compV) m_naiveV.block(firstRowW, firstColW, n, n).real() = svd.matrixV();
-  m_computed.block(firstCol + shift, firstCol + shift, n + 1, n).setZero();
-  m_computed.diagonal().segment(firstCol + shift, n) = svd.singularValues().head(n);
-}
+template <typename DerivedD, typename DerivedE>
+EIGEN_DONT_INLINE BDCSVD<MatrixType, Options>& BDCSVD<MatrixType, Options>::compute_bidiagonal_impl(
+    const MatrixBase<DerivedD>& diagonal, const MatrixBase<DerivedE>& superdiagonal, unsigned int computationOptions) {
+  EIGEN_STATIC_ASSERT(DerivedD::IsVectorAtCompileTime, THIS_METHOD_IS_ONLY_FOR_VECTORS);
+  EIGEN_STATIC_ASSERT(DerivedE::IsVectorAtCompileTime, THIS_METHOD_IS_ONLY_FOR_VECTORS);
+  EIGEN_STATIC_ASSERT((NumTraits<typename DerivedD::Scalar>::IsComplex == 0),
+                      THIS_FUNCTION_IS_NOT_FOR_COMPLEX_VALUED_MATRICES);
+  EIGEN_STATIC_ASSERT((NumTraits<typename DerivedE::Scalar>::IsComplex == 0),
+                      THIS_FUNCTION_IS_NOT_FOR_COMPLEX_VALUED_MATRICES);
 
-// The divide algorithm is done "in place", we are always working on subsets of the same matrix. The divide methods
-// takes as argument the place of the submatrix we are currently working on.
-
-//@param firstCol : The Index of the first column of the submatrix of m_computed and for m_naiveU;
-//@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU;
-// lastCol + 1 - firstCol is the size of the submatrix.
-//@param firstRowW : The Index of the first row of the matrix W that we are to change. (see the reference paper section
-// 1 for more information on W)
-//@param firstColW : Same as firstRowW with the column.
-//@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the
-// last column of the U submatrix
-// to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the
-// reference paper.
-template <typename MatrixType, int Options>
-void BDCSVD<MatrixType, Options>::divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift) {
-  // requires rows = cols + 1;
   using std::abs;
-  using std::pow;
-  using std::sqrt;
-  const Index n = lastCol - firstCol + 1;
-  const Index k = n / 2;
-  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
-  RealScalar alphaK;
-  RealScalar betaK;
-  RealScalar r0;
-  RealScalar lambda, phi, c0, s0;
-  VectorType l, f;
-  // We use the other algorithm which is more efficient for small
-  // matrices.
-  if (n < m_algoswap) {
-    // FIXME this block involves temporaries
-    if (m_compV) {
-      JacobiSVD<MatrixXr, ComputeFullU | ComputeFullV> baseSvd;
-      computeBaseCase(baseSvd, n, firstCol, firstRowW, firstColW, shift);
-    } else {
-      JacobiSVD<MatrixXr, ComputeFullU> baseSvd;
-      computeBaseCase(baseSvd, n, firstCol, firstRowW, firstColW, shift);
-    }
-    return;
-  }
-  // We use the divide and conquer algorithm
-  alphaK = m_computed(firstCol + k, firstCol + k);
-  betaK = m_computed(firstCol + k + 1, firstCol + k);
-  // The divide must be done in that order in order to have good results. Divide change the data inside the submatrices
-  // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the
-  // right submatrix before the left one.
-  divide(k + 1 + firstCol, lastCol, k + 1 + firstRowW, k + 1 + firstColW, shift);
-  if (m_info != Success && m_info != NoConvergence) return;
-  divide(firstCol, k - 1 + firstCol, firstRowW, firstColW + 1, shift + 1);
-  if (m_info != Success && m_info != NoConvergence) return;
-
-  if (m_compU) {
-    lambda = m_naiveU(firstCol + k, firstCol + k);
-    phi = m_naiveU(firstCol + k + 1, lastCol + 1);
-  } else {
-    lambda = m_naiveU(1, firstCol + k);
-    phi = m_naiveU(0, lastCol + 1);
-  }
-  r0 = sqrt((abs(alphaK * lambda) * abs(alphaK * lambda)) + abs(betaK * phi) * abs(betaK * phi));
-  if (m_compU) {
-    l = m_naiveU.row(firstCol + k).segment(firstCol, k);
-    f = m_naiveU.row(firstCol + k + 1).segment(firstCol + k + 1, n - k - 1);
-  } else {
-    l = m_naiveU.row(1).segment(firstCol, k);
-    f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1);
-  }
-  if (m_compV) m_naiveV(firstRowW + k, firstColW) = Literal(1);
-  if (r0 < considerZero) {
-    c0 = Literal(1);
-    s0 = Literal(0);
-  } else {
-    c0 = alphaK * lambda / r0;
-    s0 = betaK * phi / r0;
-  }
+  const Index n = diagonal.size();
+  eigen_assert((n == 0 || superdiagonal.size() == n - 1) && "superdiagonal must have size diagonal.size() - 1");
 
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(m_naiveU.allFinite());
-  eigen_internal_assert(m_naiveV.allFinite());
-  eigen_internal_assert(m_computed.allFinite());
-#endif
+  // For a bidiagonal matrix, rows == cols == n.
+  allocate(n, n, computationOptions);
 
-  if (m_compU) {
-    MatrixXr q1(m_naiveU.col(firstCol + k).segment(firstCol, k + 1));
-    // we shiftW Q1 to the right
-    for (Index i = firstCol + k - 1; i >= firstCol; i--)
-      m_naiveU.col(i + 1).segment(firstCol, k + 1) = m_naiveU.col(i).segment(firstCol, k + 1);
-    // we shift q1 at the left with a factor c0
-    m_naiveU.col(firstCol).segment(firstCol, k + 1) = (q1 * c0);
-    // last column = q1 * - s0
-    m_naiveU.col(lastCol + 1).segment(firstCol, k + 1) = (q1 * (-s0));
-    // first column = q2 * s0
-    m_naiveU.col(firstCol).segment(firstCol + k + 1, n - k) =
-        m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) * s0;
-    // q2 *= c0
-    m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) *= c0;
-  } else {
-    RealScalar q1 = m_naiveU(0, firstCol + k);
-    // we shift Q1 to the right
-    for (Index i = firstCol + k - 1; i >= firstCol; i--) m_naiveU(0, i + 1) = m_naiveU(0, i);
-    // we shift q1 at the left with a factor c0
-    m_naiveU(0, firstCol) = (q1 * c0);
-    // last column = q1 * - s0
-    m_naiveU(0, lastCol + 1) = (q1 * (-s0));
-    // first column = q2 * s0
-    m_naiveU(1, firstCol) = m_naiveU(1, lastCol + 1) * s0;
-    // q2 *= c0
-    m_naiveU(1, lastCol + 1) *= c0;
-    m_naiveU.row(1).segment(firstCol + 1, k).setZero();
-    m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1).setZero();
+  if (n == 0) {
+    m_isInitialized = true;
+    m_info = Success;
+    m_nonzeroSingularValues = 0;
+    return *this;
   }
 
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(m_naiveU.allFinite());
-  eigen_internal_assert(m_naiveV.allFinite());
-  eigen_internal_assert(m_computed.allFinite());
-#endif
-
-  m_computed(firstCol + shift, firstCol + shift) = r0;
-  m_computed.col(firstCol + shift).segment(firstCol + shift + 1, k) = alphaK * l.transpose().real();
-  m_computed.col(firstCol + shift).segment(firstCol + shift + k + 1, n - k - 1) = betaK * f.transpose().real();
-
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  ArrayXr tmp1 = (m_computed.block(firstCol + shift, firstCol + shift, n, n)).jacobiSvd().singularValues();
-#endif
-  // Second part: try to deflate singular values in combined matrix
-  deflation(firstCol, lastCol, k, firstRowW, firstColW, shift);
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  ArrayXr tmp2 = (m_computed.block(firstCol + shift, firstCol + shift, n, n)).jacobiSvd().singularValues();
-  std::cout << "\n\nj1 = " << tmp1.transpose().format(bdcsvdfmt) << "\n";
-  std::cout << "j2 = " << tmp2.transpose().format(bdcsvdfmt) << "\n\n";
-  std::cout << "err:      " << ((tmp1 - tmp2).abs() > 1e-12 * tmp2.abs()).transpose() << "\n";
-  static int count = 0;
-  std::cout << "# " << ++count << "\n\n";
-  eigen_internal_assert((tmp1 - tmp2).matrix().norm() < 1e-14 * tmp2.matrix().norm());
-//   eigen_internal_assert(count<681);
-//   eigen_internal_assert(((tmp1-tmp2).abs()<1e-13*tmp2.abs()).all());
-#endif
-
-  // Third part: compute SVD of combined matrix
-  MatrixXr UofSVD, VofSVD;
-  VectorType singVals;
-  computeSVDofM(firstCol + shift, n, UofSVD, singVals, VofSVD);
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(UofSVD.allFinite());
-  eigen_internal_assert(VofSVD.allFinite());
-#endif
-
-  if (m_compU)
-    structured_update(m_naiveU.block(firstCol, firstCol, n + 1, n + 1), UofSVD, (n + 2) / 2);
-  else {
-    Map<Matrix<RealScalar, 2, Dynamic>, Aligned> tmp(m_workspace.data(), 2, n + 1);
-    tmp.noalias() = m_naiveU.middleCols(firstCol, n + 1) * UofSVD;
-    m_naiveU.middleCols(firstCol, n + 1) = tmp;
+  // Check for non-finite inputs.
+  const RealScalar diagScale = diagonal.cwiseAbs().template maxCoeff<PropagateNaN>();
+  const RealScalar superdiagScale = n > 1 ? superdiagonal.cwiseAbs().template maxCoeff<PropagateNaN>() : RealScalar(0);
+  RealScalar scale = numext::maxi(diagScale, superdiagScale);
+  if (!(numext::isfinite)(scale)) {
+    m_isInitialized = true;
+    m_info = InvalidInput;
+    return *this;
   }
 
-  if (m_compV) structured_update(m_naiveV.block(firstRowW, firstColW, n, n), VofSVD, (n + 1) / 2);
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(m_naiveU.allFinite());
-  eigen_internal_assert(m_naiveV.allFinite());
-  eigen_internal_assert(m_computed.allFinite());
-#endif
-
-  m_computed.block(firstCol + shift, firstCol + shift, n, n).setZero();
-  m_computed.block(firstCol + shift, firstCol + shift, n, n).diagonal() = singVals;
-}  // end divide
-
-// Compute SVD of m_computed.block(firstCol, firstCol, n + 1, n); this block only has non-zeros in
-// the first column and on the diagonal and has undergone deflation, so diagonal is in increasing
-// order except for possibly the (0,0) entry. The computed SVD is stored U, singVals and V, except
-// that if m_compV is false, then V is not computed. Singular values are sorted in decreasing order.
-//
-// TODO Opportunities for optimization: better root finding algo, better stopping criterion, better
-// handling of round-off errors, be consistent in ordering
-// For instance, to solve the secular equation using FMM, see
-// http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf
-template <typename MatrixType, int Options>
-void BDCSVD<MatrixType, Options>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals,
-                                                MatrixXr& V) {
   const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
-  using std::abs;
-  ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n);
-  m_workspace.head(n) = m_computed.block(firstCol, firstCol, n, n).diagonal();
-  ArrayRef diag = m_workspace.head(n);
-  diag(0) = Literal(0);
-
-  // Allocate space for singular values and vectors
-  singVals.resize(n);
-  U.resize(n + 1, n + 1);
-  if (m_compV) V.resize(n, n);
-
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  if (col0.hasNaN() || diag.hasNaN()) std::cout << "\n\nHAS NAN\n\n";
-#endif
-
-  // Many singular values might have been deflated, the zero ones have been moved to the end,
-  // but others are interleaved and we must ignore them at this stage.
-  // To this end, let's compute a permutation skipping them:
-  Index actual_n = n;
-  while (actual_n > 1 && numext::is_exactly_zero(diag(actual_n - 1))) {
-    --actual_n;
-    eigen_internal_assert(numext::is_exactly_zero(col0(actual_n)));
-  }
-  Index m = 0;  // size of the deflated problem
-  for (Index k = 0; k < actual_n; ++k)
-    if (abs(col0(k)) > considerZero) m_workspaceI(m++) = k;
-  Map<ArrayXi> perm(m_workspaceI.data(), m);
-
-  Map<ArrayXr> shifts(m_workspace.data() + 1 * n, n);
-  Map<ArrayXr> mus(m_workspace.data() + 2 * n, n);
-  Map<ArrayXr> zhat(m_workspace.data() + 3 * n, n);
-
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  std::cout << "computeSVDofM using:\n";
-  std::cout << "  z: " << col0.transpose() << "\n";
-  std::cout << "  d: " << diag.transpose() << "\n";
-#endif
-
-  // Compute singVals, shifts, and mus
-  computeSingVals(col0, diag, perm, singVals, shifts, mus);
-
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  std::cout << "  j:        "
-            << (m_computed.block(firstCol, firstCol, n, n)).jacobiSvd().singularValues().transpose().reverse()
-            << "\n\n";
-  std::cout << "  sing-val: " << singVals.transpose() << "\n";
-  std::cout << "  mu:       " << mus.transpose() << "\n";
-  std::cout << "  shift:    " << shifts.transpose() << "\n";
-
-  {
-    std::cout << "\n\n    mus:    " << mus.head(actual_n).transpose() << "\n\n";
-    std::cout << "    check1 (expect0) : "
-              << ((singVals.array() - (shifts + mus)) / singVals.array()).head(actual_n).transpose() << "\n\n";
-    eigen_internal_assert((((singVals.array() - (shifts + mus)) / singVals.array()).head(actual_n) >= 0).all());
-    std::cout << "    check2 (>0)      : " << ((singVals.array() - diag) / singVals.array()).head(actual_n).transpose()
-              << "\n\n";
-    eigen_internal_assert((((singVals.array() - diag) / singVals.array()).head(actual_n) >= 0).all());
-  }
-#endif
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(singVals.allFinite());
-  eigen_internal_assert(mus.allFinite());
-  eigen_internal_assert(shifts.allFinite());
-#endif
-
-  // Compute zhat
-  perturbCol0(col0, diag, perm, singVals, shifts, mus, zhat);
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  std::cout << "  zhat: " << zhat.transpose() << "\n";
-#endif
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(zhat.allFinite());
-#endif
-
-  computeSingVecs(zhat, diag, perm, singVals, shifts, mus, U, V);
-
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  std::cout << "U^T U: " << (U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(), U.cols()))).norm() << "\n";
-  std::cout << "V^T V: " << (V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(), V.cols()))).norm() << "\n";
-#endif
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(m_naiveU.allFinite());
-  eigen_internal_assert(m_naiveV.allFinite());
-  eigen_internal_assert(m_computed.allFinite());
-  eigen_internal_assert(U.allFinite());
-  eigen_internal_assert(V.allFinite());
-//   eigen_internal_assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() <
-//   100*NumTraits<RealScalar>::epsilon() * n); eigen_internal_assert((V.transpose() * V -
-//   MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 100*NumTraits<RealScalar>::epsilon() * n);
-#endif
+  if (numext::is_exactly_zero(scale)) scale = Literal(1);
 
-  // Because of deflation, the singular values might not be completely sorted.
-  // Fortunately, reordering them is a O(n) problem
-  for (Index i = 0; i < actual_n - 1; ++i) {
-    if (singVals(i) > singVals(i + 1)) {
-      using std::swap;
-      swap(singVals(i), singVals(i + 1));
-      U.col(i).swap(U.col(i + 1));
-      if (m_compV) V.col(i).swap(V.col(i + 1));
+  //**** Small problem: build dense bidiagonal and delegate to JacobiSVD.
+  if (n < m_impl.algoSwap()) {
+    // Build the dense upper bidiagonal matrix.
+    MatrixX B = MatrixX::Zero(n, n);
+    B.diagonal() = diagonal.template cast<Scalar>() / Scalar(scale);
+    if (n > 1) B.diagonal(1) = superdiagonal.template cast<Scalar>() / Scalar(scale);
+    smallSvd.compute(B);
+    m_isInitialized = true;
+    m_info = smallSvd.info();
+    if (m_info == Success || m_info == NoConvergence) {
+      m_singularValues = smallSvd.singularValues() * scale;
+      m_nonzeroSingularValues = smallSvd.nonzeroSingularValues();
+      if (computeU()) m_matrixU = smallSvd.matrixU();
+      if (computeV()) m_matrixV = smallSvd.matrixV();
     }
+    return *this;
   }
 
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  {
-    bool singular_values_sorted =
-        (((singVals.segment(1, actual_n - 1) - singVals.head(actual_n - 1))).array() >= 0).all();
-    if (!singular_values_sorted)
-      std::cout << "Singular values are not sorted: " << singVals.segment(1, actual_n).transpose() << "\n";
-    eigen_internal_assert(singular_values_sorted);
+  //**** Fill m_computed with transposed bidiagonal format.
+  // D&C operates on B^T: m_computed(i,i) = d_i, m_computed(i+1,i) = e_i.
+  m_impl.naiveU().setZero();
+  m_impl.naiveV().setZero();
+  m_impl.computed().setZero();
+  for (Index i = 0; i < n; ++i) {
+    m_impl.computed()(i, i) = RealScalar(diagonal.coeff(i)) / scale;
   }
-#endif
-
-  // Reverse order so that singular values in increased order
-  // Because of deflation, the zeros singular-values are already at the end
-  singVals.head(actual_n).reverseInPlace();
-  U.leftCols(actual_n).rowwise().reverseInPlace();
-  if (m_compV) V.leftCols(actual_n).rowwise().reverseInPlace();
-
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  JacobiSVD<MatrixXr> jsvd(m_computed.block(firstCol, firstCol, n, n));
-  std::cout << "  * j:        " << jsvd.singularValues().transpose() << "\n\n";
-  std::cout << "  * sing-val: " << singVals.transpose() << "\n";
-//   std::cout << "  * err:      " << ((jsvd.singularValues()-singVals)>1e-13*singVals.norm()).transpose() << "\n";
-#endif
-}
-
-template <typename MatrixType, int Options>
-typename BDCSVD<MatrixType, Options>::RealScalar BDCSVD<MatrixType, Options>::secularEq(
-    RealScalar mu, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, const ArrayRef& diagShifted,
-    RealScalar shift) {
-  Index m = perm.size();
-  RealScalar res = Literal(1);
-  for (Index i = 0; i < m; ++i) {
-    Index j = perm(i);
-    // The following expression could be rewritten to involve only a single division,
-    // but this would make the expression more sensitive to overflow.
-    res += (col0(j) / (diagShifted(j) - mu)) * (col0(j) / (diag(j) + shift + mu));
+  for (Index i = 0; i < n - 1; ++i) {
+    m_impl.computed()(i + 1, i) = RealScalar(superdiagonal.coeff(i)) / scale;
   }
-  return res;
-}
 
-template <typename MatrixType, int Options>
-void BDCSVD<MatrixType, Options>::computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm,
-                                                  VectorType& singVals, ArrayRef shifts, ArrayRef mus) {
-  using std::abs;
-  using std::sqrt;
-  using std::swap;
-
-  Index n = col0.size();
-  Index actual_n = n;
-  // Note that here actual_n is computed based on col0(i)==0 instead of diag(i)==0 as above
-  // because 1) we have diag(i)==0 => col0(i)==0 and 2) if col0(i)==0, then diag(i) is already a singular value.
-  while (actual_n > 1 && numext::is_exactly_zero(col0(actual_n - 1))) --actual_n;
-
-  for (Index k = 0; k < n; ++k) {
-    if (numext::is_exactly_zero(col0(k)) || actual_n == 1) {
-      // if col0(k) == 0, then entry is deflated, so singular value is on diagonal
-      // if actual_n==1, then the deflated problem is already diagonalized
-      singVals(k) = k == 0 ? col0(0) : diag(k);
-      mus(k) = Literal(0);
-      shifts(k) = k == 0 ? col0(0) : diag(k);
-      continue;
-    }
-
-    // otherwise, use secular equation to find singular value
-    RealScalar left = diag(k);
-    RealScalar right;  // was: = (k != actual_n-1) ? diag(k+1) : (diag(actual_n-1) + col0.matrix().norm());
-    if (k == actual_n - 1)
-      right = (diag(actual_n - 1) + col0.matrix().norm());
-    else {
-      // Skip deflated singular values,
-      // recall that at this stage we assume that z[j]!=0 and all entries for which z[j]==0 have been put aside.
-      // This should be equivalent to using perm[]
-      Index l = k + 1;
-      while (numext::is_exactly_zero(col0(l))) {
-        ++l;
-        eigen_internal_assert(l < actual_n);
-      }
-      right = diag(l);
-    }
-
-    // first decide whether it's closer to the left end or the right end
-    RealScalar mid = left + (right - left) / Literal(2);
-    RealScalar fMid = secularEq(mid, col0, diag, perm, diag, Literal(0));
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-    std::cout << "right-left = " << right - left << "\n";
-    //     std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, ArrayXr(diag-left), left)
-    //                            << " " << secularEq(mid-right, col0, diag, perm, ArrayXr(diag-right), right)   <<
-    //                            "\n";
-    std::cout << "     = " << secularEq(left + RealScalar(0.000001) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.1) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.2) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.3) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.4) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.49) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.5) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.51) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.6) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.7) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.8) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.9) * (right - left), col0, diag, perm, diag, 0) << " "
-              << secularEq(left + RealScalar(0.999999) * (right - left), col0, diag, perm, diag, 0) << "\n";
-#endif
-    RealScalar shift = (k == actual_n - 1 || fMid > Literal(0)) ? left : right;
-
-    // measure everything relative to shift
-    Map<ArrayXr> diagShifted(m_workspace.data() + 4 * n, n);
-    diagShifted = diag - shift;
-
-    if (k != actual_n - 1) {
-      // check that after the shift, f(mid) is still negative:
-      RealScalar midShifted = (right - left) / RealScalar(2);
-      // we can test exact equality here, because shift comes from `... ? left : right`
-      if (numext::equal_strict(shift, right)) midShifted = -midShifted;
-      RealScalar fMidShifted = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
-      if (fMidShifted > 0) {
-        // fMid was erroneous, fix it:
-        shift = fMidShifted > Literal(0) ? left : right;
-        diagShifted = diag - shift;
-      }
-    }
+  m_isTranspose = false;
 
-    // initial guess
-    RealScalar muPrev, muCur;
-    // we can test exact equality here, because shift comes from `... ? left : right`
-    if (numext::equal_strict(shift, left)) {
-      muPrev = (right - left) * RealScalar(0.1);
-      if (k == actual_n - 1)
-        muCur = right - left;
-      else
-        muCur = (right - left) * RealScalar(0.5);
-    } else {
-      muPrev = -(right - left) * RealScalar(0.1);
-      muCur = -(right - left) * RealScalar(0.5);
-    }
-
-    RealScalar fPrev = secularEq(muPrev, col0, diag, perm, diagShifted, shift);
-    RealScalar fCur = secularEq(muCur, col0, diag, perm, diagShifted, shift);
-    if (abs(fPrev) < abs(fCur)) {
-      swap(fPrev, fCur);
-      swap(muPrev, muCur);
-    }
-
-    // rational interpolation: fit a function of the form a / mu + b through the two previous
-    // iterates and use its zero to compute the next iterate
-    bool useBisection = fPrev * fCur > Literal(0);
-    while (!numext::is_exactly_zero(fCur) &&
-           abs(muCur - muPrev) >
-               Literal(8) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(muCur), abs(muPrev)) &&
-           abs(fCur - fPrev) > NumTraits<RealScalar>::epsilon() && !useBisection) {
-      ++m_numIters;
-
-      // Find a and b such that the function f(mu) = a / mu + b matches the current and previous samples.
-      RealScalar a = (fCur - fPrev) / (Literal(1) / muCur - Literal(1) / muPrev);
-      RealScalar b = fCur - a / muCur;
-      // And find mu such that f(mu)==0:
-      RealScalar muZero = -a / b;
-      RealScalar fZero = secularEq(muZero, col0, diag, perm, diagShifted, shift);
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-      eigen_internal_assert((numext::isfinite)(fZero));
-#endif
-
-      muPrev = muCur;
-      fPrev = fCur;
-      muCur = muZero;
-      fCur = fZero;
-
-      // we can test exact equality here, because shift comes from `... ? left : right`
-      if (numext::equal_strict(shift, left) && (muCur < Literal(0) || muCur > right - left)) useBisection = true;
-      if (numext::equal_strict(shift, right) && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true;
-      if (abs(fCur) > abs(fPrev)) useBisection = true;
-    }
-
-    // fall back on bisection method if rational interpolation did not work
-    if (useBisection) {
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-      std::cout << "useBisection for k = " << k << ", actual_n = " << actual_n << "\n";
-#endif
-      RealScalar leftShifted, rightShifted;
-      // we can test exact equality here, because shift comes from `... ? left : right`
-      if (numext::equal_strict(shift, left)) {
-        // to avoid overflow, we must have mu > max(real_min, |z(k)|/sqrt(real_max)),
-        // the factor 2 is to be more conservative
-        leftShifted =
-            numext::maxi<RealScalar>((std::numeric_limits<RealScalar>::min)(),
-                                     Literal(2) * abs(col0(k)) / sqrt((std::numeric_limits<RealScalar>::max)()));
-
-        // check that we did it right:
-        eigen_internal_assert(
-            (numext::isfinite)((col0(k) / leftShifted) * (col0(k) / (diag(k) + shift + leftShifted))));
-        // I don't understand why the case k==0 would be special there:
-        // if (k == 0) rightShifted = right - left; else
-        rightShifted = (k == actual_n - 1)
-                           ? right
-                           : ((right - left) * RealScalar(0.51));  // theoretically we can take 0.5, but let's be safe
-      } else {
-        leftShifted = -(right - left) * RealScalar(0.51);
-        if (k + 1 < n)
-          rightShifted = -numext::maxi<RealScalar>((std::numeric_limits<RealScalar>::min)(),
-                                                   abs(col0(k + 1)) / sqrt((std::numeric_limits<RealScalar>::max)()));
-        else
-          rightShifted = -(std::numeric_limits<RealScalar>::min)();
-      }
-
-      RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
-      eigen_internal_assert(fLeft < Literal(0));
-
-#if defined EIGEN_BDCSVD_DEBUG_VERBOSE || defined EIGEN_BDCSVD_SANITY_CHECKS || defined EIGEN_INTERNAL_DEBUGGING
-      RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift);
-#endif
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-      if (!(numext::isfinite)(fLeft))
-        std::cout << "f(" << leftShifted << ") =" << fLeft << " ; " << left << " " << shift << " " << right << "\n";
-      eigen_internal_assert((numext::isfinite)(fLeft));
-
-      if (!(numext::isfinite)(fRight))
-        std::cout << "f(" << rightShifted << ") =" << fRight << " ; " << left << " " << shift << " " << right << "\n";
-        // eigen_internal_assert((numext::isfinite)(fRight));
-#endif
-
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-      if (!(fLeft * fRight < 0)) {
-        std::cout << "f(leftShifted) using  leftShifted=" << leftShifted
-                  << " ;  diagShifted(1:10):" << diagShifted.head(10).transpose() << "\n ; "
-                  << "left==shift=" << bool(left == shift) << " ; left-shift = " << (left - shift) << "\n";
-        std::cout << "k=" << k << ", " << fLeft << " * " << fRight << " == " << fLeft * fRight << "  ;  "
-                  << "[" << left << " .. " << right << "] -> [" << leftShifted << " " << rightShifted
-                  << "], shift=" << shift << " ,  f(right)=" << secularEq(0, col0, diag, perm, diagShifted, shift)
-                  << " == " << secularEq(right, col0, diag, perm, diag, 0) << " == " << fRight << "\n";
-      }
-#endif
-      eigen_internal_assert(fLeft * fRight < Literal(0));
-
-      if (fLeft < Literal(0)) {
-        while (rightShifted - leftShifted > Literal(2) * NumTraits<RealScalar>::epsilon() *
-                                                numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted))) {
-          RealScalar midShifted = (leftShifted + rightShifted) / Literal(2);
-          fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
-          eigen_internal_assert((numext::isfinite)(fMid));
-
-          if (fLeft * fMid < Literal(0)) {
-            rightShifted = midShifted;
-          } else {
-            leftShifted = midShifted;
-            fLeft = fMid;
-          }
-        }
-        muCur = (leftShifted + rightShifted) / Literal(2);
-      } else {
-        // We have a problem as shifting on the left or right give either a positive or negative value
-        // at the middle of [left,right]...
-        // Instead of abbording or entering an infinite loop,
-        // let's just use the middle as the estimated zero-crossing:
-        muCur = (right - left) * RealScalar(0.5);
-        // we can test exact equality here, because shift comes from `... ? left : right`
-        if (numext::equal_strict(shift, right)) muCur = -muCur;
-      }
-    }
-
-    singVals[k] = shift + muCur;
-    shifts[k] = shift;
-    mus[k] = muCur;
-
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-    if (k + 1 < n)
-      std::cout << "found " << singVals[k] << " == " << shift << " + " << muCur << " from " << diag(k) << " .. "
-                << diag(k + 1) << "\n";
-#endif
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-    eigen_internal_assert(k == 0 || singVals[k] >= singVals[k - 1]);
-    eigen_internal_assert(singVals[k] >= diag(k));
-#endif
-
-    // perturb singular value slightly if it equals diagonal entry to avoid division by zero later
-    // (deflation is supposed to avoid this from happening)
-    // - this does no seem to be necessary anymore -
-    // if (singVals[k] == left) singVals[k] *= 1 + NumTraits<RealScalar>::epsilon();
-    // if (singVals[k] == right) singVals[k] *= 1 - NumTraits<RealScalar>::epsilon();
-  }
-}
-
-// zhat is perturbation of col0 for which singular vectors can be computed stably (see Section 3.1)
-template <typename MatrixType, int Options>
-void BDCSVD<MatrixType, Options>::perturbCol0(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm,
-                                              const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus,
-                                              ArrayRef zhat) {
-  using std::sqrt;
-  Index n = col0.size();
-  Index m = perm.size();
-  if (m == 0) {
-    zhat.setZero();
-    return;
-  }
-  Index lastIdx = perm(m - 1);
-  // The offset permits to skip deflated entries while computing zhat
-  for (Index k = 0; k < n; ++k) {
-    if (numext::is_exactly_zero(col0(k)))  // deflated
-      zhat(k) = Literal(0);
-    else {
-      // see equation (3.6)
-      RealScalar dk = diag(k);
-      RealScalar prod = (singVals(lastIdx) + dk) * (mus(lastIdx) + (shifts(lastIdx) - dk));
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-      if (prod < 0) {
-        std::cout << "k = " << k << " ;  z(k)=" << col0(k) << ", diag(k)=" << dk << "\n";
-        std::cout << "prod = "
-                  << "(" << singVals(lastIdx) << " + " << dk << ") * (" << mus(lastIdx) << " + (" << shifts(lastIdx)
-                  << " - " << dk << "))"
-                  << "\n";
-        std::cout << "     = " << singVals(lastIdx) + dk << " * " << mus(lastIdx) + (shifts(lastIdx) - dk) << "\n";
-      }
-      eigen_internal_assert(prod >= 0);
-#endif
-
-      for (Index l = 0; l < m; ++l) {
-        Index i = perm(l);
-        if (i != k) {
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-          if (i >= k && (l == 0 || l - 1 >= m)) {
-            std::cout << "Error in perturbCol0\n";
-            std::cout << "  " << k << "/" << n << " " << l << "/" << m << " " << i << "/" << n << " ; " << col0(k)
-                      << " " << diag(k) << " "
-                      << "\n";
-            std::cout << "  " << diag(i) << "\n";
-            Index j = (i < k /*|| l==0*/) ? i : perm(l - 1);
-            std::cout << "  "
-                      << "j=" << j << "\n";
-          }
-#endif
-          Index j = i < k ? i : l > 0 ? perm(l - 1) : i;
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-          if (!(dk != Literal(0) || diag(i) != Literal(0))) {
-            std::cout << "k=" << k << ", i=" << i << ", l=" << l << ", perm.size()=" << perm.size() << "\n";
-          }
-          eigen_internal_assert(dk != Literal(0) || diag(i) != Literal(0));
-#endif
-          prod *= ((singVals(j) + dk) / ((diag(i) + dk))) * ((mus(j) + (shifts(j) - dk)) / ((diag(i) - dk)));
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-          eigen_internal_assert(prod >= 0);
-#endif
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-          if (i != k &&
-              numext::abs(((singVals(j) + dk) * (mus(j) + (shifts(j) - dk))) / ((diag(i) + dk) * (diag(i) - dk)) - 1) >
-                  0.9)
-            std::cout << "     "
-                      << ((singVals(j) + dk) * (mus(j) + (shifts(j) - dk))) / ((diag(i) + dk) * (diag(i) - dk))
-                      << " == (" << (singVals(j) + dk) << " * " << (mus(j) + (shifts(j) - dk)) << ") / ("
-                      << (diag(i) + dk) << " * " << (diag(i) - dk) << ")\n";
-#endif
-        }
-      }
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-      std::cout << "zhat(" << k << ") =  sqrt( " << prod << ")  ;  " << (singVals(lastIdx) + dk) << " * "
-                << mus(lastIdx) + shifts(lastIdx) << " - " << dk << "\n";
-#endif
-      RealScalar tmp = sqrt(prod);
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-      eigen_internal_assert((numext::isfinite)(tmp));
-#endif
-      zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp);
-    }
+  //**** Run D&C.
+  m_impl.divide(0, n - 1, 0, 0, 0);
+  m_info = m_impl.info();
+  m_numIters = m_impl.numIters();
+  if (m_info != Success && m_info != NoConvergence) {
+    m_isInitialized = true;
+    return *this;
   }
-}
 
-// compute singular vectors
-template <typename MatrixType, int Options>
-void BDCSVD<MatrixType, Options>::computeSingVecs(const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef& perm,
-                                                  const VectorType& singVals, const ArrayRef& shifts,
-                                                  const ArrayRef& mus, MatrixXr& U, MatrixXr& V) {
-  Index n = zhat.size();
-  Index m = perm.size();
-
-  for (Index k = 0; k < n; ++k) {
-    if (numext::is_exactly_zero(zhat(k))) {
-      U.col(k) = VectorType::Unit(n + 1, k);
-      if (m_compV) V.col(k) = VectorType::Unit(n, k);
-    } else {
-      U.col(k).setZero();
-      for (Index l = 0; l < m; ++l) {
-        Index i = perm(l);
-        U(i, k) = zhat(i) / (((diag(i) - shifts(k)) - mus(k))) / ((diag(i) + singVals[k]));
-      }
-      U(n, k) = Literal(0);
-      U.col(k).normalize();
-
-      if (m_compV) {
-        V.col(k).setZero();
-        for (Index l = 1; l < m; ++l) {
-          Index i = perm(l);
-          V(i, k) = diag(i) * zhat(i) / (((diag(i) - shifts(k)) - mus(k))) / ((diag(i) + singVals[k]));
-        }
-        V(0, k) = Literal(-1);
-        V.col(k).normalize();
-      }
+  //**** Extract singular values.
+  for (int i = 0; i < diagSize(); i++) {
+    RealScalar a = abs(m_impl.computed().coeff(i, i));
+    m_singularValues.coeffRef(i) = a * scale;
+    if (a < considerZero) {
+      m_nonzeroSingularValues = i;
+      m_singularValues.tail(diagSize() - i - 1).setZero();
+      break;
+    } else if (i == diagSize() - 1) {
+      m_nonzeroSingularValues = i + 1;
+      break;
     }
   }
-  U.col(n) = VectorType::Unit(n + 1, n);
-}
-
-// page 12_13
-// i >= 1, di almost null and zi non null.
-// We use a rotation to zero out zi applied to the left of M, and set di = 0.
-template <typename MatrixType, int Options>
-void BDCSVD<MatrixType, Options>::deflation43(Index firstCol, Index shift, Index i, Index size) {
-  using std::abs;
-  using std::pow;
-  using std::sqrt;
-  Index start = firstCol + shift;
-  RealScalar c = m_computed(start, start);
-  RealScalar s = m_computed(start + i, start);
-  RealScalar r = numext::hypot(c, s);
-  if (numext::is_exactly_zero(r)) {
-    m_computed(start + i, start + i) = Literal(0);
-    return;
-  }
-  m_computed(start, start) = r;
-  m_computed(start + i, start) = Literal(0);
-  m_computed(start + i, start + i) = Literal(0);
-
-  JacobiRotation<RealScalar> J(c / r, -s / r);
-  if (m_compU)
-    m_naiveU.middleRows(firstCol, size + 1).applyOnTheRight(firstCol, firstCol + i, J);
-  else
-    m_naiveU.applyOnTheRight(firstCol, firstCol + i, J);
-}  // end deflation 43
-
-// page 13
-// i,j >= 1, i > j, and |di - dj| < epsilon * norm2(M)
-// We apply two rotations to have zi = 0, and dj = di.
-template <typename MatrixType, int Options>
-void BDCSVD<MatrixType, Options>::deflation44(Index firstColu, Index firstColm, Index firstRowW, Index firstColW,
-                                              Index i, Index j, Index size) {
-  using std::abs;
-  using std::conj;
-  using std::pow;
-  using std::sqrt;
-
-  RealScalar s = m_computed(firstColm + i, firstColm);
-  RealScalar c = m_computed(firstColm + j, firstColm);
-  RealScalar r = numext::hypot(c, s);
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  std::cout << "deflation 4.4: " << i << "," << j << " -> " << c << " " << s << " " << r << " ; "
-            << m_computed(firstColm + i - 1, firstColm) << " " << m_computed(firstColm + i, firstColm) << " "
-            << m_computed(firstColm + i + 1, firstColm) << " " << m_computed(firstColm + i + 2, firstColm) << "\n";
-  std::cout << m_computed(firstColm + i - 1, firstColm + i - 1) << " " << m_computed(firstColm + i, firstColm + i)
-            << " " << m_computed(firstColm + i + 1, firstColm + i + 1) << " "
-            << m_computed(firstColm + i + 2, firstColm + i + 2) << "\n";
-#endif
-  if (numext::is_exactly_zero(r)) {
-    m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i);
-    return;
-  }
-  c /= r;
-  s /= r;
-  m_computed(firstColm + j, firstColm) = r;
-  m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i);
-  m_computed(firstColm + i, firstColm) = Literal(0);
-
-  JacobiRotation<RealScalar> J(c, -s);
-  if (m_compU)
-    m_naiveU.middleRows(firstColu, size + 1).applyOnTheRight(firstColu + j, firstColu + i, J);
-  else
-    m_naiveU.applyOnTheRight(firstColu + j, firstColu + i, J);
-  if (m_compV) m_naiveV.middleRows(firstRowW, size).applyOnTheRight(firstColW + j, firstColW + i, J);
-}  // end deflation 44
-
-// acts on block from (firstCol+shift, firstCol+shift) to (lastCol+shift, lastCol+shift) [inclusive]
-template <typename MatrixType, int Options>
-void BDCSVD<MatrixType, Options>::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW,
-                                            Index shift) {
-  using std::abs;
-  using std::sqrt;
-  const Index length = lastCol + 1 - firstCol;
-
-  Block<MatrixXr, Dynamic, 1> col0(m_computed, firstCol + shift, firstCol + shift, length, 1);
-  Diagonal<MatrixXr> fulldiag(m_computed);
-  VectorBlock<Diagonal<MatrixXr>, Dynamic> diag(fulldiag, firstCol + shift, length);
-
-  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
-  RealScalar maxDiag = diag.tail((std::max)(Index(1), length - 1)).cwiseAbs().maxCoeff();
-  RealScalar epsilon_strict = numext::maxi<RealScalar>(considerZero, NumTraits<RealScalar>::epsilon() * maxDiag);
-  RealScalar epsilon_coarse =
-      Literal(8) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(m_naiveU.allFinite());
-  eigen_internal_assert(m_naiveV.allFinite());
-  eigen_internal_assert(m_computed.allFinite());
-#endif
-
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  std::cout << "\ndeflate:" << diag.head(k + 1).transpose() << "  |  "
-            << diag.segment(k + 1, length - k - 1).transpose() << "\n";
-#endif
-
-  // condition 4.1
-  if (diag(0) < epsilon_coarse) {
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-    std::cout << "deflation 4.1, because " << diag(0) << " < " << epsilon_coarse << "\n";
-#endif
-    diag(0) = epsilon_coarse;
-  }
-
-  // condition 4.2
-  for (Index i = 1; i < length; ++i)
-    if (abs(col0(i)) < epsilon_strict) {
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-      std::cout << "deflation 4.2, set z(" << i << ") to zero because " << abs(col0(i)) << " < " << epsilon_strict
-                << "  (diag(" << i << ")=" << diag(i) << ")\n";
-#endif
-      col0(i) = Literal(0);
-    }
-
-  // condition 4.3
-  for (Index i = 1; i < length; i++)
-    if (diag(i) < epsilon_coarse) {
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-      std::cout << "deflation 4.3, cancel z(" << i << ")=" << col0(i) << " because diag(" << i << ")=" << diag(i)
-                << " < " << epsilon_coarse << "\n";
-#endif
-      deflation43(firstCol, shift, i, length);
-    }
 
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(m_naiveU.allFinite());
-  eigen_internal_assert(m_naiveV.allFinite());
-  eigen_internal_assert(m_computed.allFinite());
-#endif
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  std::cout << "to be sorted: " << diag.transpose() << "\n\n";
-  std::cout << "            : " << col0.transpose() << "\n\n";
-#endif
-  {
-    // Check for total deflation:
-    // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting.
-    const bool total_deflation = (col0.tail(length - 1).array().abs() < considerZero).all();
-
-    // Sort the diagonal entries, since diag(1:k-1) and diag(k:length) are already sorted, let's do a sorted merge.
-    // First, compute the respective permutation.
-    Index* permutation = m_workspaceI.data();
-    {
-      permutation[0] = 0;
-      Index p = 1;
-
-      // Move deflated diagonal entries at the end.
-      for (Index i = 1; i < length; ++i)
-        if (diag(i) < considerZero) permutation[p++] = i;
-
-      Index i = 1, j = k + 1;
-      for (; p < length; ++p) {
-        if (i > k)
-          permutation[p] = j++;
-        else if (j >= length)
-          permutation[p] = i++;
-        else if (diag(i) < diag(j))
-          permutation[p] = j++;
-        else
-          permutation[p] = i++;
-      }
-    }
-
-    // If we have a total deflation, then we have to insert diag(0) at the right place
-    if (total_deflation) {
-      for (Index i = 1; i < length; ++i) {
-        Index pi = permutation[i];
-        if (diag(pi) < considerZero || diag(0) < diag(pi))
-          permutation[i - 1] = permutation[i];
-        else {
-          permutation[i - 1] = 0;
-          break;
-        }
-      }
-    }
-
-    // Current index of each col, and current column of each index
-    Index* realInd = m_workspaceI.data() + length;
-    Index* realCol = m_workspaceI.data() + 2 * length;
-
-    for (int pos = 0; pos < length; pos++) {
-      realCol[pos] = pos;
-      realInd[pos] = pos;
-    }
-
-    for (Index i = total_deflation ? 0 : 1; i < length; i++) {
-      const Index pi = permutation[length - (total_deflation ? i + 1 : i)];
-      const Index J = realCol[pi];
-
-      using std::swap;
-      // swap diagonal and first column entries:
-      swap(diag(i), diag(J));
-      if (i != 0 && J != 0) swap(col0(i), col0(J));
-
-      // change columns
-      if (m_compU)
-        m_naiveU.col(firstCol + i)
-            .segment(firstCol, length + 1)
-            .swap(m_naiveU.col(firstCol + J).segment(firstCol, length + 1));
-      else
-        m_naiveU.col(firstCol + i).segment(0, 2).swap(m_naiveU.col(firstCol + J).segment(0, 2));
-      if (m_compV)
-        m_naiveV.col(firstColW + i)
-            .segment(firstRowW, length)
-            .swap(m_naiveV.col(firstColW + J).segment(firstRowW, length));
-
-      // update real pos
-      const Index realI = realInd[i];
-      realCol[realI] = J;
-      realCol[pi] = i;
-      realInd[J] = realI;
-      realInd[i] = pi;
-    }
+  //**** Copy U and V directly (no Householder to apply).
+  // D&C computes B^T = naiveU * S * naiveV^T, so B = naiveV * S * naiveU^T.
+  // Thus U_of_B = naiveV, V_of_B = naiveU.
+  if (computeU()) {
+    Index Ucols = m_computeThinU ? diagSize() : rows();
+    m_matrixU = MatrixX::Identity(rows(), Ucols);
+    m_matrixU.topLeftCorner(diagSize(), diagSize()) =
+        m_impl.naiveV().template cast<Scalar>().topLeftCorner(diagSize(), diagSize());
   }
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-  std::cout << "sorted: " << diag.transpose().format(bdcsvdfmt) << "\n";
-  std::cout << "      : " << col0.transpose() << "\n\n";
-#endif
-
-  // condition 4.4
-  {
-    Index i = length - 1;
-    // Find last non-deflated entry.
-    while (i > 0 && (diag(i) < considerZero || abs(col0(i)) < considerZero)) --i;
-
-    for (; i > 1; --i)
-      if ((diag(i) - diag(i - 1)) < epsilon_strict) {
-#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-        std::cout << "deflation 4.4 with i = " << i << " because " << diag(i) << " - " << diag(i - 1)
-                  << " == " << (diag(i) - diag(i - 1)) << " < " << epsilon_strict << "\n";
-#endif
-        eigen_internal_assert(abs(diag(i) - diag(i - 1)) < epsilon_coarse &&
-                              " diagonal entries are not properly sorted");
-        deflation44(firstCol, firstCol + shift, firstRowW, firstColW, i, i - 1, length);
-      }
+  if (computeV()) {
+    Index Vcols = m_computeThinV ? diagSize() : cols();
+    m_matrixV = MatrixX::Identity(cols(), Vcols);
+    m_matrixV.topLeftCorner(diagSize(), diagSize()) =
+        m_impl.naiveU().template cast<Scalar>().topLeftCorner(diagSize(), diagSize());
   }
 
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  for (Index j = 2; j < length; ++j) eigen_internal_assert(diag(j - 1) <= diag(j) || abs(diag(j)) < considerZero);
-#endif
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  eigen_internal_assert(m_naiveU.allFinite());
-  eigen_internal_assert(m_naiveV.allFinite());
-  eigen_internal_assert(m_computed.allFinite());
-#endif
-}  // end deflation
+  m_isInitialized = true;
+  return *this;
+}
 
 /** \svd_module
  *
diff --git a/Eigen/src/SVD/BDCSVDImpl.h b/Eigen/src/SVD/BDCSVDImpl.h
new file mode 100644
index 00000000000..19444e1786c
--- /dev/null
+++ b/Eigen/src/SVD/BDCSVDImpl.h
@@ -0,0 +1,821 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// We used the "A Divide-And-Conquer Algorithm for the Bidiagonal SVD"
+// research report written by Ming Gu and Stanley C.Eisenstat
+// The code variable names correspond to the names they used in their
+// report
+//
+// Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
+// Copyright (C) 2013 Nicolas Carre <nicolas.carre@ensimag.fr>
+// Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
+// Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
+// Copyright (C) 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2014-2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BDCSVD_IMPL_H
+#define EIGEN_BDCSVD_IMPL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+ * Implementation of the divide-and-conquer phase of BDCSVD.
+ *
+ * Templated only on RealScalar so that all BDCSVD instantiations sharing the same
+ * RealScalar (e.g. BDCSVD<MatrixXd, ComputeThinU|ComputeThinV> and
+ * BDCSVD<MatrixXd, ComputeFullU|ComputeFullV>, or BDCSVD<MatrixXcd> and
+ * BDCSVD<MatrixXd>) share a single copy of the ~950 lines of D&C code.
+ */
+template <typename RealScalar_>
+class bdcsvd_impl {
+ public:
+  typedef RealScalar_ RealScalar;
+  typedef typename NumTraits<RealScalar>::Literal Literal;
+  typedef Matrix<RealScalar, Dynamic, Dynamic, ColMajor> MatrixXr;
+  typedef Matrix<RealScalar, Dynamic, 1> VectorType;
+  typedef Array<RealScalar, Dynamic, 1> ArrayXr;
+  typedef Array<Index, 1, Dynamic> ArrayXi;
+  typedef Ref<ArrayXr> ArrayRef;
+  typedef Ref<ArrayXi> IndicesRef;
+
+  bdcsvd_impl() : m_algoswap(16), m_compU(false), m_compV(false), m_numIters(0), m_info(Success) {}
+
+  void allocate(Index diagSize, bool compU, bool compV);
+
+  /** Entry point for the divide-and-conquer phase. */
+  void divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift);
+
+  MatrixXr& naiveU() { return m_naiveU; }
+  const MatrixXr& naiveU() const { return m_naiveU; }
+  MatrixXr& naiveV() { return m_naiveV; }
+  const MatrixXr& naiveV() const { return m_naiveV; }
+  MatrixXr& computed() { return m_computed; }
+  const MatrixXr& computed() const { return m_computed; }
+  ComputationInfo info() const { return m_info; }
+  int numIters() const { return m_numIters; }
+  int algoSwap() const { return m_algoswap; }
+  void setAlgoSwap(int s) { m_algoswap = s; }
+
+ private:
+  void computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V);
+  void computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, VectorType& singVals,
+                       ArrayRef shifts, ArrayRef mus);
+  void perturbCol0(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals,
+                   const ArrayRef& shifts, const ArrayRef& mus, ArrayRef zhat);
+  void computeSingVecs(const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals,
+                       const ArrayRef& shifts, const ArrayRef& mus, MatrixXr& U, MatrixXr& V);
+  void deflation43(Index firstCol, Index shift, Index i, Index size);
+  void deflation44(Index firstColu, Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size);
+  void deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift);
+  void structured_update(Block<MatrixXr, Dynamic, Dynamic> A, const MatrixXr& B, Index n1);
+  static RealScalar secularEq(RealScalar x, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm,
+                              const ArrayRef& diagShifted, RealScalar shift);
+  template <typename SVDType>
+  void computeBaseCase(SVDType& svd, Index n, Index firstCol, Index firstRowW, Index firstColW, Index shift);
+
+  MatrixXr m_naiveU, m_naiveV;
+  MatrixXr m_computed;
+  ArrayXr m_workspace;
+  ArrayXi m_workspaceI;
+  // Reused base-case JacobiSVDs (one per option set) so that recursive divide()
+  // calls don't reallocate JacobiSVD's internal U/V/sigma buffers each time.
+  JacobiSVD<MatrixXr, ComputeFullU> m_baseSvdU;
+  JacobiSVD<MatrixXr, ComputeFullU | ComputeFullV> m_baseSvdUV;
+  int m_algoswap;
+  bool m_compU, m_compV;
+  int m_numIters;
+  ComputationInfo m_info;
+};
+
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::allocate(Index diagSize, bool compU, bool compV) {
+  m_compU = compU;
+  m_compV = compV;
+  m_numIters = 0;
+  m_info = Success;
+
+  m_computed = MatrixXr::Zero(diagSize + 1, diagSize);
+
+  if (m_compU)
+    m_naiveU = MatrixXr::Zero(diagSize + 1, diagSize + 1);
+  else
+    m_naiveU = MatrixXr::Zero(2, diagSize + 1);
+
+  if (m_compV) m_naiveV = MatrixXr::Zero(diagSize, diagSize);
+
+  m_workspace.resize((diagSize + 1) * (diagSize + 1) * 3);
+  m_workspaceI.resize(3 * diagSize);
+}
+
+/** \internal
+ * Performs A = A * B exploiting the special structure of the matrix A. Splitting A as:
+ *  A = [A1]
+ *      [A2]
+ * such that A1.rows()==n1, then we assume that at least half of the columns of A1 and A2 are zeros.
+ * We can thus pack them prior to the matrix product. However, this is only worth the effort if the matrix is large
+ * enough.
+ */
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::structured_update(Block<MatrixXr, Dynamic, Dynamic> A, const MatrixXr& B, Index n1) {
+  Index n = A.rows();
+  if (n > 100) {
+    // If the matrices are large enough, let's exploit the sparse structure of A by
+    // splitting it in half (wrt n1), and packing the non-zero columns.
+    Index n2 = n - n1;
+    Map<MatrixXr> A1(m_workspace.data(), n1, n);
+    Map<MatrixXr> A2(m_workspace.data() + n1 * n, n2, n);
+    Map<MatrixXr> B1(m_workspace.data() + n * n, n, n);
+    Map<MatrixXr> B2(m_workspace.data() + 2 * n * n, n, n);
+    Index k1 = 0, k2 = 0;
+    for (Index j = 0; j < n; ++j) {
+      if ((A.col(j).head(n1).array() != Literal(0)).any()) {
+        A1.col(k1) = A.col(j).head(n1);
+        B1.row(k1) = B.row(j);
+        ++k1;
+      }
+      if ((A.col(j).tail(n2).array() != Literal(0)).any()) {
+        A2.col(k2) = A.col(j).tail(n2);
+        B2.row(k2) = B.row(j);
+        ++k2;
+      }
+    }
+
+    A.topRows(n1).noalias() = A1.leftCols(k1) * B1.topRows(k1);
+    A.bottomRows(n2).noalias() = A2.leftCols(k2) * B2.topRows(k2);
+  } else {
+    Map<MatrixXr, Aligned> tmp(m_workspace.data(), n, n);
+    tmp.noalias() = A * B;
+    A = tmp;
+  }
+}
+
+template <typename RealScalar_>
+template <typename SVDType>
+void bdcsvd_impl<RealScalar_>::computeBaseCase(SVDType& svd, Index n, Index firstCol, Index firstRowW, Index firstColW,
+                                               Index shift) {
+  svd.compute(m_computed.block(firstCol, firstCol, n + 1, n));
+  m_info = svd.info();
+  if (m_info != Success && m_info != NoConvergence) return;
+  if (m_compU)
+    m_naiveU.block(firstCol, firstCol, n + 1, n + 1) = svd.matrixU();
+  else {
+    m_naiveU.row(0).segment(firstCol, n + 1) = svd.matrixU().row(0);
+    m_naiveU.row(1).segment(firstCol, n + 1) = svd.matrixU().row(n);
+  }
+  if (m_compV) m_naiveV.block(firstRowW, firstColW, n, n) = svd.matrixV();
+  m_computed.block(firstCol + shift, firstCol + shift, n + 1, n).setZero();
+  m_computed.diagonal().segment(firstCol + shift, n) = svd.singularValues().head(n);
+}
+
+// The divide algorithm is done "in place", we are always working on subsets of the same matrix. The divide methods
+// takes as argument the place of the submatrix we are currently working on.
+
+//@param firstCol : The Index of the first column of the submatrix of m_computed and for m_naiveU;
+//@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU;
+// lastCol + 1 - firstCol is the size of the submatrix.
+//@param firstRowW : The Index of the first row of the matrix W that we are to change. (see the reference paper section
+// 1 for more information on W)
+//@param firstColW : Same as firstRowW with the column.
+//@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the
+// last column of the U submatrix
+// to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the
+// reference paper.
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift) {
+  // requires rows = cols + 1;
+  using std::abs;
+  using std::sqrt;
+  const Index n = lastCol - firstCol + 1;
+  const Index k = n / 2;
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
+  RealScalar alphaK;
+  RealScalar betaK;
+  RealScalar r0;
+  RealScalar lambda, phi, c0, s0;
+  VectorType l, f;
+  // We use the other algorithm which is more efficient for small
+  // matrices.
+  if (n < m_algoswap) {
+    if (m_compV) {
+      computeBaseCase(m_baseSvdUV, n, firstCol, firstRowW, firstColW, shift);
+    } else {
+      computeBaseCase(m_baseSvdU, n, firstCol, firstRowW, firstColW, shift);
+    }
+    return;
+  }
+  // We use the divide and conquer algorithm
+  alphaK = m_computed(firstCol + k, firstCol + k);
+  betaK = m_computed(firstCol + k + 1, firstCol + k);
+  // The divide must be done in that order in order to have good results. Divide change the data inside the submatrices
+  // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the
+  // right submatrix before the left one.
+  divide(k + 1 + firstCol, lastCol, k + 1 + firstRowW, k + 1 + firstColW, shift);
+  if (m_info != Success && m_info != NoConvergence) return;
+  divide(firstCol, k - 1 + firstCol, firstRowW, firstColW + 1, shift + 1);
+  if (m_info != Success && m_info != NoConvergence) return;
+
+  if (m_compU) {
+    lambda = m_naiveU(firstCol + k, firstCol + k);
+    phi = m_naiveU(firstCol + k + 1, lastCol + 1);
+  } else {
+    lambda = m_naiveU(1, firstCol + k);
+    phi = m_naiveU(0, lastCol + 1);
+  }
+  r0 = sqrt((abs(alphaK * lambda) * abs(alphaK * lambda)) + abs(betaK * phi) * abs(betaK * phi));
+  if (m_compU) {
+    l = m_naiveU.row(firstCol + k).segment(firstCol, k);
+    f = m_naiveU.row(firstCol + k + 1).segment(firstCol + k + 1, n - k - 1);
+  } else {
+    l = m_naiveU.row(1).segment(firstCol, k);
+    f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1);
+  }
+  if (m_compV) m_naiveV(firstRowW + k, firstColW) = Literal(1);
+  if (r0 < considerZero) {
+    c0 = Literal(1);
+    s0 = Literal(0);
+  } else {
+    c0 = alphaK * lambda / r0;
+    s0 = betaK * phi / r0;
+  }
+
+  if (m_compU) {
+    MatrixXr q1(m_naiveU.col(firstCol + k).segment(firstCol, k + 1));
+    // we shiftW Q1 to the right
+    for (Index i = firstCol + k - 1; i >= firstCol; i--)
+      m_naiveU.col(i + 1).segment(firstCol, k + 1) = m_naiveU.col(i).segment(firstCol, k + 1);
+    // we shift q1 at the left with a factor c0
+    m_naiveU.col(firstCol).segment(firstCol, k + 1) = (q1 * c0);
+    // last column = q1 * - s0
+    m_naiveU.col(lastCol + 1).segment(firstCol, k + 1) = (q1 * (-s0));
+    // first column = q2 * s0
+    m_naiveU.col(firstCol).segment(firstCol + k + 1, n - k) =
+        m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) * s0;
+    // q2 *= c0
+    m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) *= c0;
+  } else {
+    RealScalar q1 = m_naiveU(0, firstCol + k);
+    // we shift Q1 to the right
+    for (Index i = firstCol + k - 1; i >= firstCol; i--) m_naiveU(0, i + 1) = m_naiveU(0, i);
+    // we shift q1 at the left with a factor c0
+    m_naiveU(0, firstCol) = (q1 * c0);
+    // last column = q1 * - s0
+    m_naiveU(0, lastCol + 1) = (q1 * (-s0));
+    // first column = q2 * s0
+    m_naiveU(1, firstCol) = m_naiveU(1, lastCol + 1) * s0;
+    // q2 *= c0
+    m_naiveU(1, lastCol + 1) *= c0;
+    m_naiveU.row(1).segment(firstCol + 1, k).setZero();
+    m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1).setZero();
+  }
+
+  m_computed(firstCol + shift, firstCol + shift) = r0;
+  m_computed.col(firstCol + shift).segment(firstCol + shift + 1, k) = alphaK * l.transpose();
+  m_computed.col(firstCol + shift).segment(firstCol + shift + k + 1, n - k - 1) = betaK * f.transpose();
+
+  // Second part: try to deflate singular values in combined matrix
+  deflation(firstCol, lastCol, k, firstRowW, firstColW, shift);
+
+  // Third part: compute SVD of combined matrix
+  MatrixXr UofSVD, VofSVD;
+  VectorType singVals;
+  computeSVDofM(firstCol + shift, n, UofSVD, singVals, VofSVD);
+
+  if (m_compU)
+    structured_update(m_naiveU.block(firstCol, firstCol, n + 1, n + 1), UofSVD, (n + 2) / 2);
+  else {
+    Map<Matrix<RealScalar, 2, Dynamic>, Aligned> tmp(m_workspace.data(), 2, n + 1);
+    tmp.noalias() = m_naiveU.middleCols(firstCol, n + 1) * UofSVD;
+    m_naiveU.middleCols(firstCol, n + 1) = tmp;
+  }
+
+  if (m_compV) structured_update(m_naiveV.block(firstRowW, firstColW, n, n), VofSVD, (n + 1) / 2);
+
+  m_computed.block(firstCol + shift, firstCol + shift, n, n).setZero();
+  m_computed.block(firstCol + shift, firstCol + shift, n, n).diagonal() = singVals;
+}  // end divide
+
+// Compute SVD of m_computed.block(firstCol, firstCol, n + 1, n); this block only has non-zeros in
+// the first column and on the diagonal and has undergone deflation, so diagonal is in increasing
+// order except for possibly the (0,0) entry. The computed SVD is stored U, singVals and V, except
+// that if m_compV is false, then V is not computed. Singular values are sorted in decreasing order.
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) {
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
+  using std::abs;
+  ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n);
+  m_workspace.head(n) = m_computed.block(firstCol, firstCol, n, n).diagonal();
+  ArrayRef diag = m_workspace.head(n);
+  diag(0) = Literal(0);
+
+  // Allocate space for singular values and vectors
+  singVals.resize(n);
+  U.resize(n + 1, n + 1);
+  if (m_compV) V.resize(n, n);
+
+  // Many singular values might have been deflated, the zero ones have been moved to the end,
+  // but others are interleaved and we must ignore them at this stage.
+  // To this end, let's compute a permutation skipping them:
+  Index actual_n = n;
+  while (actual_n > 1 && numext::is_exactly_zero(diag(actual_n - 1))) {
+    --actual_n;
+    eigen_internal_assert(numext::is_exactly_zero(col0(actual_n)));
+  }
+  Index m = 0;  // size of the deflated problem
+  for (Index k = 0; k < actual_n; ++k)
+    if (abs(col0(k)) > considerZero) m_workspaceI(m++) = k;
+  Map<ArrayXi> perm(m_workspaceI.data(), m);
+
+  Map<ArrayXr> shifts(m_workspace.data() + 1 * n, n);
+  Map<ArrayXr> mus(m_workspace.data() + 2 * n, n);
+  Map<ArrayXr> zhat(m_workspace.data() + 3 * n, n);
+
+  // Compute singVals, shifts, and mus
+  computeSingVals(col0, diag, perm, singVals, shifts, mus);
+
+  // Compute zhat
+  perturbCol0(col0, diag, perm, singVals, shifts, mus, zhat);
+
+  computeSingVecs(zhat, diag, perm, singVals, shifts, mus, U, V);
+
+  // Because of deflation, the singular values might not be completely sorted.
+  // Fortunately, reordering them is a O(n) problem
+  for (Index i = 0; i < actual_n - 1; ++i) {
+    if (singVals(i) > singVals(i + 1)) {
+      using std::swap;
+      swap(singVals(i), singVals(i + 1));
+      U.col(i).swap(U.col(i + 1));
+      if (m_compV) V.col(i).swap(V.col(i + 1));
+    }
+  }
+
+  // Reverse order so that singular values in increased order
+  // Because of deflation, the zeros singular-values are already at the end
+  singVals.head(actual_n).reverseInPlace();
+  U.leftCols(actual_n).rowwise().reverseInPlace();
+  if (m_compV) V.leftCols(actual_n).rowwise().reverseInPlace();
+}
+
+template <typename RealScalar_>
+typename bdcsvd_impl<RealScalar_>::RealScalar bdcsvd_impl<RealScalar_>::secularEq(RealScalar mu, const ArrayRef& col0,
+                                                                                  const ArrayRef& diag,
+                                                                                  const IndicesRef& perm,
+                                                                                  const ArrayRef& diagShifted,
+                                                                                  RealScalar shift) {
+  Index m = perm.size();
+  RealScalar res = Literal(1);
+  for (Index i = 0; i < m; ++i) {
+    Index j = perm(i);
+    // The following expression could be rewritten to involve only a single division,
+    // but this would make the expression more sensitive to overflow.
+    res += (col0(j) / (diagShifted(j) - mu)) * (col0(j) / (diag(j) + shift + mu));
+  }
+  return res;
+}
+
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm,
+                                               VectorType& singVals, ArrayRef shifts, ArrayRef mus) {
+  using std::abs;
+  using std::sqrt;
+  using std::swap;
+
+  Index n = col0.size();
+  Index actual_n = n;
+  // Note that here actual_n is computed based on col0(i)==0 instead of diag(i)==0 as above
+  // because 1) we have diag(i)==0 => col0(i)==0 and 2) if col0(i)==0, then diag(i) is already a singular value.
+  while (actual_n > 1 && numext::is_exactly_zero(col0(actual_n - 1))) --actual_n;
+
+  for (Index k = 0; k < n; ++k) {
+    if (numext::is_exactly_zero(col0(k)) || actual_n == 1) {
+      // if col0(k) == 0, then entry is deflated, so singular value is on diagonal
+      // if actual_n==1, then the deflated problem is already diagonalized
+      singVals(k) = k == 0 ? col0(0) : diag(k);
+      mus(k) = Literal(0);
+      shifts(k) = k == 0 ? col0(0) : diag(k);
+      continue;
+    }
+
+    // otherwise, use secular equation to find singular value
+    RealScalar left = diag(k);
+    RealScalar right;  // was: = (k != actual_n-1) ? diag(k+1) : (diag(actual_n-1) + col0.matrix().norm());
+    if (k == actual_n - 1)
+      right = (diag(actual_n - 1) + col0.matrix().stableNorm());
+    else {
+      // Skip deflated singular values,
+      // recall that at this stage we assume that z[j]!=0 and all entries for which z[j]==0 have been put aside.
+      // This should be equivalent to using perm[]
+      Index l = k + 1;
+      while (numext::is_exactly_zero(col0(l))) {
+        ++l;
+        eigen_internal_assert(l < actual_n);
+      }
+      right = diag(l);
+    }
+
+    // first decide whether it's closer to the left end or the right end
+    RealScalar mid = left + (right - left) / Literal(2);
+    RealScalar fMid = secularEq(mid, col0, diag, perm, diag, Literal(0));
+    RealScalar shift = (k == actual_n - 1 || fMid > Literal(0)) ? left : right;
+
+    // measure everything relative to shift
+    Map<ArrayXr> diagShifted(m_workspace.data() + 4 * n, n);
+    diagShifted = diag - shift;
+
+    if (k != actual_n - 1) {
+      // check that after the shift, f(mid) is still negative:
+      RealScalar midShifted = (right - left) / RealScalar(2);
+      // we can test exact equality here, because shift comes from `... ? left : right`
+      if (numext::equal_strict(shift, right)) midShifted = -midShifted;
+      RealScalar fMidShifted = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
+      if (fMidShifted > 0) {
+        // fMid was erroneous, fix it:
+        shift = fMidShifted > Literal(0) ? left : right;
+        diagShifted = diag - shift;
+      }
+    }
+
+    // initial guess
+    RealScalar muPrev, muCur;
+    // we can test exact equality here, because shift comes from `... ? left : right`
+    if (numext::equal_strict(shift, left)) {
+      muPrev = (right - left) * RealScalar(0.1);
+      if (k == actual_n - 1)
+        muCur = right - left;
+      else
+        muCur = (right - left) * RealScalar(0.5);
+    } else {
+      muPrev = -(right - left) * RealScalar(0.1);
+      muCur = -(right - left) * RealScalar(0.5);
+    }
+
+    RealScalar fPrev = secularEq(muPrev, col0, diag, perm, diagShifted, shift);
+    RealScalar fCur = secularEq(muCur, col0, diag, perm, diagShifted, shift);
+    if (abs(fPrev) < abs(fCur)) {
+      swap(fPrev, fCur);
+      swap(muPrev, muCur);
+    }
+
+    // rational interpolation: fit a function of the form a / mu + b through the two previous
+    // iterates and use its zero to compute the next iterate
+    bool useBisection = fPrev * fCur > Literal(0);
+    while (!numext::is_exactly_zero(fCur) &&
+           abs(muCur - muPrev) >
+               Literal(8) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(muCur), abs(muPrev)) &&
+           abs(fCur - fPrev) > NumTraits<RealScalar>::epsilon() && !useBisection) {
+      ++m_numIters;
+
+      // Find a and b such that the function f(mu) = a / mu + b matches the current and previous samples.
+      RealScalar a = (fCur - fPrev) / (Literal(1) / muCur - Literal(1) / muPrev);
+      RealScalar b = fCur - a / muCur;
+      // And find mu such that f(mu)==0:
+      RealScalar muZero = -a / b;
+      RealScalar fZero = secularEq(muZero, col0, diag, perm, diagShifted, shift);
+
+      muPrev = muCur;
+      fPrev = fCur;
+      muCur = muZero;
+      fCur = fZero;
+
+      // we can test exact equality here, because shift comes from `... ? left : right`
+      if (numext::equal_strict(shift, left) && (muCur < Literal(0) || muCur > right - left)) useBisection = true;
+      if (numext::equal_strict(shift, right) && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true;
+      if (abs(fCur) > abs(fPrev)) useBisection = true;
+    }
+
+    // fall back on bisection method if rational interpolation did not work
+    if (useBisection) {
+      RealScalar leftShifted, rightShifted;
+      // we can test exact equality here, because shift comes from `... ? left : right`
+      if (numext::equal_strict(shift, left)) {
+        // to avoid overflow, we must have mu > max(real_min, |z(k)|/sqrt(real_max)),
+        // the factor 2 is to be more conservative
+        leftShifted =
+            numext::maxi<RealScalar>((std::numeric_limits<RealScalar>::min)(),
+                                     Literal(2) * abs(col0(k)) / sqrt((std::numeric_limits<RealScalar>::max)()));
+
+        // check that we did it right:
+        eigen_internal_assert(
+            (numext::isfinite)((col0(k) / leftShifted) * (col0(k) / (diag(k) + shift + leftShifted))));
+        rightShifted = (k == actual_n - 1)
+                           ? right
+                           : ((right - left) * RealScalar(0.51));  // theoretically we can take 0.5, but let's be safe
+      } else {
+        leftShifted = -(right - left) * RealScalar(0.51);
+        if (k + 1 < n)
+          rightShifted = -numext::maxi<RealScalar>((std::numeric_limits<RealScalar>::min)(),
+                                                   abs(col0(k + 1)) / sqrt((std::numeric_limits<RealScalar>::max)()));
+        else
+          rightShifted = -(std::numeric_limits<RealScalar>::min)();
+      }
+      RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
+      eigen_internal_assert(fLeft < Literal(0));
+
+      if (fLeft < Literal(0)) {
+        while (rightShifted - leftShifted > Literal(2) * NumTraits<RealScalar>::epsilon() *
+                                                numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted))) {
+          RealScalar midShifted = (leftShifted + rightShifted) / Literal(2);
+          fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
+          eigen_internal_assert((numext::isfinite)(fMid));
+
+          if (fLeft * fMid < Literal(0)) {
+            rightShifted = midShifted;
+          } else {
+            leftShifted = midShifted;
+            fLeft = fMid;
+          }
+        }
+        muCur = (leftShifted + rightShifted) / Literal(2);
+      } else {
+        // We have a problem as shifting on the left or right give either a positive or negative value
+        // at the middle of [left,right]...
+        // Instead of abbording or entering an infinite loop,
+        // let's just use the middle as the estimated zero-crossing:
+        muCur = (right - left) * RealScalar(0.5);
+        // we can test exact equality here, because shift comes from `... ? left : right`
+        if (numext::equal_strict(shift, right)) muCur = -muCur;
+      }
+    }
+
+    singVals[k] = shift + muCur;
+    shifts[k] = shift;
+    mus[k] = muCur;
+  }
+}
+
+// zhat is perturbation of col0 for which singular vectors can be computed stably (see Section 3.1)
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::perturbCol0(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm,
+                                           const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus,
+                                           ArrayRef zhat) {
+  using std::sqrt;
+  Index n = col0.size();
+  Index m = perm.size();
+  if (m == 0) {
+    zhat.setZero();
+    return;
+  }
+  Index lastIdx = perm(m - 1);
+  // The offset permits to skip deflated entries while computing zhat
+  for (Index k = 0; k < n; ++k) {
+    if (numext::is_exactly_zero(col0(k)))  // deflated
+      zhat(k) = Literal(0);
+    else {
+      // see equation (3.6)
+      RealScalar dk = diag(k);
+      RealScalar prod = (singVals(lastIdx) + dk) * (mus(lastIdx) + (shifts(lastIdx) - dk));
+
+      for (Index l = 0; l < m; ++l) {
+        Index i = perm(l);
+        if (i != k) {
+          // There is no valid predecessor when the first active index is already on the
+          // right of k. Treat this as a numerical issue and zero the product.
+          if (i >= k && l == 0) {
+            m_info = NumericalIssue;
+            prod = Literal(0);
+            break;
+          }
+          Index j = i < k ? i : perm(l - 1);
+          prod *= ((singVals(j) + dk) / ((diag(i) + dk))) * ((mus(j) + (shifts(j) - dk)) / ((diag(i) - dk)));
+        }
+      }
+      RealScalar tmp = sqrt(prod);
+      zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp);
+    }
+  }
+}
+
+// compute singular vectors
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::computeSingVecs(const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef& perm,
+                                               const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus,
+                                               MatrixXr& U, MatrixXr& V) {
+  Index n = zhat.size();
+  Index m = perm.size();
+
+  for (Index k = 0; k < n; ++k) {
+    if (numext::is_exactly_zero(zhat(k))) {
+      U.col(k) = VectorType::Unit(n + 1, k);
+      if (m_compV) V.col(k) = VectorType::Unit(n, k);
+    } else {
+      U.col(k).setZero();
+      for (Index l = 0; l < m; ++l) {
+        Index i = perm(l);
+        U(i, k) = zhat(i) / (((diag(i) - shifts(k)) - mus(k))) / ((diag(i) + singVals[k]));
+      }
+      U(n, k) = Literal(0);
+      U.col(k).normalize();
+
+      if (m_compV) {
+        V.col(k).setZero();
+        for (Index l = 1; l < m; ++l) {
+          Index i = perm(l);
+          V(i, k) = diag(i) * zhat(i) / (((diag(i) - shifts(k)) - mus(k))) / ((diag(i) + singVals[k]));
+        }
+        V(0, k) = Literal(-1);
+        V.col(k).normalize();
+      }
+    }
+  }
+  U.col(n) = VectorType::Unit(n + 1, n);
+}
+
+// page 12_13
+// i >= 1, di almost null and zi non null.
+// We use a rotation to zero out zi applied to the left of M, and set di = 0.
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::deflation43(Index firstCol, Index shift, Index i, Index size) {
+  using std::abs;
+  using std::sqrt;
+  Index start = firstCol + shift;
+  RealScalar c = m_computed(start, start);
+  RealScalar s = m_computed(start + i, start);
+  RealScalar r = numext::hypot(c, s);
+  if (numext::is_exactly_zero(r)) {
+    m_computed(start + i, start + i) = Literal(0);
+    return;
+  }
+  m_computed(start, start) = r;
+  m_computed(start + i, start) = Literal(0);
+  m_computed(start + i, start + i) = Literal(0);
+
+  JacobiRotation<RealScalar> J(c / r, -s / r);
+  if (m_compU)
+    m_naiveU.middleRows(firstCol, size + 1).applyOnTheRight(firstCol, firstCol + i, J);
+  else
+    m_naiveU.applyOnTheRight(firstCol, firstCol + i, J);
+}  // end deflation 43
+
+// page 13
+// i,j >= 1, i > j, and |di - dj| < epsilon * norm2(M)
+// We apply two rotations to have zi = 0, and dj = di.
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::deflation44(Index firstColu, Index firstColm, Index firstRowW, Index firstColW, Index i,
+                                           Index j, Index size) {
+  using std::abs;
+  using std::sqrt;
+
+  RealScalar s = m_computed(firstColm + i, firstColm);
+  RealScalar c = m_computed(firstColm + j, firstColm);
+  RealScalar r = numext::hypot(c, s);
+  if (numext::is_exactly_zero(r)) {
+    m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i);
+    return;
+  }
+  c /= r;
+  s /= r;
+  m_computed(firstColm + j, firstColm) = r;
+  m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i);
+  m_computed(firstColm + i, firstColm) = Literal(0);
+
+  JacobiRotation<RealScalar> J(c, -s);
+  if (m_compU)
+    m_naiveU.middleRows(firstColu, size + 1).applyOnTheRight(firstColu + j, firstColu + i, J);
+  else
+    m_naiveU.applyOnTheRight(firstColu + j, firstColu + i, J);
+  if (m_compV) m_naiveV.middleRows(firstRowW, size).applyOnTheRight(firstColW + j, firstColW + i, J);
+}  // end deflation 44
+
+// acts on block from (firstCol+shift, firstCol+shift) to (lastCol+shift, lastCol+shift) [inclusive]
+template <typename RealScalar_>
+void bdcsvd_impl<RealScalar_>::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW,
+                                         Index shift) {
+  using std::abs;
+  using std::sqrt;
+  const Index length = lastCol + 1 - firstCol;
+
+  Block<MatrixXr, Dynamic, 1> col0(m_computed, firstCol + shift, firstCol + shift, length, 1);
+  Diagonal<MatrixXr> fulldiag(m_computed);
+  VectorBlock<Diagonal<MatrixXr>, Dynamic> diag(fulldiag, firstCol + shift, length);
+
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
+  RealScalar maxDiag = diag.tail((std::max)(Index(1), length - 1)).cwiseAbs().maxCoeff();
+  RealScalar epsilon_strict = numext::maxi<RealScalar>(considerZero, NumTraits<RealScalar>::epsilon() * maxDiag);
+  RealScalar epsilon_coarse =
+      Literal(8) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
+
+  // condition 4.1
+  if (diag(0) < epsilon_coarse) {
+    diag(0) = epsilon_coarse;
+  }
+
+  // condition 4.2
+  for (Index i = 1; i < length; ++i)
+    if (abs(col0(i)) < epsilon_strict) {
+      col0(i) = Literal(0);
+    }
+
+  // condition 4.3
+  for (Index i = 1; i < length; i++)
+    if (diag(i) < epsilon_coarse) {
+      deflation43(firstCol, shift, i, length);
+    }
+
+  {
+    // Check for total deflation:
+    // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting.
+    const bool total_deflation = (col0.tail(length - 1).array().abs() < considerZero).all();
+
+    // Sort the diagonal entries, since diag(1:k-1) and diag(k:length) are already sorted, let's do a sorted merge.
+    // First, compute the respective permutation.
+    Index* permutation = m_workspaceI.data();
+    {
+      permutation[0] = 0;
+      Index p = 1;
+
+      // Move deflated diagonal entries at the end.
+      for (Index i = 1; i < length; ++i)
+        if (diag(i) < considerZero) permutation[p++] = i;
+
+      Index i = 1, j = k + 1;
+      for (; p < length; ++p) {
+        if (i > k)
+          permutation[p] = j++;
+        else if (j >= length)
+          permutation[p] = i++;
+        else if (diag(i) < diag(j))
+          permutation[p] = j++;
+        else
+          permutation[p] = i++;
+      }
+    }
+
+    // If we have a total deflation, then we have to insert diag(0) at the right place
+    if (total_deflation) {
+      for (Index i = 1; i < length; ++i) {
+        Index pi = permutation[i];
+        if (diag(pi) < considerZero || diag(0) < diag(pi))
+          permutation[i - 1] = permutation[i];
+        else {
+          permutation[i - 1] = 0;
+          break;
+        }
+      }
+    }
+
+    // Current index of each col, and current column of each index
+    Index* realInd = m_workspaceI.data() + length;
+    Index* realCol = m_workspaceI.data() + 2 * length;
+
+    for (int pos = 0; pos < length; pos++) {
+      realCol[pos] = pos;
+      realInd[pos] = pos;
+    }
+
+    for (Index i = total_deflation ? 0 : 1; i < length; i++) {
+      const Index pi = permutation[length - (total_deflation ? i + 1 : i)];
+      const Index J = realCol[pi];
+
+      using std::swap;
+      // swap diagonal and first column entries:
+      swap(diag(i), diag(J));
+      if (i != 0 && J != 0) swap(col0(i), col0(J));
+
+      // change columns
+      if (m_compU)
+        m_naiveU.col(firstCol + i)
+            .segment(firstCol, length + 1)
+            .swap(m_naiveU.col(firstCol + J).segment(firstCol, length + 1));
+      else
+        m_naiveU.col(firstCol + i).segment(0, 2).swap(m_naiveU.col(firstCol + J).segment(0, 2));
+      if (m_compV)
+        m_naiveV.col(firstColW + i)
+            .segment(firstRowW, length)
+            .swap(m_naiveV.col(firstColW + J).segment(firstRowW, length));
+
+      // update real pos
+      const Index realI = realInd[i];
+      realCol[realI] = J;
+      realCol[pi] = i;
+      realInd[J] = realI;
+      realInd[i] = pi;
+    }
+  }
+
+  // condition 4.4
+  {
+    Index i = length - 1;
+    // Find last non-deflated entry.
+    while (i > 0 && (diag(i) < considerZero || abs(col0(i)) < considerZero)) --i;
+
+    for (; i > 1; --i)
+      if ((diag(i) - diag(i - 1)) < epsilon_coarse) {
+        deflation44(firstCol, firstCol + shift, firstRowW, firstColW, i, i - 1, length);
+      }
+  }
+
+}  // end deflation
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_BDCSVD_IMPL_H
diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index 1abde17fd90..fd0275c0f07 100644
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -372,8 +372,8 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, Options, true> {
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   static bool run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q, RealScalar& maxDiagEntry) {
-    using std::abs;
-    using std::sqrt;
+    using numext::abs;
+    using numext::sqrt;
     Scalar z;
     JacobiRotation<Scalar> rot;
     RealScalar n = sqrt(numext::abs2(work_matrix.coeff(p, p)) + numext::abs2(work_matrix.coeff(q, p)));
@@ -425,7 +425,7 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, Options, true> {
 };
 
 template <typename MatrixType_, int Options>
-struct traits<JacobiSVD<MatrixType_, Options> > : svd_traits<MatrixType_, Options> {
+struct traits<JacobiSVD<MatrixType_, Options>> : svd_traits<MatrixType_, Options> {
   typedef MatrixType_ MatrixType;
 };
 
@@ -497,7 +497,7 @@ struct traits<JacobiSVD<MatrixType_, Options> > : svd_traits<MatrixType_, Option
  * \sa MatrixBase::jacobiSvd()
  */
 template <typename MatrixType_, int Options_>
-class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
+class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_>> {
   typedef SVDBase<JacobiSVD> Base;
 
  public:
@@ -555,7 +555,8 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
    * \deprecated Will be removed in the next major Eigen version. Options should
    * be specified in the \a Options template parameter.
    */
-  EIGEN_DEPRECATED JacobiSVD(Index rows, Index cols, unsigned int computationOptions) {
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using the class template parameter.")
+  JacobiSVD(Index rows, Index cols, unsigned int computationOptions) {
     internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, rows, cols);
     allocate(rows, cols, computationOptions);
   }
@@ -570,6 +571,11 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
     compute_impl(matrix, internal::get_computation_options(Options));
   }
 
+  template <typename Derived>
+  explicit JacobiSVD(const TriangularBase<Derived>& matrix) {
+    compute_impl(matrix, internal::get_computation_options(Options));
+  }
+
   /** \brief Constructor performing the decomposition of given matrix using specified options
    *         for computing unitaries.
    *
@@ -600,6 +606,11 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
     return compute_impl(matrix, m_computationOptions);
   }
 
+  template <typename Derived>
+  JacobiSVD& compute(const TriangularBase<Derived>& matrix) {
+    return compute_impl(matrix, m_computationOptions);
+  }
+
   /** \brief Method performing the decomposition of given matrix, as specified by
    *         the `computationOptions` parameter.
    *
@@ -610,7 +621,8 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
    * be specified in the \a Options template parameter.
    */
   template <typename Derived>
-  EIGEN_DEPRECATED JacobiSVD& compute(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using the class template parameter.")
+  JacobiSVD& compute(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
     internal::check_svd_options_assertions<MatrixBase<Derived>, Options>(m_computationOptions, matrix.rows(),
                                                                          matrix.cols());
     return compute_impl(matrix, computationOptions);
@@ -636,9 +648,17 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
   }
 
  private:
+  template <typename Derived>
+  JacobiSVD& compute_impl(const TriangularBase<Derived>& matrix, unsigned int computationOptions);
   template <typename Derived>
   JacobiSVD& compute_impl(const MatrixBase<Derived>& matrix, unsigned int computationOptions);
 
+  // Blocked sweep for the Jacobi SVD (works for both real and complex scalars).
+  // Extracted into a separate EIGEN_DONT_INLINE method to prevent the blocking
+  // code from interfering with the compiler's optimization of the non-blocking
+  // scalar sweep.
+  EIGEN_DONT_INLINE bool blocked_sweep(RealScalar considerAsZero, RealScalar precision, RealScalar& maxDiagEntry);
+
  protected:
   using Base::m_computationOptions;
   using Base::m_computeFullU;
@@ -672,8 +692,25 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
   internal::qr_preconditioner_impl<MatrixType, Options, QRPreconditioner, internal::PreconditionIfMoreRowsThanCols>
       m_qr_precond_morerows;
   WorkMatrixType m_workMatrix;
+
+  // Blocking parameters for the Jacobi SVD sweep.
+#ifdef EIGEN_JACOBI_SVD_BLOCK_SIZE
+  static constexpr Index kDefaultBlockSize = EIGEN_JACOBI_SVD_BLOCK_SIZE;
+#else
+  static constexpr Index kDefaultBlockSize = 32;
+#endif
+
+  // Use the lower of the default block size and static maximum matrix dimensions.
+  static constexpr Index kBlockSize = internal::min_size_prefer_fixed(kDefaultBlockSize, MaxDiagSizeAtCompileTime);
 };
 
+template <typename MatrixType, int Options>
+template <typename Derived>
+JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(const TriangularBase<Derived>& matrix,
+                                                                             unsigned int computationOptions) {
+  return compute_impl(matrix.toDenseMatrix(), computationOptions);
+}
+
 template <typename MatrixType, int Options>
 template <typename Derived>
 JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(const MatrixBase<Derived>& matrix,
@@ -682,7 +719,7 @@ JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(con
   EIGEN_STATIC_ASSERT((std::is_same<typename Derived::Scalar, typename MatrixType::Scalar>::value),
                       Input matrix must have the same Scalar type as the BDCSVD object.);
 
-  using std::abs;
+  using numext::abs;
 
   allocate(matrix.rows(), matrix.cols(), computationOptions);
 
@@ -699,6 +736,7 @@ JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(con
     m_isInitialized = true;
     m_info = InvalidInput;
     m_nonzeroSingularValues = 0;
+    m_singularValues.setZero();
     return *this;
   }
   if (numext::is_exactly_zero(scale)) scale = RealScalar(1);
@@ -724,33 +762,72 @@ JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(con
   while (!finished) {
     finished = true;
 
-    // do a sweep: for all index pairs (p,q), perform SVD of the corresponding 2x2 sub-matrix
-
-    for (Index p = 1; p < diagSize(); ++p) {
-      for (Index q = 0; q < p; ++q) {
-        // if this 2x2 sub-matrix is not diagonal already...
-        // notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't
-        // keep us iterating forever. Similarly, small denormal numbers are considered zero.
+    {
+      // Sweep with optional blocking for large matrices.
+      // Use blocking when the matrix is large enough that individual left rotations
+      // (strided row operations on column-major data) cause significant cache misses.
+      // The threshold is derived from the L2 cache size: blocking becomes worthwhile
+      // when n exceeds sqrt(L2 / 4). We divide by sizeof(float) rather than sizeof(RealScalar)
+      // because the cache miss pattern depends on the number of columns accessed (one cache
+      // line per column), not the scalar size. This also makes the threshold appropriately
+      // more conservative for larger types where GEMM overhead is higher.
+      const Index n = diagSize();
+#ifdef EIGEN_JACOBI_SVD_BLOCKING_THRESHOLD
+      const Index blockingThreshold = EIGEN_JACOBI_SVD_BLOCKING_THRESHOLD;
+#else
+      const Index blockingThreshold = static_cast<Index>(std::sqrt(static_cast<double>(l2CacheSize() / sizeof(float))));
+#endif
+
+      if (n >= blockingThreshold) {
+        // The blocked sweep is in a separate EIGEN_DONT_INLINE method to prevent
+        // the blocking code from interfering with the compiler's optimization of
+        // the non-blocking scalar sweep below.
+        finished = !blocked_sweep(considerAsZero, precision, maxDiagEntry);
+      }
+      // Non-blocking paths: apply rotations individually. The real and complex
+      // paths are kept separate to avoid any codegen impact from the complex
+      // preconditioner on GCC's optimization of the real inner loop.
+      else
+        EIGEN_IF_CONSTEXPR(NumTraits<Scalar>::IsComplex) {
+          // Complex non-blocking sweep: condition each 2x2 block to be real before diagonalizing.
+          for (Index p = 1; p < n; ++p) {
+            for (Index q = 0; q < p; ++q) {
+              RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+              if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) {
+                finished = false;
+                if (internal::svd_precondition_2x2_block_to_be_real<MatrixType, Options>::run(m_workMatrix, *this, p, q,
+                                                                                              maxDiagEntry)) {
+                  JacobiRotation<RealScalar> j_left, j_right;
+                  internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
+                  m_workMatrix.applyOnTheLeft(p, q, j_left);
+                  if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose());
+                  m_workMatrix.applyOnTheRight(p, q, j_right);
+                  if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right);
+                  maxDiagEntry = numext::maxi<RealScalar>(
+                      maxDiagEntry,
+                      numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q))));
+                }
+              }
+            }
+          }
+        }
+      else {
+        // Real non-blocking sweep: diagonalize each 2x2 block directly.
         RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
-        if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) {
-          finished = false;
-          // perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal
-          // the complex to real operation returns true if the updated 2x2 block is not already diagonal
-          if (internal::svd_precondition_2x2_block_to_be_real<MatrixType, Options>::run(m_workMatrix, *this, p, q,
-                                                                                        maxDiagEntry)) {
-            JacobiRotation<RealScalar> j_left, j_right;
-            internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
-
-            // accumulate resulting Jacobi rotations
-            m_workMatrix.applyOnTheLeft(p, q, j_left);
-            if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose());
-
-            m_workMatrix.applyOnTheRight(p, q, j_right);
-            if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right);
-
-            // keep track of the largest diagonal coefficient
-            maxDiagEntry = numext::maxi<RealScalar>(
-                maxDiagEntry, numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q))));
+        for (Index p = 1; p < n; ++p) {
+          for (Index q = 0; q < p; ++q) {
+            if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) {
+              finished = false;
+              JacobiRotation<RealScalar> j_left, j_right;
+              internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
+              m_workMatrix.applyOnTheLeft(p, q, j_left);
+              if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose());
+              m_workMatrix.applyOnTheRight(p, q, j_right);
+              if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right);
+              maxDiagEntry = numext::maxi<RealScalar>(
+                  maxDiagEntry, numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q))));
+              threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+            }
           }
         }
       }
@@ -800,6 +877,216 @@ JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(con
   return *this;
 }
 
+// Blocked Jacobi SVD sweep for both real and complex scalar types. For large n,
+// applying left rotations (row operations on column-major data) causes cache
+// misses due to strided access. To mitigate this, we accumulate kBlockSize left
+// rotations into a small dense matrix and apply them via a single GEMM to the
+// contiguous row block q..q+kBlockSize-1 and the (possibly distant) row p.
+// Right rotations and column scalings act on columns (contiguous in column-major)
+// and are applied individually.
+//
+// For complex types, the 2x2 preconditioning (making the block real) involves
+// complex left rotations and row scalings, which are also accumulated into the
+// block matrix. Column scalings from preconditioning are applied directly.
+//
+// The accumulated rotation matrix has lower-triangular structure in its top-left
+// kBlockSize x kBlockSize corner, which we exploit with triangularView.
+//
+// Returns true if any off-diagonal element exceeded the threshold (i.e. sweep
+// is not yet converged).
+template <typename MatrixType, int Options>
+EIGEN_DONT_INLINE bool JacobiSVD<MatrixType, Options>::blocked_sweep(RealScalar considerAsZero, RealScalar precision,
+                                                                     RealScalar& maxDiagEntry) {
+  using numext::abs;
+  using numext::sqrt;
+  const Index n = diagSize();
+  RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+  bool notFinished = false;
+  static constexpr Index kBlockBufferSize = (kBlockSize + 1) * (kBlockSize + 1);
+  ei_declare_aligned_stack_constructed_variable(Scalar, blockBufferPtr, kBlockBufferSize, 0);
+  Map<Matrix<Scalar, kBlockSize + 1, kBlockSize + 1, MatrixOptions>, AlignedMax> blockBuffer(
+      blockBufferPtr, kBlockSize + 1, kBlockSize + 1);
+
+  ei_declare_aligned_stack_constructed_variable(Scalar, accumPtr, kBlockBufferSize, 0);
+  Map<Matrix<Scalar, kBlockSize + 1, kBlockSize + 1, MatrixOptions>, AlignedMax> accum(accumPtr, kBlockSize + 1,
+                                                                                       kBlockSize + 1);
+
+  for (Index p = 1; p < n; ++p) {
+    Index q = 0;
+
+    // Blocked loop: process kBlockSize pairs (p,q+qq) for qq=0..kBlockSize-1.
+    // We extract the relevant (kBlockSize+1) x (kBlockSize+1) submatrix of W
+    // into a small buffer, compute all rotations on the buffer, accumulate the
+    // left transformations into `accum`, and apply them in one GEMM at the end.
+    for (; q + kBlockSize <= p; q += kBlockSize) {
+      // Buffer = [ W(q:q+k, q:q+k)  W(q:q+k, p) ]
+      //          [ W(p, q:q+k)       W(p, p)      ]
+      blockBuffer.template topLeftCorner<kBlockSize, kBlockSize>() =
+          m_workMatrix.template block<kBlockSize, kBlockSize>(q, q);
+      blockBuffer.col(kBlockSize).template head<kBlockSize>() = m_workMatrix.col(p).template segment<kBlockSize>(q);
+      blockBuffer.row(kBlockSize).template head<kBlockSize>() = m_workMatrix.row(p).template segment<kBlockSize>(q);
+      blockBuffer(kBlockSize, kBlockSize) = m_workMatrix(p, p);
+
+      // Accumulator for left transformations: W <- accum * W.
+      // After processing qq pairs, accum's top-left kBlockSize x kBlockSize
+      // block is lower-triangular (each rotation only mixes row qq with row
+      // kBlockSize, so rows 0..qq-1 are unchanged).
+      accum.setIdentity(kBlockSize + 1, kBlockSize + 1);
+      bool blockDirty = false;
+
+      for (Index qq = 0; qq < kBlockSize; ++qq) {
+        if (abs(blockBuffer.coeff(kBlockSize, qq)) > threshold || abs(blockBuffer.coeff(qq, kBlockSize)) > threshold) {
+          notFinished = true;
+          blockDirty = true;
+
+          // Complex preconditioning: transform the 2x2 block
+          //   [w_pp  w_pq] = [buffer(kBlockSize, kBlockSize)  buffer(kBlockSize, qq)]
+          //   [w_qp  w_qq]   [buffer(qq, kBlockSize)          buffer(qq, qq)        ]
+          // to have real entries via unitary row/column operations, so
+          // real_2x2_jacobi_svd can be applied.
+          //
+          // Left operations (complex rotation, row scaling by e^{i*theta}) are
+          // accumulated into `accum` for deferred GEMM application.
+          // Right operations (column scaling) are applied directly since column
+          // ops are contiguous in column-major layout.
+          bool doRealSvd = true;
+          EIGEN_IF_CONSTEXPR(NumTraits<Scalar>::IsComplex) {
+            Scalar z;
+            // nn = ||(w_pp, w_qp)||_2, the norm of the first column of the 2x2 block.
+            RealScalar nn = sqrt(numext::abs2(blockBuffer.coeff(kBlockSize, kBlockSize)) +
+                                 numext::abs2(blockBuffer.coeff(qq, kBlockSize)));
+
+            if (numext::is_exactly_zero(nn)) {
+              // First column is zero => block is already upper triangular.
+              blockBuffer.coeffRef(kBlockSize, kBlockSize) = Scalar(0);
+              blockBuffer.coeffRef(qq, kBlockSize) = Scalar(0);
+
+              // Scale rows by z = e^{-i*arg(w)} to make remaining entries real.
+              if (abs(numext::imag(blockBuffer.coeff(kBlockSize, qq))) > considerAsZero) {
+                z = abs(blockBuffer.coeff(kBlockSize, qq)) / blockBuffer.coeff(kBlockSize, qq);
+                blockBuffer.row(kBlockSize) *= z;
+                accum.row(kBlockSize) *= z;  // accumulate left op
+                if (computeU()) m_matrixU.col(p) *= numext::conj(z);
+              }
+              if (abs(numext::imag(blockBuffer.coeff(qq, qq))) > considerAsZero) {
+                z = abs(blockBuffer.coeff(qq, qq)) / blockBuffer.coeff(qq, qq);
+                blockBuffer.row(qq) *= z;
+                accum.row(qq) *= z;  // accumulate left op
+                if (computeU()) m_matrixU.col(q + qq) *= numext::conj(z);
+              }
+            } else {
+              // Apply complex Givens rotation to zero out w_qp:
+              //   [c  s] [w_pp]   [nn]      conj(w_pp)         w_qp
+              //   [-s c] [w_qp] = [0 ]  c = ----------,  s = ------
+              //                                 nn              nn
+              JacobiRotation<Scalar> rot;
+              rot.c() = numext::conj(blockBuffer.coeff(kBlockSize, kBlockSize)) / nn;
+              rot.s() = blockBuffer.coeff(qq, kBlockSize) / nn;
+              blockBuffer.applyOnTheLeft(kBlockSize, qq, rot);
+              accum.applyOnTheLeft(kBlockSize, qq, rot);  // accumulate left op
+              if (computeU()) m_matrixU.applyOnTheRight(p, q + qq, rot.adjoint());
+
+              // Scale column qq by z = e^{-i*arg(w_pq)} to make w_pq real.
+              if (abs(numext::imag(blockBuffer.coeff(kBlockSize, qq))) > considerAsZero) {
+                z = abs(blockBuffer.coeff(kBlockSize, qq)) / blockBuffer.coeff(kBlockSize, qq);
+                blockBuffer.col(qq) *= z;
+                m_workMatrix.col(q + qq) *= z;  // right op: apply directly
+                if (computeV()) m_matrixV.col(q + qq) *= z;
+              }
+              // Scale row qq by z = e^{-i*arg(w_qq)} to make w_qq real.
+              if (abs(numext::imag(blockBuffer.coeff(qq, qq))) > considerAsZero) {
+                z = abs(blockBuffer.coeff(qq, qq)) / blockBuffer.coeff(qq, qq);
+                blockBuffer.row(qq) *= z;
+                accum.row(qq) *= z;  // accumulate left op
+                if (computeU()) m_matrixU.col(q + qq) *= numext::conj(z);
+              }
+            }
+            // Update maxDiagEntry from preconditioning.
+            maxDiagEntry = numext::maxi<RealScalar>(
+                maxDiagEntry, numext::maxi<RealScalar>(abs(blockBuffer.coeff(kBlockSize, kBlockSize)),
+                                                       abs(blockBuffer.coeff(qq, qq))));
+            // Check if 2x2 block still needs diagonalizing.
+            RealScalar precondThreshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+            doRealSvd = abs(blockBuffer.coeff(kBlockSize, qq)) > precondThreshold ||
+                        abs(blockBuffer.coeff(qq, kBlockSize)) > precondThreshold;
+          }
+
+          if (doRealSvd) {
+            // Compute real 2x2 SVD: buffer_2x2 = j_left * diag * j_right^T.
+            JacobiRotation<RealScalar> j_left, j_right;
+            internal::real_2x2_jacobi_svd(blockBuffer, kBlockSize, qq, &j_left, &j_right);
+            blockBuffer.applyOnTheLeft(kBlockSize, qq, j_left);
+            blockBuffer.applyOnTheRight(kBlockSize, qq, j_right);
+
+            // Accumulate left rotation for deferred GEMM.
+            accum.applyOnTheLeft(kBlockSize, qq, j_left);
+
+            // Right rotation is a column op (contiguous): apply directly.
+            m_workMatrix.applyOnTheRight(p, q + qq, j_right);
+            if (computeU()) m_matrixU.applyOnTheRight(p, q + qq, j_left.transpose());
+            if (computeV()) m_matrixV.applyOnTheRight(p, q + qq, j_right);
+
+            maxDiagEntry = numext::maxi<RealScalar>(
+                maxDiagEntry, numext::maxi<RealScalar>(abs(blockBuffer.coeff(kBlockSize, kBlockSize)),
+                                                       abs(blockBuffer.coeff(qq, qq))));
+          }
+          threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+        }
+      }
+
+      // Apply accumulated left rotations: W <- accum * W, via GEMM.
+      // When p == q + kBlockSize, all kBlockSize+1 rows are contiguous.
+      // Otherwise, rows q..q+k-1 and row p are non-adjacent; we split:
+      //   [Mq]      [L11  l12] [Mq]
+      //   [Mp] <-   [l21  l22] [Mp]
+      // L11 is lower-triangular (exploited via triangularView).
+      if (blockDirty) {
+        if (p == q + kBlockSize) {
+          m_workMatrix.template middleRows<kBlockSize + 1>(q) =
+              accum * m_workMatrix.template middleRows<kBlockSize + 1>(q);
+        } else {
+          const auto L11 = accum.template topLeftCorner<kBlockSize, kBlockSize>();
+          const auto l12 = accum.col(kBlockSize).template head<kBlockSize>();
+          const auto l21 = accum.row(kBlockSize).template head<kBlockSize>();
+          const Scalar l22 = accum(kBlockSize, kBlockSize);
+          auto Mq = m_workMatrix.template middleRows<kBlockSize>(q);
+          auto Mp = m_workMatrix.row(p);
+          Matrix<Scalar, 1, Dynamic> Mp_save = Mp;
+          Mp.noalias() = l21 * Mq + l22 * Mp_save;
+          Mq = L11.template triangularView<Lower>() * Mq + l12 * Mp_save;
+        }
+      }
+    }
+
+    // Scalar loop for remaining pairs after blocked processing.
+    for (; q < p; ++q) {
+      if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) {
+        notFinished = true;
+
+        bool doRealSvd = true;
+        EIGEN_IF_CONSTEXPR(NumTraits<Scalar>::IsComplex) {
+          doRealSvd = internal::svd_precondition_2x2_block_to_be_real<MatrixType, Options>::run(m_workMatrix, *this, p,
+                                                                                                q, maxDiagEntry);
+        }
+
+        if (doRealSvd) {
+          JacobiRotation<RealScalar> j_left, j_right;
+          internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
+          m_workMatrix.applyOnTheLeft(p, q, j_left);
+          if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose());
+          m_workMatrix.applyOnTheRight(p, q, j_right);
+          if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right);
+          maxDiagEntry = numext::maxi<RealScalar>(
+              maxDiagEntry, numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q))));
+        }
+        threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+      }
+    }
+  }
+
+  return notFinished;
+}
+
 /** \svd_module
  *
  * \return the singular value decomposition of \c *this computed by two-sided
diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h
index dcb4dba2054..c34bf0d4402 100644
--- a/Eigen/src/SVD/SVDBase.h
+++ b/Eigen/src/SVD/SVDBase.h
@@ -290,7 +290,7 @@ class SVDBase : public SolverBase<SVDBase<Derived> > {
    * A x - b \Vert \f$.
    */
   template <typename Rhs>
-  inline const Solve<Derived, Rhs> solve(const MatrixBase<Rhs>& b) const;
+  inline Solve<Derived, Rhs> solve(const MatrixBase<Rhs>& b) const;
 #endif
 
   /** \brief Reports whether previous computation was successful.
diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h
index 6df6318c94a..12ecc05c35d 100644
--- a/Eigen/src/SVD/UpperBidiagonalization.h
+++ b/Eigen/src/SVD/UpperBidiagonalization.h
@@ -17,8 +17,8 @@
 namespace Eigen {
 
 namespace internal {
-// UpperBidiagonalization will probably be replaced by a Bidiagonalization class, don't want to make it stable API.
-// At the same time, it's useful to keep for now as it's about the only thing that is testing the BandMatrix class.
+// UpperBidiagonalization may be replaced by a Bidiagonalization class; not part of stable API.
+// Kept for now as it is one of the few tests exercising the BandMatrix class.
 
 template <typename MatrixType_>
 class UpperBidiagonalization {
@@ -171,8 +171,10 @@ void upperbidiagonalization_blocked_helper(
 
     // 1 - update the k-th column of A
     SubColumnType v_k = A.col(k).tail(remainingRows);
-    v_k -= V_k1 * Y.row(k).head(k).adjoint();
-    if (k) v_k.noalias() -= X_k1 * A.col(k).head(k);
+    if (k) {
+      v_k.noalias() -= V_k1 * Y.row(k).head(k).adjoint();
+      v_k.noalias() -= X_k1 * A.col(k).head(k);
+    }
 
     // 2 - construct left Householder transform in-place
     v_k.makeHouseholderInPlace(tau_v, diagonal[k]);
@@ -204,7 +206,7 @@ void upperbidiagonalization_blocked_helper(
       u_k = u_k.conjugate();
       {
         u_k.noalias() -= Y_k * A.row(k).head(k + 1).adjoint();
-        if (k) u_k -= U_k1.adjoint() * X.row(k).head(k).adjoint();
+        if (k) u_k.noalias() -= U_k1.adjoint() * X.row(k).head(k).adjoint();
       }
 
       // 5 - construct right Householder transform in-place
@@ -263,7 +265,7 @@ void upperbidiagonalization_blocked_helper(
  * Bidiagonal Form. by Jaeyoung Choi, Jack J. Dongarra, David W. Walker. (1995) section 3.3
  */
 template <typename MatrixType, typename BidiagType>
-void upperbidiagonalization_inplace_blocked(MatrixType& A, BidiagType& bidiagonal, Index maxBlockSize = 32,
+void upperbidiagonalization_inplace_blocked(MatrixType& A, BidiagType& bidiagonal, Index maxBlockSize = 16,
                                             typename MatrixType::Scalar* /*tempData*/ = 0) {
   typedef typename MatrixType::Scalar Scalar;
   typedef Block<MatrixType, Dynamic, Dynamic> BlockType;
@@ -311,7 +313,7 @@ void upperbidiagonalization_inplace_blocked(MatrixType& A, BidiagType& bidiagona
     typename MatrixType::RealScalar* upper_diagonal_ptr =
         upper_diagonal.size() > 0 ? &upper_diagonal.coeffRef(k) : nullptr;
 
-    if (k + bs == cols || bcols < 48)  // somewhat arbitrary threshold
+    if (k + bs == cols || bcols < 2 * blockSize)  // fall back to unblocked for small trailing submatrices
     {
       upperbidiagonalization_inplace_unblocked(B, &(bidiagonal.template diagonal<0>().coeffRef(k)), upper_diagonal_ptr,
                                                X.data());
@@ -330,7 +332,7 @@ UpperBidiagonalization<MatrixType_>& UpperBidiagonalization<MatrixType_>::comput
   Index cols = matrix.cols();
   EIGEN_ONLY_USED_FOR_DEBUG(cols);
 
-  eigen_assert(rows >= cols && "UpperBidiagonalization is only for Arices satisfying rows>=cols.");
+  eigen_assert(rows >= cols && "UpperBidiagonalization is only for matrices satisfying rows>=cols.");
 
   m_householder = matrix;
 
@@ -350,7 +352,7 @@ UpperBidiagonalization<MatrixType_>& UpperBidiagonalization<MatrixType_>::comput
   EIGEN_ONLY_USED_FOR_DEBUG(rows);
   EIGEN_ONLY_USED_FOR_DEBUG(cols);
 
-  eigen_assert(rows >= cols && "UpperBidiagonalization is only for Arices satisfying rows>=cols.");
+  eigen_assert(rows >= cols && "UpperBidiagonalization is only for matrices satisfying rows>=cols.");
 
   m_householder = matrix;
   upperbidiagonalization_inplace_blocked(m_householder, m_bidiagonal);
@@ -359,19 +361,6 @@ UpperBidiagonalization<MatrixType_>& UpperBidiagonalization<MatrixType_>::comput
   return *this;
 }
 
-#if 0
-/** \return the Householder QR decomposition of \c *this.
-  *
-  * \sa class Bidiagonalization
-  */
-template<typename Derived>
-const UpperBidiagonalization<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::bidiagonalization() const
-{
-  return UpperBidiagonalization<PlainObject>(eval());
-}
-#endif
-
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h
index 3ccbb037e68..997bc2205c3 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h
@@ -406,7 +406,7 @@ class SimplicialLLT : public SimplicialCholeskyBase<SimplicialLLT<MatrixType_, U
     return *this;
   }
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -416,7 +416,8 @@ class SimplicialLLT : public SimplicialCholeskyBase<SimplicialLLT<MatrixType_, U
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+   * The given matrix must have the same sparsity as the matrix on which the symbolic decomposition has been
+   * performed.
    *
    * \sa analyzePattern()
    */
@@ -494,7 +495,7 @@ class SimplicialLDLT : public SimplicialCholeskyBase<SimplicialLDLT<MatrixType_,
     return *this;
   }
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -504,7 +505,8 @@ class SimplicialLDLT : public SimplicialCholeskyBase<SimplicialLDLT<MatrixType_,
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+   * The given matrix must have the same sparsity as the matrix on which the symbolic decomposition has been
+   * performed.
    *
    * \sa analyzePattern()
    */
@@ -575,7 +577,7 @@ class SimplicialNonHermitianLLT
     return *this;
   }
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -585,7 +587,8 @@ class SimplicialNonHermitianLLT
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+   * The given matrix must have the same sparsity as the matrix on which the symbolic decomposition has been
+   * performed.
    *
    * \sa analyzePattern()
    */
@@ -664,7 +667,7 @@ class SimplicialNonHermitianLDLT
     return *this;
   }
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -674,7 +677,8 @@ class SimplicialNonHermitianLDLT
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+   * The given matrix must have the same sparsity as the matrix on which the symbolic decomposition has been
+   * performed.
    *
    * \sa analyzePattern()
    */
@@ -742,7 +746,7 @@ class SimplicialCholesky : public SimplicialCholeskyBase<SimplicialCholesky<Matr
     return *this;
   }
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -757,7 +761,8 @@ class SimplicialCholesky : public SimplicialCholeskyBase<SimplicialCholesky<Matr
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+   * The given matrix must have the same sparsity as the matrix on which the symbolic decomposition has been
+   * performed.
    *
    * \sa analyzePattern()
    */
diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
index 3c65541b84b..cd61f65bf9e 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
@@ -89,7 +89,7 @@ struct simpl_chol_helper {
         m_set[u] = v;
         u = next;
       }
-    };
+    }
   };
 
   // Computes the higher adjacency pattern by transposing the input lower adjacency matrix.
@@ -274,9 +274,12 @@ struct simpl_chol_helper {
   }
 };
 
-// Symbol is ODR-used, so we need a definition.
+// Required pre-C++17 for ODR; redundant and deprecated since (C++17 makes
+// constexpr static data members implicitly inline).
+#if EIGEN_COMP_CXXVER < 17
 template <typename Scalar, typename StorageIndex>
 constexpr StorageIndex simpl_chol_helper<Scalar, StorageIndex>::kEmpty;
+#endif
 
 }  // namespace internal
 
diff --git a/Eigen/src/SparseCore/AmbiVector.h b/Eigen/src/SparseCore/AmbiVector.h
index 9f265f05604..99c896ab89b 100644
--- a/Eigen/src/SparseCore/AmbiVector.h
+++ b/Eigen/src/SparseCore/AmbiVector.h
@@ -173,7 +173,7 @@ Scalar_& AmbiVector<Scalar_, StorageIndex_>::coeffRef(Index i) {
     return m_buffer[i];
   else {
     ListEl* EIGEN_RESTRICT llElements = reinterpret_cast<ListEl*>(m_buffer);
-    // TODO factorize the following code to reduce code generation
+    // TODO: factor out the following code to reduce code generation
     eigen_assert(m_mode == IsSparse);
     if (m_llSize == 0) {
       // this is the first element
@@ -238,8 +238,8 @@ Scalar_& AmbiVector<Scalar_, StorageIndex_>::coeff(Index i) {
       Index elid = m_llStart;
       while (elid >= 0 && llElements[elid].index < i) elid = llElements[elid].next;
 
-      if (llElements[elid].index == i)
-        return llElements[m_llCurrent].value;
+      if (elid >= 0 && llElements[elid].index == i)
+        return llElements[elid].value;
       else
         return m_zero;
     }
diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h
index 8f8a6963a98..30bd8ad2f12 100644
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h
@@ -72,10 +72,9 @@ class CompressedStorage {
   void resize(Index size, double reserveSizeFactor = 0) {
     if (m_allocatedSize < size) {
       // Avoid underflow on the std::min<Index> call by choosing the smaller index type.
-      using SmallerIndexType =
-          typename std::conditional<static_cast<size_t>((std::numeric_limits<Index>::max)()) <
-                                        static_cast<size_t>((std::numeric_limits<StorageIndex>::max)()),
-                                    Index, StorageIndex>::type;
+      using SmallerIndexType = std::conditional_t<static_cast<size_t>((std::numeric_limits<Index>::max)()) <
+                                                      static_cast<size_t>((std::numeric_limits<StorageIndex>::max)()),
+                                                  Index, StorageIndex>;
       Index realloc_size =
           (std::min<Index>)(NumTraits<SmallerIndexType>::highest(), size + Index(reserveSizeFactor * double(size)));
       if (realloc_size < size) internal::throw_std_bad_alloc();
diff --git a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
index 3c6e797bd5f..1753c4a5a2b 100644
--- a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
+++ b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
@@ -33,7 +33,7 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r
   ei_declare_aligned_stack_constructed_variable(ResScalar, values, rows, 0);
   ei_declare_aligned_stack_constructed_variable(Index, indices, rows, 0);
 
-  std::memset(mask, 0, sizeof(bool) * rows);
+  std::fill_n(mask, rows, false);
 
   evaluator<Lhs> lhsEval(lhs);
   evaluator<Rhs> rhsEval(rhs);
@@ -79,8 +79,6 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r
       const Index t200 = rows / 11;  // 11 == (log2(200)*1.39)
       const Index t = (rows * 100) / 139;
 
-      // FIXME reserve nnz non zeros
-      // FIXME implement faster sorting algorithms for very small nnz
       // if the result is sparse enough => use a quick sort
       // otherwise => loop through the entire vector
       // In order to avoid to perform an expensive log2 when the
@@ -106,10 +104,6 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r
   res.finalize();
 }
 
-}  // end namespace internal
-
-namespace internal {
-
 // Helper template to generate new sparse matrix types
 template <class Source, int Order>
 using WithStorageOrder = SparseMatrix<typename Source::Scalar, Order, typename Source::StorageIndex>;
@@ -131,7 +125,8 @@ struct conservative_sparse_sparse_product_selector<Lhs, Rhs, ResultType, ColMajo
 
     // If the result is tall and thin (in the extreme case a column vector)
     // then it is faster to sort the coefficients inplace instead of transposing twice.
-    // FIXME, the following heuristic is probably not very good.
+    // The dimension-only test here ignores nnz / per-column density; a proper
+    // cost model using estimated_nnz_prod would pick the right path more often.
     if (lhs.rows() > rhs.cols()) {
       using ColMajorMatrix = typename sparse_eval<ColMajorMatrixAux, ResultType::RowsAtCompileTime,
                                                   ResultType::ColsAtCompileTime, ColMajorMatrixAux::Flags>::type;
@@ -232,10 +227,6 @@ struct conservative_sparse_sparse_product_selector<Lhs, Rhs, ResultType, RowMajo
   }
 };
 
-}  // end namespace internal
-
-namespace internal {
-
 template <typename Lhs, typename Rhs, typename ResultType>
 static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res) {
   typedef typename remove_all_t<Lhs>::Scalar LhsScalar;
diff --git a/Eigen/src/SparseCore/SparseAssign.h b/Eigen/src/SparseCore/SparseAssign.h
index f2da5193475..b794263b41e 100644
--- a/Eigen/src/SparseCore/SparseAssign.h
+++ b/Eigen/src/SparseCore/SparseAssign.h
@@ -25,7 +25,7 @@ Derived &SparseMatrixBase<Derived>::operator=(const EigenBase<OtherDerived> &oth
 template <typename Derived>
 template <typename OtherDerived>
 Derived &SparseMatrixBase<Derived>::operator=(const ReturnByValue<OtherDerived> &other) {
-  // TODO use the evaluator mechanism
+  // TODO: use the evaluator mechanism
   other.evalTo(derived());
   return derived();
 }
diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index 1342f4e7bd7..707bf7c1c46 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@@ -18,7 +18,7 @@ namespace Eigen {
 // Subset of columns or rows
 template <typename XprType, int BlockRows, int BlockCols>
 class BlockImpl<XprType, BlockRows, BlockCols, true, Sparse>
-    : public SparseMatrixBase<Block<XprType, BlockRows, BlockCols, true> > {
+    : public SparseCompressedBase<Block<XprType, BlockRows, BlockCols, true> > {
   typedef internal::remove_all_t<typename XprType::Nested> MatrixTypeNested_;
   typedef Block<XprType, BlockRows, BlockCols, true> BlockType;
 
@@ -27,7 +27,7 @@ class BlockImpl<XprType, BlockRows, BlockCols, true, Sparse>
 
  protected:
   enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
-  typedef SparseMatrixBase<BlockType> Base;
+  typedef SparseCompressedBase<BlockType> Base;
   using Base::convert_index;
 
  public:
@@ -68,6 +68,32 @@ class BlockImpl<XprType, BlockRows, BlockCols, true, Sparse>
   Index blockRows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
   Index blockCols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
 
+  inline const Scalar* valuePtr() const { return m_matrix.valuePtr(); }
+  inline Scalar* valuePtr() { return m_matrix.valuePtr(); }
+
+  inline const StorageIndex* innerIndexPtr() const { return m_matrix.innerIndexPtr(); }
+  inline StorageIndex* innerIndexPtr() { return m_matrix.innerIndexPtr(); }
+
+  inline const StorageIndex* outerIndexPtr() const {
+    const StorageIndex* p = m_matrix.outerIndexPtr();
+    return p ? p + m_outerStart : 0;
+  }
+  inline StorageIndex* outerIndexPtr() {
+    StorageIndex* p = m_matrix.outerIndexPtr();
+    return p ? p + m_outerStart : 0;
+  }
+
+  inline const StorageIndex* innerNonZeroPtr() const {
+    const StorageIndex* p = m_matrix.innerNonZeroPtr();
+    return p ? p + m_outerStart : 0;
+  }
+  inline StorageIndex* innerNonZeroPtr() {
+    StorageIndex* p = m_matrix.innerNonZeroPtr();
+    return p ? p + m_outerStart : 0;
+  }
+
+  bool isCompressed() const { return m_matrix.innerNonZeroPtr() == 0; }
+
  protected:
   typename internal::ref_selector<XprType>::non_const_type m_matrix;
   Index m_outerStart;
diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h
index c1682833d0e..37a23065ae6 100644
--- a/Eigen/src/SparseCore/SparseCompressedBase.h
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h
@@ -113,6 +113,13 @@ class SparseCompressedBase : public SparseMatrixBase<Derived> {
   /** \returns whether \c *this is in compressed form. */
   inline bool isCompressed() const { return innerNonZeroPtr() == 0; }
 
+ protected:
+  Index coeffsStart() const {
+    const StorageIndex* outer = outerIndexPtr();
+    return (outer && derived().outerSize() > 0) ? internal::convert_index<Index>(outer[0]) : 0;
+  }
+
+ public:
   /** \returns a read-only view of the stored coefficients as a 1D array expression.
    *
    * \warning this method is for \b compressed \b storage \b only, and it will trigger an assertion otherwise.
@@ -120,7 +127,9 @@ class SparseCompressedBase : public SparseMatrixBase<Derived> {
    * \sa valuePtr(), isCompressed() */
   const Map<const Array<Scalar, Dynamic, 1>> coeffs() const {
     eigen_assert(isCompressed());
-    return Array<Scalar, Dynamic, 1>::Map(valuePtr(), nonZeros());
+    const Index start = coeffsStart();
+    const Scalar* values = valuePtr() + start;
+    return Array<Scalar, Dynamic, 1>::Map(values, nonZeros());
   }
 
   /** \returns a read-write view of the stored coefficients as a 1D array expression
@@ -135,7 +144,9 @@ class SparseCompressedBase : public SparseMatrixBase<Derived> {
    * \sa valuePtr(), isCompressed() */
   Map<Array<Scalar, Dynamic, 1>> coeffs() {
     eigen_assert(isCompressed());
-    return Array<Scalar, Dynamic, 1>::Map(valuePtr(), nonZeros());
+    const Index start = coeffsStart();
+    Scalar* values = valuePtr() + start;
+    return Array<Scalar, Dynamic, 1>::Map(values, nonZeros());
   }
 
   /** sorts the inner vectors in the range [begin,end) with respect to `Comp`
@@ -305,8 +316,7 @@ class SparseCompressedBase<Derived>::ReverseInnerIterator {
     }
   }
 
-  explicit ReverseInnerIterator(const SparseCompressedBase& mat)
-      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(0), m_start(0), m_id(mat.nonZeros()) {
+  explicit ReverseInnerIterator(const SparseCompressedBase& mat) : ReverseInnerIterator(mat, Index(0)) {
     EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
   }
 
@@ -480,6 +490,7 @@ class CompressedStorageIterator {
     return *this;
   }
   inline reference operator*() const { return reference(m_data.keyPtr() + m_index, m_data.valuePtr() + m_index); }
+  inline reference operator[](int index) { return *(*this + index); }
 
 #define MAKE_COMP(OP) \
   inline bool operator OP(const CompressedStorageIterator& other) const { return m_index OP other.m_index; }
@@ -528,15 +539,17 @@ struct inner_sort_impl<Derived, Comp, true> {
   typedef typename Derived::Scalar Scalar;
   typedef typename Derived::StorageIndex StorageIndex;
   static inline void run(SparseCompressedBase<Derived>& obj, Index, Index) {
-    Index begin_offset = 0;
-    Index end_offset = obj.nonZeros();
+    const StorageIndex* outer = obj.outerIndexPtr();
+    Index begin_offset = (outer && obj.outerSize() > 0) ? internal::convert_index<Index>(outer[0]) : 0;
+    Index end_offset = begin_offset + obj.nonZeros();
     CompressedStorageIterator<Scalar, StorageIndex> begin_it(begin_offset, obj.innerIndexPtr(), obj.valuePtr());
     CompressedStorageIterator<Scalar, StorageIndex> end_it(end_offset, obj.innerIndexPtr(), obj.valuePtr());
     std::sort(begin_it, end_it, Comp());
   }
   static inline Index check(const SparseCompressedBase<Derived>& obj, Index, Index) {
-    Index begin_offset = 0;
-    Index end_offset = obj.nonZeros();
+    const StorageIndex* outer = obj.outerIndexPtr();
+    Index begin_offset = (outer && obj.outerSize() > 0) ? internal::convert_index<Index>(outer[0]) : 0;
+    Index end_offset = begin_offset + obj.nonZeros();
     const StorageIndex* begin_it = obj.innerIndexPtr() + begin_offset;
     const StorageIndex* end_it = obj.innerIndexPtr() + end_offset;
     return std::is_sorted(begin_it, end_it, Comp()) ? 1 : 0;
diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
index 7fcf2c219dc..b93194116eb 100644
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@@ -32,7 +32,7 @@ namespace Eigen {
 //  4 - dense op dense     product      dense
 //                         generic      dense
 //
-// TODO to ease compiler job, we could specialize product/quotient with a scalar
+// TODO: to ease compiler job, we could specialize product/quotient with a scalar
 //      and fallback to cwise-unary evaluator using bind1st_op and bind2nd_op.
 
 template <typename BinaryOp, typename Lhs, typename Rhs>
@@ -858,7 +858,7 @@ Derived& SparseMatrixBase<Derived>::operator+=(const EigenBase<OtherDerived>& ot
 template <typename Derived>
 template <typename OtherDerived>
 Derived& SparseMatrixBase<Derived>::operator-=(const EigenBase<OtherDerived>& other) {
-  call_assignment(derived(), other.derived(), internal::assign_op<Scalar, typename OtherDerived::Scalar>());
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar, typename OtherDerived::Scalar>());
   return derived();
 }
 
diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h
index 17ce596a5af..75127937c77 100644
--- a/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -26,11 +26,18 @@ struct product_promote_storage_type<Dense, Sparse, OuterProduct> {
   typedef Sparse ret;
 };
 
+// Type trait to detect if a sparse type supports direct compressed storage access
+// (i.e., has valuePtr(), innerIndexPtr(), outerIndexPtr(), isCompressed()).
+// All types deriving from SparseCompressedBase provide these methods.
+template <typename T>
+struct has_compressed_storage : std::is_base_of<SparseCompressedBase<T>, T> {};
+
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType,
           int LhsStorageOrder = ((SparseLhsType::Flags & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor,
           bool ColPerCol = ((DenseRhsType::Flags & RowMajorBit) == 0) || DenseRhsType::ColsAtCompileTime == 1>
 struct sparse_time_dense_product_impl;
 
+// RowMajor, single column (ColPerCol=true): CSR SpMV
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
 struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
                                       RowMajor, true> {
@@ -39,36 +46,123 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
   typedef internal::remove_all_t<DenseResType> Res;
   typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
   typedef evaluator<Lhs> LhsEval;
+  typedef typename Res::Scalar ResScalar;
+
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
                   const typename Res::Scalar& alpha) {
     LhsEval lhsEval(lhs);
-
     Index n = lhs.outerSize();
-#ifdef EIGEN_HAS_OPENMP
-    Index threads = Eigen::nbThreads();
-#endif
 
     for (Index c = 0; c < rhs.cols(); ++c) {
+      runCol(lhsEval, lhs, rhs, res, alpha, n, c, std::integral_constant<bool, has_compressed_storage<Lhs>::value>());
+    }
+  }
+
+  // Direct pointer path: works for both compressed and non-compressed storage.
+  static void runCol(const LhsEval& /*lhsEval*/, const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
+                     const ResScalar& alpha, Index n, Index c, std::true_type /* has_compressed_storage */) {
+    runColImpl(lhs, rhs, res, alpha, n, c, std::integral_constant<bool, bool(DenseRhsType::Flags & DirectAccessBit)>());
+  }
+
+  template <typename RhsT>
+  static void runColImpl(const SparseLhsType& lhs, const RhsT& rhs, DenseResType& res, const ResScalar& alpha, Index n,
+                         Index c, std::true_type) {
+    const Lhs& mat = lhs;
+    const auto* vals = mat.valuePtr();
+    const auto* inds = mat.innerIndexPtr();
+    // Sparse vectors don't store outer indices.
+    const auto* outer = mat.outerIndexPtr();
+    const auto* innerNnz = mat.innerNonZeroPtr();
+    // The fast rhs pointer path requires unit inner stride (common case: VectorXd, contiguous matrix column).
+    if (rhs.innerStride() == 1) {
+      const auto* x = rhs.data() + c * rhs.outerStride();
 #ifdef EIGEN_HAS_OPENMP
-      // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
-      // It basically represents the minimal amount of work to be done to be worth it.
-      if (threads > 1 && lhsEval.nonZerosEstimate() > 20000) {
+      Index threads = Eigen::nbThreads();
+      if (threads > 1 && mat.nonZeros() > 20000) {
 #pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
-        for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
+        for (Index i = 0; i < n; ++i) {
+          Index k = outer ? outer[i] : 0;
+          const Index end = innerNnz ? (outer ? outer[i] : 0) + innerNnz[i]
+                                     : (outer ? outer[i + 1] : mat.nonZeros());
+          ResScalar sum0(0), sum1(0);
+          for (; k < end; ++k) {
+            sum0 += vals[k] * x[inds[k]];
+            ++k;
+            if (k < end) {
+              sum1 += vals[k] * x[inds[k]];
+            }
+          }
+          res.coeffRef(i, c) += alpha * (sum0 + sum1);
+        }
       } else
 #endif
       {
-        for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
+        for (Index i = 0; i < n; ++i) {
+          Index k = outer ? outer[i] : 0;
+          const Index end = innerNnz ? (outer ? outer[i] : 0) + innerNnz[i]
+                                     : (outer ? outer[i + 1] : mat.nonZeros());
+          // Two independent accumulators to break the dependency chain
+          ResScalar sum0(0), sum1(0);
+          for (; k < end; ++k) {
+            sum0 += vals[k] * x[inds[k]];
+            ++k;
+            if (k < end) {
+              sum1 += vals[k] * x[inds[k]];
+            }
+          }
+          res.coeffRef(i, c) += alpha * (sum0 + sum1);
+        }
       }
+    } else {
+      runColImpl(lhs, rhs, res, alpha, n, c, std::false_type());
+    }
+  }
+
+  // Use fall-back path without direct access to rhs.
+  template <typename RhsT>
+  static void runColImpl(const SparseLhsType& lhs, const RhsT& rhs, DenseResType& res, const ResScalar& alpha, Index n,
+                         Index c, std::false_type) {
+    const Lhs& mat = lhs;
+    const auto* vals = mat.valuePtr();
+    const auto* inds = mat.innerIndexPtr();
+    const auto* outer = mat.outerIndexPtr();
+    const auto* innerNnz = mat.innerNonZeroPtr();
+    // Non-unit rhs stride (or no direct access): use direct pointers for sparse side, coeff() for rhs
+    for (Index i = 0; i < n; ++i) {
+      Index k = outer ? outer[i] : 0;
+      const Index end = innerNnz ? (outer ? outer[i] : 0) + innerNnz[i]
+                                 : (outer ? outer[i + 1] : mat.nonZeros());
+      ResScalar sum0(0), sum1(0);
+      for (; k < end; ++k) {
+        sum0 += vals[k] * rhs.coeff(inds[k], c);
+        ++k;
+        if (k < end) {
+          sum1 += vals[k] * rhs.coeff(inds[k], c);
+        }
+      }
+      res.coeffRef(i, c) += alpha * (sum0 + sum1);
+    }
+  }
+
+  // Iterator fallback path
+  static void runCol(const LhsEval& lhsEval, const SparseLhsType& /*lhs*/, const DenseRhsType& rhs, DenseResType& res,
+                     const ResScalar& alpha, Index n, Index c, std::false_type /* has_compressed_storage */) {
+#ifdef EIGEN_HAS_OPENMP
+    Index threads = Eigen::nbThreads();
+    if (threads > 1 && lhsEval.nonZerosEstimate() > 20000) {
+#pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
+      for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
+    } else
+#endif
+    {
+      for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
     }
   }
 
-  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res,
-                         const typename Res::Scalar& alpha, Index i, Index col) {
-    // Two accumulators, which breaks the dependency chain on the accumulator
-    // and allows more instruction-level parallelism in the following loop
-    typename Res::Scalar tmp_a(0);
-    typename Res::Scalar tmp_b(0);
+  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res, const ResScalar& alpha,
+                         Index i, Index col) {
+    ResScalar tmp_a(0);
+    ResScalar tmp_b(0);
     for (LhsInnerIterator it(lhsEval, i); it; ++it) {
       tmp_a += it.value() * rhs.coeff(it.index(), col);
       ++it;
@@ -80,17 +174,7 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
   }
 };
 
-// FIXME: what is the purpose of the following specialization? Is it for the BlockedSparse format?
-// -> let's disable it for now as it is conflicting with generic scalar*matrix and matrix*scalar operators
-// template<typename T1, typename T2/*, int Options_, typename StrideType_*/>
-// struct ScalarBinaryOpTraits<T1, Ref<T2/*, Options_, StrideType_*/> >
-// {
-//   enum {
-//     Defined = 1
-//   };
-//   typedef typename CwiseUnaryOp<scalar_multiple2_op<T1, typename T2::Scalar>, T2>::PlainObject ReturnType;
-// };
-
+// ColMajor, single column (ColPerCol=true): CSC SpMV
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType>
 struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, AlphaType, ColMajor, true> {
   typedef internal::remove_all_t<SparseLhsType> Lhs;
@@ -98,11 +182,61 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
   typedef internal::remove_all_t<DenseResType> Res;
   typedef evaluator<Lhs> LhsEval;
   typedef typename LhsEval::InnerIterator LhsInnerIterator;
+
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha) {
+    runImpl(lhs, rhs, res, alpha, std::integral_constant<bool, has_compressed_storage<Lhs>::value>());
+  }
+
+  // Direct pointer path: works for both compressed and non-compressed storage.
+  static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha,
+                      std::true_type /* has_compressed_storage */) {
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Lhs::StorageIndex StorageIndex;
+    const Lhs& mat = lhs;
+    const LhsScalar* vals = mat.valuePtr();
+    const StorageIndex* inds = mat.innerIndexPtr();
+    // Sparse vectors don't store outer indices.
+    const auto* outer = mat.outerIndexPtr();
+    const auto* innerNnz = mat.innerNonZeroPtr();
+    // The fast result pointer path requires contiguous ColMajor result layout.
+    // Transpose<ColMajor> reports innerStride()==1 but is actually RowMajor, so check both.
+    if (!(Res::Flags & RowMajorBit) && res.innerStride() == 1) {
+      for (Index c = 0; c < rhs.cols(); ++c) {
+        typename Res::Scalar* y = res.data() + c * res.outerStride();
+        for (Index j = 0; j < lhs.outerSize(); ++j) {
+          typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c));
+          const Index start = outer ? outer[j] : 0;
+          const Index end = innerNnz ? start + innerNnz[j] : (outer ? outer[j + 1] : mat.nonZeros());
+          Index k = start;
+          // 4-way unrolled scatter-add (no SIMD: writes are scattered)
+          for (; k + 3 < end; k += 4) {
+            y[inds[k]] += vals[k] * rhs_j;
+            y[inds[k + 1]] += vals[k + 1] * rhs_j;
+            y[inds[k + 2]] += vals[k + 2] * rhs_j;
+            y[inds[k + 3]] += vals[k + 3] * rhs_j;
+          }
+          for (; k < end; ++k) y[inds[k]] += vals[k] * rhs_j;
+        }
+      }
+    } else {
+      // Non-unit result stride: use coeffRef() for result access
+      for (Index c = 0; c < rhs.cols(); ++c) {
+        for (Index j = 0; j < lhs.outerSize(); ++j) {
+          typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c));
+          const Index start = outer ? outer[j] : 0;
+          const Index end = innerNnz ? start + innerNnz[j] : (outer ? outer[j + 1] : mat.nonZeros());
+          for (Index k = start; k < end; ++k) res.coeffRef(inds[k], c) += vals[k] * rhs_j;
+        }
+      }
+    }
+  }
+
+  // Iterator-based fallback
+  static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha,
+                      std::false_type /* has_compressed_storage */) {
     LhsEval lhsEval(lhs);
     for (Index c = 0; c < rhs.cols(); ++c) {
       for (Index j = 0; j < lhs.outerSize(); ++j) {
-        //        typename Res::Scalar rhs_j = alpha * rhs.coeff(j,c);
         typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c));
         for (LhsInnerIterator it(lhsEval, j); it; ++it) res.coeffRef(it.index(), c) += it.value() * rhs_j;
       }
@@ -110,6 +244,7 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
   }
 };
 
+// RowMajor, multiple columns (ColPerCol=false): sparse * dense_matrix
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
 struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
                                       RowMajor, false> {
@@ -118,6 +253,9 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
   typedef internal::remove_all_t<DenseResType> Res;
   typedef evaluator<Lhs> LhsEval;
   typedef typename LhsEval::InnerIterator LhsInnerIterator;
+
+  static constexpr bool IsCompressedLhs = has_compressed_storage<Lhs>::value;
+
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
                   const typename Res::Scalar& alpha) {
     Index n = lhs.rows();
@@ -129,21 +267,43 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
     // It basically represents the minimal amount of work to be done to be worth it.
     if (threads > 1 && lhsEval.nonZerosEstimate() * rhs.cols() > 20000) {
 #pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
-      for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i);
+      for (Index i = 0; i < n; ++i)
+        processRow(lhsEval, lhs, rhs, res, alpha, i, std::integral_constant<bool, IsCompressedLhs>());
     } else
 #endif
     {
-      for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i);
+      for (Index i = 0; i < n; ++i)
+        processRow(lhsEval, lhs, rhs, res, alpha, i, std::integral_constant<bool, IsCompressedLhs>());
     }
   }
 
-  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, Res& res, const typename Res::Scalar& alpha,
-                         Index i) {
+  // Direct pointer path: works for both compressed and non-compressed storage.
+  static void processRow(const LhsEval& /*lhsEval*/, const SparseLhsType& lhs, const DenseRhsType& rhs, Res& res,
+                         const typename Res::Scalar& alpha, Index i, std::true_type /* has_compressed_storage */) {
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Lhs::StorageIndex StorageIndex;
+    const Lhs& mat = lhs;
+    const LhsScalar* vals = mat.valuePtr();
+    const StorageIndex* inds = mat.innerIndexPtr();
+    // Sparse vectors don't store outer indices.
+    const Index start = mat.outerIndexPtr() ? mat.outerIndexPtr()[i] : 0;
+    const auto* innerNnz = mat.innerNonZeroPtr();
+    const Index end = innerNnz
+                          ? start + innerNnz[i]
+                          : (mat.outerIndexPtr() ? mat.outerIndexPtr()[i + 1]
+                                                 : mat.nonZeros());
+    typename Res::RowXpr res_i(res.row(i));
+    for (Index k = start; k < end; ++k) res_i += (alpha * vals[k]) * rhs.row(inds[k]);
+  }
+
+  static void processRow(const LhsEval& lhsEval, const SparseLhsType& /*lhs*/, const DenseRhsType& rhs, Res& res,
+                         const typename Res::Scalar& alpha, Index i, std::false_type /* has_compressed_storage */) {
     typename Res::RowXpr res_i(res.row(i));
     for (LhsInnerIterator it(lhsEval, i); it; ++it) res_i += (alpha * it.value()) * rhs.row(it.index());
   }
 };
 
+// ColMajor, multiple columns (ColPerCol=false): sparse * dense_matrix
 template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
 struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
                                       ColMajor, false> {
@@ -151,8 +311,33 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
   typedef internal::remove_all_t<DenseRhsType> Rhs;
   typedef internal::remove_all_t<DenseResType> Res;
   typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
+
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
                   const typename Res::Scalar& alpha) {
+    runImpl(lhs, rhs, res, alpha, std::integral_constant<bool, has_compressed_storage<Lhs>::value>());
+  }
+
+  // Direct pointer path: works for both compressed and non-compressed storage.
+  static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
+                      const typename Res::Scalar& alpha, std::true_type /* has_compressed_storage */) {
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Lhs::StorageIndex StorageIndex;
+    const Lhs& mat = lhs;
+    const LhsScalar* vals = mat.valuePtr();
+    const StorageIndex* inds = mat.innerIndexPtr();
+    // Sparse vectors don't store outer indices.
+    const auto* outer = mat.outerIndexPtr();
+    const auto* innerNnz = mat.innerNonZeroPtr();
+    for (Index j = 0; j < lhs.outerSize(); ++j) {
+      typename Rhs::ConstRowXpr rhs_j(rhs.row(j));
+      const Index start = outer ? outer[j] : 0;
+      const Index end = innerNnz ? start + innerNnz[j] : (outer ? outer[j + 1] : mat.nonZeros());
+      for (Index k = start; k < end; ++k) res.row(inds[k]) += (alpha * vals[k]) * rhs_j;
+    }
+  }
+
+  static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
+                      const typename Res::Scalar& alpha, std::false_type /* has_compressed_storage */) {
     evaluator<Lhs> lhsEval(lhs);
     for (Index j = 0; j < lhs.outerSize(); ++j) {
       typename Rhs::ConstRowXpr rhs_j(rhs.row(j));
diff --git a/Eigen/src/SparseCore/SparseDiagonalProduct.h b/Eigen/src/SparseCore/SparseDiagonalProduct.h
index 1f72a6b3ce3..e02fdaa7c3e 100644
--- a/Eigen/src/SparseCore/SparseDiagonalProduct.h
+++ b/Eigen/src/SparseCore/SparseDiagonalProduct.h
@@ -39,7 +39,11 @@ struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, Diagonal
     : public sparse_diagonal_product_evaluator<Rhs, typename Lhs::DiagonalVectorType,
                                                Rhs::Flags & RowMajorBit ? SDP_AsScalarProduct : SDP_AsCwiseProduct> {
   typedef Product<Lhs, Rhs, DefaultProduct> XprType;
-  enum { CoeffReadCost = HugeCost, Flags = Rhs::Flags & RowMajorBit, Alignment = 0 };  // FIXME CoeffReadCost & Flags
+  enum {
+    CoeffReadCost = HugeCost,
+    Flags = Rhs::Flags & RowMajorBit,
+    Alignment = 0
+  };  // FIXME: compute proper CoeffReadCost and propagate Flags.
 
   typedef sparse_diagonal_product_evaluator<Rhs, typename Lhs::DiagonalVectorType,
                                             Rhs::Flags & RowMajorBit ? SDP_AsScalarProduct : SDP_AsCwiseProduct>
@@ -52,7 +56,11 @@ struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, SparseSh
     : public sparse_diagonal_product_evaluator<Lhs, Transpose<const typename Rhs::DiagonalVectorType>,
                                                Lhs::Flags & RowMajorBit ? SDP_AsCwiseProduct : SDP_AsScalarProduct> {
   typedef Product<Lhs, Rhs, DefaultProduct> XprType;
-  enum { CoeffReadCost = HugeCost, Flags = Lhs::Flags & RowMajorBit, Alignment = 0 };  // FIXME CoeffReadCost & Flags
+  enum {
+    CoeffReadCost = HugeCost,
+    Flags = Lhs::Flags & RowMajorBit,
+    Alignment = 0
+  };  // FIXME: compute proper CoeffReadCost and propagate Flags.
 
   typedef sparse_diagonal_product_evaluator<Lhs, Transpose<const typename Rhs::DiagonalVectorType>,
                                             Lhs::Flags & RowMajorBit ? SDP_AsCwiseProduct : SDP_AsScalarProduct>
diff --git a/Eigen/src/SparseCore/SparseDot.h b/Eigen/src/SparseCore/SparseDot.h
index 76a4f6cb795..485605fd4bf 100644
--- a/Eigen/src/SparseCore/SparseDot.h
+++ b/Eigen/src/SparseCore/SparseDot.h
@@ -36,10 +36,10 @@ inline typename internal::traits<Derived>::Scalar SparseMatrixBase<Derived>::dot
   Scalar res1(0);
   Scalar res2(0);
   for (; i; ++i) {
-    res1 = numext::fma(numext::conj(i.value()), other.coeff(i.index()), res1);
+    res1 = numext::madd<Scalar>(numext::conj(i.value()), other.coeff(i.index()), res1);
     ++i;
     if (i) {
-      res2 = numext::fma(numext::conj(i.value()), other.coeff(i.index()), res2);
+      res2 = numext::madd<Scalar>(numext::conj(i.value()), other.coeff(i.index()), res2);
     }
   }
   return res1 + res2;
@@ -67,7 +67,7 @@ inline typename internal::traits<Derived>::Scalar SparseMatrixBase<Derived>::dot
   Scalar res(0);
   while (i && j) {
     if (i.index() == j.index()) {
-      res += numext::conj(i.value()) * j.value();
+      res = numext::madd<Scalar>(numext::conj(i.value()), j.value(), res);
       ++i;
       ++j;
     } else if (i.index() < j.index())
diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h
index 73e29c7b175..79b464c5b79 100644
--- a/Eigen/src/SparseCore/SparseMap.h
+++ b/Eigen/src/SparseCore/SparseMap.h
@@ -131,11 +131,8 @@ class SparseMapBase<Derived, ReadOnlyAccessors> : public SparseCompressedBase<De
         m_values(valuePtr),
         m_innerNonZeros(0) {}
 
-  /** Empty destructor */
-  inline ~SparseMapBase() {}
-
  protected:
-  inline SparseMapBase() {}
+  inline SparseMapBase() = default;
 };
 
 /** \ingroup SparseCore_Module
@@ -194,11 +191,8 @@ class SparseMapBase<Derived, WriteAccessors> : public SparseMapBase<Derived, Rea
   inline SparseMapBase(Index size, Index nnz, StorageIndex* innerIndexPtr, Scalar* valuePtr)
       : Base(size, nnz, innerIndexPtr, valuePtr) {}
 
-  /** Empty destructor */
-  inline ~SparseMapBase() {}
-
  protected:
-  inline SparseMapBase() {}
+  inline SparseMapBase() = default;
 };
 
 /** \ingroup SparseCore_Module
@@ -238,8 +232,6 @@ class Map<SparseMatrixType> : public SparseMapBase<Derived, WriteAccessors>
              Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
       : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr) {}
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-  /** Empty destructor */
-  inline ~Map() {}
 };
 
 template <typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
@@ -260,9 +252,6 @@ class Map<const SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideTy
   inline Map(Index rows, Index cols, Index nnz, const StorageIndex* outerIndexPtr, const StorageIndex* innerIndexPtr,
              const Scalar* valuePtr, const StorageIndex* innerNonZerosPtr = 0)
       : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr) {}
-
-  /** Empty destructor */
-  inline ~Map() {}
 };
 
 namespace internal {
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 7ddb6fab901..2248def4f6b 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -1008,7 +1008,7 @@ class SparseMatrix : public SparseCompressedBase<SparseMatrix<Scalar_, Options_,
 
     const bool overwrite = internal::is_same<Func, internal::assign_op<Scalar, Scalar>>::value;
     if (overwrite) {
-      if ((m_outerSize != n) || (m_innerSize != n)) resize(n, n);
+      if ((m_outerSize != n) || (m_innerSize != n) || (n == 0)) resize(n, n);
     }
 
     if (m_data.size() == 0 || overwrite) {
@@ -1130,7 +1130,11 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa
   using TransposedSparseMatrix =
       SparseMatrix<typename SparseMatrixType::Scalar, IsRowMajor ? ColMajor : RowMajor, StorageIndex>;
 
-  if (begin == end) return;
+  if (begin == end) {
+    // Clear out existing data (if any).
+    mat.setZero();
+    return;
+  }
 
   // There are two strategies to consider for constructing a matrix from unordered triplets:
   // A) construct the 'mat' in its native storage order and sort in-place (less memory); or,
@@ -1273,7 +1277,8 @@ void insert_from_triplets_sorted(const InputIterator& begin, const InputIterator
   using SrcXprType =
       CwiseBinaryOp<scalar_disjunction_op<DupFunctor, Scalar>, const SparseMatrixType, const SparseMatrixType>;
 
-  // TODO: process triplets without making a copy
+  // Saving the trips temporary would need a direct mat+triplets merge with
+  // on-the-fly duplicate collapsing (non-trivial).
   SparseMatrixType trips(mat.rows(), mat.cols());
   set_from_triplets_sorted(begin, end, trips, dup_func);
 
@@ -1333,7 +1338,7 @@ void SparseMatrix<Scalar, Options_, StorageIndex_>::setFromTriplets(const InputI
  * \code
  * value = dup_func(OldValue, NewValue)
  * \endcode
- * Here is a C++11 example keeping the latest entry only:
+ * Here is an example keeping the latest entry only:
  * \code
  * mat.setFromTriplets(triplets.begin(), triplets.end(), [] (const Scalar&,const Scalar &b) { return b; });
  * \endcode
@@ -1362,7 +1367,7 @@ void SparseMatrix<Scalar, Options_, StorageIndex_>::setFromSortedTriplets(const
  * \code
  * value = dup_func(OldValue, NewValue)
  * \endcode
- * Here is a C++11 example keeping the latest entry only:
+ * Here is an example keeping the latest entry only:
  * \code
  * mat.setFromSortedTriplets(triplets.begin(), triplets.end(), [] (const Scalar&,const Scalar &b) { return b; });
  * \endcode
@@ -1426,7 +1431,7 @@ void SparseMatrix<Scalar, Options_, StorageIndex_>::insertFromTriplets(const Inp
  * \code
  * value = dup_func(OldValue, NewValue)
  * \endcode
- * Here is a C++11 example keeping the latest entry only:
+ * Here is an example keeping the latest entry only:
  * \code
  * mat.insertFromTriplets(triplets.begin(), triplets.end(), [] (const Scalar&,const Scalar &b) { return b; });
  * \endcode
@@ -1455,7 +1460,7 @@ void SparseMatrix<Scalar, Options_, StorageIndex_>::insertFromSortedTriplets(con
  * \code
  * value = dup_func(OldValue, NewValue)
  * \endcode
- * Here is a C++11 example keeping the latest entry only:
+ * Here is an example keeping the latest entry only:
  * \code
  * mat.insertFromSortedTriplets(triplets.begin(), triplets.end(), [] (const Scalar&,const Scalar &b) { return b; });
  * \endcode
@@ -1544,7 +1549,7 @@ SparseMatrix<Scalar, Options_, StorageIndex_>::operator=(const SparseMatrixBase<
     Eigen::Map<IndexVector>(dest.m_outerIndex, dest.outerSize()).setZero();
 
     // pass 1
-    // FIXME the above copy could be merged with that pass
+    // FIXME: merge the above copy into this pass to avoid iterating twice.
     for (Index j = 0; j < otherCopy.outerSize(); ++j)
       for (typename OtherCopyEval::InnerIterator it(otherCopyEval, j); it; ++it) ++dest.m_outerIndex[it.index()];
 
@@ -1842,26 +1847,34 @@ class Serializer<SparseMatrix<Scalar, Options, StorageIndex>, void> {
       // Inner non-zero counts.
       std::size_t data_bytes = sizeof(StorageIndex) * header.outer_size;
       if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr;
-      memcpy(value.innerNonZeroPtr(), src, data_bytes);
+      if (data_bytes != 0) {
+        memcpy(value.innerNonZeroPtr(), src, data_bytes);
+      }
       src += data_bytes;
     }
 
     // Outer indices.
     std::size_t data_bytes = sizeof(StorageIndex) * (header.outer_size + 1);
     if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr;
-    memcpy(value.outerIndexPtr(), src, data_bytes);
+    if (data_bytes != 0) {
+      memcpy(value.outerIndexPtr(), src, data_bytes);
+    }
     src += data_bytes;
 
     // Inner indices.
     data_bytes = sizeof(StorageIndex) * header.inner_buffer_size;
     if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr;
-    memcpy(value.innerIndexPtr(), src, data_bytes);
+    if (data_bytes != 0) {
+      memcpy(value.innerIndexPtr(), src, data_bytes);
+    }
     src += data_bytes;
 
     // Values.
     data_bytes = sizeof(Scalar) * header.inner_buffer_size;
     if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr;
-    memcpy(value.valuePtr(), src, data_bytes);
+    if (data_bytes != 0) {
+      memcpy(value.valuePtr(), src, data_bytes);
+    }
     src += data_bytes;
     return src;
   }
diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index 7ac16f83452..514107ee287 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -115,7 +115,7 @@ class SparseMatrixBase : public EigenBase<Derived> {
   typedef Transpose<Derived> TransposeReturnType;
   typedef Transpose<const Derived> ConstTransposeReturnType;
 
-  // FIXME storage order do not match evaluator storage order
+  // FIXME: storage order may not match evaluator storage order.
   typedef SparseMatrix<Scalar, Flags & RowMajorBit ? RowMajor : ColMajor, StorageIndex> PlainObject;
 
   /** This is the "real scalar" type; if the \a Scalar type is already real numbers
@@ -203,7 +203,7 @@ class SparseMatrixBase : public EigenBase<Derived> {
     return derived();
   }
 
-  SparseMatrixBase() : m_isRValue(false) { /* TODO check flags */
+  SparseMatrixBase() : m_isRValue(false) { /* TODO: validate traits flags. */
   }
 
   template <typename OtherDerived>
@@ -224,33 +224,84 @@ class SparseMatrixBase : public EigenBase<Derived> {
  public:
 #ifndef EIGEN_NO_IO
   friend std::ostream& operator<<(std::ostream& s, const SparseMatrixBase& m) {
-    typedef typename Derived::Nested Nested;
-    typedef internal::remove_all_t<Nested> NestedCleaned;
+    using Nested = typename Derived::Nested;
+    using NestedCleaned = internal::remove_all_t<Nested>;
 
     if (Flags & RowMajorBit) {
       Nested nm(m.derived());
       internal::evaluator<NestedCleaned> thisEval(nm);
+
+      // compute global width
+      std::size_t width = 0;
+      {
+        std::ostringstream ss0;
+        ss0.copyfmt(s);
+        ss0 << Scalar(0);
+        width = ss0.str().size();
+        for (Index row = 0; row < nm.outerSize(); ++row) {
+          for (typename internal::evaluator<NestedCleaned>::InnerIterator it(thisEval, row); it; ++it) {
+            std::ostringstream ss;
+            ss.copyfmt(s);
+            ss << it.value();
+
+            const std::size_t potential_width = ss.str().size();
+            if (potential_width > width) width = potential_width;
+          }
+        }
+      }
+
       for (Index row = 0; row < nm.outerSize(); ++row) {
         Index col = 0;
         for (typename internal::evaluator<NestedCleaned>::InnerIterator it(thisEval, row); it; ++it) {
-          for (; col < it.index(); ++col) s << "0 ";
+          for (; col < it.index(); ++col) {
+            s.width(width);
+            s << Scalar(0) << " ";
+          }
+          s.width(width);
           s << it.value() << " ";
           ++col;
         }
-        for (; col < m.cols(); ++col) s << "0 ";
+        for (; col < m.cols(); ++col) {
+          s.width(width);
+          s << Scalar(0) << " ";
+        }
         s << std::endl;
       }
     } else {
       Nested nm(m.derived());
       internal::evaluator<NestedCleaned> thisEval(nm);
       if (m.cols() == 1) {
+        // compute local width (single col)
+        std::size_t width = 0;
+        {
+          std::ostringstream ss0;
+          ss0.copyfmt(s);
+          ss0 << Scalar(0);
+          width = ss0.str().size();
+          for (typename internal::evaluator<NestedCleaned>::InnerIterator it(thisEval, 0); it; ++it) {
+            std::ostringstream ss;
+            ss.copyfmt(s);
+            ss << it.value();
+
+            const std::size_t potential_width = ss.str().size();
+            if (potential_width > width) width = potential_width;
+          }
+        }
+
         Index row = 0;
         for (typename internal::evaluator<NestedCleaned>::InnerIterator it(thisEval, 0); it; ++it) {
-          for (; row < it.index(); ++row) s << "0" << std::endl;
+          for (; row < it.index(); ++row) {
+            s.width(width);
+            s << Scalar(0) << std::endl;
+          }
+          s.width(width);
           s << it.value() << std::endl;
           ++row;
         }
-        for (; row < m.rows(); ++row) s << "0" << std::endl;
+        for (; row < m.rows(); ++row) {
+          s.width(width);
+          s << Scalar(0) << std::endl;
+        }
       } else {
         SparseMatrix<Scalar, RowMajorBit, StorageIndex> trans = m;
         s << static_cast<const SparseMatrixBase<SparseMatrix<Scalar, RowMajorBit, StorageIndex> >&>(trans);
diff --git a/Eigen/src/SparseCore/SparsePermutation.h b/Eigen/src/SparseCore/SparsePermutation.h
index 56f572d3560..3fb370507e9 100644
--- a/Eigen/src/SparseCore/SparsePermutation.h
+++ b/Eigen/src/SparseCore/SparsePermutation.h
@@ -246,4 +246,4 @@ inline const Product<Inverse<PermutationType>, SparseDerived, AliasFreeProduct>
 
 }  // end namespace Eigen
 
-#endif  // EIGEN_SPARSE_SELFADJOINTVIEW_H
+#endif  // EIGEN_SPARSE_PERMUTATION_H
diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h
index c205e6ddde3..29cdda1a873 100644
--- a/Eigen/src/SparseCore/SparseRef.h
+++ b/Eigen/src/SparseCore/SparseRef.h
@@ -89,11 +89,19 @@ class SparseRefBase : public SparseMapBase<Derived> {
  protected:
   template <typename Expression>
   void construct(Expression& expr) {
-    if (expr.outerIndexPtr() == 0)
+    if (Expression::IsVectorAtCompileTime) {
+      const Index offset = expr.outerIndexPtr() ? expr.outerIndexPtr()[0] : 0;
+      auto inner_index_ptr = expr.innerIndexPtr();
+      auto value_ptr = expr.valuePtr();
+      if (inner_index_ptr) inner_index_ptr += offset;
+      if (value_ptr) value_ptr += offset;
+      internal::construct_at<Base>(this, expr.size(), expr.nonZeros(), inner_index_ptr, value_ptr);
+    } else if (expr.outerIndexPtr() == 0) {
       internal::construct_at<Base>(this, expr.size(), expr.nonZeros(), expr.innerIndexPtr(), expr.valuePtr());
-    else
+    } else {
       internal::construct_at<Base>(this, expr.rows(), expr.cols(), expr.nonZeros(), expr.outerIndexPtr(),
                                    expr.innerIndexPtr(), expr.valuePtr(), expr.innerNonZeroPtr());
+    }
   }
 };
 
@@ -117,7 +125,8 @@ class Ref<SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType>
 #else
 template <typename SparseMatrixType, int Options>
 class Ref<SparseMatrixType, Options>
-    : public SparseMapBase<Derived, WriteAccessors>  // yes, that's weird to use Derived here, but that works!
+    : public SparseMapBase<Derived, WriteAccessors>  // Note: 'Derived' is used here intentionally; it resolves
+                                                     // correctly via CRTP.
 #endif
 {
   typedef SparseMatrix<MatScalar, MatOptions, MatIndex> PlainObjectType;
@@ -267,6 +276,8 @@ class Ref<SparseVectorType> : public SparseMapBase<Derived, WriteAccessors>
   {
     EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
     EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+    EIGEN_STATIC_ASSERT((!std::is_same<Derived, PlainObjectType>::value),
+                        THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
     Base::construct(expr.const_cast_derived());
   }
 };
@@ -322,8 +333,8 @@ class Ref<const SparseVector<MatScalar, MatOptions, MatIndex>, Options, StrideTy
 
 namespace internal {
 
-// FIXME shall we introduce a general evaluatior_ref that we can specialize for any sparse object once, and thus remove
-// this copy-pasta thing...
+// FIXME: consider introducing a general evaluator_ref that we can specialize for any sparse object once, and thus
+// remove this copy-pasta thing...
 
 template <typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
 struct evaluator<Ref<SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType>>
diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h
index 05b3de56e46..32fbcd4fbce 100644
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -64,6 +64,8 @@ class SparseSelfAdjointView : public EigenBase<SparseSelfAdjointView<MatrixType,
   typedef Matrix<StorageIndex, Dynamic, 1> VectorI;
   typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
   typedef internal::remove_all_t<MatrixTypeNested> MatrixTypeNested_;
+  typedef SparseMatrix<Scalar, (MatrixTypeNested_::Flags & RowMajorBit) ? RowMajor : ColMajor, StorageIndex>
+      PlainObject;
 
   explicit inline SparseSelfAdjointView(MatrixType& matrix) : m_matrix(matrix) {
     eigen_assert(rows() == cols() && "SelfAdjointView is only for squared matrices");
@@ -114,6 +116,16 @@ class SparseSelfAdjointView : public EigenBase<SparseSelfAdjointView<MatrixType,
     return Product<OtherDerived, SparseSelfAdjointView>(lhs.derived(), rhs);
   }
 
+  // Scalar multiplication intentionally materializes the full matrix, unlike dense SelfAdjointView's lazy wrapper,
+  // matching the existing SparseSelfAdjointView products.
+  PlainObject operator*(const Scalar& s) const { return s * *this; }
+
+  friend PlainObject operator*(const Scalar& s, const SparseSelfAdjointView& mat) {
+    PlainObject res(mat);
+    res *= s;
+    return res;
+  }
+
   /** Perform a symmetric rank K update of the selfadjoint matrix \c *this:
    * \f$ this = this + \alpha ( u u^* ) \f$ where \a u is a vector or matrix.
    *
@@ -126,7 +138,7 @@ class SparseSelfAdjointView : public EigenBase<SparseSelfAdjointView<MatrixType,
   SparseSelfAdjointView& rankUpdate(const SparseMatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
 
   /** \returns an expression of P H P^-1 */
-  // TODO implement twists in a more evaluator friendly fashion
+  // TODO: implement twists in a more evaluator friendly fashion
   SparseSymmetricPermutationProduct<MatrixTypeNested_, Mode> twistedBy(
       const PermutationMatrix<Dynamic, Dynamic, StorageIndex>& perm) const {
     return SparseSymmetricPermutationProduct<MatrixTypeNested_, Mode>(m_matrix, perm);
@@ -161,8 +173,7 @@ class SparseSelfAdjointView : public EigenBase<SparseSelfAdjointView<MatrixType,
 
  protected:
   MatrixTypeNested m_matrix;
-  // mutable VectorI m_countPerRow;
-  // mutable VectorI m_countPerCol;
+
  private:
   template <typename Dest>
   void evalTo(Dest&) const;
@@ -205,7 +216,7 @@ SparseSelfAdjointView<MatrixType, Mode>& SparseSelfAdjointView<MatrixType, Mode>
 
 namespace internal {
 
-// TODO currently a selfadjoint expression has the form SelfAdjointView<.,.>
+// TODO: currently a selfadjoint expression has the form SelfAdjointView<.,.>
 //      in the future selfadjoint-ness should be defined by the expression traits
 //      such that Transpose<SelfAdjointView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to
 //      make it work)
diff --git a/Eigen/src/SparseCore/SparseSolverBase.h b/Eigen/src/SparseCore/SparseSolverBase.h
index d67a6773be9..a1ed4ba1102 100644
--- a/Eigen/src/SparseCore/SparseSolverBase.h
+++ b/Eigen/src/SparseCore/SparseSolverBase.h
@@ -64,14 +64,17 @@ std::enable_if_t<Rhs::ColsAtCompileTime == 1 || Dest::ColsAtCompileTime == 1> so
  *
  */
 template <typename Derived>
-class SparseSolverBase : internal::noncopyable {
+class SparseSolverBase {
  public:
   /** Default constructor */
   SparseSolverBase() : m_isInitialized(false) {}
 
-  SparseSolverBase(SparseSolverBase&& other) : internal::noncopyable{}, m_isInitialized{other.m_isInitialized} {}
+  SparseSolverBase(const SparseSolverBase&) = delete;
+  SparseSolverBase& operator=(const SparseSolverBase&) = delete;
 
-  ~SparseSolverBase() {}
+  SparseSolverBase(SparseSolverBase&& other) : m_isInitialized{other.m_isInitialized} {}
+
+  ~SparseSolverBase() = default;
 
   Derived& derived() { return *static_cast<Derived*>(this); }
   const Derived& derived() const { return *static_cast<const Derived*>(this); }
@@ -81,7 +84,7 @@ class SparseSolverBase : internal::noncopyable {
    * \sa compute()
    */
   template <typename Rhs>
-  inline const Solve<Derived, Rhs> solve(const MatrixBase<Rhs>& b) const {
+  inline Solve<Derived, Rhs> solve(const MatrixBase<Rhs>& b) const {
     eigen_assert(m_isInitialized && "Solver is not initialized.");
     eigen_assert(derived().rows() == b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
     return Solve<Derived, Rhs>(derived(), b.derived());
@@ -92,7 +95,7 @@ class SparseSolverBase : internal::noncopyable {
    * \sa compute()
    */
   template <typename Rhs>
-  inline const Solve<Derived, Rhs> solve(const SparseMatrixBase<Rhs>& b) const {
+  inline Solve<Derived, Rhs> solve(const SparseMatrixBase<Rhs>& b) const {
     eigen_assert(m_isInitialized && "Solver is not initialized.");
     eigen_assert(derived().rows() == b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
     return Solve<Derived, Rhs>(derived(), b.derived());
diff --git a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
index 6e1c9cf5074..41e5261aec6 100644
--- a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
+++ b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
@@ -56,14 +56,11 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
   res.reserve(estimated_nnz_prod);
   double ratioColRes = double(estimated_nnz_prod) / (double(lhs.rows()) * double(rhs.cols()));
   for (Index j = 0; j < cols; ++j) {
-    // FIXME:
-    // double ratioColRes = (double(rhs.innerVector(j).nonZeros()) +
-    // double(lhs.nonZeros())/double(lhs.cols()))/double(lhs.rows());
-    // let's do a more accurate determination of the nnz ratio for the current column j of res
+    // FIXME: compute a more accurate per-column nnz ratio for res.
     tempVector.init(ratioColRes);
     tempVector.setZero();
     for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt) {
-      // FIXME should be written like this: tmp += rhsIt.value() * lhs.col(rhsIt.index())
+      // FIXME: rewrite as tmp += rhsIt.value() * lhs.col(rhsIt.index()).
       tempVector.restart();
       RhsScalar x = rhsIt.value();
       for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, rhsIt.index()); lhsIt; ++lhsIt) {
@@ -87,7 +84,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs, Rhs, ResultType, ColMajo
   typedef typename ResultType::RealScalar RealScalar;
 
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) {
-    remove_all_t<ResultType> res_(res.rows(), res.cols());
+    remove_all_t<ResultType> res_{res.rows(), res.cols()};
     internal::sparse_sparse_product_with_pruning_impl<Lhs, Rhs, ResultType>(lhs, rhs, res_, tolerance);
     res.swap(res_);
   }
@@ -99,7 +96,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs, Rhs, ResultType, ColMajo
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) {
     // we need a col-major matrix to hold the result
     typedef SparseMatrix<typename ResultType::Scalar, ColMajor, typename ResultType::StorageIndex> SparseTemporaryType;
-    SparseTemporaryType res_(res.rows(), res.cols());
+    SparseTemporaryType res_{res.rows(), res.cols()};
     internal::sparse_sparse_product_with_pruning_impl<Lhs, Rhs, SparseTemporaryType>(lhs, rhs, res_, tolerance);
     res = res_;
   }
@@ -110,7 +107,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs, Rhs, ResultType, RowMajo
   typedef typename ResultType::RealScalar RealScalar;
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) {
     // let's transpose the product to get a column x column product
-    remove_all_t<ResultType> res_(res.rows(), res.cols());
+    remove_all_t<ResultType> res_{res.rows(), res.cols()};
     internal::sparse_sparse_product_with_pruning_impl<Rhs, Lhs, ResultType>(rhs, lhs, res_, tolerance);
     res.swap(res_);
   }
diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h
index 33cedaf3eb3..8eedd54c1e5 100644
--- a/Eigen/src/SparseCore/SparseUtil.h
+++ b/Eigen/src/SparseCore/SparseUtil.h
@@ -105,7 +105,7 @@ struct sparse_eval<T, Rows, 1, Flags> {
   typedef SparseVector<Scalar_, ColMajor, StorageIndex_> type;
 };
 
-// TODO this seems almost identical to plain_matrix_type<T, Sparse>
+// TODO: consider unifying with plain_matrix_type<T, Sparse>.
 template <typename T, int Rows, int Cols, int Flags>
 struct sparse_eval {
   typedef typename traits<T>::Scalar Scalar_;
diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h
index 3f72a34dabd..a1b89afdf7b 100644
--- a/Eigen/src/SparseCore/SparseVector.h
+++ b/Eigen/src/SparseCore/SparseVector.h
@@ -140,7 +140,7 @@ class SparseVector : public SparseCompressedBase<SparseVector<Scalar_, Options_,
     return insertBack(inner);
   }
   inline Scalar& insertBack(Index i) {
-    m_data.append(0, i);
+    m_data.append(Scalar(0), i);
     return m_data.value(m_data.size() - 1);
   }
 
@@ -150,7 +150,7 @@ class SparseVector : public SparseCompressedBase<SparseVector<Scalar_, Options_,
     return insertBackUnordered(inner);
   }
   inline Scalar& insertBackUnordered(Index i) {
-    m_data.append(0, i);
+    m_data.append(Scalar(0), i);
     return m_data.value(m_data.size() - 1);
   }
 
@@ -168,7 +168,7 @@ class SparseVector : public SparseCompressedBase<SparseVector<Scalar_, Options_,
 
     Index startId = 0;
     Index p = Index(m_data.size()) - 1;
-    // TODO smart realloc
+    // TODO: implement smart reallocation.
     m_data.resize(p + 2, 1);
 
     while ((p >= startId) && (m_data.index(p) > i)) {
@@ -177,7 +177,7 @@ class SparseVector : public SparseCompressedBase<SparseVector<Scalar_, Options_,
       --p;
     }
     m_data.index(p + 1) = convert_index(i);
-    m_data.value(p + 1) = 0;
+    m_data.value(p + 1) = Scalar(0);
     return m_data.value(p + 1);
   }
 
@@ -354,40 +354,40 @@ class SparseVector : public SparseCompressedBase<SparseVector<Scalar_, Options_,
 
  public:
   /** \internal \deprecated use setZero() and reserve() */
-  EIGEN_DEPRECATED void startFill(Index reserve) {
+  EIGEN_DEPRECATED_WITH_REASON("Use .setZero() and .reserve() instead.") void startFill(Index reserve) {
     setZero();
     m_data.reserve(reserve);
   }
 
   /** \internal \deprecated use insertBack(Index,Index) */
-  EIGEN_DEPRECATED Scalar& fill(Index r, Index c) {
+  EIGEN_DEPRECATED_WITH_REASON("Use .insertBack() instead.") Scalar& fill(Index r, Index c) {
     eigen_assert(r == 0 || c == 0);
     return fill(IsColVector ? r : c);
   }
 
   /** \internal \deprecated use insertBack(Index) */
-  EIGEN_DEPRECATED Scalar& fill(Index i) {
-    m_data.append(0, i);
+  EIGEN_DEPRECATED_WITH_REASON("Use .insertBack() instead.") Scalar& fill(Index i) {
+    m_data.append(Scalar(0), i);
     return m_data.value(m_data.size() - 1);
   }
 
   /** \internal \deprecated use insert(Index,Index) */
-  EIGEN_DEPRECATED Scalar& fillrand(Index r, Index c) {
+  EIGEN_DEPRECATED_WITH_REASON("Use .insert() instead.") Scalar& fillrand(Index r, Index c) {
     eigen_assert(r == 0 || c == 0);
     return fillrand(IsColVector ? r : c);
   }
 
   /** \internal \deprecated use insert(Index) */
-  EIGEN_DEPRECATED Scalar& fillrand(Index i) { return insert(i); }
+  EIGEN_DEPRECATED_WITH_REASON("Use .insert() instead.") Scalar& fillrand(Index i) { return insert(i); }
 
   /** \internal \deprecated use finalize() */
-  EIGEN_DEPRECATED void endFill() {}
+  EIGEN_DEPRECATED_WITH_REASON("Use .finalize() instead.") void endFill() {}
 
   // These two functions were here in the 3.1 release, so let's keep them in case some code rely on them.
   /** \internal \deprecated use data() */
-  EIGEN_DEPRECATED Storage& _data() { return m_data; }
+  EIGEN_DEPRECATED_WITH_REASON("Use .data() instead.") Storage& _data() { return m_data; }
   /** \internal \deprecated use data() */
-  EIGEN_DEPRECATED const Storage& _data() const { return m_data; }
+  EIGEN_DEPRECATED_WITH_REASON("Use .data() instead.") const Storage& _data() const { return m_data; }
 
 #ifdef EIGEN_SPARSEVECTOR_PLUGIN
 #include EIGEN_SPARSEVECTOR_PLUGIN
@@ -487,12 +487,16 @@ class Serializer<SparseVector<Scalar, Options, StorageIndex>, void> {
 
     // Inner indices.
     std::size_t data_bytes = sizeof(StorageIndex) * header.num_non_zeros;
-    memcpy(dest, value.innerIndexPtr(), data_bytes);
+    if (data_bytes != 0) {
+      memcpy(dest, value.innerIndexPtr(), data_bytes);
+    }
     dest += data_bytes;
 
     // Values.
     data_bytes = sizeof(Scalar) * header.num_non_zeros;
-    memcpy(dest, value.valuePtr(), data_bytes);
+    if (data_bytes != 0) {
+      memcpy(dest, value.valuePtr(), data_bytes);
+    }
     dest += data_bytes;
 
     return dest;
@@ -515,13 +519,17 @@ class Serializer<SparseVector<Scalar, Options, StorageIndex>, void> {
     // Inner indices.
     std::size_t data_bytes = sizeof(StorageIndex) * header.num_non_zeros;
     if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr;
-    memcpy(value.innerIndexPtr(), src, data_bytes);
+    if (data_bytes != 0) {
+      memcpy(value.innerIndexPtr(), src, data_bytes);
+    }
     src += data_bytes;
 
     // Values.
     data_bytes = sizeof(Scalar) * header.num_non_zeros;
     if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr;
-    memcpy(value.valuePtr(), src, data_bytes);
+    if (data_bytes != 0) {
+      memcpy(value.valuePtr(), src, data_bytes);
+    }
     src += data_bytes;
     return src;
   }
diff --git a/Eigen/src/SparseCore/SparseView.h b/Eigen/src/SparseCore/SparseView.h
index 7220beea4f0..d5668980c03 100644
--- a/Eigen/src/SparseCore/SparseView.h
+++ b/Eigen/src/SparseCore/SparseView.h
@@ -75,7 +75,7 @@ class SparseView : public SparseMatrixBase<SparseView<MatrixType> > {
 
 namespace internal {
 
-// TODO find a way to unify the two following variants
+// TODO: find a way to unify the two following variants
 // This is tricky because implementing an inner iterator on top of an IndexBased evaluator is
 // not easy because the evaluators do not expose the sizes of the underlying expression.
 
diff --git a/Eigen/src/SparseCore/TriangularSolver.h b/Eigen/src/SparseCore/TriangularSolver.h
index 7753a246aef..f43f0c853f3 100644
--- a/Eigen/src/SparseCore/TriangularSolver.h
+++ b/Eigen/src/SparseCore/TriangularSolver.h
@@ -41,7 +41,7 @@ struct sparse_solve_triangular_selector<Lhs, Rhs, Mode, Lower, RowMajor> {
           lastVal = it.value();
           lastIndex = it.index();
           if (lastIndex == i) break;
-          tmp -= lastVal * other.coeff(lastIndex, col);
+          tmp = numext::madd<Scalar>(-lastVal, other.coeff(lastIndex, col), tmp);
         }
         if (Mode & UnitDiag)
           other.coeffRef(i, col) = tmp;
@@ -75,7 +75,7 @@ struct sparse_solve_triangular_selector<Lhs, Rhs, Mode, Upper, RowMajor> {
         } else if (it && it.index() == i)
           ++it;
         for (; it; ++it) {
-          tmp -= it.value() * other.coeff(it.index(), col);
+          tmp = numext::madd<Scalar>(-it.value(), other.coeff(it.index(), col), tmp);
         }
 
         if (Mode & UnitDiag)
@@ -107,7 +107,9 @@ struct sparse_solve_triangular_selector<Lhs, Rhs, Mode, Lower, ColMajor> {
             tmp /= it.value();
           }
           if (it && it.index() == i) ++it;
-          for (; it; ++it) other.coeffRef(it.index(), col) -= tmp * it.value();
+          for (; it; ++it) {
+            other.coeffRef(it.index(), col) = numext::madd<Scalar>(-tmp, it.value(), other.coeffRef(it.index(), col));
+          }
         }
       }
     }
@@ -128,14 +130,17 @@ struct sparse_solve_triangular_selector<Lhs, Rhs, Mode, Upper, ColMajor> {
         if (!numext::is_exactly_zero(tmp))  // optimization when other is actually sparse
         {
           if (!(Mode & UnitDiag)) {
-            // TODO replace this by a binary search. make sure the binary search is safe for partially sorted elements
+            // TODO: replace this with a binary search. make sure the binary search is safe for partially sorted
+            // elements
             LhsIterator it(lhsEval, i);
             while (it && it.index() != i) ++it;
             eigen_assert(it && it.index() == i);
             other.coeffRef(i, col) /= it.value();
           }
           LhsIterator it(lhsEval, i);
-          for (; it && it.index() < i; ++it) other.coeffRef(it.index(), col) -= tmp * it.value();
+          for (; it && it.index() < i; ++it) {
+            other.coeffRef(it.index(), col) = numext::madd<Scalar>(-tmp, it.value(), other.coeffRef(it.index(), col));
+          }
         }
       }
     }
@@ -191,7 +196,7 @@ struct sparse_solve_triangular_sparse_selector<Lhs, Rhs, Mode, UpLo, ColMajor> {
     res.reserve(other.nonZeros());
 
     for (Index col = 0; col < other.cols(); ++col) {
-      // FIXME estimate number of non zeros
+      // FIXME: estimate the number of non-zeros per column for better allocation.
       tempVector.init(.99 /*float(other.col(col).nonZeros())/float(other.rows())*/);
       tempVector.setZero();
       tempVector.restart();
@@ -215,23 +220,22 @@ struct sparse_solve_triangular_sparse_selector<Lhs, Rhs, Mode, UpLo, ColMajor> {
           tempVector.restart();
           if (IsLower) {
             if (it.index() == i) ++it;
-            for (; it; ++it) tempVector.coeffRef(it.index()) -= ci * it.value();
+            for (; it; ++it) {
+              tempVector.coeffRef(it.index()) = numext::madd<Scalar>(-ci, it.value(), tempVector.coeffRef(it.index()));
+            }
           } else {
-            for (; it && it.index() < i; ++it) tempVector.coeffRef(it.index()) -= ci * it.value();
+            for (; it && it.index() < i; ++it) {
+              tempVector.coeffRef(it.index()) = numext::madd<Scalar>(-ci, it.value(), tempVector.coeffRef(it.index()));
+            }
           }
         }
       }
 
-      //       Index count = 0;
-      // FIXME compute a reference value to filter zeros
+      // FIXME: compute a reference value to filter zeros.
       for (typename AmbiVector<Scalar, StorageIndex>::Iterator it(tempVector /*,1e-12*/); it; ++it) {
-        //         ++ count;
-        //         std::cerr << "fill " << it.index() << ", " << col << "\n";
-        //         std::cout << it.value() << "  ";
-        // FIXME use insertBack
+        // FIXME: use insertBack for better performance.
         res.insert(it.index(), col) = it.value();
       }
-      //       std::cout << "tempVector.nonZeros() == " << int(count) << " / " << (other.rows()) << "\n";
     }
     res.finalize();
     other = res.markAsRValue();
@@ -247,17 +251,8 @@ void TriangularViewImpl<ExpressionType, Mode, Sparse>::solveInPlace(SparseMatrix
   eigen_assert(derived().cols() == derived().rows() && derived().cols() == other.rows());
   eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper | Lower)));
 
-  //   enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit };
-
-  //   typedef std::conditional_t<copy,
-  //     typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&> OtherCopy;
-  //   OtherCopy otherCopy(other.derived());
-
   internal::sparse_solve_triangular_sparse_selector<ExpressionType, OtherDerived, Mode>::run(
       derived().nestedExpression(), other.derived());
-
-  //   if (copy)
-  //     other = otherCopy;
 }
 #endif
 
diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h
index cc69a42de0a..c9fcaa0ac83 100644
--- a/Eigen/src/SparseLU/SparseLU.h
+++ b/Eigen/src/SparseLU/SparseLU.h
@@ -51,7 +51,7 @@ class SparseLUTransposeView : public SparseSolverBase<SparseLUTransposeView<Conj
     eigen_assert(m_sparseLU->info() == Success && "The matrix should be factorized first");
     EIGEN_STATIC_ASSERT((Dest::Flags & RowMajorBit) == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
 
-    // this ugly const_cast_derived() helps to detect aliasing when applying the permutations
+    // const_cast_derived() is needed to enable aliasing detection when applying the permutations.
     for (Index j = 0; j < B.cols(); ++j) {
       X.col(j) = m_sparseLU->colsPermutation() * B.const_cast_derived().col(j);
     }
@@ -310,7 +310,7 @@ class SparseLU : public SparseSolverBase<SparseLU<MatrixType_, OrderingType_>>,
    * \sa compute()
    */
   template <typename Rhs>
-  inline const Solve<SparseLU, Rhs> solve(const MatrixBase<Rhs>& B) const;
+  inline Solve<SparseLU, Rhs> solve(const MatrixBase<Rhs>& B) const;
 #endif  // EIGEN_PARSED_BY_DOXYGEN
 
   /** \brief Reports whether previous computation was successful.
@@ -344,7 +344,7 @@ class SparseLU : public SparseSolverBase<SparseLU<MatrixType_, OrderingType_>>,
     // on return, X is overwritten by the computed solution
     X.resize(B.rows(), B.cols());
 
-    // this ugly const_cast_derived() helps to detect aliasing when applying the permutations
+    // const_cast_derived() is needed to enable aliasing detection when applying the permutations.
     for (Index j = 0; j < B.cols(); ++j) X.col(j) = rowsPermutation() * B.const_cast_derived().col(j);
 
     // Forward substitution with L
@@ -360,7 +360,7 @@ class SparseLU : public SparseSolverBase<SparseLU<MatrixType_, OrderingType_>>,
   /** \brief Give the absolute value of the determinant.
    *
    * \returns the absolute value of the determinant of the matrix of which
-   * *this is the QR decomposition.
+   * *this is the LU factorization.
    *
    * \warning a determinant can be very big or small, so for matrices
    * of large enough dimension, there is a risk of overflow/underflow.
@@ -368,7 +368,7 @@ class SparseLU : public SparseSolverBase<SparseLU<MatrixType_, OrderingType_>>,
    *
    * \sa logAbsDeterminant(), signDeterminant()
    */
-  Scalar absDeterminant() {
+  Scalar absDeterminant() const {
     using std::abs;
     eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
     // Initialize with the determinant of the row matrix
@@ -389,7 +389,7 @@ class SparseLU : public SparseSolverBase<SparseLU<MatrixType_, OrderingType_>>,
   /** \brief Give the natural log of the absolute determinant.
    *
    * \returns the natural log of the absolute value of the determinant of the matrix
-   * of which **this is the QR decomposition
+   * of which *this is the LU factorization
    *
    * \note This method is useful to work around the risk of overflow/underflow that's
    * inherent to the determinant computation.
@@ -420,7 +420,7 @@ class SparseLU : public SparseSolverBase<SparseLU<MatrixType_, OrderingType_>>,
    *
    * \sa absDeterminant(), logAbsDeterminant()
    */
-  Scalar signDeterminant() {
+  Scalar signDeterminant() const {
     eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
     // Initialize with the determinant of the row matrix
     Index det = 1;
@@ -446,7 +446,7 @@ class SparseLU : public SparseSolverBase<SparseLU<MatrixType_, OrderingType_>>,
    *
    * \sa absDeterminant(), logAbsDeterminant()
    */
-  Scalar determinant() {
+  Scalar determinant() const {
     eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
     // Initialize with the determinant of the row matrix
     Scalar det = Scalar(1.);
@@ -507,7 +507,7 @@ class SparseLU : public SparseSolverBase<SparseLU<MatrixType_, OrderingType_>>,
   SparseLU(const SparseLU&);
 };  // End class SparseLU
 
-// Functions needed by the anaysis phase
+// Functions needed by the analysis phase
 /** \brief Compute the column permutation.
  *
  * Compute the column permutation to minimize the fill-in
@@ -535,19 +535,21 @@ void SparseLU<MatrixType, OrderingType>::analyzePattern(const MatrixType& mat) {
   OrderingType ord;
   ord(m_mat, m_perm_c);
 
-  // Apply the permutation to the column of the input  matrix
+  // Apply the permutation to the column of the input matrix
   if (m_perm_c.size()) {
-    m_mat.uncompress();  // NOTE: The effect of this command is only to create the InnerNonzeros pointers. FIXME : This
-                         // vector is filled but not subsequently used.
-    // Then, permute only the column pointers
+    // Switch to uncompressed mode so innerNonZeroPtr() exists and can be
+    // permuted consistently with outerIndexPtr().
+    // Downstream sparse traversals may also rely on these per-column counts
+    // while m_mat remains uncompressed.
+    m_mat.uncompress();
+    // A compressed column-major input already exposes valid column pointers.
+    // Otherwise snapshot the internal column-major structure before permuting in place.
+    const bool useInputOuterIndex = !MatrixType::IsRowMajor && mat.isCompressed();
     ei_declare_aligned_stack_constructed_variable(
-        StorageIndex, outerIndexPtr, mat.cols() + 1,
-        mat.isCompressed() ? const_cast<StorageIndex*>(mat.outerIndexPtr()) : 0);
-
-    // If the input matrix 'mat' is uncompressed, then the outer-indices do not match the ones of m_mat, and a copy is
-    // thus needed.
-    if (!mat.isCompressed())
-      IndexVector::Map(outerIndexPtr, mat.cols() + 1) = IndexVector::Map(m_mat.outerIndexPtr(), mat.cols() + 1);
+        StorageIndex, outerIndexPtr, m_mat.cols() + 1,
+        useInputOuterIndex ? const_cast<StorageIndex*>(mat.outerIndexPtr()) : 0);
+    if (!useInputOuterIndex)
+      IndexVector::Map(outerIndexPtr, m_mat.cols() + 1) = IndexVector::Map(m_mat.outerIndexPtr(), m_mat.cols() + 1);
 
     // Apply the permutation and compute the nnz per column.
     for (Index i = 0; i < mat.cols(); i++) {
@@ -603,7 +605,7 @@ void SparseLU<MatrixType, OrderingType>::analyzePattern(const MatrixType& mat) {
  * > A->ncol: number of bytes allocated when memory allocation failure occurred, plus A->ncol.
  * If lwork = -1, it is the estimated amount of space needed, plus A->ncol.
  *
- * It seems that A was the name of the matrix in the past.
+ * Note: 'A' in the above description refers to the factored matrix (historical naming from SuperLU).
  *
  * \sa analyzePattern(), compute(), SparseLU(), info(), lastErrorMessage()
  */
@@ -616,24 +618,21 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix) {
   m_isInitialized = true;
 
   // Apply the column permutation computed in analyzepattern()
-  //   m_mat = matrix * m_perm_c.inverse();
   m_mat = matrix;
   if (m_perm_c.size()) {
-    m_mat.uncompress();  // NOTE: The effect of this command is only to create the InnerNonzeros pointers.
-    // Then, permute only the column pointers
-    const StorageIndex* outerIndexPtr;
-    if (matrix.isCompressed())
-      outerIndexPtr = matrix.outerIndexPtr();
-    else {
-      StorageIndex* outerIndexPtr_t = new StorageIndex[matrix.cols() + 1];
-      for (Index i = 0; i <= matrix.cols(); i++) outerIndexPtr_t[i] = m_mat.outerIndexPtr()[i];
-      outerIndexPtr = outerIndexPtr_t;
-    }
+    // Switch to uncompressed mode so innerNonZeroPtr() exists and can be
+    // permuted consistently with outerIndexPtr().
+    m_mat.uncompress();
+    const bool useInputOuterIndex = !MatrixType::IsRowMajor && matrix.isCompressed();
+    ei_declare_aligned_stack_constructed_variable(
+        StorageIndex, outerIndexPtr, m_mat.cols() + 1,
+        useInputOuterIndex ? const_cast<StorageIndex*>(matrix.outerIndexPtr()) : 0);
+    if (!useInputOuterIndex)
+      IndexVector::Map(outerIndexPtr, m_mat.cols() + 1) = IndexVector::Map(m_mat.outerIndexPtr(), m_mat.cols() + 1);
     for (Index i = 0; i < matrix.cols(); i++) {
       m_mat.outerIndexPtr()[m_perm_c.indices()(i)] = outerIndexPtr[i];
       m_mat.innerNonZeroPtr()[m_perm_c.indices()(i)] = outerIndexPtr[i + 1] - outerIndexPtr[i];
     }
-    if (!matrix.isCompressed()) delete[] outerIndexPtr;
   } else {  // FIXME This should not be needed if the empty permutation is handled transparently
     m_perm_c.resize(matrix.cols());
     for (StorageIndex i = 0; i < matrix.cols(); ++i) m_perm_c.indices()(i) = i;
@@ -778,11 +777,6 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix) {
         return;
       }
 
-      // Update the determinant of the row permutation matrix
-      // FIXME: the following test is not correct, we should probably take iperm_c into account and pivrow is not
-      // directly the row pivot.
-      if (pivrow != jj) m_detPermR = -m_detPermR;
-
       // Prune columns (0:jj-1) using column jj
       Base::pruneL(jj, m_perm_r.indices(), pivrow, nseg, segrep, repfnz_k, xprune, m_glu);
 
diff --git a/Eigen/src/SparseLU/SparseLU_Memory.h b/Eigen/src/SparseLU/SparseLU_Memory.h
index 22affd2213c..7acfa5c8fff 100644
--- a/Eigen/src/SparseLU/SparseLU_Memory.h
+++ b/Eigen/src/SparseLU/SparseLU_Memory.h
@@ -128,7 +128,7 @@ Index SparseLUImpl<Scalar, StorageIndex>::expand(VectorType& vec, Index& length,
  * \param n number of columns
  * \param annz number of initial nonzeros in the matrix
  * \param lwork  if lwork=-1, this routine returns an estimated size of the required memory
- * \param glu persistent data to facilitate multiple factors : will be deleted later ??
+ * \param glu persistent data to facilitate multiple factors (may be deleted later).
  * \param fillratio estimated ratio of fill in the factors
  * \param panel_size Size of a panel
  * \return an estimated size of the required memory if lwork = -1; otherwise, return the size of actually allocated
diff --git a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
index eb1590916cd..98b7348c76c 100644
--- a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
+++ b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
@@ -27,11 +27,7 @@ namespace internal {
  * NOTE : This class corresponds to the SCformat structure in SuperLU
  *
  */
-/* TODO
- * InnerIterator as for sparsematrix
- * SuperInnerIterator to iterate through all supernodes
- * Function for triangular solve
- */
+// TODO: add InnerIterator, SuperInnerIterator, and triangular solve support.
 template <typename Scalar_, typename StorageIndex_>
 class MappedSuperNodalMatrix {
  public:
diff --git a/Eigen/src/SparseLU/SparseLU_column_bmod.h b/Eigen/src/SparseLU/SparseLU_column_bmod.h
index 8435b56228d..014ffc28651 100644
--- a/Eigen/src/SparseLU/SparseLU_column_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_column_bmod.h
@@ -45,7 +45,7 @@ namespace internal {
  * \param dense Store the full representation of the column
  * \param tempv working array
  * \param segrep segment representative ...
- * \param repfnz ??? First nonzero column in each row ???  ...
+ * \param repfnz first nonzero column in each row  ...
  * \param fpanelc First column in the current panel
  * \param glu Global LU data.
  * \return 0 - successful return
diff --git a/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h b/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
index 8df830b0844..86080e4486f 100644
--- a/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
+++ b/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
@@ -52,7 +52,7 @@ void SparseLUImpl<Scalar, StorageIndex>::heap_relax_snode(const Index n, IndexVe
   IndexVector post;
   internal::treePostorder(StorageIndex(n), et, post);  // Post order etree
   IndexVector inv_post(n + 1);
-  for (StorageIndex i = 0; i < n + 1; ++i) inv_post(post(i)) = i;  // inv_post = post.inverse()???
+  for (StorageIndex i = 0; i < n + 1; ++i) inv_post(post(i)) = i;  // Compute the inverse postorder permutation.
 
   // Renumber etree in postorder
   IndexVector iwork(n);
diff --git a/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/Eigen/src/SparseLU/SparseLU_panel_bmod.h
index 505d9829755..397ad8c4dc8 100644
--- a/Eigen/src/SparseLU/SparseLU_panel_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_panel_bmod.h
@@ -118,7 +118,7 @@ void SparseLUImpl<Scalar, StorageIndex>::panel_bmod(const Index m, const Index w
 
         Index isub = lptr + no_zeros;
         Index off = u_rows - segsize;
-        for (Index i = 0; i < off; i++) U(i, u_col) = 0;
+        for (Index i = 0; i < off; i++) U(i, u_col) = Scalar(0);
         for (Index i = 0; i < segsize; i++) {
           Index irow = glu.lsub(isub);
           U(i + off, u_col) = dense_col(irow);
@@ -163,14 +163,14 @@ void SparseLUImpl<Scalar, StorageIndex>::panel_bmod(const Index m, const Index w
         for (Index i = 0; i < segsize; i++) {
           Index irow = glu.lsub(isub++);
           dense_col(irow) = U.coeff(i + off, u_col);
-          U.coeffRef(i + off, u_col) = 0;
+          U.coeffRef(i + off, u_col) = Scalar(0);
         }
 
         // Scatter l into SPA dense[]
         for (Index i = 0; i < nrow; i++) {
           Index irow = glu.lsub(isub++);
           dense_col(irow) -= L.coeff(i, u_col);
-          L.coeffRef(i, u_col) = 0;
+          L.coeffRef(i, u_col) = Scalar(0);
         }
         u_col++;
       }
diff --git a/Eigen/src/SparseLU/SparseLU_panel_dfs.h b/Eigen/src/SparseLU/SparseLU_panel_dfs.h
index df3154845ac..44123968184 100644
--- a/Eigen/src/SparseLU/SparseLU_panel_dfs.h
+++ b/Eigen/src/SparseLU/SparseLU_panel_dfs.h
@@ -136,12 +136,9 @@ void SparseLUImpl<Scalar, StorageIndex>::dfs_kernel(const StorageIndex jj, Index
         //    segment is seen for the first time. (Note that
         //    "repfnz(krep)" may change later.)
         //    Baktrack dfs to its parent
-        if (traits.update_segrep(krep, jj))
-        // if (marker1(krep) < jcol )
-        {
+        if (traits.update_segrep(krep, jj)) {
           segrep(nseg) = krep;
           ++nseg;
-          // marker1(krep) = jj;
         }
 
         kpar = parent(krep);            // Pop recursion, mimic recursion
diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h
index 4dc7aa9f8ca..a8665952059 100644
--- a/Eigen/src/SparseQR/SparseQR.h
+++ b/Eigen/src/SparseQR/SparseQR.h
@@ -241,14 +241,14 @@ class SparseQR : public SparseSolverBase<SparseQR<MatrixType_, OrderingType_> >
    * \sa compute()
    */
   template <typename Rhs>
-  inline const Solve<SparseQR, Rhs> solve(const MatrixBase<Rhs>& B) const {
+  inline Solve<SparseQR, Rhs> solve(const MatrixBase<Rhs>& B) const {
     eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
     eigen_assert(this->rows() == B.rows() &&
                  "SparseQR::solve() : invalid number of rows in the right hand side matrix");
     return Solve<SparseQR, Rhs>(*this, B.derived());
   }
   template <typename Rhs>
-  inline const Solve<SparseQR, Rhs> solve(const SparseMatrixBase<Rhs>& B) const {
+  inline Solve<SparseQR, Rhs> solve(const SparseMatrixBase<Rhs>& B) const {
     eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
     eigen_assert(this->rows() == B.rows() &&
                  "SparseQR::solve() : invalid number of rows in the right hand side matrix");
@@ -338,8 +338,7 @@ void SparseQR<MatrixType, OrderingType>::analyzePattern(const MatrixType& mat) {
   m_Q.resize(m, diagSize);
 
   // Allocate space for nonzero elements: rough estimation
-  m_R.reserve(2 * mat.nonZeros());  // FIXME Get a more accurate estimation through symbolic factorization with the
-                                    // etree
+  m_R.reserve(2 * mat.nonZeros());  // FIXME: get a tighter bound via symbolic factorization using the etree.
   m_Q.reserve(2 * mat.nonZeros());
   m_hcoeffs.resize(diagSize);
   m_analysisIsok = true;
@@ -375,16 +374,18 @@ void SparseQR<MatrixType, OrderingType>::factorize(const MatrixType& mat) {
     m_isEtreeOk = true;
   }
 
-  m_pmat.uncompress();  // To have the innerNonZeroPtr allocated
+  // Switch to uncompressed mode so innerNonZeroPtr() exists and can be
+  // permuted consistently with outerIndexPtr().
+  m_pmat.uncompress();
 
   // Apply the fill-in reducing permutation lazily:
   {
-    // If the input is row major, copy the original column indices,
-    // otherwise directly use the input matrix
-    //
+    // A compressed column-major input already exposes valid column pointers.
+    // Otherwise snapshot the internal column-major structure before permuting in place.
     IndexVector originalOuterIndicesCpy;
-    const StorageIndex* originalOuterIndices = mat.outerIndexPtr();
-    if (MatrixType::IsRowMajor) {
+    const bool useInputOuterIndices = !MatrixType::IsRowMajor && mat.isCompressed();
+    const StorageIndex* originalOuterIndices = useInputOuterIndices ? mat.outerIndexPtr() : nullptr;
+    if (!useInputOuterIndices) {
       originalOuterIndicesCpy = IndexVector::Map(m_pmat.outerIndexPtr(), n + 1);
       originalOuterIndices = originalOuterIndicesCpy.data();
     }
@@ -402,10 +403,10 @@ void SparseQR<MatrixType, OrderingType>::factorize(const MatrixType& mat) {
    */
   RealScalar pivotThreshold;
   if (m_useDefaultThreshold) {
-    RealScalar max2Norm = 0.0;
+    RealScalar max2Norm = RealScalar(0.0);
     for (int j = 0; j < n; j++) max2Norm = numext::maxi(max2Norm, m_pmat.col(j).norm());
     if (max2Norm == RealScalar(0)) max2Norm = RealScalar(1);
-    pivotThreshold = 20 * (m + n) * max2Norm * NumTraits<RealScalar>::epsilon();
+    pivotThreshold = RealScalar(20 * (m + n)) * max2Norm * NumTraits<RealScalar>::epsilon();
   } else {
     pivotThreshold = m_threshold;
   }
@@ -498,24 +499,24 @@ void SparseQR<MatrixType, OrderingType>::factorize(const MatrixType& mat) {
     }  // End update current column
 
     Scalar tau = RealScalar(0);
-    RealScalar beta = 0;
+    RealScalar beta = RealScalar(0);
 
     if (nonzeroCol < diagSize) {
       // Compute the Householder reflection that eliminate the current column
-      // FIXME this step should call the Householder module.
+      // FIXME: refactor to use the Householder module's reflector computation.
       Scalar c0 = nzcolQ ? tval(Qidx(0)) : Scalar(0);
 
       // First, the squared norm of Q((col+1):m, col)
-      RealScalar sqrNorm = 0.;
+      RealScalar sqrNorm = RealScalar(0.);
       for (Index itq = 1; itq < nzcolQ; ++itq) sqrNorm += numext::abs2(tval(Qidx(itq)));
       if (sqrNorm == RealScalar(0) && numext::imag(c0) == RealScalar(0)) {
         beta = numext::real(c0);
-        tval(Qidx(0)) = 1;
+        tval(Qidx(0)) = Scalar(1);
       } else {
         using std::sqrt;
         beta = sqrt(numext::abs2(c0) + sqrNorm);
         if (numext::real(c0) >= RealScalar(0)) beta = -beta;
-        tval(Qidx(0)) = 1;
+        tval(Qidx(0)) = Scalar(1);
         for (Index itq = 1; itq < nzcolQ; ++itq) tval(Qidx(itq)) /= (c0 - beta);
         tau = numext::conj((beta - c0) / beta);
       }
@@ -627,7 +628,7 @@ struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived
 
   const SparseQRType& m_qr;
   const Derived& m_other;
-  bool m_transpose;  // TODO this actually means adjoint
+  bool m_transpose;  // TODO: rename to m_adjoint; this flag controls adjoint application.
 };
 
 template <typename SparseQRType>
@@ -646,14 +647,14 @@ struct SparseQRMatrixQReturnType : public EigenBase<SparseQRMatrixQReturnType<Sp
   }
   inline Index rows() const { return m_qr.rows(); }
   inline Index cols() const { return m_qr.rows(); }
-  // To use for operations with the transpose of Q FIXME this is the same as adjoint at the moment
+  // To use for operations with the transpose of Q. FIXME: currently identical to adjoint(); specialize for complex.
   SparseQRMatrixQTransposeReturnType<SparseQRType> transpose() const {
     return SparseQRMatrixQTransposeReturnType<SparseQRType>(m_qr);
   }
   const SparseQRType& m_qr;
 };
 
-// TODO this actually represents the adjoint of Q
+// TODO: rename to SparseQRMatrixQAdjointReturnType; this represents the adjoint of Q.
 template <typename SparseQRType>
 struct SparseQRMatrixQTransposeReturnType {
   explicit SparseQRMatrixQTransposeReturnType(const SparseQRType& qr) : m_qr(qr) {}
diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h
index 0c101494ae5..bed414bd18e 100644
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -65,6 +65,24 @@ DECL_GSSVX(z, double, std::complex<double>)
 #ifdef EIGEN_SUPERLU_HAS_ILU
 
 // similarly for the incomplete factorization using gsisx
+#if defined(SUPERLU_MAJOR_VERSION) && (SUPERLU_MAJOR_VERSION >= 5)
+#define DECL_GSISX(PREFIX, FLOATTYPE, KEYTYPE)                                                                         \
+  extern "C" {                                                                                                         \
+  extern void PREFIX##gsisx(superlu_options_t *, SuperMatrix *, int *, int *, int *, char *, FLOATTYPE *, FLOATTYPE *, \
+                            SuperMatrix *, SuperMatrix *, void *, int, SuperMatrix *, SuperMatrix *, FLOATTYPE *,      \
+                            FLOATTYPE *, GlobalLU_t *, mem_usage_t *, SuperLUStat_t *, int *);                         \
+  }                                                                                                                    \
+  inline float SuperLU_gsisx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, int *etree,         \
+                             char *equed, FLOATTYPE *R, FLOATTYPE *C, SuperMatrix *L, SuperMatrix *U, void *work,      \
+                             int lwork, SuperMatrix *B, SuperMatrix *X, FLOATTYPE *recip_pivot_growth,                 \
+                             FLOATTYPE *rcond, SuperLUStat_t *stats, int *info, KEYTYPE) {                             \
+    mem_usage_t mem_usage;                                                                                             \
+    GlobalLU_t gLU;                                                                                                    \
+    PREFIX##gsisx(options, A, perm_c, perm_r, etree, equed, R, C, L, U, work, lwork, B, X, recip_pivot_growth, rcond,  \
+                  &gLU, &mem_usage, stats, info);                                                                      \
+    return mem_usage.for_lu; /* bytes used by the factor storage */                                                    \
+  }
+#else  // version < 5.0
 #define DECL_GSISX(PREFIX, FLOATTYPE, KEYTYPE)                                                                         \
   extern "C" {                                                                                                         \
   extern void PREFIX##gsisx(superlu_options_t *, SuperMatrix *, int *, int *, int *, char *, FLOATTYPE *, FLOATTYPE *, \
@@ -80,6 +98,7 @@ DECL_GSSVX(z, double, std::complex<double>)
                   &mem_usage, stats, info);                                                                            \
     return mem_usage.for_lu; /* bytes used by the factor storage */                                                    \
   }
+#endif
 
 DECL_GSISX(s, float, float)
 DECL_GSISX(c, float, std::complex<float>)
@@ -189,7 +208,7 @@ struct SluMatrix : SuperMatrix {
 
     res.setScalarType<typename MatrixType::Scalar>();
 
-    // FIXME the following is not very accurate
+    // FIXME: the following type mapping is approximate.
     if (int(MatrixType::Flags) & int(Upper)) res.Mtype = SLU_TRU;
     if (int(MatrixType::Flags) & int(Lower)) res.Mtype = SLU_TRL;
 
@@ -240,7 +259,7 @@ struct SluMatrixMapHelper<SparseMatrixBase<Derived> > {
 
     res.setScalarType<typename MatrixType::Scalar>();
 
-    // FIXME the following is not very accurate
+    // FIXME: the following type mapping is approximate.
     if (MatrixType::Flags & Upper) res.Mtype = SLU_TRU;
     if (MatrixType::Flags & Lower) res.Mtype = SLU_TRL;
 
@@ -320,7 +339,7 @@ class SuperLUBase : public SparseSolverBase<Derived> {
     derived().factorize(matrix);
   }
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -454,7 +473,7 @@ class SuperLU : public SuperLUBase<MatrixType_, SuperLU<MatrixType_> > {
 
   ~SuperLU() {}
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -468,7 +487,8 @@ class SuperLU : public SuperLUBase<MatrixType_, SuperLU<MatrixType_> > {
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+   * The given matrix must have the same sparsity as the matrix on which the symbolic decomposition has been
+   * performed.
    *
    * \sa analyzePattern()
    */
@@ -563,7 +583,7 @@ void SuperLU<MatrixType>::factorize(const MatrixType &a) {
 
   m_extractedDataAreDirty = true;
 
-  // FIXME how to better check for errors ???
+  // FIXME: implement more detailed error checking based on SuperLU info codes.
   m_info = info == 0 ? Success : NumericalIssue;
   m_factorizationIsOk = true;
 }
@@ -762,7 +782,7 @@ class SuperILU : public SuperLUBase<MatrixType_, SuperILU<MatrixType_> > {
 
   ~SuperILU() {}
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -772,7 +792,8 @@ class SuperILU : public SuperLUBase<MatrixType_, SuperILU<MatrixType_> > {
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+   * The given matrix must have the same sparsity as the matrix on which the symbolic decomposition has been
+   * performed.
    *
    * \sa analyzePattern()
    */
@@ -851,7 +872,7 @@ void SuperILU<MatrixType>::factorize(const MatrixType &a) {
                 &info, Scalar());
   StatFree(&m_sluStat);
 
-  // FIXME how to better check for errors ???
+  // FIXME: implement more detailed error checking based on SuperLU info codes.
   m_info = info == 0 ? Success : NumericalIssue;
   m_factorizationIsOk = true;
 }
diff --git a/Eigen/src/ThreadPool/Barrier.h b/Eigen/src/ThreadPool/Barrier.h
index 8b2f8da7167..be530d0c8b1 100644
--- a/Eigen/src/ThreadPool/Barrier.h
+++ b/Eigen/src/ThreadPool/Barrier.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
+// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
diff --git a/Eigen/src/ThreadPool/InternalHeaderCheck.h b/Eigen/src/ThreadPool/InternalHeaderCheck.h
index 5b27ef436ca..4384cc66a2d 100644
--- a/Eigen/src/ThreadPool/InternalHeaderCheck.h
+++ b/Eigen/src/ThreadPool/InternalHeaderCheck.h
@@ -1,4 +1,3 @@
 #ifndef EIGEN_THREADPOOL_MODULE_H
-#error \
-    "Please include unsupported/Eigen/CXX11/ThreadPool instead of including headers inside the src directory directly."
+#error "Please include Eigen/ThreadPool instead of including headers inside the src directory directly."
 #endif
diff --git a/Eigen/src/ThreadPool/NonBlockingThreadPool.h b/Eigen/src/ThreadPool/NonBlockingThreadPool.h
index 44d4b243279..a9f0bebf827 100644
--- a/Eigen/src/ThreadPool/NonBlockingThreadPool.h
+++ b/Eigen/src/ThreadPool/NonBlockingThreadPool.h
@@ -433,7 +433,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
     // If we are shutting down and all worker threads blocked without work,
     // that's we are done.
     blocked_++;
-    // TODO is blocked_ required to be unsigned?
+    // TODO: is blocked_ required to be unsigned?
     if (done_ && blocked_ == static_cast<unsigned>(num_threads_)) {
       ec_.CancelWait();
       // Almost done, but need to re-check queues.
diff --git a/Eigen/src/ThreadPool/RunQueue.h b/Eigen/src/ThreadPool/RunQueue.h
index 9046b18018a..e419ae7dac9 100644
--- a/Eigen/src/ThreadPool/RunQueue.h
+++ b/Eigen/src/ThreadPool/RunQueue.h
@@ -57,7 +57,7 @@ class RunQueue {
     Elem* e = &array_[front & kMask];
     uint8_t s = e->state.load(std::memory_order_relaxed);
     if (s != kEmpty || !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) return w;
-    front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed);
+    front_.store(front + 1 + (kSize << 1), std::memory_order_release);
     e->w = std::move(w);
     e->state.store(kReady, std::memory_order_release);
     return Work();
@@ -73,7 +73,7 @@ class RunQueue {
     Work w = std::move(e->w);
     e->state.store(kEmpty, std::memory_order_release);
     front = ((front - 1) & kMask2) | (front & ~kMask2);
-    front_.store(front, std::memory_order_relaxed);
+    front_.store(front, std::memory_order_release);
     return w;
   }
 
@@ -86,7 +86,7 @@ class RunQueue {
     uint8_t s = e->state.load(std::memory_order_relaxed);
     if (s != kEmpty || !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) return w;
     back = ((back - 1) & kMask2) | (back & ~kMask2);
-    back_.store(back, std::memory_order_relaxed);
+    back_.store(back, std::memory_order_release);
     e->w = std::move(w);
     e->state.store(kReady, std::memory_order_release);
     return Work();
@@ -102,7 +102,7 @@ class RunQueue {
     if (s != kReady || !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) return Work();
     Work w = std::move(e->w);
     e->state.store(kEmpty, std::memory_order_release);
-    back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);
+    back_.store(back + 1 + (kSize << 1), std::memory_order_release);
     return w;
   }
 
@@ -132,7 +132,7 @@ class RunQueue {
       e->state.store(kEmpty, std::memory_order_release);
       n++;
     }
-    if (n != 0) back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed);
+    if (n != 0) back_.store(start + 1 + (kSize << 1), std::memory_order_release);
     return n;
   }
 
diff --git a/Eigen/src/ThreadPool/ThreadLocal.h b/Eigen/src/ThreadPool/ThreadLocal.h
index aa0bd10833d..7e55b08addf 100644
--- a/Eigen/src/ThreadPool/ThreadLocal.h
+++ b/Eigen/src/ThreadPool/ThreadLocal.h
@@ -29,8 +29,7 @@
 #include <Availability.h>
 #include <TargetConditionals.h>
 #endif
-// Checks whether C++11's `thread_local` storage duration specifier is
-// supported.
+// Checks whether the `thread_local` storage duration specifier is supported.
 #if EIGEN_COMP_CLANGAPPLE && \
     ((EIGEN_COMP_CLANGAPPLE < 8000042) || (TARGET_OS_IPHONE && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0))
 // Notes: Xcode's clang did not support `thread_local` until version
@@ -90,7 +89,7 @@ struct ThreadLocalNoOpRelease {
 //
 //   Eigen::ThreadLocal<Counter> counter(10);
 //
-//   // Each thread will have access to it's own counter object.
+//   // Each thread will have access to its own counter object.
 //   Counter& cnt = counter.local();
 //   cnt++;
 //
diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h
index 3fdcc1fd441..51e4db17724 100644
--- a/Eigen/src/UmfPackSupport/UmfPackSupport.h
+++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h
@@ -11,7 +11,7 @@
 #define EIGEN_UMFPACKSUPPORT_H
 
 // for compatibility with super old version of umfpack,
-// not sure this is really needed, but this is harmless.
+// This may not be strictly needed, but it is harmless.
 #ifndef SuiteSparse_long
 #ifdef UF_long
 #define SuiteSparse_long UF_long
@@ -381,7 +381,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<MatrixType_> > {
     factorize_impl();
   }
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -425,7 +425,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<MatrixType_> > {
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed.
+   * The given matrix must have the same sparsity as the matrix on which the pattern analysis has been performed.
    *
    * \sa analyzePattern(), compute()
    */
diff --git a/Eigen/src/misc/RankRevealingBase.h b/Eigen/src/misc/RankRevealingBase.h
new file mode 100644
index 00000000000..1729c912f22
--- /dev/null
+++ b/Eigen/src/misc/RankRevealingBase.h
@@ -0,0 +1,178 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_RANK_REVEALING_BASE_H
+#define EIGEN_RANK_REVEALING_BASE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \brief CRTP mixin providing threshold management, rank computation, and rank-derived queries
+ *         for rank-revealing decompositions (FullPivLU, ColPivHouseholderQR, FullPivHouseholderQR).
+ *
+ * \tparam Derived the concrete decomposition class (CRTP parameter)
+ *
+ * The derived class must provide:
+ *   - rows(), cols() (inherited from SolverBase)
+ *   - m_isInitialized (bool member, also used by SolverBase)
+ *   - pivotCoeff(Index i) returning the absolute value of the i-th pivot
+ */
+template <typename Derived>
+class RankRevealingBase {
+ public:
+  typedef typename internal::traits<Derived>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  RankRevealingBase()
+      : m_usePrescribedThreshold(false),
+        m_prescribedThreshold(RealScalar(0)),
+        m_maxpivot(RealScalar(0)),
+        m_nonzero_pivots(0) {}
+
+  /** Allows to prescribe a threshold to be used by certain methods, such as rank(),
+   * who need to determine when pivots are to be considered nonzero. This is not used for the
+   * decomposition itself.
+   *
+   * When it needs to get the threshold value, Eigen calls threshold(). By default, this
+   * uses a formula to automatically determine a reasonable threshold.
+   * Once you have called the present method setThreshold(const RealScalar&),
+   * your value is used instead.
+   *
+   * \param threshold The new value to use as the threshold.
+   *
+   * A pivot will be considered nonzero if its absolute value is strictly greater than
+   *  \f$ \vert pivot \vert \leqslant threshold \times \vert maxpivot \vert \f$
+   * where maxpivot is the biggest pivot.
+   *
+   * If you want to come back to the default behavior, call setThreshold(Default_t)
+   */
+  Derived& setThreshold(const RealScalar& threshold) {
+    m_usePrescribedThreshold = true;
+    m_prescribedThreshold = threshold;
+    return self();
+  }
+
+  /** Allows to come back to the default behavior, letting Eigen use its default formula for
+   * determining the threshold.
+   *
+   * You should pass the special object Eigen::Default as parameter here.
+   * \code dec.setThreshold(Eigen::Default); \endcode
+   *
+   * See the documentation of setThreshold(const RealScalar&).
+   */
+  Derived& setThreshold(Default_t) {
+    m_usePrescribedThreshold = false;
+    return self();
+  }
+
+  /** Returns the threshold that will be used by certain methods such as rank().
+   *
+   * See the documentation of setThreshold(const RealScalar&).
+   */
+  RealScalar threshold() const {
+    eigen_assert(self().m_isInitialized || m_usePrescribedThreshold);
+    // Higham's backward error bound: ||ΔA||₂ ≤ c·min(m,n)·u·||A||₂.
+    // The factor of 4 covers the constant c.
+    return m_usePrescribedThreshold
+               ? m_prescribedThreshold
+               : NumTraits<Scalar>::epsilon() * RealScalar(4 * (std::min)(self().rows(), self().cols()));
+  }
+
+  /** \returns the rank of the matrix of which *this is the decomposition.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline Index rank() const {
+    using std::abs;
+    eigen_assert(self().m_isInitialized && "Decomposition is not initialized.");
+    RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold();
+    Index result = 0;
+    for (Index i = 0; i < m_nonzero_pivots; ++i) result += (self().pivotCoeff(i) > premultiplied_threshold);
+    return result;
+  }
+
+  /** \returns the dimension of the kernel of the matrix of which *this is the decomposition.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline Index dimensionOfKernel() const {
+    eigen_assert(self().m_isInitialized && "Decomposition is not initialized.");
+    return self().cols() - rank();
+  }
+
+  /** \returns true if the matrix of which *this is the decomposition represents an injective
+   *          linear map, i.e. has trivial kernel; false otherwise.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline bool isInjective() const {
+    eigen_assert(self().m_isInitialized && "Decomposition is not initialized.");
+    return rank() == self().cols();
+  }
+
+  /** \returns true if the matrix of which *this is the decomposition represents a surjective
+   *          linear map; false otherwise.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline bool isSurjective() const {
+    eigen_assert(self().m_isInitialized && "Decomposition is not initialized.");
+    return rank() == self().rows();
+  }
+
+  /** \returns true if the matrix of which *this is the decomposition is invertible.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline bool isInvertible() const {
+    eigen_assert(self().m_isInitialized && "Decomposition is not initialized.");
+    return isInjective() && isSurjective();
+  }
+
+  /** \returns the number of nonzero pivots in the decomposition.
+   * Here nonzero is meant in the exact sense, not in a fuzzy sense.
+   * So that notion isn't really intrinsically interesting, but it is
+   * still useful when implementing algorithms.
+   *
+   * \sa rank()
+   */
+  inline Index nonzeroPivots() const {
+    eigen_assert(self().m_isInitialized && "Decomposition is not initialized.");
+    return m_nonzero_pivots;
+  }
+
+  /** \returns the absolute value of the biggest pivot, i.e. the biggest
+   *          diagonal coefficient of U (or R).
+   */
+  RealScalar maxPivot() const { return m_maxpivot; }
+
+ protected:
+  bool m_usePrescribedThreshold;
+  RealScalar m_prescribedThreshold;
+  RealScalar m_maxpivot;
+  Index m_nonzero_pivots;
+
+ private:
+  Derived& self() { return static_cast<Derived&>(*this); }
+  const Derived& self() const { return static_cast<const Derived&>(*this); }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_RANK_REVEALING_BASE_H
diff --git a/Eigen/src/misc/RealSvd2x2.h b/Eigen/src/misc/RealSvd2x2.h
deleted file mode 100644
index 332a5abbcea..00000000000
--- a/Eigen/src/misc/RealSvd2x2.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2013-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_REALSVD2X2_H
-#define EIGEN_REALSVD2X2_H
-
-// IWYU pragma: private
-#include "./InternalHeaderCheck.h"
-
-namespace Eigen {
-
-namespace internal {
-
-template <typename MatrixType, typename RealScalar, typename Index>
-void real_2x2_jacobi_svd(const MatrixType &matrix, Index p, Index q, JacobiRotation<RealScalar> *j_left,
-                         JacobiRotation<RealScalar> *j_right) {
-  using std::abs;
-  using std::sqrt;
-  Matrix<RealScalar, 2, 2> m;
-  m << numext::real(matrix.coeff(p, p)), numext::real(matrix.coeff(p, q)), numext::real(matrix.coeff(q, p)),
-      numext::real(matrix.coeff(q, q));
-  JacobiRotation<RealScalar> rot1;
-  RealScalar t = m.coeff(0, 0) + m.coeff(1, 1);
-  RealScalar d = m.coeff(1, 0) - m.coeff(0, 1);
-
-  if (abs(d) < (std::numeric_limits<RealScalar>::min)()) {
-    rot1.s() = RealScalar(0);
-    rot1.c() = RealScalar(1);
-  } else {
-    // If d!=0, then t/d cannot overflow because the magnitude of the
-    // entries forming d are not too small compared to the ones forming t.
-    RealScalar u = t / d;
-    RealScalar tmp = sqrt(RealScalar(1) + numext::abs2(u));
-    rot1.s() = RealScalar(1) / tmp;
-    rot1.c() = u / tmp;
-  }
-  m.applyOnTheLeft(0, 1, rot1);
-  j_right->makeJacobi(m, 0, 1);
-  *j_left = rot1 * j_right->transpose();
-}
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_REALSVD2X2_H
diff --git a/Eigen/src/misc/lapacke.h b/Eigen/src/misc/lapacke.h
index 94afd502fe7..8f2810d7755 100644
--- a/Eigen/src/misc/lapacke.h
+++ b/Eigen/src/misc/lapacke.h
@@ -26,9 +26,10 @@
   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
   THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
-* Contents: Native C interface to LAPACK
+* Contents: Native C interface to LAPACK (subset used by Eigen)
 * Author: Intel Corporation
 * Generated November, 2011
+* Modified: stripped to the subset of declarations actually used by Eigen.
 *****************************************************************************/
 
 #ifndef _MKL_LAPACKE_H_
@@ -36,9 +37,6 @@
 #ifndef _LAPACKE_H_
 #define _LAPACKE_H_
 
-/*
- *  Turn on HAVE_LAPACK_CONFIG_H to redefine C-LAPACK datatypes
- */
 #ifdef HAVE_LAPACK_CONFIG_H
 #include "lapacke_config.h"
 #endif
@@ -57,24 +55,8 @@
 #define lapack_logical lapack_int
 #endif
 
-/* Complex types are structures equivalent to the
- * Fortran complex types COMPLEX(4) and COMPLEX(8).
- *
- * One can also redefine the types with his own types
- * for example by including in the code definitions like
- *
- * #define lapack_complex_float std::complex<float>
- * #define lapack_complex_double std::complex<double>
- *
- * or define these types in the command line:
- *
- * -Dlapack_complex_float="std::complex<float>"
- * -Dlapack_complex_double="std::complex<double>"
- */
-
 #ifndef LAPACK_COMPLEX_CUSTOM
 
-/* Complex type (single precision) */
 #ifndef lapack_complex_float
 #define lapack_complex_float std::complex<float>
 #endif
@@ -89,7 +71,6 @@
 
 lapack_complex_float lapack_make_complex_float(float re, float im);
 
-/* Complex type (double precision) */
 #ifndef lapack_complex_double
 #define lapack_complex_double std::complex<double>
 #endif
@@ -126,9 +107,7 @@ extern "C" {
 #define LAPACK_WORK_MEMORY_ERROR -1010
 #define LAPACK_TRANSPOSE_MEMORY_ERROR -1011
 
-/* Callback logical functions of one, two, or three arguments are used
- *  to select eigenvalues to sort to the top left of the Schur form.
- *  The value is selected if function returns TRUE (non-zero). */
+/* Callback logical functions used to select eigenvalues for Schur form. */
 
 typedef lapack_logical (*LAPACK_S_SELECT2)(const float*, const float*);
 typedef lapack_logical (*LAPACK_S_SELECT3)(const float*, const float*, const float*);
@@ -145,243 +124,16 @@ typedef lapack_logical (*LAPACK_Z_SELECT2)(const lapack_complex_double*, const l
 #define LAPACK_lsame LAPACK_GLOBAL(lsame, LSAME)
 lapack_logical LAPACK_lsame(char* ca, char* cb, lapack_int lca, lapack_int lcb);
 
-/* C-LAPACK function prototypes */
-
-lapack_int LAPACKE_sbdsdc(int matrix_order, char uplo, char compq, lapack_int n, float* d, float* e, float* u,
-                          lapack_int ldu, float* vt, lapack_int ldvt, float* q, lapack_int* iq);
-lapack_int LAPACKE_dbdsdc(int matrix_order, char uplo, char compq, lapack_int n, double* d, double* e, double* u,
-                          lapack_int ldu, double* vt, lapack_int ldvt, double* q, lapack_int* iq);
-
-lapack_int LAPACKE_sbdsqr(int matrix_order, char uplo, lapack_int n, lapack_int ncvt, lapack_int nru, lapack_int ncc,
-                          float* d, float* e, float* vt, lapack_int ldvt, float* u, lapack_int ldu, float* c,
-                          lapack_int ldc);
-lapack_int LAPACKE_dbdsqr(int matrix_order, char uplo, lapack_int n, lapack_int ncvt, lapack_int nru, lapack_int ncc,
-                          double* d, double* e, double* vt, lapack_int ldvt, double* u, lapack_int ldu, double* c,
-                          lapack_int ldc);
-lapack_int LAPACKE_cbdsqr(int matrix_order, char uplo, lapack_int n, lapack_int ncvt, lapack_int nru, lapack_int ncc,
-                          float* d, float* e, lapack_complex_float* vt, lapack_int ldvt, lapack_complex_float* u,
-                          lapack_int ldu, lapack_complex_float* c, lapack_int ldc);
-lapack_int LAPACKE_zbdsqr(int matrix_order, char uplo, lapack_int n, lapack_int ncvt, lapack_int nru, lapack_int ncc,
-                          double* d, double* e, lapack_complex_double* vt, lapack_int ldvt, lapack_complex_double* u,
-                          lapack_int ldu, lapack_complex_double* c, lapack_int ldc);
-
-lapack_int LAPACKE_sdisna(char job, lapack_int m, lapack_int n, const float* d, float* sep);
-lapack_int LAPACKE_ddisna(char job, lapack_int m, lapack_int n, const double* d, double* sep);
-
-lapack_int LAPACKE_sgbbrd(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int ncc, lapack_int kl,
-                          lapack_int ku, float* ab, lapack_int ldab, float* d, float* e, float* q, lapack_int ldq,
-                          float* pt, lapack_int ldpt, float* c, lapack_int ldc);
-lapack_int LAPACKE_dgbbrd(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int ncc, lapack_int kl,
-                          lapack_int ku, double* ab, lapack_int ldab, double* d, double* e, double* q, lapack_int ldq,
-                          double* pt, lapack_int ldpt, double* c, lapack_int ldc);
-lapack_int LAPACKE_cgbbrd(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int ncc, lapack_int kl,
-                          lapack_int ku, lapack_complex_float* ab, lapack_int ldab, float* d, float* e,
-                          lapack_complex_float* q, lapack_int ldq, lapack_complex_float* pt, lapack_int ldpt,
-                          lapack_complex_float* c, lapack_int ldc);
-lapack_int LAPACKE_zgbbrd(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int ncc, lapack_int kl,
-                          lapack_int ku, lapack_complex_double* ab, lapack_int ldab, double* d, double* e,
-                          lapack_complex_double* q, lapack_int ldq, lapack_complex_double* pt, lapack_int ldpt,
-                          lapack_complex_double* c, lapack_int ldc);
-
-lapack_int LAPACKE_sgbcon(int matrix_order, char norm, lapack_int n, lapack_int kl, lapack_int ku, const float* ab,
-                          lapack_int ldab, const lapack_int* ipiv, float anorm, float* rcond);
-lapack_int LAPACKE_dgbcon(int matrix_order, char norm, lapack_int n, lapack_int kl, lapack_int ku, const double* ab,
-                          lapack_int ldab, const lapack_int* ipiv, double anorm, double* rcond);
-lapack_int LAPACKE_cgbcon(int matrix_order, char norm, lapack_int n, lapack_int kl, lapack_int ku,
-                          const lapack_complex_float* ab, lapack_int ldab, const lapack_int* ipiv, float anorm,
-                          float* rcond);
-lapack_int LAPACKE_zgbcon(int matrix_order, char norm, lapack_int n, lapack_int kl, lapack_int ku,
-                          const lapack_complex_double* ab, lapack_int ldab, const lapack_int* ipiv, double anorm,
-                          double* rcond);
-
-lapack_int LAPACKE_sgbequ(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, const float* ab,
-                          lapack_int ldab, float* r, float* c, float* rowcnd, float* colcnd, float* amax);
-lapack_int LAPACKE_dgbequ(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, const double* ab,
-                          lapack_int ldab, double* r, double* c, double* rowcnd, double* colcnd, double* amax);
-lapack_int LAPACKE_cgbequ(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                          const lapack_complex_float* ab, lapack_int ldab, float* r, float* c, float* rowcnd,
-                          float* colcnd, float* amax);
-lapack_int LAPACKE_zgbequ(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                          const lapack_complex_double* ab, lapack_int ldab, double* r, double* c, double* rowcnd,
-                          double* colcnd, double* amax);
-
-lapack_int LAPACKE_sgbequb(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, const float* ab,
-                           lapack_int ldab, float* r, float* c, float* rowcnd, float* colcnd, float* amax);
-lapack_int LAPACKE_dgbequb(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, const double* ab,
-                           lapack_int ldab, double* r, double* c, double* rowcnd, double* colcnd, double* amax);
-lapack_int LAPACKE_cgbequb(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                           const lapack_complex_float* ab, lapack_int ldab, float* r, float* c, float* rowcnd,
-                           float* colcnd, float* amax);
-lapack_int LAPACKE_zgbequb(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                           const lapack_complex_double* ab, lapack_int ldab, double* r, double* c, double* rowcnd,
-                           double* colcnd, double* amax);
-
-lapack_int LAPACKE_sgbrfs(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
-                          const float* ab, lapack_int ldab, const float* afb, lapack_int ldafb, const lapack_int* ipiv,
-                          const float* b, lapack_int ldb, float* x, lapack_int ldx, float* ferr, float* berr);
-lapack_int LAPACKE_dgbrfs(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
-                          const double* ab, lapack_int ldab, const double* afb, lapack_int ldafb,
-                          const lapack_int* ipiv, const double* b, lapack_int ldb, double* x, lapack_int ldx,
-                          double* ferr, double* berr);
-lapack_int LAPACKE_cgbrfs(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
-                          const lapack_complex_float* ab, lapack_int ldab, const lapack_complex_float* afb,
-                          lapack_int ldafb, const lapack_int* ipiv, const lapack_complex_float* b, lapack_int ldb,
-                          lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr);
-lapack_int LAPACKE_zgbrfs(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
-                          const lapack_complex_double* ab, lapack_int ldab, const lapack_complex_double* afb,
-                          lapack_int ldafb, const lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb,
-                          lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr);
-
-lapack_int LAPACKE_sgbrfsx(int matrix_order, char trans, char equed, lapack_int n, lapack_int kl, lapack_int ku,
-                           lapack_int nrhs, const float* ab, lapack_int ldab, const float* afb, lapack_int ldafb,
-                           const lapack_int* ipiv, const float* r, const float* c, const float* b, lapack_int ldb,
-                           float* x, lapack_int ldx, float* rcond, float* berr, lapack_int n_err_bnds,
-                           float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams, float* params);
-lapack_int LAPACKE_dgbrfsx(int matrix_order, char trans, char equed, lapack_int n, lapack_int kl, lapack_int ku,
-                           lapack_int nrhs, const double* ab, lapack_int ldab, const double* afb, lapack_int ldafb,
-                           const lapack_int* ipiv, const double* r, const double* c, const double* b, lapack_int ldb,
-                           double* x, lapack_int ldx, double* rcond, double* berr, lapack_int n_err_bnds,
-                           double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams, double* params);
-lapack_int LAPACKE_cgbrfsx(int matrix_order, char trans, char equed, lapack_int n, lapack_int kl, lapack_int ku,
-                           lapack_int nrhs, const lapack_complex_float* ab, lapack_int ldab,
-                           const lapack_complex_float* afb, lapack_int ldafb, const lapack_int* ipiv, const float* r,
-                           const float* c, const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
-                           lapack_int ldx, float* rcond, float* berr, lapack_int n_err_bnds, float* err_bnds_norm,
-                           float* err_bnds_comp, lapack_int nparams, float* params);
-lapack_int LAPACKE_zgbrfsx(int matrix_order, char trans, char equed, lapack_int n, lapack_int kl, lapack_int ku,
-                           lapack_int nrhs, const lapack_complex_double* ab, lapack_int ldab,
-                           const lapack_complex_double* afb, lapack_int ldafb, const lapack_int* ipiv, const double* r,
-                           const double* c, const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
-                           lapack_int ldx, double* rcond, double* berr, lapack_int n_err_bnds, double* err_bnds_norm,
-                           double* err_bnds_comp, lapack_int nparams, double* params);
-
-lapack_int LAPACKE_sgbsv(int matrix_order, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs, float* ab,
-                         lapack_int ldab, lapack_int* ipiv, float* b, lapack_int ldb);
-lapack_int LAPACKE_dgbsv(int matrix_order, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs, double* ab,
-                         lapack_int ldab, lapack_int* ipiv, double* b, lapack_int ldb);
-lapack_int LAPACKE_cgbsv(int matrix_order, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
-                         lapack_complex_float* ab, lapack_int ldab, lapack_int* ipiv, lapack_complex_float* b,
-                         lapack_int ldb);
-lapack_int LAPACKE_zgbsv(int matrix_order, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
-                         lapack_complex_double* ab, lapack_int ldab, lapack_int* ipiv, lapack_complex_double* b,
-                         lapack_int ldb);
-
-lapack_int LAPACKE_sgbsvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                          lapack_int nrhs, float* ab, lapack_int ldab, float* afb, lapack_int ldafb, lapack_int* ipiv,
-                          char* equed, float* r, float* c, float* b, lapack_int ldb, float* x, lapack_int ldx,
-                          float* rcond, float* ferr, float* berr, float* rpivot);
-lapack_int LAPACKE_dgbsvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                          lapack_int nrhs, double* ab, lapack_int ldab, double* afb, lapack_int ldafb, lapack_int* ipiv,
-                          char* equed, double* r, double* c, double* b, lapack_int ldb, double* x, lapack_int ldx,
-                          double* rcond, double* ferr, double* berr, double* rpivot);
-lapack_int LAPACKE_cgbsvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                          lapack_int nrhs, lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* afb,
-                          lapack_int ldafb, lapack_int* ipiv, char* equed, float* r, float* c, lapack_complex_float* b,
-                          lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* rcond, float* ferr,
-                          float* berr, float* rpivot);
-lapack_int LAPACKE_zgbsvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                          lapack_int nrhs, lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* afb,
-                          lapack_int ldafb, lapack_int* ipiv, char* equed, double* r, double* c,
-                          lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                          double* rcond, double* ferr, double* berr, double* rpivot);
-
-lapack_int LAPACKE_sgbsvxx(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                           lapack_int nrhs, float* ab, lapack_int ldab, float* afb, lapack_int ldafb, lapack_int* ipiv,
-                           char* equed, float* r, float* c, float* b, lapack_int ldb, float* x, lapack_int ldx,
-                           float* rcond, float* rpvgrw, float* berr, lapack_int n_err_bnds, float* err_bnds_norm,
-                           float* err_bnds_comp, lapack_int nparams, float* params);
-lapack_int LAPACKE_dgbsvxx(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                           lapack_int nrhs, double* ab, lapack_int ldab, double* afb, lapack_int ldafb,
-                           lapack_int* ipiv, char* equed, double* r, double* c, double* b, lapack_int ldb, double* x,
-                           lapack_int ldx, double* rcond, double* rpvgrw, double* berr, lapack_int n_err_bnds,
-                           double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams, double* params);
-lapack_int LAPACKE_cgbsvxx(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                           lapack_int nrhs, lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* afb,
-                           lapack_int ldafb, lapack_int* ipiv, char* equed, float* r, float* c, lapack_complex_float* b,
-                           lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* rcond, float* rpvgrw,
-                           float* berr, lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
-                           lapack_int nparams, float* params);
-lapack_int LAPACKE_zgbsvxx(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                           lapack_int nrhs, lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* afb,
-                           lapack_int ldafb, lapack_int* ipiv, char* equed, double* r, double* c,
-                           lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                           double* rcond, double* rpvgrw, double* berr, lapack_int n_err_bnds, double* err_bnds_norm,
-                           double* err_bnds_comp, lapack_int nparams, double* params);
-
-lapack_int LAPACKE_sgbtrf(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, float* ab,
-                          lapack_int ldab, lapack_int* ipiv);
-lapack_int LAPACKE_dgbtrf(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, double* ab,
-                          lapack_int ldab, lapack_int* ipiv);
-lapack_int LAPACKE_cgbtrf(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                          lapack_complex_float* ab, lapack_int ldab, lapack_int* ipiv);
-lapack_int LAPACKE_zgbtrf(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                          lapack_complex_double* ab, lapack_int ldab, lapack_int* ipiv);
-
-lapack_int LAPACKE_sgbtrs(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
-                          const float* ab, lapack_int ldab, const lapack_int* ipiv, float* b, lapack_int ldb);
-lapack_int LAPACKE_dgbtrs(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
-                          const double* ab, lapack_int ldab, const lapack_int* ipiv, double* b, lapack_int ldb);
-lapack_int LAPACKE_cgbtrs(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
-                          const lapack_complex_float* ab, lapack_int ldab, const lapack_int* ipiv,
-                          lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zgbtrs(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
-                          const lapack_complex_double* ab, lapack_int ldab, const lapack_int* ipiv,
-                          lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_sgebak(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
-                          const float* scale, lapack_int m, float* v, lapack_int ldv);
-lapack_int LAPACKE_dgebak(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
-                          const double* scale, lapack_int m, double* v, lapack_int ldv);
-lapack_int LAPACKE_cgebak(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
-                          const float* scale, lapack_int m, lapack_complex_float* v, lapack_int ldv);
-lapack_int LAPACKE_zgebak(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
-                          const double* scale, lapack_int m, lapack_complex_double* v, lapack_int ldv);
-
-lapack_int LAPACKE_sgebal(int matrix_order, char job, lapack_int n, float* a, lapack_int lda, lapack_int* ilo,
-                          lapack_int* ihi, float* scale);
-lapack_int LAPACKE_dgebal(int matrix_order, char job, lapack_int n, double* a, lapack_int lda, lapack_int* ilo,
-                          lapack_int* ihi, double* scale);
-lapack_int LAPACKE_cgebal(int matrix_order, char job, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          lapack_int* ilo, lapack_int* ihi, float* scale);
-lapack_int LAPACKE_zgebal(int matrix_order, char job, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          lapack_int* ilo, lapack_int* ihi, double* scale);
-
-lapack_int LAPACKE_sgebrd(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* d, float* e,
-                          float* tauq, float* taup);
-lapack_int LAPACKE_dgebrd(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* d, double* e,
-                          double* tauq, double* taup);
-lapack_int LAPACKE_cgebrd(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          float* d, float* e, lapack_complex_float* tauq, lapack_complex_float* taup);
-lapack_int LAPACKE_zgebrd(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          double* d, double* e, lapack_complex_double* tauq, lapack_complex_double* taup);
-
-lapack_int LAPACKE_sgecon(int matrix_order, char norm, lapack_int n, const float* a, lapack_int lda, float anorm,
-                          float* rcond);
-lapack_int LAPACKE_dgecon(int matrix_order, char norm, lapack_int n, const double* a, lapack_int lda, double anorm,
-                          double* rcond);
-lapack_int LAPACKE_cgecon(int matrix_order, char norm, lapack_int n, const lapack_complex_float* a, lapack_int lda,
-                          float anorm, float* rcond);
-lapack_int LAPACKE_zgecon(int matrix_order, char norm, lapack_int n, const lapack_complex_double* a, lapack_int lda,
-                          double anorm, double* rcond);
-
-lapack_int LAPACKE_sgeequ(int matrix_order, lapack_int m, lapack_int n, const float* a, lapack_int lda, float* r,
-                          float* c, float* rowcnd, float* colcnd, float* amax);
-lapack_int LAPACKE_dgeequ(int matrix_order, lapack_int m, lapack_int n, const double* a, lapack_int lda, double* r,
-                          double* c, double* rowcnd, double* colcnd, double* amax);
-lapack_int LAPACKE_cgeequ(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_float* a, lapack_int lda,
-                          float* r, float* c, float* rowcnd, float* colcnd, float* amax);
-lapack_int LAPACKE_zgeequ(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_double* a, lapack_int lda,
-                          double* r, double* c, double* rowcnd, double* colcnd, double* amax);
-
-lapack_int LAPACKE_sgeequb(int matrix_order, lapack_int m, lapack_int n, const float* a, lapack_int lda, float* r,
-                           float* c, float* rowcnd, float* colcnd, float* amax);
-lapack_int LAPACKE_dgeequb(int matrix_order, lapack_int m, lapack_int n, const double* a, lapack_int lda, double* r,
-                           double* c, double* rowcnd, double* colcnd, double* amax);
-lapack_int LAPACKE_cgeequb(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_float* a, lapack_int lda,
-                           float* r, float* c, float* rowcnd, float* colcnd, float* amax);
-lapack_int LAPACKE_zgeequb(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_double* a, lapack_int lda,
-                           double* r, double* c, double* rowcnd, double* colcnd, double* amax);
+/*
+ * LAPACKE function prototypes used by Eigen.
+ *
+ * Only the subset of LAPACKE routines that Eigen actually calls is declared
+ * here.  If you need the full LAPACKE API, include your system's <lapacke.h>
+ * after including Eigen headers, or define EIGEN_LAPACKE_SYSTEM before
+ * including Eigen to use the system header instead of this bundled subset.
+ */
 
+/* Schur decomposition (gees) — used by RealSchur_LAPACKE.h, ComplexSchur_LAPACKE.h */
 lapack_int LAPACKE_sgees(int matrix_order, char jobvs, char sort, LAPACK_S_SELECT2 select, lapack_int n, float* a,
                          lapack_int lda, lapack_int* sdim, float* wr, float* wi, float* vs, lapack_int ldvs);
 lapack_int LAPACKE_dgees(int matrix_order, char jobvs, char sort, LAPACK_D_SELECT2 select, lapack_int n, double* a,
@@ -393,125 +145,7 @@ lapack_int LAPACKE_zgees(int matrix_order, char jobvs, char sort, LAPACK_Z_SELEC
                          lapack_complex_double* a, lapack_int lda, lapack_int* sdim, lapack_complex_double* w,
                          lapack_complex_double* vs, lapack_int ldvs);
 
-lapack_int LAPACKE_sgeesx(int matrix_order, char jobvs, char sort, LAPACK_S_SELECT2 select, char sense, lapack_int n,
-                          float* a, lapack_int lda, lapack_int* sdim, float* wr, float* wi, float* vs, lapack_int ldvs,
-                          float* rconde, float* rcondv);
-lapack_int LAPACKE_dgeesx(int matrix_order, char jobvs, char sort, LAPACK_D_SELECT2 select, char sense, lapack_int n,
-                          double* a, lapack_int lda, lapack_int* sdim, double* wr, double* wi, double* vs,
-                          lapack_int ldvs, double* rconde, double* rcondv);
-lapack_int LAPACKE_cgeesx(int matrix_order, char jobvs, char sort, LAPACK_C_SELECT1 select, char sense, lapack_int n,
-                          lapack_complex_float* a, lapack_int lda, lapack_int* sdim, lapack_complex_float* w,
-                          lapack_complex_float* vs, lapack_int ldvs, float* rconde, float* rcondv);
-lapack_int LAPACKE_zgeesx(int matrix_order, char jobvs, char sort, LAPACK_Z_SELECT1 select, char sense, lapack_int n,
-                          lapack_complex_double* a, lapack_int lda, lapack_int* sdim, lapack_complex_double* w,
-                          lapack_complex_double* vs, lapack_int ldvs, double* rconde, double* rcondv);
-
-lapack_int LAPACKE_sgeev(int matrix_order, char jobvl, char jobvr, lapack_int n, float* a, lapack_int lda, float* wr,
-                         float* wi, float* vl, lapack_int ldvl, float* vr, lapack_int ldvr);
-lapack_int LAPACKE_dgeev(int matrix_order, char jobvl, char jobvr, lapack_int n, double* a, lapack_int lda, double* wr,
-                         double* wi, double* vl, lapack_int ldvl, double* vr, lapack_int ldvr);
-lapack_int LAPACKE_cgeev(int matrix_order, char jobvl, char jobvr, lapack_int n, lapack_complex_float* a,
-                         lapack_int lda, lapack_complex_float* w, lapack_complex_float* vl, lapack_int ldvl,
-                         lapack_complex_float* vr, lapack_int ldvr);
-lapack_int LAPACKE_zgeev(int matrix_order, char jobvl, char jobvr, lapack_int n, lapack_complex_double* a,
-                         lapack_int lda, lapack_complex_double* w, lapack_complex_double* vl, lapack_int ldvl,
-                         lapack_complex_double* vr, lapack_int ldvr);
-
-lapack_int LAPACKE_sgeevx(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n, float* a,
-                          lapack_int lda, float* wr, float* wi, float* vl, lapack_int ldvl, float* vr, lapack_int ldvr,
-                          lapack_int* ilo, lapack_int* ihi, float* scale, float* abnrm, float* rconde, float* rcondv);
-lapack_int LAPACKE_dgeevx(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n, double* a,
-                          lapack_int lda, double* wr, double* wi, double* vl, lapack_int ldvl, double* vr,
-                          lapack_int ldvr, lapack_int* ilo, lapack_int* ihi, double* scale, double* abnrm,
-                          double* rconde, double* rcondv);
-lapack_int LAPACKE_cgeevx(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
-                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* w, lapack_complex_float* vl,
-                          lapack_int ldvl, lapack_complex_float* vr, lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
-                          float* scale, float* abnrm, float* rconde, float* rcondv);
-lapack_int LAPACKE_zgeevx(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
-                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* w, lapack_complex_double* vl,
-                          lapack_int ldvl, lapack_complex_double* vr, lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
-                          double* scale, double* abnrm, double* rconde, double* rcondv);
-
-lapack_int LAPACKE_sgehrd(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, float* a, lapack_int lda,
-                          float* tau);
-lapack_int LAPACKE_dgehrd(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, double* a, lapack_int lda,
-                          double* tau);
-lapack_int LAPACKE_cgehrd(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, lapack_complex_float* a,
-                          lapack_int lda, lapack_complex_float* tau);
-lapack_int LAPACKE_zgehrd(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, lapack_complex_double* a,
-                          lapack_int lda, lapack_complex_double* tau);
-
-lapack_int LAPACKE_sgejsv(int matrix_order, char joba, char jobu, char jobv, char jobr, char jobt, char jobp,
-                          lapack_int m, lapack_int n, float* a, lapack_int lda, float* sva, float* u, lapack_int ldu,
-                          float* v, lapack_int ldv, float* stat, lapack_int* istat);
-lapack_int LAPACKE_dgejsv(int matrix_order, char joba, char jobu, char jobv, char jobr, char jobt, char jobp,
-                          lapack_int m, lapack_int n, double* a, lapack_int lda, double* sva, double* u, lapack_int ldu,
-                          double* v, lapack_int ldv, double* stat, lapack_int* istat);
-
-lapack_int LAPACKE_sgelq2(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau);
-lapack_int LAPACKE_dgelq2(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau);
-lapack_int LAPACKE_cgelq2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          lapack_complex_float* tau);
-lapack_int LAPACKE_zgelq2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          lapack_complex_double* tau);
-
-lapack_int LAPACKE_sgelqf(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau);
-lapack_int LAPACKE_dgelqf(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau);
-lapack_int LAPACKE_cgelqf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          lapack_complex_float* tau);
-lapack_int LAPACKE_zgelqf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          lapack_complex_double* tau);
-
-lapack_int LAPACKE_sgels(int matrix_order, char trans, lapack_int m, lapack_int n, lapack_int nrhs, float* a,
-                         lapack_int lda, float* b, lapack_int ldb);
-lapack_int LAPACKE_dgels(int matrix_order, char trans, lapack_int m, lapack_int n, lapack_int nrhs, double* a,
-                         lapack_int lda, double* b, lapack_int ldb);
-lapack_int LAPACKE_cgels(int matrix_order, char trans, lapack_int m, lapack_int n, lapack_int nrhs,
-                         lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zgels(int matrix_order, char trans, lapack_int m, lapack_int n, lapack_int nrhs,
-                         lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_sgelsd(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
-                          float* b, lapack_int ldb, float* s, float rcond, lapack_int* rank);
-lapack_int LAPACKE_dgelsd(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
-                          double* b, lapack_int ldb, double* s, double rcond, lapack_int* rank);
-lapack_int LAPACKE_cgelsd(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
-                          lapack_int lda, lapack_complex_float* b, lapack_int ldb, float* s, float rcond,
-                          lapack_int* rank);
-lapack_int LAPACKE_zgelsd(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
-                          lapack_int lda, lapack_complex_double* b, lapack_int ldb, double* s, double rcond,
-                          lapack_int* rank);
-
-lapack_int LAPACKE_sgelss(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
-                          float* b, lapack_int ldb, float* s, float rcond, lapack_int* rank);
-lapack_int LAPACKE_dgelss(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
-                          double* b, lapack_int ldb, double* s, double rcond, lapack_int* rank);
-lapack_int LAPACKE_cgelss(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
-                          lapack_int lda, lapack_complex_float* b, lapack_int ldb, float* s, float rcond,
-                          lapack_int* rank);
-lapack_int LAPACKE_zgelss(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
-                          lapack_int lda, lapack_complex_double* b, lapack_int ldb, double* s, double rcond,
-                          lapack_int* rank);
-
-lapack_int LAPACKE_sgelsy(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
-                          float* b, lapack_int ldb, lapack_int* jpvt, float rcond, lapack_int* rank);
-lapack_int LAPACKE_dgelsy(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
-                          double* b, lapack_int ldb, lapack_int* jpvt, double rcond, lapack_int* rank);
-lapack_int LAPACKE_cgelsy(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
-                          lapack_int lda, lapack_complex_float* b, lapack_int ldb, lapack_int* jpvt, float rcond,
-                          lapack_int* rank);
-lapack_int LAPACKE_zgelsy(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
-                          lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_int* jpvt, double rcond,
-                          lapack_int* rank);
-
-lapack_int LAPACKE_sgeqlf(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau);
-lapack_int LAPACKE_dgeqlf(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau);
-lapack_int LAPACKE_cgeqlf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          lapack_complex_float* tau);
-lapack_int LAPACKE_zgeqlf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          lapack_complex_double* tau);
-
+/* QR with column pivoting (geqp3) — used by ColPivHouseholderQR_LAPACKE.h */
 lapack_int LAPACKE_sgeqp3(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, lapack_int* jpvt,
                           float* tau);
 lapack_int LAPACKE_dgeqp3(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, lapack_int* jpvt,
@@ -521,22 +155,7 @@ lapack_int LAPACKE_cgeqp3(int matrix_order, lapack_int m, lapack_int n, lapack_c
 lapack_int LAPACKE_zgeqp3(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
                           lapack_int* jpvt, lapack_complex_double* tau);
 
-lapack_int LAPACKE_sgeqpf(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, lapack_int* jpvt,
-                          float* tau);
-lapack_int LAPACKE_dgeqpf(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, lapack_int* jpvt,
-                          double* tau);
-lapack_int LAPACKE_cgeqpf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          lapack_int* jpvt, lapack_complex_float* tau);
-lapack_int LAPACKE_zgeqpf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          lapack_int* jpvt, lapack_complex_double* tau);
-
-lapack_int LAPACKE_sgeqr2(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau);
-lapack_int LAPACKE_dgeqr2(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau);
-lapack_int LAPACKE_cgeqr2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          lapack_complex_float* tau);
-lapack_int LAPACKE_zgeqr2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          lapack_complex_double* tau);
-
+/* QR factorization (geqrf) — used by HouseholderQR_LAPACKE.h */
 lapack_int LAPACKE_sgeqrf(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau);
 lapack_int LAPACKE_dgeqrf(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau);
 lapack_int LAPACKE_cgeqrf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
@@ -544,58 +163,7 @@ lapack_int LAPACKE_cgeqrf(int matrix_order, lapack_int m, lapack_int n, lapack_c
 lapack_int LAPACKE_zgeqrf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
                           lapack_complex_double* tau);
 
-lapack_int LAPACKE_sgeqrfp(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau);
-lapack_int LAPACKE_dgeqrfp(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau);
-lapack_int LAPACKE_cgeqrfp(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                           lapack_complex_float* tau);
-lapack_int LAPACKE_zgeqrfp(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                           lapack_complex_double* tau);
-
-lapack_int LAPACKE_sgerfs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const float* a, lapack_int lda,
-                          const float* af, lapack_int ldaf, const lapack_int* ipiv, const float* b, lapack_int ldb,
-                          float* x, lapack_int ldx, float* ferr, float* berr);
-lapack_int LAPACKE_dgerfs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const double* a, lapack_int lda,
-                          const double* af, lapack_int ldaf, const lapack_int* ipiv, const double* b, lapack_int ldb,
-                          double* x, lapack_int ldx, double* ferr, double* berr);
-lapack_int LAPACKE_cgerfs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
-                          lapack_int lda, const lapack_complex_float* af, lapack_int ldaf, const lapack_int* ipiv,
-                          const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                          float* ferr, float* berr);
-lapack_int LAPACKE_zgerfs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
-                          lapack_int lda, const lapack_complex_double* af, lapack_int ldaf, const lapack_int* ipiv,
-                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                          double* ferr, double* berr);
-
-lapack_int LAPACKE_sgerfsx(int matrix_order, char trans, char equed, lapack_int n, lapack_int nrhs, const float* a,
-                           lapack_int lda, const float* af, lapack_int ldaf, const lapack_int* ipiv, const float* r,
-                           const float* c, const float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond,
-                           float* berr, lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
-                           lapack_int nparams, float* params);
-lapack_int LAPACKE_dgerfsx(int matrix_order, char trans, char equed, lapack_int n, lapack_int nrhs, const double* a,
-                           lapack_int lda, const double* af, lapack_int ldaf, const lapack_int* ipiv, const double* r,
-                           const double* c, const double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond,
-                           double* berr, lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
-                           lapack_int nparams, double* params);
-lapack_int LAPACKE_cgerfsx(int matrix_order, char trans, char equed, lapack_int n, lapack_int nrhs,
-                           const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
-                           lapack_int ldaf, const lapack_int* ipiv, const float* r, const float* c,
-                           const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                           float* rcond, float* berr, lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
-                           lapack_int nparams, float* params);
-lapack_int LAPACKE_zgerfsx(int matrix_order, char trans, char equed, lapack_int n, lapack_int nrhs,
-                           const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
-                           lapack_int ldaf, const lapack_int* ipiv, const double* r, const double* c,
-                           const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                           double* rcond, double* berr, lapack_int n_err_bnds, double* err_bnds_norm,
-                           double* err_bnds_comp, lapack_int nparams, double* params);
-
-lapack_int LAPACKE_sgerqf(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau);
-lapack_int LAPACKE_dgerqf(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau);
-lapack_int LAPACKE_cgerqf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          lapack_complex_float* tau);
-lapack_int LAPACKE_zgerqf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          lapack_complex_double* tau);
-
+/* SVD via divide-and-conquer (gesdd) — used by BDCSVD_LAPACKE.h */
 lapack_int LAPACKE_sgesdd(int matrix_order, char jobz, lapack_int m, lapack_int n, float* a, lapack_int lda, float* s,
                           float* u, lapack_int ldu, float* vt, lapack_int ldvt);
 lapack_int LAPACKE_dgesdd(int matrix_order, char jobz, lapack_int m, lapack_int n, double* a, lapack_int lda, double* s,
@@ -607,20 +175,7 @@ lapack_int LAPACKE_zgesdd(int matrix_order, char jobz, lapack_int m, lapack_int
                           lapack_int lda, double* s, lapack_complex_double* u, lapack_int ldu,
                           lapack_complex_double* vt, lapack_int ldvt);
 
-lapack_int LAPACKE_sgesv(int matrix_order, lapack_int n, lapack_int nrhs, float* a, lapack_int lda, lapack_int* ipiv,
-                         float* b, lapack_int ldb);
-lapack_int LAPACKE_dgesv(int matrix_order, lapack_int n, lapack_int nrhs, double* a, lapack_int lda, lapack_int* ipiv,
-                         double* b, lapack_int ldb);
-lapack_int LAPACKE_cgesv(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_float* a, lapack_int lda,
-                         lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zgesv(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_double* a, lapack_int lda,
-                         lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
-lapack_int LAPACKE_dsgesv(int matrix_order, lapack_int n, lapack_int nrhs, double* a, lapack_int lda, lapack_int* ipiv,
-                          double* b, lapack_int ldb, double* x, lapack_int ldx, lapack_int* iter);
-lapack_int LAPACKE_zcgesv(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_double* a, lapack_int lda,
-                          lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
-                          lapack_int ldx, lapack_int* iter);
-
+/* SVD (gesvd) — used by JacobiSVD_LAPACKE.h */
 lapack_int LAPACKE_sgesvd(int matrix_order, char jobu, char jobvt, lapack_int m, lapack_int n, float* a, lapack_int lda,
                           float* s, float* u, lapack_int ldu, float* vt, lapack_int ldvt, float* superb);
 lapack_int LAPACKE_dgesvd(int matrix_order, char jobu, char jobvt, lapack_int m, lapack_int n, double* a,
@@ -633,60 +188,7 @@ lapack_int LAPACKE_zgesvd(int matrix_order, char jobu, char jobvt, lapack_int m,
                           lapack_int lda, double* s, lapack_complex_double* u, lapack_int ldu,
                           lapack_complex_double* vt, lapack_int ldvt, double* superb);
 
-lapack_int LAPACKE_sgesvj(int matrix_order, char joba, char jobu, char jobv, lapack_int m, lapack_int n, float* a,
-                          lapack_int lda, float* sva, lapack_int mv, float* v, lapack_int ldv, float* stat);
-lapack_int LAPACKE_dgesvj(int matrix_order, char joba, char jobu, char jobv, lapack_int m, lapack_int n, double* a,
-                          lapack_int lda, double* sva, lapack_int mv, double* v, lapack_int ldv, double* stat);
-
-lapack_int LAPACKE_sgesvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, float* a,
-                          lapack_int lda, float* af, lapack_int ldaf, lapack_int* ipiv, char* equed, float* r, float* c,
-                          float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* ferr, float* berr,
-                          float* rpivot);
-lapack_int LAPACKE_dgesvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, double* a,
-                          lapack_int lda, double* af, lapack_int ldaf, lapack_int* ipiv, char* equed, double* r,
-                          double* c, double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* ferr,
-                          double* berr, double* rpivot);
-lapack_int LAPACKE_cgesvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
-                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
-                          lapack_int* ipiv, char* equed, float* r, float* c, lapack_complex_float* b, lapack_int ldb,
-                          lapack_complex_float* x, lapack_int ldx, float* rcond, float* ferr, float* berr,
-                          float* rpivot);
-lapack_int LAPACKE_zgesvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
-                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
-                          lapack_int* ipiv, char* equed, double* r, double* c, lapack_complex_double* b, lapack_int ldb,
-                          lapack_complex_double* x, lapack_int ldx, double* rcond, double* ferr, double* berr,
-                          double* rpivot);
-
-lapack_int LAPACKE_sgesvxx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, float* a,
-                           lapack_int lda, float* af, lapack_int ldaf, lapack_int* ipiv, char* equed, float* r,
-                           float* c, float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* rpvgrw,
-                           float* berr, lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
-                           lapack_int nparams, float* params);
-lapack_int LAPACKE_dgesvxx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, double* a,
-                           lapack_int lda, double* af, lapack_int ldaf, lapack_int* ipiv, char* equed, double* r,
-                           double* c, double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond,
-                           double* rpvgrw, double* berr, lapack_int n_err_bnds, double* err_bnds_norm,
-                           double* err_bnds_comp, lapack_int nparams, double* params);
-lapack_int LAPACKE_cgesvxx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
-                           lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
-                           lapack_int* ipiv, char* equed, float* r, float* c, lapack_complex_float* b, lapack_int ldb,
-                           lapack_complex_float* x, lapack_int ldx, float* rcond, float* rpvgrw, float* berr,
-                           lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
-                           float* params);
-lapack_int LAPACKE_zgesvxx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
-                           lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
-                           lapack_int* ipiv, char* equed, double* r, double* c, lapack_complex_double* b,
-                           lapack_int ldb, lapack_complex_double* x, lapack_int ldx, double* rcond, double* rpvgrw,
-                           double* berr, lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
-                           lapack_int nparams, double* params);
-
-lapack_int LAPACKE_sgetf2(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, lapack_int* ipiv);
-lapack_int LAPACKE_dgetf2(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, lapack_int* ipiv);
-lapack_int LAPACKE_cgetf2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          lapack_int* ipiv);
-lapack_int LAPACKE_zgetf2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          lapack_int* ipiv);
-
+/* LU factorization (getrf) — used by PartialPivLU_LAPACKE.h */
 lapack_int LAPACKE_sgetrf(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, lapack_int* ipiv);
 lapack_int LAPACKE_dgetrf(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, lapack_int* ipiv);
 lapack_int LAPACKE_cgetrf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
@@ -694,9387 +196,19 @@ lapack_int LAPACKE_cgetrf(int matrix_order, lapack_int m, lapack_int n, lapack_c
 lapack_int LAPACKE_zgetrf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
                           lapack_int* ipiv);
 
-lapack_int LAPACKE_sgetri(int matrix_order, lapack_int n, float* a, lapack_int lda, const lapack_int* ipiv);
-lapack_int LAPACKE_dgetri(int matrix_order, lapack_int n, double* a, lapack_int lda, const lapack_int* ipiv);
-lapack_int LAPACKE_cgetri(int matrix_order, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          const lapack_int* ipiv);
-lapack_int LAPACKE_zgetri(int matrix_order, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          const lapack_int* ipiv);
-
-lapack_int LAPACKE_sgetrs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const float* a, lapack_int lda,
-                          const lapack_int* ipiv, float* b, lapack_int ldb);
-lapack_int LAPACKE_dgetrs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const double* a, lapack_int lda,
-                          const lapack_int* ipiv, double* b, lapack_int ldb);
-lapack_int LAPACKE_cgetrs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
-                          lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zgetrs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
-                          lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_sggbak(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
-                          const float* lscale, const float* rscale, lapack_int m, float* v, lapack_int ldv);
-lapack_int LAPACKE_dggbak(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
-                          const double* lscale, const double* rscale, lapack_int m, double* v, lapack_int ldv);
-lapack_int LAPACKE_cggbak(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
-                          const float* lscale, const float* rscale, lapack_int m, lapack_complex_float* v,
-                          lapack_int ldv);
-lapack_int LAPACKE_zggbak(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
-                          const double* lscale, const double* rscale, lapack_int m, lapack_complex_double* v,
-                          lapack_int ldv);
-
-lapack_int LAPACKE_sggbal(int matrix_order, char job, lapack_int n, float* a, lapack_int lda, float* b, lapack_int ldb,
-                          lapack_int* ilo, lapack_int* ihi, float* lscale, float* rscale);
-lapack_int LAPACKE_dggbal(int matrix_order, char job, lapack_int n, double* a, lapack_int lda, double* b,
-                          lapack_int ldb, lapack_int* ilo, lapack_int* ihi, double* lscale, double* rscale);
-lapack_int LAPACKE_cggbal(int matrix_order, char job, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          lapack_complex_float* b, lapack_int ldb, lapack_int* ilo, lapack_int* ihi, float* lscale,
-                          float* rscale);
-lapack_int LAPACKE_zggbal(int matrix_order, char job, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          lapack_complex_double* b, lapack_int ldb, lapack_int* ilo, lapack_int* ihi, double* lscale,
-                          double* rscale);
-
-lapack_int LAPACKE_sgges(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_S_SELECT3 selctg, lapack_int n,
-                         float* a, lapack_int lda, float* b, lapack_int ldb, lapack_int* sdim, float* alphar,
-                         float* alphai, float* beta, float* vsl, lapack_int ldvsl, float* vsr, lapack_int ldvsr);
-lapack_int LAPACKE_dgges(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_D_SELECT3 selctg, lapack_int n,
-                         double* a, lapack_int lda, double* b, lapack_int ldb, lapack_int* sdim, double* alphar,
-                         double* alphai, double* beta, double* vsl, lapack_int ldvsl, double* vsr, lapack_int ldvsr);
-lapack_int LAPACKE_cgges(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_C_SELECT2 selctg, lapack_int n,
-                         lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
-                         lapack_int* sdim, lapack_complex_float* alpha, lapack_complex_float* beta,
-                         lapack_complex_float* vsl, lapack_int ldvsl, lapack_complex_float* vsr, lapack_int ldvsr);
-lapack_int LAPACKE_zgges(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_Z_SELECT2 selctg, lapack_int n,
-                         lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
-                         lapack_int* sdim, lapack_complex_double* alpha, lapack_complex_double* beta,
-                         lapack_complex_double* vsl, lapack_int ldvsl, lapack_complex_double* vsr, lapack_int ldvsr);
-
-lapack_int LAPACKE_sggesx(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_S_SELECT3 selctg, char sense,
-                          lapack_int n, float* a, lapack_int lda, float* b, lapack_int ldb, lapack_int* sdim,
-                          float* alphar, float* alphai, float* beta, float* vsl, lapack_int ldvsl, float* vsr,
-                          lapack_int ldvsr, float* rconde, float* rcondv);
-lapack_int LAPACKE_dggesx(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_D_SELECT3 selctg, char sense,
-                          lapack_int n, double* a, lapack_int lda, double* b, lapack_int ldb, lapack_int* sdim,
-                          double* alphar, double* alphai, double* beta, double* vsl, lapack_int ldvsl, double* vsr,
-                          lapack_int ldvsr, double* rconde, double* rcondv);
-lapack_int LAPACKE_cggesx(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_C_SELECT2 selctg, char sense,
-                          lapack_int n, lapack_complex_float* a, lapack_int lda, lapack_complex_float* b,
-                          lapack_int ldb, lapack_int* sdim, lapack_complex_float* alpha, lapack_complex_float* beta,
-                          lapack_complex_float* vsl, lapack_int ldvsl, lapack_complex_float* vsr, lapack_int ldvsr,
-                          float* rconde, float* rcondv);
-lapack_int LAPACKE_zggesx(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_Z_SELECT2 selctg, char sense,
-                          lapack_int n, lapack_complex_double* a, lapack_int lda, lapack_complex_double* b,
-                          lapack_int ldb, lapack_int* sdim, lapack_complex_double* alpha, lapack_complex_double* beta,
-                          lapack_complex_double* vsl, lapack_int ldvsl, lapack_complex_double* vsr, lapack_int ldvsr,
-                          double* rconde, double* rcondv);
-
-lapack_int LAPACKE_sggev(int matrix_order, char jobvl, char jobvr, lapack_int n, float* a, lapack_int lda, float* b,
-                         lapack_int ldb, float* alphar, float* alphai, float* beta, float* vl, lapack_int ldvl,
-                         float* vr, lapack_int ldvr);
-lapack_int LAPACKE_dggev(int matrix_order, char jobvl, char jobvr, lapack_int n, double* a, lapack_int lda, double* b,
-                         lapack_int ldb, double* alphar, double* alphai, double* beta, double* vl, lapack_int ldvl,
-                         double* vr, lapack_int ldvr);
-lapack_int LAPACKE_cggev(int matrix_order, char jobvl, char jobvr, lapack_int n, lapack_complex_float* a,
-                         lapack_int lda, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* alpha,
-                         lapack_complex_float* beta, lapack_complex_float* vl, lapack_int ldvl,
-                         lapack_complex_float* vr, lapack_int ldvr);
-lapack_int LAPACKE_zggev(int matrix_order, char jobvl, char jobvr, lapack_int n, lapack_complex_double* a,
-                         lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* alpha,
-                         lapack_complex_double* beta, lapack_complex_double* vl, lapack_int ldvl,
-                         lapack_complex_double* vr, lapack_int ldvr);
-
-lapack_int LAPACKE_sggevx(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n, float* a,
-                          lapack_int lda, float* b, lapack_int ldb, float* alphar, float* alphai, float* beta,
-                          float* vl, lapack_int ldvl, float* vr, lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
-                          float* lscale, float* rscale, float* abnrm, float* bbnrm, float* rconde, float* rcondv);
-lapack_int LAPACKE_dggevx(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n, double* a,
-                          lapack_int lda, double* b, lapack_int ldb, double* alphar, double* alphai, double* beta,
-                          double* vl, lapack_int ldvl, double* vr, lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
-                          double* lscale, double* rscale, double* abnrm, double* bbnrm, double* rconde, double* rcondv);
-lapack_int LAPACKE_cggevx(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
-                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
-                          lapack_complex_float* alpha, lapack_complex_float* beta, lapack_complex_float* vl,
-                          lapack_int ldvl, lapack_complex_float* vr, lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
-                          float* lscale, float* rscale, float* abnrm, float* bbnrm, float* rconde, float* rcondv);
-lapack_int LAPACKE_zggevx(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
-                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
-                          lapack_complex_double* alpha, lapack_complex_double* beta, lapack_complex_double* vl,
-                          lapack_int ldvl, lapack_complex_double* vr, lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
-                          double* lscale, double* rscale, double* abnrm, double* bbnrm, double* rconde, double* rcondv);
-
-lapack_int LAPACKE_sggglm(int matrix_order, lapack_int n, lapack_int m, lapack_int p, float* a, lapack_int lda,
-                          float* b, lapack_int ldb, float* d, float* x, float* y);
-lapack_int LAPACKE_dggglm(int matrix_order, lapack_int n, lapack_int m, lapack_int p, double* a, lapack_int lda,
-                          double* b, lapack_int ldb, double* d, double* x, double* y);
-lapack_int LAPACKE_cggglm(int matrix_order, lapack_int n, lapack_int m, lapack_int p, lapack_complex_float* a,
-                          lapack_int lda, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* d,
-                          lapack_complex_float* x, lapack_complex_float* y);
-lapack_int LAPACKE_zggglm(int matrix_order, lapack_int n, lapack_int m, lapack_int p, lapack_complex_double* a,
-                          lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* d,
-                          lapack_complex_double* x, lapack_complex_double* y);
-
-lapack_int LAPACKE_sgghrd(int matrix_order, char compq, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
-                          float* a, lapack_int lda, float* b, lapack_int ldb, float* q, lapack_int ldq, float* z,
-                          lapack_int ldz);
-lapack_int LAPACKE_dgghrd(int matrix_order, char compq, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
-                          double* a, lapack_int lda, double* b, lapack_int ldb, double* q, lapack_int ldq, double* z,
-                          lapack_int ldz);
-lapack_int LAPACKE_cgghrd(int matrix_order, char compq, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
-                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
-                          lapack_complex_float* q, lapack_int ldq, lapack_complex_float* z, lapack_int ldz);
-lapack_int LAPACKE_zgghrd(int matrix_order, char compq, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
-                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
-                          lapack_complex_double* q, lapack_int ldq, lapack_complex_double* z, lapack_int ldz);
-
-lapack_int LAPACKE_sgglse(int matrix_order, lapack_int m, lapack_int n, lapack_int p, float* a, lapack_int lda,
-                          float* b, lapack_int ldb, float* c, float* d, float* x);
-lapack_int LAPACKE_dgglse(int matrix_order, lapack_int m, lapack_int n, lapack_int p, double* a, lapack_int lda,
-                          double* b, lapack_int ldb, double* c, double* d, double* x);
-lapack_int LAPACKE_cgglse(int matrix_order, lapack_int m, lapack_int n, lapack_int p, lapack_complex_float* a,
-                          lapack_int lda, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* c,
-                          lapack_complex_float* d, lapack_complex_float* x);
-lapack_int LAPACKE_zgglse(int matrix_order, lapack_int m, lapack_int n, lapack_int p, lapack_complex_double* a,
-                          lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* c,
-                          lapack_complex_double* d, lapack_complex_double* x);
-
-lapack_int LAPACKE_sggqrf(int matrix_order, lapack_int n, lapack_int m, lapack_int p, float* a, lapack_int lda,
-                          float* taua, float* b, lapack_int ldb, float* taub);
-lapack_int LAPACKE_dggqrf(int matrix_order, lapack_int n, lapack_int m, lapack_int p, double* a, lapack_int lda,
-                          double* taua, double* b, lapack_int ldb, double* taub);
-lapack_int LAPACKE_cggqrf(int matrix_order, lapack_int n, lapack_int m, lapack_int p, lapack_complex_float* a,
-                          lapack_int lda, lapack_complex_float* taua, lapack_complex_float* b, lapack_int ldb,
-                          lapack_complex_float* taub);
-lapack_int LAPACKE_zggqrf(int matrix_order, lapack_int n, lapack_int m, lapack_int p, lapack_complex_double* a,
-                          lapack_int lda, lapack_complex_double* taua, lapack_complex_double* b, lapack_int ldb,
-                          lapack_complex_double* taub);
-
-lapack_int LAPACKE_sggrqf(int matrix_order, lapack_int m, lapack_int p, lapack_int n, float* a, lapack_int lda,
-                          float* taua, float* b, lapack_int ldb, float* taub);
-lapack_int LAPACKE_dggrqf(int matrix_order, lapack_int m, lapack_int p, lapack_int n, double* a, lapack_int lda,
-                          double* taua, double* b, lapack_int ldb, double* taub);
-lapack_int LAPACKE_cggrqf(int matrix_order, lapack_int m, lapack_int p, lapack_int n, lapack_complex_float* a,
-                          lapack_int lda, lapack_complex_float* taua, lapack_complex_float* b, lapack_int ldb,
-                          lapack_complex_float* taub);
-lapack_int LAPACKE_zggrqf(int matrix_order, lapack_int m, lapack_int p, lapack_int n, lapack_complex_double* a,
-                          lapack_int lda, lapack_complex_double* taua, lapack_complex_double* b, lapack_int ldb,
-                          lapack_complex_double* taub);
-
-lapack_int LAPACKE_sggsvd(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int n, lapack_int p,
-                          lapack_int* k, lapack_int* l, float* a, lapack_int lda, float* b, lapack_int ldb,
-                          float* alpha, float* beta, float* u, lapack_int ldu, float* v, lapack_int ldv, float* q,
-                          lapack_int ldq, lapack_int* iwork);
-lapack_int LAPACKE_dggsvd(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int n, lapack_int p,
-                          lapack_int* k, lapack_int* l, double* a, lapack_int lda, double* b, lapack_int ldb,
-                          double* alpha, double* beta, double* u, lapack_int ldu, double* v, lapack_int ldv, double* q,
-                          lapack_int ldq, lapack_int* iwork);
-lapack_int LAPACKE_cggsvd(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int n, lapack_int p,
-                          lapack_int* k, lapack_int* l, lapack_complex_float* a, lapack_int lda,
-                          lapack_complex_float* b, lapack_int ldb, float* alpha, float* beta, lapack_complex_float* u,
-                          lapack_int ldu, lapack_complex_float* v, lapack_int ldv, lapack_complex_float* q,
-                          lapack_int ldq, lapack_int* iwork);
-lapack_int LAPACKE_zggsvd(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int n, lapack_int p,
-                          lapack_int* k, lapack_int* l, lapack_complex_double* a, lapack_int lda,
-                          lapack_complex_double* b, lapack_int ldb, double* alpha, double* beta,
-                          lapack_complex_double* u, lapack_int ldu, lapack_complex_double* v, lapack_int ldv,
-                          lapack_complex_double* q, lapack_int ldq, lapack_int* iwork);
-
-lapack_int LAPACKE_sggsvp(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p, lapack_int n,
-                          float* a, lapack_int lda, float* b, lapack_int ldb, float tola, float tolb, lapack_int* k,
-                          lapack_int* l, float* u, lapack_int ldu, float* v, lapack_int ldv, float* q, lapack_int ldq);
-lapack_int LAPACKE_dggsvp(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p, lapack_int n,
-                          double* a, lapack_int lda, double* b, lapack_int ldb, double tola, double tolb, lapack_int* k,
-                          lapack_int* l, double* u, lapack_int ldu, double* v, lapack_int ldv, double* q,
-                          lapack_int ldq);
-lapack_int LAPACKE_cggsvp(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p, lapack_int n,
-                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb, float tola,
-                          float tolb, lapack_int* k, lapack_int* l, lapack_complex_float* u, lapack_int ldu,
-                          lapack_complex_float* v, lapack_int ldv, lapack_complex_float* q, lapack_int ldq);
-lapack_int LAPACKE_zggsvp(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p, lapack_int n,
-                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
-                          double tola, double tolb, lapack_int* k, lapack_int* l, lapack_complex_double* u,
-                          lapack_int ldu, lapack_complex_double* v, lapack_int ldv, lapack_complex_double* q,
-                          lapack_int ldq);
-
-lapack_int LAPACKE_sgtcon(char norm, lapack_int n, const float* dl, const float* d, const float* du, const float* du2,
-                          const lapack_int* ipiv, float anorm, float* rcond);
-lapack_int LAPACKE_dgtcon(char norm, lapack_int n, const double* dl, const double* d, const double* du,
-                          const double* du2, const lapack_int* ipiv, double anorm, double* rcond);
-lapack_int LAPACKE_cgtcon(char norm, lapack_int n, const lapack_complex_float* dl, const lapack_complex_float* d,
-                          const lapack_complex_float* du, const lapack_complex_float* du2, const lapack_int* ipiv,
-                          float anorm, float* rcond);
-lapack_int LAPACKE_zgtcon(char norm, lapack_int n, const lapack_complex_double* dl, const lapack_complex_double* d,
-                          const lapack_complex_double* du, const lapack_complex_double* du2, const lapack_int* ipiv,
-                          double anorm, double* rcond);
-
-lapack_int LAPACKE_sgtrfs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const float* dl, const float* d,
-                          const float* du, const float* dlf, const float* df, const float* duf, const float* du2,
-                          const lapack_int* ipiv, const float* b, lapack_int ldb, float* x, lapack_int ldx, float* ferr,
-                          float* berr);
-lapack_int LAPACKE_dgtrfs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const double* dl,
-                          const double* d, const double* du, const double* dlf, const double* df, const double* duf,
-                          const double* du2, const lapack_int* ipiv, const double* b, lapack_int ldb, double* x,
-                          lapack_int ldx, double* ferr, double* berr);
-lapack_int LAPACKE_cgtrfs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const lapack_complex_float* dl,
-                          const lapack_complex_float* d, const lapack_complex_float* du,
-                          const lapack_complex_float* dlf, const lapack_complex_float* df,
-                          const lapack_complex_float* duf, const lapack_complex_float* du2, const lapack_int* ipiv,
-                          const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                          float* ferr, float* berr);
-lapack_int LAPACKE_zgtrfs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const lapack_complex_double* dl,
-                          const lapack_complex_double* d, const lapack_complex_double* du,
-                          const lapack_complex_double* dlf, const lapack_complex_double* df,
-                          const lapack_complex_double* duf, const lapack_complex_double* du2, const lapack_int* ipiv,
-                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                          double* ferr, double* berr);
-
-lapack_int LAPACKE_sgtsv(int matrix_order, lapack_int n, lapack_int nrhs, float* dl, float* d, float* du, float* b,
-                         lapack_int ldb);
-lapack_int LAPACKE_dgtsv(int matrix_order, lapack_int n, lapack_int nrhs, double* dl, double* d, double* du, double* b,
-                         lapack_int ldb);
-lapack_int LAPACKE_cgtsv(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_float* dl,
-                         lapack_complex_float* d, lapack_complex_float* du, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zgtsv(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_double* dl,
-                         lapack_complex_double* d, lapack_complex_double* du, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_sgtsvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, const float* dl,
-                          const float* d, const float* du, float* dlf, float* df, float* duf, float* du2,
-                          lapack_int* ipiv, const float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond,
-                          float* ferr, float* berr);
-lapack_int LAPACKE_dgtsvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, const double* dl,
-                          const double* d, const double* du, double* dlf, double* df, double* duf, double* du2,
-                          lapack_int* ipiv, const double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond,
-                          double* ferr, double* berr);
-lapack_int LAPACKE_cgtsvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_float* dl, const lapack_complex_float* d, const lapack_complex_float* du,
-                          lapack_complex_float* dlf, lapack_complex_float* df, lapack_complex_float* duf,
-                          lapack_complex_float* du2, lapack_int* ipiv, const lapack_complex_float* b, lapack_int ldb,
-                          lapack_complex_float* x, lapack_int ldx, float* rcond, float* ferr, float* berr);
-lapack_int LAPACKE_zgtsvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_double* dl, const lapack_complex_double* d,
-                          const lapack_complex_double* du, lapack_complex_double* dlf, lapack_complex_double* df,
-                          lapack_complex_double* duf, lapack_complex_double* du2, lapack_int* ipiv,
-                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                          double* rcond, double* ferr, double* berr);
-
-lapack_int LAPACKE_sgttrf(lapack_int n, float* dl, float* d, float* du, float* du2, lapack_int* ipiv);
-lapack_int LAPACKE_dgttrf(lapack_int n, double* dl, double* d, double* du, double* du2, lapack_int* ipiv);
-lapack_int LAPACKE_cgttrf(lapack_int n, lapack_complex_float* dl, lapack_complex_float* d, lapack_complex_float* du,
-                          lapack_complex_float* du2, lapack_int* ipiv);
-lapack_int LAPACKE_zgttrf(lapack_int n, lapack_complex_double* dl, lapack_complex_double* d, lapack_complex_double* du,
-                          lapack_complex_double* du2, lapack_int* ipiv);
-
-lapack_int LAPACKE_sgttrs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const float* dl, const float* d,
-                          const float* du, const float* du2, const lapack_int* ipiv, float* b, lapack_int ldb);
-lapack_int LAPACKE_dgttrs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const double* dl,
-                          const double* d, const double* du, const double* du2, const lapack_int* ipiv, double* b,
-                          lapack_int ldb);
-lapack_int LAPACKE_cgttrs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const lapack_complex_float* dl,
-                          const lapack_complex_float* d, const lapack_complex_float* du,
-                          const lapack_complex_float* du2, const lapack_int* ipiv, lapack_complex_float* b,
-                          lapack_int ldb);
-lapack_int LAPACKE_zgttrs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const lapack_complex_double* dl,
-                          const lapack_complex_double* d, const lapack_complex_double* du,
-                          const lapack_complex_double* du2, const lapack_int* ipiv, lapack_complex_double* b,
-                          lapack_int ldb);
-
-lapack_int LAPACKE_chbev(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, lapack_complex_float* ab,
-                         lapack_int ldab, float* w, lapack_complex_float* z, lapack_int ldz);
-lapack_int LAPACKE_zhbev(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, lapack_complex_double* ab,
-                         lapack_int ldab, double* w, lapack_complex_double* z, lapack_int ldz);
-
-lapack_int LAPACKE_chbevd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, lapack_complex_float* ab,
-                          lapack_int ldab, float* w, lapack_complex_float* z, lapack_int ldz);
-lapack_int LAPACKE_zhbevd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd,
-                          lapack_complex_double* ab, lapack_int ldab, double* w, lapack_complex_double* z,
-                          lapack_int ldz);
-
-lapack_int LAPACKE_chbevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int kd,
-                          lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* q, lapack_int ldq, float vl,
-                          float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w,
-                          lapack_complex_float* z, lapack_int ldz, lapack_int* ifail);
-lapack_int LAPACKE_zhbevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int kd,
-                          lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* q, lapack_int ldq,
-                          double vl, double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
-                          lapack_complex_double* z, lapack_int ldz, lapack_int* ifail);
-
-lapack_int LAPACKE_chbgst(int matrix_order, char vect, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                          lapack_complex_float* ab, lapack_int ldab, const lapack_complex_float* bb, lapack_int ldbb,
-                          lapack_complex_float* x, lapack_int ldx);
-lapack_int LAPACKE_zhbgst(int matrix_order, char vect, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                          lapack_complex_double* ab, lapack_int ldab, const lapack_complex_double* bb, lapack_int ldbb,
-                          lapack_complex_double* x, lapack_int ldx);
-
-lapack_int LAPACKE_chbgv(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                         lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* bb, lapack_int ldbb, float* w,
-                         lapack_complex_float* z, lapack_int ldz);
-lapack_int LAPACKE_zhbgv(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                         lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* bb, lapack_int ldbb,
-                         double* w, lapack_complex_double* z, lapack_int ldz);
-
-lapack_int LAPACKE_chbgvd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                          lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* bb, lapack_int ldbb,
-                          float* w, lapack_complex_float* z, lapack_int ldz);
-lapack_int LAPACKE_zhbgvd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                          lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* bb, lapack_int ldbb,
-                          double* w, lapack_complex_double* z, lapack_int ldz);
-
-lapack_int LAPACKE_chbgvx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int ka,
-                          lapack_int kb, lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* bb,
-                          lapack_int ldbb, lapack_complex_float* q, lapack_int ldq, float vl, float vu, lapack_int il,
-                          lapack_int iu, float abstol, lapack_int* m, float* w, lapack_complex_float* z, lapack_int ldz,
-                          lapack_int* ifail);
-lapack_int LAPACKE_zhbgvx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int ka,
-                          lapack_int kb, lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* bb,
-                          lapack_int ldbb, lapack_complex_double* q, lapack_int ldq, double vl, double vu,
-                          lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
-                          lapack_complex_double* z, lapack_int ldz, lapack_int* ifail);
-
-lapack_int LAPACKE_chbtrd(int matrix_order, char vect, char uplo, lapack_int n, lapack_int kd, lapack_complex_float* ab,
-                          lapack_int ldab, float* d, float* e, lapack_complex_float* q, lapack_int ldq);
-lapack_int LAPACKE_zhbtrd(int matrix_order, char vect, char uplo, lapack_int n, lapack_int kd,
-                          lapack_complex_double* ab, lapack_int ldab, double* d, double* e, lapack_complex_double* q,
-                          lapack_int ldq);
-
-lapack_int LAPACKE_checon(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
-                          const lapack_int* ipiv, float anorm, float* rcond);
-lapack_int LAPACKE_zhecon(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a, lapack_int lda,
-                          const lapack_int* ipiv, double anorm, double* rcond);
-
-lapack_int LAPACKE_cheequb(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
-                           float* s, float* scond, float* amax);
-lapack_int LAPACKE_zheequb(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a, lapack_int lda,
-                           double* s, double* scond, double* amax);
-
-lapack_int LAPACKE_cheev(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                         float* w);
-lapack_int LAPACKE_zheev(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                         double* w);
-
-lapack_int LAPACKE_cheevd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          float* w);
-lapack_int LAPACKE_zheevd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_double* a,
-                          lapack_int lda, double* w);
-
-lapack_int LAPACKE_cheevr(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_complex_float* a,
-                          lapack_int lda, float vl, float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m,
-                          float* w, lapack_complex_float* z, lapack_int ldz, lapack_int* isuppz);
-lapack_int LAPACKE_zheevr(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_complex_double* a,
-                          lapack_int lda, double vl, double vu, lapack_int il, lapack_int iu, double abstol,
-                          lapack_int* m, double* w, lapack_complex_double* z, lapack_int ldz, lapack_int* isuppz);
-
-lapack_int LAPACKE_cheevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_complex_float* a,
-                          lapack_int lda, float vl, float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m,
-                          float* w, lapack_complex_float* z, lapack_int ldz, lapack_int* ifail);
-lapack_int LAPACKE_zheevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_complex_double* a,
-                          lapack_int lda, double vl, double vu, lapack_int il, lapack_int iu, double abstol,
-                          lapack_int* m, double* w, lapack_complex_double* z, lapack_int ldz, lapack_int* ifail);
-
-lapack_int LAPACKE_chegst(int matrix_order, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a,
-                          lapack_int lda, const lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zhegst(int matrix_order, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a,
-                          lapack_int lda, const lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_chegv(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
-                         lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb, float* w);
-lapack_int LAPACKE_zhegv(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
-                         lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb, double* w);
-
-lapack_int LAPACKE_chegvd(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
-                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb, float* w);
-lapack_int LAPACKE_zhegvd(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
-                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
-                          double* w);
-
-lapack_int LAPACKE_chegvx(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
-                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb, float vl,
-                          float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w,
-                          lapack_complex_float* z, lapack_int ldz, lapack_int* ifail);
-lapack_int LAPACKE_zhegvx(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
-                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb, double vl,
-                          double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
-                          lapack_complex_double* z, lapack_int ldz, lapack_int* ifail);
-
-lapack_int LAPACKE_cherfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
-                          lapack_int lda, const lapack_complex_float* af, lapack_int ldaf, const lapack_int* ipiv,
-                          const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                          float* ferr, float* berr);
-lapack_int LAPACKE_zherfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
-                          lapack_int lda, const lapack_complex_double* af, lapack_int ldaf, const lapack_int* ipiv,
-                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                          double* ferr, double* berr);
-
-lapack_int LAPACKE_cherfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
-                           const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
-                           lapack_int ldaf, const lapack_int* ipiv, const float* s, const lapack_complex_float* b,
-                           lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* rcond, float* berr,
-                           lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
-                           float* params);
-lapack_int LAPACKE_zherfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
-                           const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
-                           lapack_int ldaf, const lapack_int* ipiv, const double* s, const lapack_complex_double* b,
-                           lapack_int ldb, lapack_complex_double* x, lapack_int ldx, double* rcond, double* berr,
-                           lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
-                           double* params);
-
-lapack_int LAPACKE_chesv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
-                         lapack_int lda, lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zhesv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
-                         lapack_int lda, lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_chesvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
-                          lapack_int* ipiv, const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
-                          lapack_int ldx, float* rcond, float* ferr, float* berr);
-lapack_int LAPACKE_zhesvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
-                          lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
-                          lapack_int ldx, double* rcond, double* ferr, double* berr);
-
-lapack_int LAPACKE_chesvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                           lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
-                           lapack_int* ipiv, char* equed, float* s, lapack_complex_float* b, lapack_int ldb,
-                           lapack_complex_float* x, lapack_int ldx, float* rcond, float* rpvgrw, float* berr,
-                           lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
-                           float* params);
-lapack_int LAPACKE_zhesvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                           lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
-                           lapack_int* ipiv, char* equed, double* s, lapack_complex_double* b, lapack_int ldb,
-                           lapack_complex_double* x, lapack_int ldx, double* rcond, double* rpvgrw, double* berr,
-                           lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
-                           double* params);
-
-lapack_int LAPACKE_chetrd(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda, float* d,
-                          float* e, lapack_complex_float* tau);
-lapack_int LAPACKE_zhetrd(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          double* d, double* e, lapack_complex_double* tau);
-
-lapack_int LAPACKE_chetrf(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          lapack_int* ipiv);
-lapack_int LAPACKE_zhetrf(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          lapack_int* ipiv);
-
-lapack_int LAPACKE_chetri(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          const lapack_int* ipiv);
-lapack_int LAPACKE_zhetri(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          const lapack_int* ipiv);
-
-lapack_int LAPACKE_chetrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
-                          lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zhetrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
-                          lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_chfrk(int matrix_order, char transr, char uplo, char trans, lapack_int n, lapack_int k, float alpha,
-                         const lapack_complex_float* a, lapack_int lda, float beta, lapack_complex_float* c);
-lapack_int LAPACKE_zhfrk(int matrix_order, char transr, char uplo, char trans, lapack_int n, lapack_int k, double alpha,
-                         const lapack_complex_double* a, lapack_int lda, double beta, lapack_complex_double* c);
-
-lapack_int LAPACKE_shgeqz(int matrix_order, char job, char compq, char compz, lapack_int n, lapack_int ilo,
-                          lapack_int ihi, float* h, lapack_int ldh, float* t, lapack_int ldt, float* alphar,
-                          float* alphai, float* beta, float* q, lapack_int ldq, float* z, lapack_int ldz);
-lapack_int LAPACKE_dhgeqz(int matrix_order, char job, char compq, char compz, lapack_int n, lapack_int ilo,
-                          lapack_int ihi, double* h, lapack_int ldh, double* t, lapack_int ldt, double* alphar,
-                          double* alphai, double* beta, double* q, lapack_int ldq, double* z, lapack_int ldz);
-lapack_int LAPACKE_chgeqz(int matrix_order, char job, char compq, char compz, lapack_int n, lapack_int ilo,
-                          lapack_int ihi, lapack_complex_float* h, lapack_int ldh, lapack_complex_float* t,
-                          lapack_int ldt, lapack_complex_float* alpha, lapack_complex_float* beta,
-                          lapack_complex_float* q, lapack_int ldq, lapack_complex_float* z, lapack_int ldz);
-lapack_int LAPACKE_zhgeqz(int matrix_order, char job, char compq, char compz, lapack_int n, lapack_int ilo,
-                          lapack_int ihi, lapack_complex_double* h, lapack_int ldh, lapack_complex_double* t,
-                          lapack_int ldt, lapack_complex_double* alpha, lapack_complex_double* beta,
-                          lapack_complex_double* q, lapack_int ldq, lapack_complex_double* z, lapack_int ldz);
-
-lapack_int LAPACKE_chpcon(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap,
-                          const lapack_int* ipiv, float anorm, float* rcond);
-lapack_int LAPACKE_zhpcon(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap,
-                          const lapack_int* ipiv, double anorm, double* rcond);
-
-lapack_int LAPACKE_chpev(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_float* ap, float* w,
-                         lapack_complex_float* z, lapack_int ldz);
-lapack_int LAPACKE_zhpev(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_double* ap, double* w,
-                         lapack_complex_double* z, lapack_int ldz);
-
-lapack_int LAPACKE_chpevd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_float* ap, float* w,
-                          lapack_complex_float* z, lapack_int ldz);
-lapack_int LAPACKE_zhpevd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_double* ap, double* w,
-                          lapack_complex_double* z, lapack_int ldz);
-
-lapack_int LAPACKE_chpevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_complex_float* ap,
-                          float vl, float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w,
-                          lapack_complex_float* z, lapack_int ldz, lapack_int* ifail);
-lapack_int LAPACKE_zhpevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_complex_double* ap,
-                          double vl, double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
-                          lapack_complex_double* z, lapack_int ldz, lapack_int* ifail);
-
-lapack_int LAPACKE_chpgst(int matrix_order, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* ap,
-                          const lapack_complex_float* bp);
-lapack_int LAPACKE_zhpgst(int matrix_order, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* ap,
-                          const lapack_complex_double* bp);
-
-lapack_int LAPACKE_chpgv(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
-                         lapack_complex_float* ap, lapack_complex_float* bp, float* w, lapack_complex_float* z,
-                         lapack_int ldz);
-lapack_int LAPACKE_zhpgv(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
-                         lapack_complex_double* ap, lapack_complex_double* bp, double* w, lapack_complex_double* z,
-                         lapack_int ldz);
-
-lapack_int LAPACKE_chpgvd(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
-                          lapack_complex_float* ap, lapack_complex_float* bp, float* w, lapack_complex_float* z,
-                          lapack_int ldz);
-lapack_int LAPACKE_zhpgvd(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
-                          lapack_complex_double* ap, lapack_complex_double* bp, double* w, lapack_complex_double* z,
-                          lapack_int ldz);
-
-lapack_int LAPACKE_chpgvx(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
-                          lapack_complex_float* ap, lapack_complex_float* bp, float vl, float vu, lapack_int il,
-                          lapack_int iu, float abstol, lapack_int* m, float* w, lapack_complex_float* z, lapack_int ldz,
-                          lapack_int* ifail);
-lapack_int LAPACKE_zhpgvx(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
-                          lapack_complex_double* ap, lapack_complex_double* bp, double vl, double vu, lapack_int il,
-                          lapack_int iu, double abstol, lapack_int* m, double* w, lapack_complex_double* z,
-                          lapack_int ldz, lapack_int* ifail);
-
-lapack_int LAPACKE_chprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* ap,
-                          const lapack_complex_float* afp, const lapack_int* ipiv, const lapack_complex_float* b,
-                          lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr);
-lapack_int LAPACKE_zhprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* ap,
-                          const lapack_complex_double* afp, const lapack_int* ipiv, const lapack_complex_double* b,
-                          lapack_int ldb, lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr);
-
-lapack_int LAPACKE_chpsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* ap,
-                         lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zhpsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* ap,
-                         lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_chpsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_float* ap, lapack_complex_float* afp, lapack_int* ipiv,
-                          const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                          float* rcond, float* ferr, float* berr);
-lapack_int LAPACKE_zhpsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_double* ap, lapack_complex_double* afp, lapack_int* ipiv,
-                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                          double* rcond, double* ferr, double* berr);
-
-lapack_int LAPACKE_chptrd(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap, float* d, float* e,
-                          lapack_complex_float* tau);
-lapack_int LAPACKE_zhptrd(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap, double* d, double* e,
-                          lapack_complex_double* tau);
-
-lapack_int LAPACKE_chptrf(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap, lapack_int* ipiv);
-lapack_int LAPACKE_zhptrf(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap, lapack_int* ipiv);
-
-lapack_int LAPACKE_chptri(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap, const lapack_int* ipiv);
-lapack_int LAPACKE_zhptri(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap, const lapack_int* ipiv);
-
-lapack_int LAPACKE_chptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* ap,
-                          const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zhptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* ap,
-                          const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_shsein(int matrix_order, char job, char eigsrc, char initv, lapack_logical* select, lapack_int n,
-                          const float* h, lapack_int ldh, float* wr, const float* wi, float* vl, lapack_int ldvl,
-                          float* vr, lapack_int ldvr, lapack_int mm, lapack_int* m, lapack_int* ifaill,
-                          lapack_int* ifailr);
-lapack_int LAPACKE_dhsein(int matrix_order, char job, char eigsrc, char initv, lapack_logical* select, lapack_int n,
-                          const double* h, lapack_int ldh, double* wr, const double* wi, double* vl, lapack_int ldvl,
-                          double* vr, lapack_int ldvr, lapack_int mm, lapack_int* m, lapack_int* ifaill,
-                          lapack_int* ifailr);
-lapack_int LAPACKE_chsein(int matrix_order, char job, char eigsrc, char initv, const lapack_logical* select,
-                          lapack_int n, const lapack_complex_float* h, lapack_int ldh, lapack_complex_float* w,
-                          lapack_complex_float* vl, lapack_int ldvl, lapack_complex_float* vr, lapack_int ldvr,
-                          lapack_int mm, lapack_int* m, lapack_int* ifaill, lapack_int* ifailr);
-lapack_int LAPACKE_zhsein(int matrix_order, char job, char eigsrc, char initv, const lapack_logical* select,
-                          lapack_int n, const lapack_complex_double* h, lapack_int ldh, lapack_complex_double* w,
-                          lapack_complex_double* vl, lapack_int ldvl, lapack_complex_double* vr, lapack_int ldvr,
-                          lapack_int mm, lapack_int* m, lapack_int* ifaill, lapack_int* ifailr);
-
-lapack_int LAPACKE_shseqr(int matrix_order, char job, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
-                          float* h, lapack_int ldh, float* wr, float* wi, float* z, lapack_int ldz);
-lapack_int LAPACKE_dhseqr(int matrix_order, char job, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
-                          double* h, lapack_int ldh, double* wr, double* wi, double* z, lapack_int ldz);
-lapack_int LAPACKE_chseqr(int matrix_order, char job, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
-                          lapack_complex_float* h, lapack_int ldh, lapack_complex_float* w, lapack_complex_float* z,
-                          lapack_int ldz);
-lapack_int LAPACKE_zhseqr(int matrix_order, char job, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
-                          lapack_complex_double* h, lapack_int ldh, lapack_complex_double* w, lapack_complex_double* z,
-                          lapack_int ldz);
-
-lapack_int LAPACKE_clacgv(lapack_int n, lapack_complex_float* x, lapack_int incx);
-lapack_int LAPACKE_zlacgv(lapack_int n, lapack_complex_double* x, lapack_int incx);
-
-lapack_int LAPACKE_slacpy(int matrix_order, char uplo, lapack_int m, lapack_int n, const float* a, lapack_int lda,
-                          float* b, lapack_int ldb);
-lapack_int LAPACKE_dlacpy(int matrix_order, char uplo, lapack_int m, lapack_int n, const double* a, lapack_int lda,
-                          double* b, lapack_int ldb);
-lapack_int LAPACKE_clacpy(int matrix_order, char uplo, lapack_int m, lapack_int n, const lapack_complex_float* a,
-                          lapack_int lda, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zlacpy(int matrix_order, char uplo, lapack_int m, lapack_int n, const lapack_complex_double* a,
-                          lapack_int lda, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_zlag2c(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_double* a, lapack_int lda,
-                          lapack_complex_float* sa, lapack_int ldsa);
-
-lapack_int LAPACKE_slag2d(int matrix_order, lapack_int m, lapack_int n, const float* sa, lapack_int ldsa, double* a,
-                          lapack_int lda);
-
-lapack_int LAPACKE_dlag2s(int matrix_order, lapack_int m, lapack_int n, const double* a, lapack_int lda, float* sa,
-                          lapack_int ldsa);
-
-lapack_int LAPACKE_clag2z(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_float* sa, lapack_int ldsa,
-                          lapack_complex_double* a, lapack_int lda);
-
-lapack_int LAPACKE_slagge(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, const float* d,
-                          float* a, lapack_int lda, lapack_int* iseed);
-lapack_int LAPACKE_dlagge(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, const double* d,
-                          double* a, lapack_int lda, lapack_int* iseed);
-lapack_int LAPACKE_clagge(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, const float* d,
-                          lapack_complex_float* a, lapack_int lda, lapack_int* iseed);
-lapack_int LAPACKE_zlagge(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, const double* d,
-                          lapack_complex_double* a, lapack_int lda, lapack_int* iseed);
-
-float LAPACKE_slamch(char cmach);
-double LAPACKE_dlamch(char cmach);
-
-float LAPACKE_slange(int matrix_order, char norm, lapack_int m, lapack_int n, const float* a, lapack_int lda);
-double LAPACKE_dlange(int matrix_order, char norm, lapack_int m, lapack_int n, const double* a, lapack_int lda);
-float LAPACKE_clange(int matrix_order, char norm, lapack_int m, lapack_int n, const lapack_complex_float* a,
-                     lapack_int lda);
-double LAPACKE_zlange(int matrix_order, char norm, lapack_int m, lapack_int n, const lapack_complex_double* a,
-                      lapack_int lda);
-
-float LAPACKE_clanhe(int matrix_order, char norm, char uplo, lapack_int n, const lapack_complex_float* a,
-                     lapack_int lda);
-double LAPACKE_zlanhe(int matrix_order, char norm, char uplo, lapack_int n, const lapack_complex_double* a,
-                      lapack_int lda);
-
-float LAPACKE_slansy(int matrix_order, char norm, char uplo, lapack_int n, const float* a, lapack_int lda);
-double LAPACKE_dlansy(int matrix_order, char norm, char uplo, lapack_int n, const double* a, lapack_int lda);
-float LAPACKE_clansy(int matrix_order, char norm, char uplo, lapack_int n, const lapack_complex_float* a,
-                     lapack_int lda);
-double LAPACKE_zlansy(int matrix_order, char norm, char uplo, lapack_int n, const lapack_complex_double* a,
-                      lapack_int lda);
-
-float LAPACKE_slantr(int matrix_order, char norm, char uplo, char diag, lapack_int m, lapack_int n, const float* a,
-                     lapack_int lda);
-double LAPACKE_dlantr(int matrix_order, char norm, char uplo, char diag, lapack_int m, lapack_int n, const double* a,
-                      lapack_int lda);
-float LAPACKE_clantr(int matrix_order, char norm, char uplo, char diag, lapack_int m, lapack_int n,
-                     const lapack_complex_float* a, lapack_int lda);
-double LAPACKE_zlantr(int matrix_order, char norm, char uplo, char diag, lapack_int m, lapack_int n,
-                      const lapack_complex_double* a, lapack_int lda);
-
-lapack_int LAPACKE_slarfb(int matrix_order, char side, char trans, char direct, char storev, lapack_int m, lapack_int n,
-                          lapack_int k, const float* v, lapack_int ldv, const float* t, lapack_int ldt, float* c,
-                          lapack_int ldc);
-lapack_int LAPACKE_dlarfb(int matrix_order, char side, char trans, char direct, char storev, lapack_int m, lapack_int n,
-                          lapack_int k, const double* v, lapack_int ldv, const double* t, lapack_int ldt, double* c,
-                          lapack_int ldc);
-lapack_int LAPACKE_clarfb(int matrix_order, char side, char trans, char direct, char storev, lapack_int m, lapack_int n,
-                          lapack_int k, const lapack_complex_float* v, lapack_int ldv, const lapack_complex_float* t,
-                          lapack_int ldt, lapack_complex_float* c, lapack_int ldc);
-lapack_int LAPACKE_zlarfb(int matrix_order, char side, char trans, char direct, char storev, lapack_int m, lapack_int n,
-                          lapack_int k, const lapack_complex_double* v, lapack_int ldv, const lapack_complex_double* t,
-                          lapack_int ldt, lapack_complex_double* c, lapack_int ldc);
-
-lapack_int LAPACKE_slarfg(lapack_int n, float* alpha, float* x, lapack_int incx, float* tau);
-lapack_int LAPACKE_dlarfg(lapack_int n, double* alpha, double* x, lapack_int incx, double* tau);
-lapack_int LAPACKE_clarfg(lapack_int n, lapack_complex_float* alpha, lapack_complex_float* x, lapack_int incx,
-                          lapack_complex_float* tau);
-lapack_int LAPACKE_zlarfg(lapack_int n, lapack_complex_double* alpha, lapack_complex_double* x, lapack_int incx,
-                          lapack_complex_double* tau);
-
-lapack_int LAPACKE_slarft(int matrix_order, char direct, char storev, lapack_int n, lapack_int k, const float* v,
-                          lapack_int ldv, const float* tau, float* t, lapack_int ldt);
-lapack_int LAPACKE_dlarft(int matrix_order, char direct, char storev, lapack_int n, lapack_int k, const double* v,
-                          lapack_int ldv, const double* tau, double* t, lapack_int ldt);
-lapack_int LAPACKE_clarft(int matrix_order, char direct, char storev, lapack_int n, lapack_int k,
-                          const lapack_complex_float* v, lapack_int ldv, const lapack_complex_float* tau,
-                          lapack_complex_float* t, lapack_int ldt);
-lapack_int LAPACKE_zlarft(int matrix_order, char direct, char storev, lapack_int n, lapack_int k,
-                          const lapack_complex_double* v, lapack_int ldv, const lapack_complex_double* tau,
-                          lapack_complex_double* t, lapack_int ldt);
-
-lapack_int LAPACKE_slarfx(int matrix_order, char side, lapack_int m, lapack_int n, const float* v, float tau, float* c,
-                          lapack_int ldc, float* work);
-lapack_int LAPACKE_dlarfx(int matrix_order, char side, lapack_int m, lapack_int n, const double* v, double tau,
-                          double* c, lapack_int ldc, double* work);
-lapack_int LAPACKE_clarfx(int matrix_order, char side, lapack_int m, lapack_int n, const lapack_complex_float* v,
-                          lapack_complex_float tau, lapack_complex_float* c, lapack_int ldc,
-                          lapack_complex_float* work);
-lapack_int LAPACKE_zlarfx(int matrix_order, char side, lapack_int m, lapack_int n, const lapack_complex_double* v,
-                          lapack_complex_double tau, lapack_complex_double* c, lapack_int ldc,
-                          lapack_complex_double* work);
-
-lapack_int LAPACKE_slarnv(lapack_int idist, lapack_int* iseed, lapack_int n, float* x);
-lapack_int LAPACKE_dlarnv(lapack_int idist, lapack_int* iseed, lapack_int n, double* x);
-lapack_int LAPACKE_clarnv(lapack_int idist, lapack_int* iseed, lapack_int n, lapack_complex_float* x);
-lapack_int LAPACKE_zlarnv(lapack_int idist, lapack_int* iseed, lapack_int n, lapack_complex_double* x);
-
-lapack_int LAPACKE_slaset(int matrix_order, char uplo, lapack_int m, lapack_int n, float alpha, float beta, float* a,
-                          lapack_int lda);
-lapack_int LAPACKE_dlaset(int matrix_order, char uplo, lapack_int m, lapack_int n, double alpha, double beta, double* a,
-                          lapack_int lda);
-lapack_int LAPACKE_claset(int matrix_order, char uplo, lapack_int m, lapack_int n, lapack_complex_float alpha,
-                          lapack_complex_float beta, lapack_complex_float* a, lapack_int lda);
-lapack_int LAPACKE_zlaset(int matrix_order, char uplo, lapack_int m, lapack_int n, lapack_complex_double alpha,
-                          lapack_complex_double beta, lapack_complex_double* a, lapack_int lda);
-
-lapack_int LAPACKE_slasrt(char id, lapack_int n, float* d);
-lapack_int LAPACKE_dlasrt(char id, lapack_int n, double* d);
-
-lapack_int LAPACKE_slaswp(int matrix_order, lapack_int n, float* a, lapack_int lda, lapack_int k1, lapack_int k2,
-                          const lapack_int* ipiv, lapack_int incx);
-lapack_int LAPACKE_dlaswp(int matrix_order, lapack_int n, double* a, lapack_int lda, lapack_int k1, lapack_int k2,
-                          const lapack_int* ipiv, lapack_int incx);
-lapack_int LAPACKE_claswp(int matrix_order, lapack_int n, lapack_complex_float* a, lapack_int lda, lapack_int k1,
-                          lapack_int k2, const lapack_int* ipiv, lapack_int incx);
-lapack_int LAPACKE_zlaswp(int matrix_order, lapack_int n, lapack_complex_double* a, lapack_int lda, lapack_int k1,
-                          lapack_int k2, const lapack_int* ipiv, lapack_int incx);
-
-lapack_int LAPACKE_slatms(int matrix_order, lapack_int m, lapack_int n, char dist, lapack_int* iseed, char sym,
-                          float* d, lapack_int mode, float cond, float dmax, lapack_int kl, lapack_int ku, char pack,
-                          float* a, lapack_int lda);
-lapack_int LAPACKE_dlatms(int matrix_order, lapack_int m, lapack_int n, char dist, lapack_int* iseed, char sym,
-                          double* d, lapack_int mode, double cond, double dmax, lapack_int kl, lapack_int ku, char pack,
-                          double* a, lapack_int lda);
-lapack_int LAPACKE_clatms(int matrix_order, lapack_int m, lapack_int n, char dist, lapack_int* iseed, char sym,
-                          float* d, lapack_int mode, float cond, float dmax, lapack_int kl, lapack_int ku, char pack,
-                          lapack_complex_float* a, lapack_int lda);
-lapack_int LAPACKE_zlatms(int matrix_order, lapack_int m, lapack_int n, char dist, lapack_int* iseed, char sym,
-                          double* d, lapack_int mode, double cond, double dmax, lapack_int kl, lapack_int ku, char pack,
-                          lapack_complex_double* a, lapack_int lda);
-
-lapack_int LAPACKE_slauum(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda);
-lapack_int LAPACKE_dlauum(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda);
-lapack_int LAPACKE_clauum(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda);
-lapack_int LAPACKE_zlauum(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda);
-
-lapack_int LAPACKE_sopgtr(int matrix_order, char uplo, lapack_int n, const float* ap, const float* tau, float* q,
-                          lapack_int ldq);
-lapack_int LAPACKE_dopgtr(int matrix_order, char uplo, lapack_int n, const double* ap, const double* tau, double* q,
-                          lapack_int ldq);
-
-lapack_int LAPACKE_sopmtr(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
-                          const float* ap, const float* tau, float* c, lapack_int ldc);
-lapack_int LAPACKE_dopmtr(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
-                          const double* ap, const double* tau, double* c, lapack_int ldc);
-
-lapack_int LAPACKE_sorgbr(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int k, float* a,
-                          lapack_int lda, const float* tau);
-lapack_int LAPACKE_dorgbr(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int k, double* a,
-                          lapack_int lda, const double* tau);
-
-lapack_int LAPACKE_sorghr(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, float* a, lapack_int lda,
-                          const float* tau);
-lapack_int LAPACKE_dorghr(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, double* a, lapack_int lda,
-                          const double* tau);
-
-lapack_int LAPACKE_sorglq(int matrix_order, lapack_int m, lapack_int n, lapack_int k, float* a, lapack_int lda,
-                          const float* tau);
-lapack_int LAPACKE_dorglq(int matrix_order, lapack_int m, lapack_int n, lapack_int k, double* a, lapack_int lda,
-                          const double* tau);
-
-lapack_int LAPACKE_sorgql(int matrix_order, lapack_int m, lapack_int n, lapack_int k, float* a, lapack_int lda,
-                          const float* tau);
-lapack_int LAPACKE_dorgql(int matrix_order, lapack_int m, lapack_int n, lapack_int k, double* a, lapack_int lda,
-                          const double* tau);
-
-lapack_int LAPACKE_sorgqr(int matrix_order, lapack_int m, lapack_int n, lapack_int k, float* a, lapack_int lda,
-                          const float* tau);
-lapack_int LAPACKE_dorgqr(int matrix_order, lapack_int m, lapack_int n, lapack_int k, double* a, lapack_int lda,
-                          const double* tau);
-
-lapack_int LAPACKE_sorgrq(int matrix_order, lapack_int m, lapack_int n, lapack_int k, float* a, lapack_int lda,
-                          const float* tau);
-lapack_int LAPACKE_dorgrq(int matrix_order, lapack_int m, lapack_int n, lapack_int k, double* a, lapack_int lda,
-                          const double* tau);
-
-lapack_int LAPACKE_sorgtr(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, const float* tau);
-lapack_int LAPACKE_dorgtr(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda, const double* tau);
-
-lapack_int LAPACKE_sormbr(int matrix_order, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc);
-lapack_int LAPACKE_dormbr(int matrix_order, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc);
-
-lapack_int LAPACKE_sormhr(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int ilo,
-                          lapack_int ihi, const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc);
-lapack_int LAPACKE_dormhr(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int ilo,
-                          lapack_int ihi, const double* a, lapack_int lda, const double* tau, double* c,
-                          lapack_int ldc);
-
-lapack_int LAPACKE_sormlq(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc);
-lapack_int LAPACKE_dormlq(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc);
-
-lapack_int LAPACKE_sormql(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc);
-lapack_int LAPACKE_dormql(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc);
-
-lapack_int LAPACKE_sormqr(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc);
-lapack_int LAPACKE_dormqr(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc);
-
-lapack_int LAPACKE_sormrq(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc);
-lapack_int LAPACKE_dormrq(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc);
-
-lapack_int LAPACKE_sormrz(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          lapack_int l, const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc);
-lapack_int LAPACKE_dormrz(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          lapack_int l, const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc);
-
-lapack_int LAPACKE_sormtr(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
-                          const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc);
-lapack_int LAPACKE_dormtr(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
-                          const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc);
-
-lapack_int LAPACKE_spbcon(int matrix_order, char uplo, lapack_int n, lapack_int kd, const float* ab, lapack_int ldab,
-                          float anorm, float* rcond);
-lapack_int LAPACKE_dpbcon(int matrix_order, char uplo, lapack_int n, lapack_int kd, const double* ab, lapack_int ldab,
-                          double anorm, double* rcond);
-lapack_int LAPACKE_cpbcon(int matrix_order, char uplo, lapack_int n, lapack_int kd, const lapack_complex_float* ab,
-                          lapack_int ldab, float anorm, float* rcond);
-lapack_int LAPACKE_zpbcon(int matrix_order, char uplo, lapack_int n, lapack_int kd, const lapack_complex_double* ab,
-                          lapack_int ldab, double anorm, double* rcond);
-
-lapack_int LAPACKE_spbequ(int matrix_order, char uplo, lapack_int n, lapack_int kd, const float* ab, lapack_int ldab,
-                          float* s, float* scond, float* amax);
-lapack_int LAPACKE_dpbequ(int matrix_order, char uplo, lapack_int n, lapack_int kd, const double* ab, lapack_int ldab,
-                          double* s, double* scond, double* amax);
-lapack_int LAPACKE_cpbequ(int matrix_order, char uplo, lapack_int n, lapack_int kd, const lapack_complex_float* ab,
-                          lapack_int ldab, float* s, float* scond, float* amax);
-lapack_int LAPACKE_zpbequ(int matrix_order, char uplo, lapack_int n, lapack_int kd, const lapack_complex_double* ab,
-                          lapack_int ldab, double* s, double* scond, double* amax);
-
-lapack_int LAPACKE_spbrfs(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs, const float* ab,
-                          lapack_int ldab, const float* afb, lapack_int ldafb, const float* b, lapack_int ldb, float* x,
-                          lapack_int ldx, float* ferr, float* berr);
-lapack_int LAPACKE_dpbrfs(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs, const double* ab,
-                          lapack_int ldab, const double* afb, lapack_int ldafb, const double* b, lapack_int ldb,
-                          double* x, lapack_int ldx, double* ferr, double* berr);
-lapack_int LAPACKE_cpbrfs(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                          const lapack_complex_float* ab, lapack_int ldab, const lapack_complex_float* afb,
-                          lapack_int ldafb, const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
-                          lapack_int ldx, float* ferr, float* berr);
-lapack_int LAPACKE_zpbrfs(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                          const lapack_complex_double* ab, lapack_int ldab, const lapack_complex_double* afb,
-                          lapack_int ldafb, const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
-                          lapack_int ldx, double* ferr, double* berr);
-
-lapack_int LAPACKE_spbstf(int matrix_order, char uplo, lapack_int n, lapack_int kb, float* bb, lapack_int ldbb);
-lapack_int LAPACKE_dpbstf(int matrix_order, char uplo, lapack_int n, lapack_int kb, double* bb, lapack_int ldbb);
-lapack_int LAPACKE_cpbstf(int matrix_order, char uplo, lapack_int n, lapack_int kb, lapack_complex_float* bb,
-                          lapack_int ldbb);
-lapack_int LAPACKE_zpbstf(int matrix_order, char uplo, lapack_int n, lapack_int kb, lapack_complex_double* bb,
-                          lapack_int ldbb);
-
-lapack_int LAPACKE_spbsv(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs, float* ab,
-                         lapack_int ldab, float* b, lapack_int ldb);
-lapack_int LAPACKE_dpbsv(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs, double* ab,
-                         lapack_int ldab, double* b, lapack_int ldb);
-lapack_int LAPACKE_cpbsv(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                         lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zpbsv(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                         lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_spbsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                          float* ab, lapack_int ldab, float* afb, lapack_int ldafb, char* equed, float* s, float* b,
-                          lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* ferr, float* berr);
-lapack_int LAPACKE_dpbsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                          double* ab, lapack_int ldab, double* afb, lapack_int ldafb, char* equed, double* s, double* b,
-                          lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* ferr, double* berr);
-lapack_int LAPACKE_cpbsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                          lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* afb, lapack_int ldafb,
-                          char* equed, float* s, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
-                          lapack_int ldx, float* rcond, float* ferr, float* berr);
-lapack_int LAPACKE_zpbsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                          lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* afb, lapack_int ldafb,
-                          char* equed, double* s, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
-                          lapack_int ldx, double* rcond, double* ferr, double* berr);
-
-lapack_int LAPACKE_spbtrf(int matrix_order, char uplo, lapack_int n, lapack_int kd, float* ab, lapack_int ldab);
-lapack_int LAPACKE_dpbtrf(int matrix_order, char uplo, lapack_int n, lapack_int kd, double* ab, lapack_int ldab);
-lapack_int LAPACKE_cpbtrf(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_complex_float* ab,
-                          lapack_int ldab);
-lapack_int LAPACKE_zpbtrf(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_complex_double* ab,
-                          lapack_int ldab);
-
-lapack_int LAPACKE_spbtrs(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs, const float* ab,
-                          lapack_int ldab, float* b, lapack_int ldb);
-lapack_int LAPACKE_dpbtrs(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs, const double* ab,
-                          lapack_int ldab, double* b, lapack_int ldb);
-lapack_int LAPACKE_cpbtrs(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                          const lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zpbtrs(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                          const lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_spftrf(int matrix_order, char transr, char uplo, lapack_int n, float* a);
-lapack_int LAPACKE_dpftrf(int matrix_order, char transr, char uplo, lapack_int n, double* a);
-lapack_int LAPACKE_cpftrf(int matrix_order, char transr, char uplo, lapack_int n, lapack_complex_float* a);
-lapack_int LAPACKE_zpftrf(int matrix_order, char transr, char uplo, lapack_int n, lapack_complex_double* a);
-
-lapack_int LAPACKE_spftri(int matrix_order, char transr, char uplo, lapack_int n, float* a);
-lapack_int LAPACKE_dpftri(int matrix_order, char transr, char uplo, lapack_int n, double* a);
-lapack_int LAPACKE_cpftri(int matrix_order, char transr, char uplo, lapack_int n, lapack_complex_float* a);
-lapack_int LAPACKE_zpftri(int matrix_order, char transr, char uplo, lapack_int n, lapack_complex_double* a);
-
-lapack_int LAPACKE_spftrs(int matrix_order, char transr, char uplo, lapack_int n, lapack_int nrhs, const float* a,
-                          float* b, lapack_int ldb);
-lapack_int LAPACKE_dpftrs(int matrix_order, char transr, char uplo, lapack_int n, lapack_int nrhs, const double* a,
-                          double* b, lapack_int ldb);
-lapack_int LAPACKE_cpftrs(int matrix_order, char transr, char uplo, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_float* a, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zpftrs(int matrix_order, char transr, char uplo, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_double* a, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_spocon(int matrix_order, char uplo, lapack_int n, const float* a, lapack_int lda, float anorm,
-                          float* rcond);
-lapack_int LAPACKE_dpocon(int matrix_order, char uplo, lapack_int n, const double* a, lapack_int lda, double anorm,
-                          double* rcond);
-lapack_int LAPACKE_cpocon(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
-                          float anorm, float* rcond);
-lapack_int LAPACKE_zpocon(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a, lapack_int lda,
-                          double anorm, double* rcond);
-
-lapack_int LAPACKE_spoequ(int matrix_order, lapack_int n, const float* a, lapack_int lda, float* s, float* scond,
-                          float* amax);
-lapack_int LAPACKE_dpoequ(int matrix_order, lapack_int n, const double* a, lapack_int lda, double* s, double* scond,
-                          double* amax);
-lapack_int LAPACKE_cpoequ(int matrix_order, lapack_int n, const lapack_complex_float* a, lapack_int lda, float* s,
-                          float* scond, float* amax);
-lapack_int LAPACKE_zpoequ(int matrix_order, lapack_int n, const lapack_complex_double* a, lapack_int lda, double* s,
-                          double* scond, double* amax);
-
-lapack_int LAPACKE_spoequb(int matrix_order, lapack_int n, const float* a, lapack_int lda, float* s, float* scond,
-                           float* amax);
-lapack_int LAPACKE_dpoequb(int matrix_order, lapack_int n, const double* a, lapack_int lda, double* s, double* scond,
-                           double* amax);
-lapack_int LAPACKE_cpoequb(int matrix_order, lapack_int n, const lapack_complex_float* a, lapack_int lda, float* s,
-                           float* scond, float* amax);
-lapack_int LAPACKE_zpoequb(int matrix_order, lapack_int n, const lapack_complex_double* a, lapack_int lda, double* s,
-                           double* scond, double* amax);
-
-lapack_int LAPACKE_sporfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a, lapack_int lda,
-                          const float* af, lapack_int ldaf, const float* b, lapack_int ldb, float* x, lapack_int ldx,
-                          float* ferr, float* berr);
-lapack_int LAPACKE_dporfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a, lapack_int lda,
-                          const double* af, lapack_int ldaf, const double* b, lapack_int ldb, double* x, lapack_int ldx,
-                          double* ferr, double* berr);
-lapack_int LAPACKE_cporfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
-                          lapack_int lda, const lapack_complex_float* af, lapack_int ldaf,
-                          const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                          float* ferr, float* berr);
-lapack_int LAPACKE_zporfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
-                          lapack_int lda, const lapack_complex_double* af, lapack_int ldaf,
-                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                          double* ferr, double* berr);
-
-lapack_int LAPACKE_sporfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs, const float* a,
-                           lapack_int lda, const float* af, lapack_int ldaf, const float* s, const float* b,
-                           lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* berr, lapack_int n_err_bnds,
-                           float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams, float* params);
-lapack_int LAPACKE_dporfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs, const double* a,
-                           lapack_int lda, const double* af, lapack_int ldaf, const double* s, const double* b,
-                           lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* berr,
-                           lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
-                           double* params);
-lapack_int LAPACKE_cporfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
-                           const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
-                           lapack_int ldaf, const float* s, const lapack_complex_float* b, lapack_int ldb,
-                           lapack_complex_float* x, lapack_int ldx, float* rcond, float* berr, lapack_int n_err_bnds,
-                           float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams, float* params);
-lapack_int LAPACKE_zporfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
-                           const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
-                           lapack_int ldaf, const double* s, const lapack_complex_double* b, lapack_int ldb,
-                           lapack_complex_double* x, lapack_int ldx, double* rcond, double* berr, lapack_int n_err_bnds,
-                           double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams, double* params);
-
-lapack_int LAPACKE_sposv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, float* a, lapack_int lda, float* b,
-                         lapack_int ldb);
-lapack_int LAPACKE_dposv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
-                         double* b, lapack_int ldb);
-lapack_int LAPACKE_cposv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
-                         lapack_int lda, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zposv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
-                         lapack_int lda, lapack_complex_double* b, lapack_int ldb);
-lapack_int LAPACKE_dsposv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
-                          double* b, lapack_int ldb, double* x, lapack_int ldx, lapack_int* iter);
-lapack_int LAPACKE_zcposv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
-                          lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
-                          lapack_int ldx, lapack_int* iter);
-
-lapack_int LAPACKE_sposvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, float* a,
-                          lapack_int lda, float* af, lapack_int ldaf, char* equed, float* s, float* b, lapack_int ldb,
-                          float* x, lapack_int ldx, float* rcond, float* ferr, float* berr);
-lapack_int LAPACKE_dposvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, double* a,
-                          lapack_int lda, double* af, lapack_int ldaf, char* equed, double* s, double* b,
-                          lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* ferr, double* berr);
-lapack_int LAPACKE_cposvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
-                          char* equed, float* s, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
-                          lapack_int ldx, float* rcond, float* ferr, float* berr);
-lapack_int LAPACKE_zposvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
-                          char* equed, double* s, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
-                          lapack_int ldx, double* rcond, double* ferr, double* berr);
-
-lapack_int LAPACKE_sposvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, float* a,
-                           lapack_int lda, float* af, lapack_int ldaf, char* equed, float* s, float* b, lapack_int ldb,
-                           float* x, lapack_int ldx, float* rcond, float* rpvgrw, float* berr, lapack_int n_err_bnds,
-                           float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams, float* params);
-lapack_int LAPACKE_dposvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, double* a,
-                           lapack_int lda, double* af, lapack_int ldaf, char* equed, double* s, double* b,
-                           lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* rpvgrw, double* berr,
-                           lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
-                           double* params);
-lapack_int LAPACKE_cposvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                           lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
-                           char* equed, float* s, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
-                           lapack_int ldx, float* rcond, float* rpvgrw, float* berr, lapack_int n_err_bnds,
-                           float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams, float* params);
-lapack_int LAPACKE_zposvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                           lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
-                           char* equed, double* s, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
-                           lapack_int ldx, double* rcond, double* rpvgrw, double* berr, lapack_int n_err_bnds,
-                           double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams, double* params);
-
+/* Cholesky factorization (potrf) — used by LLT_LAPACKE.h */
 lapack_int LAPACKE_spotrf(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda);
 lapack_int LAPACKE_dpotrf(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda);
 lapack_int LAPACKE_cpotrf(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda);
 lapack_int LAPACKE_zpotrf(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda);
 
-lapack_int LAPACKE_spotri(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda);
-lapack_int LAPACKE_dpotri(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda);
-lapack_int LAPACKE_cpotri(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda);
-lapack_int LAPACKE_zpotri(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda);
-
-lapack_int LAPACKE_spotrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a, lapack_int lda,
-                          float* b, lapack_int ldb);
-lapack_int LAPACKE_dpotrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a, lapack_int lda,
-                          double* b, lapack_int ldb);
-lapack_int LAPACKE_cpotrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
-                          lapack_int lda, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zpotrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
-                          lapack_int lda, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_sppcon(int matrix_order, char uplo, lapack_int n, const float* ap, float anorm, float* rcond);
-lapack_int LAPACKE_dppcon(int matrix_order, char uplo, lapack_int n, const double* ap, double anorm, double* rcond);
-lapack_int LAPACKE_cppcon(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap, float anorm,
-                          float* rcond);
-lapack_int LAPACKE_zppcon(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap, double anorm,
-                          double* rcond);
-
-lapack_int LAPACKE_sppequ(int matrix_order, char uplo, lapack_int n, const float* ap, float* s, float* scond,
-                          float* amax);
-lapack_int LAPACKE_dppequ(int matrix_order, char uplo, lapack_int n, const double* ap, double* s, double* scond,
-                          double* amax);
-lapack_int LAPACKE_cppequ(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap, float* s,
-                          float* scond, float* amax);
-lapack_int LAPACKE_zppequ(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap, double* s,
-                          double* scond, double* amax);
-
-lapack_int LAPACKE_spprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* ap, const float* afp,
-                          const float* b, lapack_int ldb, float* x, lapack_int ldx, float* ferr, float* berr);
-lapack_int LAPACKE_dpprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* ap,
-                          const double* afp, const double* b, lapack_int ldb, double* x, lapack_int ldx, double* ferr,
-                          double* berr);
-lapack_int LAPACKE_cpprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* ap,
-                          const lapack_complex_float* afp, const lapack_complex_float* b, lapack_int ldb,
-                          lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr);
-lapack_int LAPACKE_zpprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* ap,
-                          const lapack_complex_double* afp, const lapack_complex_double* b, lapack_int ldb,
-                          lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr);
-
-lapack_int LAPACKE_sppsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, float* ap, float* b,
-                         lapack_int ldb);
-lapack_int LAPACKE_dppsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* ap, double* b,
-                         lapack_int ldb);
-lapack_int LAPACKE_cppsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* ap,
-                         lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zppsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* ap,
-                         lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_sppsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, float* ap, float* afp,
-                          char* equed, float* s, float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond,
-                          float* ferr, float* berr);
-lapack_int LAPACKE_dppsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, double* ap,
-                          double* afp, char* equed, double* s, double* b, lapack_int ldb, double* x, lapack_int ldx,
-                          double* rcond, double* ferr, double* berr);
-lapack_int LAPACKE_cppsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                          lapack_complex_float* ap, lapack_complex_float* afp, char* equed, float* s,
-                          lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                          float* rcond, float* ferr, float* berr);
-lapack_int LAPACKE_zppsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                          lapack_complex_double* ap, lapack_complex_double* afp, char* equed, double* s,
-                          lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                          double* rcond, double* ferr, double* berr);
-
-lapack_int LAPACKE_spptrf(int matrix_order, char uplo, lapack_int n, float* ap);
-lapack_int LAPACKE_dpptrf(int matrix_order, char uplo, lapack_int n, double* ap);
-lapack_int LAPACKE_cpptrf(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap);
-lapack_int LAPACKE_zpptrf(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap);
-
-lapack_int LAPACKE_spptri(int matrix_order, char uplo, lapack_int n, float* ap);
-lapack_int LAPACKE_dpptri(int matrix_order, char uplo, lapack_int n, double* ap);
-lapack_int LAPACKE_cpptri(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap);
-lapack_int LAPACKE_zpptri(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap);
-
-lapack_int LAPACKE_spptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* ap, float* b,
-                          lapack_int ldb);
-lapack_int LAPACKE_dpptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* ap, double* b,
-                          lapack_int ldb);
-lapack_int LAPACKE_cpptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* ap,
-                          lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zpptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* ap,
-                          lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_spstrf(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, lapack_int* piv,
-                          lapack_int* rank, float tol);
-lapack_int LAPACKE_dpstrf(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda, lapack_int* piv,
-                          lapack_int* rank, double tol);
-lapack_int LAPACKE_cpstrf(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          lapack_int* piv, lapack_int* rank, float tol);
-lapack_int LAPACKE_zpstrf(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          lapack_int* piv, lapack_int* rank, double tol);
-
-lapack_int LAPACKE_sptcon(lapack_int n, const float* d, const float* e, float anorm, float* rcond);
-lapack_int LAPACKE_dptcon(lapack_int n, const double* d, const double* e, double anorm, double* rcond);
-lapack_int LAPACKE_cptcon(lapack_int n, const float* d, const lapack_complex_float* e, float anorm, float* rcond);
-lapack_int LAPACKE_zptcon(lapack_int n, const double* d, const lapack_complex_double* e, double anorm, double* rcond);
-
-lapack_int LAPACKE_spteqr(int matrix_order, char compz, lapack_int n, float* d, float* e, float* z, lapack_int ldz);
-lapack_int LAPACKE_dpteqr(int matrix_order, char compz, lapack_int n, double* d, double* e, double* z, lapack_int ldz);
-lapack_int LAPACKE_cpteqr(int matrix_order, char compz, lapack_int n, float* d, float* e, lapack_complex_float* z,
-                          lapack_int ldz);
-lapack_int LAPACKE_zpteqr(int matrix_order, char compz, lapack_int n, double* d, double* e, lapack_complex_double* z,
-                          lapack_int ldz);
-
-lapack_int LAPACKE_sptrfs(int matrix_order, lapack_int n, lapack_int nrhs, const float* d, const float* e,
-                          const float* df, const float* ef, const float* b, lapack_int ldb, float* x, lapack_int ldx,
-                          float* ferr, float* berr);
-lapack_int LAPACKE_dptrfs(int matrix_order, lapack_int n, lapack_int nrhs, const double* d, const double* e,
-                          const double* df, const double* ef, const double* b, lapack_int ldb, double* x,
-                          lapack_int ldx, double* ferr, double* berr);
-lapack_int LAPACKE_cptrfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* d,
-                          const lapack_complex_float* e, const float* df, const lapack_complex_float* ef,
-                          const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                          float* ferr, float* berr);
-lapack_int LAPACKE_zptrfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* d,
-                          const lapack_complex_double* e, const double* df, const lapack_complex_double* ef,
-                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                          double* ferr, double* berr);
-
-lapack_int LAPACKE_sptsv(int matrix_order, lapack_int n, lapack_int nrhs, float* d, float* e, float* b, lapack_int ldb);
-lapack_int LAPACKE_dptsv(int matrix_order, lapack_int n, lapack_int nrhs, double* d, double* e, double* b,
-                         lapack_int ldb);
-lapack_int LAPACKE_cptsv(int matrix_order, lapack_int n, lapack_int nrhs, float* d, lapack_complex_float* e,
-                         lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zptsv(int matrix_order, lapack_int n, lapack_int nrhs, double* d, lapack_complex_double* e,
-                         lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_sptsvx(int matrix_order, char fact, lapack_int n, lapack_int nrhs, const float* d, const float* e,
-                          float* df, float* ef, const float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond,
-                          float* ferr, float* berr);
-lapack_int LAPACKE_dptsvx(int matrix_order, char fact, lapack_int n, lapack_int nrhs, const double* d, const double* e,
-                          double* df, double* ef, const double* b, lapack_int ldb, double* x, lapack_int ldx,
-                          double* rcond, double* ferr, double* berr);
-lapack_int LAPACKE_cptsvx(int matrix_order, char fact, lapack_int n, lapack_int nrhs, const float* d,
-                          const lapack_complex_float* e, float* df, lapack_complex_float* ef,
-                          const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                          float* rcond, float* ferr, float* berr);
-lapack_int LAPACKE_zptsvx(int matrix_order, char fact, lapack_int n, lapack_int nrhs, const double* d,
-                          const lapack_complex_double* e, double* df, lapack_complex_double* ef,
-                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                          double* rcond, double* ferr, double* berr);
-
-lapack_int LAPACKE_spttrf(lapack_int n, float* d, float* e);
-lapack_int LAPACKE_dpttrf(lapack_int n, double* d, double* e);
-lapack_int LAPACKE_cpttrf(lapack_int n, float* d, lapack_complex_float* e);
-lapack_int LAPACKE_zpttrf(lapack_int n, double* d, lapack_complex_double* e);
-
-lapack_int LAPACKE_spttrs(int matrix_order, lapack_int n, lapack_int nrhs, const float* d, const float* e, float* b,
-                          lapack_int ldb);
-lapack_int LAPACKE_dpttrs(int matrix_order, lapack_int n, lapack_int nrhs, const double* d, const double* e, double* b,
-                          lapack_int ldb);
-lapack_int LAPACKE_cpttrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* d,
-                          const lapack_complex_float* e, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zpttrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* d,
-                          const lapack_complex_double* e, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_ssbev(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, float* ab,
-                         lapack_int ldab, float* w, float* z, lapack_int ldz);
-lapack_int LAPACKE_dsbev(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, double* ab,
-                         lapack_int ldab, double* w, double* z, lapack_int ldz);
-
-lapack_int LAPACKE_ssbevd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, float* ab,
-                          lapack_int ldab, float* w, float* z, lapack_int ldz);
-lapack_int LAPACKE_dsbevd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, double* ab,
-                          lapack_int ldab, double* w, double* z, lapack_int ldz);
-
-lapack_int LAPACKE_ssbevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int kd, float* ab,
-                          lapack_int ldab, float* q, lapack_int ldq, float vl, float vu, lapack_int il, lapack_int iu,
-                          float abstol, lapack_int* m, float* w, float* z, lapack_int ldz, lapack_int* ifail);
-lapack_int LAPACKE_dsbevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int kd, double* ab,
-                          lapack_int ldab, double* q, lapack_int ldq, double vl, double vu, lapack_int il,
-                          lapack_int iu, double abstol, lapack_int* m, double* w, double* z, lapack_int ldz,
-                          lapack_int* ifail);
-
-lapack_int LAPACKE_ssbgst(int matrix_order, char vect, char uplo, lapack_int n, lapack_int ka, lapack_int kb, float* ab,
-                          lapack_int ldab, const float* bb, lapack_int ldbb, float* x, lapack_int ldx);
-lapack_int LAPACKE_dsbgst(int matrix_order, char vect, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                          double* ab, lapack_int ldab, const double* bb, lapack_int ldbb, double* x, lapack_int ldx);
-
-lapack_int LAPACKE_ssbgv(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb, float* ab,
-                         lapack_int ldab, float* bb, lapack_int ldbb, float* w, float* z, lapack_int ldz);
-lapack_int LAPACKE_dsbgv(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb, double* ab,
-                         lapack_int ldab, double* bb, lapack_int ldbb, double* w, double* z, lapack_int ldz);
-
-lapack_int LAPACKE_ssbgvd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb, float* ab,
-                          lapack_int ldab, float* bb, lapack_int ldbb, float* w, float* z, lapack_int ldz);
-lapack_int LAPACKE_dsbgvd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                          double* ab, lapack_int ldab, double* bb, lapack_int ldbb, double* w, double* z,
-                          lapack_int ldz);
-
-lapack_int LAPACKE_ssbgvx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int ka,
-                          lapack_int kb, float* ab, lapack_int ldab, float* bb, lapack_int ldbb, float* q,
-                          lapack_int ldq, float vl, float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m,
-                          float* w, float* z, lapack_int ldz, lapack_int* ifail);
-lapack_int LAPACKE_dsbgvx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int ka,
-                          lapack_int kb, double* ab, lapack_int ldab, double* bb, lapack_int ldbb, double* q,
-                          lapack_int ldq, double vl, double vu, lapack_int il, lapack_int iu, double abstol,
-                          lapack_int* m, double* w, double* z, lapack_int ldz, lapack_int* ifail);
-
-lapack_int LAPACKE_ssbtrd(int matrix_order, char vect, char uplo, lapack_int n, lapack_int kd, float* ab,
-                          lapack_int ldab, float* d, float* e, float* q, lapack_int ldq);
-lapack_int LAPACKE_dsbtrd(int matrix_order, char vect, char uplo, lapack_int n, lapack_int kd, double* ab,
-                          lapack_int ldab, double* d, double* e, double* q, lapack_int ldq);
-
-lapack_int LAPACKE_ssfrk(int matrix_order, char transr, char uplo, char trans, lapack_int n, lapack_int k, float alpha,
-                         const float* a, lapack_int lda, float beta, float* c);
-lapack_int LAPACKE_dsfrk(int matrix_order, char transr, char uplo, char trans, lapack_int n, lapack_int k, double alpha,
-                         const double* a, lapack_int lda, double beta, double* c);
-
-lapack_int LAPACKE_sspcon(int matrix_order, char uplo, lapack_int n, const float* ap, const lapack_int* ipiv,
-                          float anorm, float* rcond);
-lapack_int LAPACKE_dspcon(int matrix_order, char uplo, lapack_int n, const double* ap, const lapack_int* ipiv,
-                          double anorm, double* rcond);
-lapack_int LAPACKE_cspcon(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap,
-                          const lapack_int* ipiv, float anorm, float* rcond);
-lapack_int LAPACKE_zspcon(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap,
-                          const lapack_int* ipiv, double anorm, double* rcond);
-
-lapack_int LAPACKE_sspev(int matrix_order, char jobz, char uplo, lapack_int n, float* ap, float* w, float* z,
-                         lapack_int ldz);
-lapack_int LAPACKE_dspev(int matrix_order, char jobz, char uplo, lapack_int n, double* ap, double* w, double* z,
-                         lapack_int ldz);
-
-lapack_int LAPACKE_sspevd(int matrix_order, char jobz, char uplo, lapack_int n, float* ap, float* w, float* z,
-                          lapack_int ldz);
-lapack_int LAPACKE_dspevd(int matrix_order, char jobz, char uplo, lapack_int n, double* ap, double* w, double* z,
-                          lapack_int ldz);
-
-lapack_int LAPACKE_sspevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, float* ap, float vl,
-                          float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w, float* z,
-                          lapack_int ldz, lapack_int* ifail);
-lapack_int LAPACKE_dspevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, double* ap, double vl,
-                          double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w, double* z,
-                          lapack_int ldz, lapack_int* ifail);
-
-lapack_int LAPACKE_sspgst(int matrix_order, lapack_int itype, char uplo, lapack_int n, float* ap, const float* bp);
-lapack_int LAPACKE_dspgst(int matrix_order, lapack_int itype, char uplo, lapack_int n, double* ap, const double* bp);
-
-lapack_int LAPACKE_sspgv(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, float* ap, float* bp,
-                         float* w, float* z, lapack_int ldz);
-lapack_int LAPACKE_dspgv(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, double* ap, double* bp,
-                         double* w, double* z, lapack_int ldz);
-
-lapack_int LAPACKE_sspgvd(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, float* ap, float* bp,
-                          float* w, float* z, lapack_int ldz);
-lapack_int LAPACKE_dspgvd(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, double* ap,
-                          double* bp, double* w, double* z, lapack_int ldz);
-
-lapack_int LAPACKE_sspgvx(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n, float* ap,
-                          float* bp, float vl, float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m,
-                          float* w, float* z, lapack_int ldz, lapack_int* ifail);
-lapack_int LAPACKE_dspgvx(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
-                          double* ap, double* bp, double vl, double vu, lapack_int il, lapack_int iu, double abstol,
-                          lapack_int* m, double* w, double* z, lapack_int ldz, lapack_int* ifail);
-
-lapack_int LAPACKE_ssprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* ap, const float* afp,
-                          const lapack_int* ipiv, const float* b, lapack_int ldb, float* x, lapack_int ldx, float* ferr,
-                          float* berr);
-lapack_int LAPACKE_dsprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* ap,
-                          const double* afp, const lapack_int* ipiv, const double* b, lapack_int ldb, double* x,
-                          lapack_int ldx, double* ferr, double* berr);
-lapack_int LAPACKE_csprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* ap,
-                          const lapack_complex_float* afp, const lapack_int* ipiv, const lapack_complex_float* b,
-                          lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr);
-lapack_int LAPACKE_zsprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* ap,
-                          const lapack_complex_double* afp, const lapack_int* ipiv, const lapack_complex_double* b,
-                          lapack_int ldb, lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr);
-
-lapack_int LAPACKE_sspsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, float* ap, lapack_int* ipiv,
-                         float* b, lapack_int ldb);
-lapack_int LAPACKE_dspsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* ap, lapack_int* ipiv,
-                         double* b, lapack_int ldb);
-lapack_int LAPACKE_cspsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* ap,
-                         lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zspsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* ap,
-                         lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_sspsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, const float* ap,
-                          float* afp, lapack_int* ipiv, const float* b, lapack_int ldb, float* x, lapack_int ldx,
-                          float* rcond, float* ferr, float* berr);
-lapack_int LAPACKE_dspsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, const double* ap,
-                          double* afp, lapack_int* ipiv, const double* b, lapack_int ldb, double* x, lapack_int ldx,
-                          double* rcond, double* ferr, double* berr);
-lapack_int LAPACKE_cspsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_float* ap, lapack_complex_float* afp, lapack_int* ipiv,
-                          const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                          float* rcond, float* ferr, float* berr);
-lapack_int LAPACKE_zspsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_double* ap, lapack_complex_double* afp, lapack_int* ipiv,
-                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                          double* rcond, double* ferr, double* berr);
-
-lapack_int LAPACKE_ssptrd(int matrix_order, char uplo, lapack_int n, float* ap, float* d, float* e, float* tau);
-lapack_int LAPACKE_dsptrd(int matrix_order, char uplo, lapack_int n, double* ap, double* d, double* e, double* tau);
-
-lapack_int LAPACKE_ssptrf(int matrix_order, char uplo, lapack_int n, float* ap, lapack_int* ipiv);
-lapack_int LAPACKE_dsptrf(int matrix_order, char uplo, lapack_int n, double* ap, lapack_int* ipiv);
-lapack_int LAPACKE_csptrf(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap, lapack_int* ipiv);
-lapack_int LAPACKE_zsptrf(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap, lapack_int* ipiv);
-
-lapack_int LAPACKE_ssptri(int matrix_order, char uplo, lapack_int n, float* ap, const lapack_int* ipiv);
-lapack_int LAPACKE_dsptri(int matrix_order, char uplo, lapack_int n, double* ap, const lapack_int* ipiv);
-lapack_int LAPACKE_csptri(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap, const lapack_int* ipiv);
-lapack_int LAPACKE_zsptri(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap, const lapack_int* ipiv);
-
-lapack_int LAPACKE_ssptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* ap,
-                          const lapack_int* ipiv, float* b, lapack_int ldb);
-lapack_int LAPACKE_dsptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* ap,
-                          const lapack_int* ipiv, double* b, lapack_int ldb);
-lapack_int LAPACKE_csptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* ap,
-                          const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zsptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* ap,
-                          const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_sstebz(char range, char order, lapack_int n, float vl, float vu, lapack_int il, lapack_int iu,
-                          float abstol, const float* d, const float* e, lapack_int* m, lapack_int* nsplit, float* w,
-                          lapack_int* iblock, lapack_int* isplit);
-lapack_int LAPACKE_dstebz(char range, char order, lapack_int n, double vl, double vu, lapack_int il, lapack_int iu,
-                          double abstol, const double* d, const double* e, lapack_int* m, lapack_int* nsplit, double* w,
-                          lapack_int* iblock, lapack_int* isplit);
-
-lapack_int LAPACKE_sstedc(int matrix_order, char compz, lapack_int n, float* d, float* e, float* z, lapack_int ldz);
-lapack_int LAPACKE_dstedc(int matrix_order, char compz, lapack_int n, double* d, double* e, double* z, lapack_int ldz);
-lapack_int LAPACKE_cstedc(int matrix_order, char compz, lapack_int n, float* d, float* e, lapack_complex_float* z,
-                          lapack_int ldz);
-lapack_int LAPACKE_zstedc(int matrix_order, char compz, lapack_int n, double* d, double* e, lapack_complex_double* z,
-                          lapack_int ldz);
-
-lapack_int LAPACKE_sstegr(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl, float vu,
-                          lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w, float* z, lapack_int ldz,
-                          lapack_int* isuppz);
-lapack_int LAPACKE_dstegr(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
-                          double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w, double* z,
-                          lapack_int ldz, lapack_int* isuppz);
-lapack_int LAPACKE_cstegr(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl, float vu,
-                          lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w, lapack_complex_float* z,
-                          lapack_int ldz, lapack_int* isuppz);
-lapack_int LAPACKE_zstegr(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
-                          double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
-                          lapack_complex_double* z, lapack_int ldz, lapack_int* isuppz);
-
-lapack_int LAPACKE_sstein(int matrix_order, lapack_int n, const float* d, const float* e, lapack_int m, const float* w,
-                          const lapack_int* iblock, const lapack_int* isplit, float* z, lapack_int ldz,
-                          lapack_int* ifailv);
-lapack_int LAPACKE_dstein(int matrix_order, lapack_int n, const double* d, const double* e, lapack_int m,
-                          const double* w, const lapack_int* iblock, const lapack_int* isplit, double* z,
-                          lapack_int ldz, lapack_int* ifailv);
-lapack_int LAPACKE_cstein(int matrix_order, lapack_int n, const float* d, const float* e, lapack_int m, const float* w,
-                          const lapack_int* iblock, const lapack_int* isplit, lapack_complex_float* z, lapack_int ldz,
-                          lapack_int* ifailv);
-lapack_int LAPACKE_zstein(int matrix_order, lapack_int n, const double* d, const double* e, lapack_int m,
-                          const double* w, const lapack_int* iblock, const lapack_int* isplit, lapack_complex_double* z,
-                          lapack_int ldz, lapack_int* ifailv);
-
-lapack_int LAPACKE_sstemr(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl, float vu,
-                          lapack_int il, lapack_int iu, lapack_int* m, float* w, float* z, lapack_int ldz,
-                          lapack_int nzc, lapack_int* isuppz, lapack_logical* tryrac);
-lapack_int LAPACKE_dstemr(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
-                          double vu, lapack_int il, lapack_int iu, lapack_int* m, double* w, double* z, lapack_int ldz,
-                          lapack_int nzc, lapack_int* isuppz, lapack_logical* tryrac);
-lapack_int LAPACKE_cstemr(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl, float vu,
-                          lapack_int il, lapack_int iu, lapack_int* m, float* w, lapack_complex_float* z,
-                          lapack_int ldz, lapack_int nzc, lapack_int* isuppz, lapack_logical* tryrac);
-lapack_int LAPACKE_zstemr(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
-                          double vu, lapack_int il, lapack_int iu, lapack_int* m, double* w, lapack_complex_double* z,
-                          lapack_int ldz, lapack_int nzc, lapack_int* isuppz, lapack_logical* tryrac);
-
-lapack_int LAPACKE_ssteqr(int matrix_order, char compz, lapack_int n, float* d, float* e, float* z, lapack_int ldz);
-lapack_int LAPACKE_dsteqr(int matrix_order, char compz, lapack_int n, double* d, double* e, double* z, lapack_int ldz);
-lapack_int LAPACKE_csteqr(int matrix_order, char compz, lapack_int n, float* d, float* e, lapack_complex_float* z,
-                          lapack_int ldz);
-lapack_int LAPACKE_zsteqr(int matrix_order, char compz, lapack_int n, double* d, double* e, lapack_complex_double* z,
-                          lapack_int ldz);
-
-lapack_int LAPACKE_ssterf(lapack_int n, float* d, float* e);
-lapack_int LAPACKE_dsterf(lapack_int n, double* d, double* e);
-
-lapack_int LAPACKE_sstev(int matrix_order, char jobz, lapack_int n, float* d, float* e, float* z, lapack_int ldz);
-lapack_int LAPACKE_dstev(int matrix_order, char jobz, lapack_int n, double* d, double* e, double* z, lapack_int ldz);
-
-lapack_int LAPACKE_sstevd(int matrix_order, char jobz, lapack_int n, float* d, float* e, float* z, lapack_int ldz);
-lapack_int LAPACKE_dstevd(int matrix_order, char jobz, lapack_int n, double* d, double* e, double* z, lapack_int ldz);
-
-lapack_int LAPACKE_sstevr(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl, float vu,
-                          lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w, float* z, lapack_int ldz,
-                          lapack_int* isuppz);
-lapack_int LAPACKE_dstevr(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
-                          double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w, double* z,
-                          lapack_int ldz, lapack_int* isuppz);
-
-lapack_int LAPACKE_sstevx(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl, float vu,
-                          lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w, float* z, lapack_int ldz,
-                          lapack_int* ifail);
-lapack_int LAPACKE_dstevx(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
-                          double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w, double* z,
-                          lapack_int ldz, lapack_int* ifail);
-
-lapack_int LAPACKE_ssycon(int matrix_order, char uplo, lapack_int n, const float* a, lapack_int lda,
-                          const lapack_int* ipiv, float anorm, float* rcond);
-lapack_int LAPACKE_dsycon(int matrix_order, char uplo, lapack_int n, const double* a, lapack_int lda,
-                          const lapack_int* ipiv, double anorm, double* rcond);
-lapack_int LAPACKE_csycon(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
-                          const lapack_int* ipiv, float anorm, float* rcond);
-lapack_int LAPACKE_zsycon(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a, lapack_int lda,
-                          const lapack_int* ipiv, double anorm, double* rcond);
-
-lapack_int LAPACKE_ssyequb(int matrix_order, char uplo, lapack_int n, const float* a, lapack_int lda, float* s,
-                           float* scond, float* amax);
-lapack_int LAPACKE_dsyequb(int matrix_order, char uplo, lapack_int n, const double* a, lapack_int lda, double* s,
-                           double* scond, double* amax);
-lapack_int LAPACKE_csyequb(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
-                           float* s, float* scond, float* amax);
-lapack_int LAPACKE_zsyequb(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a, lapack_int lda,
-                           double* s, double* scond, double* amax);
-
+/* Symmetric/Hermitian eigenvalues (syev/heev) — used by SelfAdjointEigenSolver_LAPACKE.h */
 lapack_int LAPACKE_ssyev(int matrix_order, char jobz, char uplo, lapack_int n, float* a, lapack_int lda, float* w);
 lapack_int LAPACKE_dsyev(int matrix_order, char jobz, char uplo, lapack_int n, double* a, lapack_int lda, double* w);
-
-lapack_int LAPACKE_ssyevd(int matrix_order, char jobz, char uplo, lapack_int n, float* a, lapack_int lda, float* w);
-lapack_int LAPACKE_dsyevd(int matrix_order, char jobz, char uplo, lapack_int n, double* a, lapack_int lda, double* w);
-
-lapack_int LAPACKE_ssyevr(int matrix_order, char jobz, char range, char uplo, lapack_int n, float* a, lapack_int lda,
-                          float vl, float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w,
-                          float* z, lapack_int ldz, lapack_int* isuppz);
-lapack_int LAPACKE_dsyevr(int matrix_order, char jobz, char range, char uplo, lapack_int n, double* a, lapack_int lda,
-                          double vl, double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
-                          double* z, lapack_int ldz, lapack_int* isuppz);
-
-lapack_int LAPACKE_ssyevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, float* a, lapack_int lda,
-                          float vl, float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w,
-                          float* z, lapack_int ldz, lapack_int* ifail);
-lapack_int LAPACKE_dsyevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, double* a, lapack_int lda,
-                          double vl, double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
-                          double* z, lapack_int ldz, lapack_int* ifail);
-
-lapack_int LAPACKE_ssygst(int matrix_order, lapack_int itype, char uplo, lapack_int n, float* a, lapack_int lda,
-                          const float* b, lapack_int ldb);
-lapack_int LAPACKE_dsygst(int matrix_order, lapack_int itype, char uplo, lapack_int n, double* a, lapack_int lda,
-                          const double* b, lapack_int ldb);
-
-lapack_int LAPACKE_ssygv(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, float* a,
-                         lapack_int lda, float* b, lapack_int ldb, float* w);
-lapack_int LAPACKE_dsygv(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, double* a,
-                         lapack_int lda, double* b, lapack_int ldb, double* w);
-
-lapack_int LAPACKE_ssygvd(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, float* a,
-                          lapack_int lda, float* b, lapack_int ldb, float* w);
-lapack_int LAPACKE_dsygvd(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, double* a,
-                          lapack_int lda, double* b, lapack_int ldb, double* w);
-
-lapack_int LAPACKE_ssygvx(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n, float* a,
-                          lapack_int lda, float* b, lapack_int ldb, float vl, float vu, lapack_int il, lapack_int iu,
-                          float abstol, lapack_int* m, float* w, float* z, lapack_int ldz, lapack_int* ifail);
-lapack_int LAPACKE_dsygvx(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n, double* a,
-                          lapack_int lda, double* b, lapack_int ldb, double vl, double vu, lapack_int il, lapack_int iu,
-                          double abstol, lapack_int* m, double* w, double* z, lapack_int ldz, lapack_int* ifail);
-
-lapack_int LAPACKE_ssyrfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a, lapack_int lda,
-                          const float* af, lapack_int ldaf, const lapack_int* ipiv, const float* b, lapack_int ldb,
-                          float* x, lapack_int ldx, float* ferr, float* berr);
-lapack_int LAPACKE_dsyrfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a, lapack_int lda,
-                          const double* af, lapack_int ldaf, const lapack_int* ipiv, const double* b, lapack_int ldb,
-                          double* x, lapack_int ldx, double* ferr, double* berr);
-lapack_int LAPACKE_csyrfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
-                          lapack_int lda, const lapack_complex_float* af, lapack_int ldaf, const lapack_int* ipiv,
-                          const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                          float* ferr, float* berr);
-lapack_int LAPACKE_zsyrfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
-                          lapack_int lda, const lapack_complex_double* af, lapack_int ldaf, const lapack_int* ipiv,
-                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                          double* ferr, double* berr);
-
-lapack_int LAPACKE_ssyrfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs, const float* a,
-                           lapack_int lda, const float* af, lapack_int ldaf, const lapack_int* ipiv, const float* s,
-                           const float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* berr,
-                           lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
-                           float* params);
-lapack_int LAPACKE_dsyrfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs, const double* a,
-                           lapack_int lda, const double* af, lapack_int ldaf, const lapack_int* ipiv, const double* s,
-                           const double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* berr,
-                           lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
-                           double* params);
-lapack_int LAPACKE_csyrfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
-                           const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
-                           lapack_int ldaf, const lapack_int* ipiv, const float* s, const lapack_complex_float* b,
-                           lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* rcond, float* berr,
-                           lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
-                           float* params);
-lapack_int LAPACKE_zsyrfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
-                           const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
-                           lapack_int ldaf, const lapack_int* ipiv, const double* s, const lapack_complex_double* b,
-                           lapack_int ldb, lapack_complex_double* x, lapack_int ldx, double* rcond, double* berr,
-                           lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
-                           double* params);
-
-lapack_int LAPACKE_ssysv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
-                         lapack_int* ipiv, float* b, lapack_int ldb);
-lapack_int LAPACKE_dsysv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
-                         lapack_int* ipiv, double* b, lapack_int ldb);
-lapack_int LAPACKE_csysv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
-                         lapack_int lda, lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zsysv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
-                         lapack_int lda, lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_ssysvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, const float* a,
-                          lapack_int lda, float* af, lapack_int ldaf, lapack_int* ipiv, const float* b, lapack_int ldb,
-                          float* x, lapack_int ldx, float* rcond, float* ferr, float* berr);
-lapack_int LAPACKE_dsysvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, const double* a,
-                          lapack_int lda, double* af, lapack_int ldaf, lapack_int* ipiv, const double* b,
-                          lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* ferr, double* berr);
-lapack_int LAPACKE_csysvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
-                          lapack_int* ipiv, const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
-                          lapack_int ldx, float* rcond, float* ferr, float* berr);
-lapack_int LAPACKE_zsysvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
-                          lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
-                          lapack_int ldx, double* rcond, double* ferr, double* berr);
-
-lapack_int LAPACKE_ssysvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, float* a,
-                           lapack_int lda, float* af, lapack_int ldaf, lapack_int* ipiv, char* equed, float* s,
-                           float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* rpvgrw, float* berr,
-                           lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
-                           float* params);
-lapack_int LAPACKE_dsysvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, double* a,
-                           lapack_int lda, double* af, lapack_int ldaf, lapack_int* ipiv, char* equed, double* s,
-                           double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* rpvgrw,
-                           double* berr, lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
-                           lapack_int nparams, double* params);
-lapack_int LAPACKE_csysvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                           lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
-                           lapack_int* ipiv, char* equed, float* s, lapack_complex_float* b, lapack_int ldb,
-                           lapack_complex_float* x, lapack_int ldx, float* rcond, float* rpvgrw, float* berr,
-                           lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
-                           float* params);
-lapack_int LAPACKE_zsysvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                           lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
-                           lapack_int* ipiv, char* equed, double* s, lapack_complex_double* b, lapack_int ldb,
-                           lapack_complex_double* x, lapack_int ldx, double* rcond, double* rpvgrw, double* berr,
-                           lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
-                           double* params);
-
-lapack_int LAPACKE_ssytrd(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, float* d, float* e,
-                          float* tau);
-lapack_int LAPACKE_dsytrd(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda, double* d, double* e,
-                          double* tau);
-
-lapack_int LAPACKE_ssytrf(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, lapack_int* ipiv);
-lapack_int LAPACKE_dsytrf(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda, lapack_int* ipiv);
-lapack_int LAPACKE_csytrf(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          lapack_int* ipiv);
-lapack_int LAPACKE_zsytrf(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          lapack_int* ipiv);
-
-lapack_int LAPACKE_ssytri(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, const lapack_int* ipiv);
-lapack_int LAPACKE_dsytri(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda, const lapack_int* ipiv);
-lapack_int LAPACKE_csytri(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          const lapack_int* ipiv);
-lapack_int LAPACKE_zsytri(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          const lapack_int* ipiv);
-
-lapack_int LAPACKE_ssytrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a, lapack_int lda,
-                          const lapack_int* ipiv, float* b, lapack_int ldb);
-lapack_int LAPACKE_dsytrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a, lapack_int lda,
-                          const lapack_int* ipiv, double* b, lapack_int ldb);
-lapack_int LAPACKE_csytrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
-                          lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zsytrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
-                          lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_stbcon(int matrix_order, char norm, char uplo, char diag, lapack_int n, lapack_int kd,
-                          const float* ab, lapack_int ldab, float* rcond);
-lapack_int LAPACKE_dtbcon(int matrix_order, char norm, char uplo, char diag, lapack_int n, lapack_int kd,
-                          const double* ab, lapack_int ldab, double* rcond);
-lapack_int LAPACKE_ctbcon(int matrix_order, char norm, char uplo, char diag, lapack_int n, lapack_int kd,
-                          const lapack_complex_float* ab, lapack_int ldab, float* rcond);
-lapack_int LAPACKE_ztbcon(int matrix_order, char norm, char uplo, char diag, lapack_int n, lapack_int kd,
-                          const lapack_complex_double* ab, lapack_int ldab, double* rcond);
-
-lapack_int LAPACKE_stbrfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
-                          lapack_int nrhs, const float* ab, lapack_int ldab, const float* b, lapack_int ldb,
-                          const float* x, lapack_int ldx, float* ferr, float* berr);
-lapack_int LAPACKE_dtbrfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
-                          lapack_int nrhs, const double* ab, lapack_int ldab, const double* b, lapack_int ldb,
-                          const double* x, lapack_int ldx, double* ferr, double* berr);
-lapack_int LAPACKE_ctbrfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
-                          lapack_int nrhs, const lapack_complex_float* ab, lapack_int ldab,
-                          const lapack_complex_float* b, lapack_int ldb, const lapack_complex_float* x, lapack_int ldx,
-                          float* ferr, float* berr);
-lapack_int LAPACKE_ztbrfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
-                          lapack_int nrhs, const lapack_complex_double* ab, lapack_int ldab,
-                          const lapack_complex_double* b, lapack_int ldb, const lapack_complex_double* x,
-                          lapack_int ldx, double* ferr, double* berr);
-
-lapack_int LAPACKE_stbtrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
-                          lapack_int nrhs, const float* ab, lapack_int ldab, float* b, lapack_int ldb);
-lapack_int LAPACKE_dtbtrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
-                          lapack_int nrhs, const double* ab, lapack_int ldab, double* b, lapack_int ldb);
-lapack_int LAPACKE_ctbtrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
-                          lapack_int nrhs, const lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* b,
-                          lapack_int ldb);
-lapack_int LAPACKE_ztbtrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
-                          lapack_int nrhs, const lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* b,
-                          lapack_int ldb);
-
-lapack_int LAPACKE_stfsm(int matrix_order, char transr, char side, char uplo, char trans, char diag, lapack_int m,
-                         lapack_int n, float alpha, const float* a, float* b, lapack_int ldb);
-lapack_int LAPACKE_dtfsm(int matrix_order, char transr, char side, char uplo, char trans, char diag, lapack_int m,
-                         lapack_int n, double alpha, const double* a, double* b, lapack_int ldb);
-lapack_int LAPACKE_ctfsm(int matrix_order, char transr, char side, char uplo, char trans, char diag, lapack_int m,
-                         lapack_int n, lapack_complex_float alpha, const lapack_complex_float* a,
-                         lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_ztfsm(int matrix_order, char transr, char side, char uplo, char trans, char diag, lapack_int m,
-                         lapack_int n, lapack_complex_double alpha, const lapack_complex_double* a,
-                         lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_stftri(int matrix_order, char transr, char uplo, char diag, lapack_int n, float* a);
-lapack_int LAPACKE_dtftri(int matrix_order, char transr, char uplo, char diag, lapack_int n, double* a);
-lapack_int LAPACKE_ctftri(int matrix_order, char transr, char uplo, char diag, lapack_int n, lapack_complex_float* a);
-lapack_int LAPACKE_ztftri(int matrix_order, char transr, char uplo, char diag, lapack_int n, lapack_complex_double* a);
-
-lapack_int LAPACKE_stfttp(int matrix_order, char transr, char uplo, lapack_int n, const float* arf, float* ap);
-lapack_int LAPACKE_dtfttp(int matrix_order, char transr, char uplo, lapack_int n, const double* arf, double* ap);
-lapack_int LAPACKE_ctfttp(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_float* arf,
-                          lapack_complex_float* ap);
-lapack_int LAPACKE_ztfttp(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_double* arf,
-                          lapack_complex_double* ap);
-
-lapack_int LAPACKE_stfttr(int matrix_order, char transr, char uplo, lapack_int n, const float* arf, float* a,
-                          lapack_int lda);
-lapack_int LAPACKE_dtfttr(int matrix_order, char transr, char uplo, lapack_int n, const double* arf, double* a,
-                          lapack_int lda);
-lapack_int LAPACKE_ctfttr(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_float* arf,
-                          lapack_complex_float* a, lapack_int lda);
-lapack_int LAPACKE_ztfttr(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_double* arf,
-                          lapack_complex_double* a, lapack_int lda);
-
-lapack_int LAPACKE_stgevc(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
-                          const float* s, lapack_int lds, const float* p, lapack_int ldp, float* vl, lapack_int ldvl,
-                          float* vr, lapack_int ldvr, lapack_int mm, lapack_int* m);
-lapack_int LAPACKE_dtgevc(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
-                          const double* s, lapack_int lds, const double* p, lapack_int ldp, double* vl, lapack_int ldvl,
-                          double* vr, lapack_int ldvr, lapack_int mm, lapack_int* m);
-lapack_int LAPACKE_ctgevc(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
-                          const lapack_complex_float* s, lapack_int lds, const lapack_complex_float* p, lapack_int ldp,
-                          lapack_complex_float* vl, lapack_int ldvl, lapack_complex_float* vr, lapack_int ldvr,
-                          lapack_int mm, lapack_int* m);
-lapack_int LAPACKE_ztgevc(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
-                          const lapack_complex_double* s, lapack_int lds, const lapack_complex_double* p,
-                          lapack_int ldp, lapack_complex_double* vl, lapack_int ldvl, lapack_complex_double* vr,
-                          lapack_int ldvr, lapack_int mm, lapack_int* m);
-
-lapack_int LAPACKE_stgexc(int matrix_order, lapack_logical wantq, lapack_logical wantz, lapack_int n, float* a,
-                          lapack_int lda, float* b, lapack_int ldb, float* q, lapack_int ldq, float* z, lapack_int ldz,
-                          lapack_int* ifst, lapack_int* ilst);
-lapack_int LAPACKE_dtgexc(int matrix_order, lapack_logical wantq, lapack_logical wantz, lapack_int n, double* a,
-                          lapack_int lda, double* b, lapack_int ldb, double* q, lapack_int ldq, double* z,
-                          lapack_int ldz, lapack_int* ifst, lapack_int* ilst);
-lapack_int LAPACKE_ctgexc(int matrix_order, lapack_logical wantq, lapack_logical wantz, lapack_int n,
-                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
-                          lapack_complex_float* q, lapack_int ldq, lapack_complex_float* z, lapack_int ldz,
-                          lapack_int ifst, lapack_int ilst);
-lapack_int LAPACKE_ztgexc(int matrix_order, lapack_logical wantq, lapack_logical wantz, lapack_int n,
-                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
-                          lapack_complex_double* q, lapack_int ldq, lapack_complex_double* z, lapack_int ldz,
-                          lapack_int ifst, lapack_int ilst);
-
-lapack_int LAPACKE_stgsen(int matrix_order, lapack_int ijob, lapack_logical wantq, lapack_logical wantz,
-                          const lapack_logical* select, lapack_int n, float* a, lapack_int lda, float* b,
-                          lapack_int ldb, float* alphar, float* alphai, float* beta, float* q, lapack_int ldq, float* z,
-                          lapack_int ldz, lapack_int* m, float* pl, float* pr, float* dif);
-lapack_int LAPACKE_dtgsen(int matrix_order, lapack_int ijob, lapack_logical wantq, lapack_logical wantz,
-                          const lapack_logical* select, lapack_int n, double* a, lapack_int lda, double* b,
-                          lapack_int ldb, double* alphar, double* alphai, double* beta, double* q, lapack_int ldq,
-                          double* z, lapack_int ldz, lapack_int* m, double* pl, double* pr, double* dif);
-lapack_int LAPACKE_ctgsen(int matrix_order, lapack_int ijob, lapack_logical wantq, lapack_logical wantz,
-                          const lapack_logical* select, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          lapack_complex_float* b, lapack_int ldb, lapack_complex_float* alpha,
-                          lapack_complex_float* beta, lapack_complex_float* q, lapack_int ldq, lapack_complex_float* z,
-                          lapack_int ldz, lapack_int* m, float* pl, float* pr, float* dif);
-lapack_int LAPACKE_ztgsen(int matrix_order, lapack_int ijob, lapack_logical wantq, lapack_logical wantz,
-                          const lapack_logical* select, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          lapack_complex_double* b, lapack_int ldb, lapack_complex_double* alpha,
-                          lapack_complex_double* beta, lapack_complex_double* q, lapack_int ldq,
-                          lapack_complex_double* z, lapack_int ldz, lapack_int* m, double* pl, double* pr, double* dif);
-
-lapack_int LAPACKE_stgsja(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p, lapack_int n,
-                          lapack_int k, lapack_int l, float* a, lapack_int lda, float* b, lapack_int ldb, float tola,
-                          float tolb, float* alpha, float* beta, float* u, lapack_int ldu, float* v, lapack_int ldv,
-                          float* q, lapack_int ldq, lapack_int* ncycle);
-lapack_int LAPACKE_dtgsja(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p, lapack_int n,
-                          lapack_int k, lapack_int l, double* a, lapack_int lda, double* b, lapack_int ldb, double tola,
-                          double tolb, double* alpha, double* beta, double* u, lapack_int ldu, double* v,
-                          lapack_int ldv, double* q, lapack_int ldq, lapack_int* ncycle);
-lapack_int LAPACKE_ctgsja(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p, lapack_int n,
-                          lapack_int k, lapack_int l, lapack_complex_float* a, lapack_int lda, lapack_complex_float* b,
-                          lapack_int ldb, float tola, float tolb, float* alpha, float* beta, lapack_complex_float* u,
-                          lapack_int ldu, lapack_complex_float* v, lapack_int ldv, lapack_complex_float* q,
-                          lapack_int ldq, lapack_int* ncycle);
-lapack_int LAPACKE_ztgsja(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p, lapack_int n,
-                          lapack_int k, lapack_int l, lapack_complex_double* a, lapack_int lda,
-                          lapack_complex_double* b, lapack_int ldb, double tola, double tolb, double* alpha,
-                          double* beta, lapack_complex_double* u, lapack_int ldu, lapack_complex_double* v,
-                          lapack_int ldv, lapack_complex_double* q, lapack_int ldq, lapack_int* ncycle);
-
-lapack_int LAPACKE_stgsna(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
-                          const float* a, lapack_int lda, const float* b, lapack_int ldb, const float* vl,
-                          lapack_int ldvl, const float* vr, lapack_int ldvr, float* s, float* dif, lapack_int mm,
-                          lapack_int* m);
-lapack_int LAPACKE_dtgsna(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
-                          const double* a, lapack_int lda, const double* b, lapack_int ldb, const double* vl,
-                          lapack_int ldvl, const double* vr, lapack_int ldvr, double* s, double* dif, lapack_int mm,
-                          lapack_int* m);
-lapack_int LAPACKE_ctgsna(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
-                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* b, lapack_int ldb,
-                          const lapack_complex_float* vl, lapack_int ldvl, const lapack_complex_float* vr,
-                          lapack_int ldvr, float* s, float* dif, lapack_int mm, lapack_int* m);
-lapack_int LAPACKE_ztgsna(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
-                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* b,
-                          lapack_int ldb, const lapack_complex_double* vl, lapack_int ldvl,
-                          const lapack_complex_double* vr, lapack_int ldvr, double* s, double* dif, lapack_int mm,
-                          lapack_int* m);
-
-lapack_int LAPACKE_stgsyl(int matrix_order, char trans, lapack_int ijob, lapack_int m, lapack_int n, const float* a,
-                          lapack_int lda, const float* b, lapack_int ldb, float* c, lapack_int ldc, const float* d,
-                          lapack_int ldd, const float* e, lapack_int lde, float* f, lapack_int ldf, float* scale,
-                          float* dif);
-lapack_int LAPACKE_dtgsyl(int matrix_order, char trans, lapack_int ijob, lapack_int m, lapack_int n, const double* a,
-                          lapack_int lda, const double* b, lapack_int ldb, double* c, lapack_int ldc, const double* d,
-                          lapack_int ldd, const double* e, lapack_int lde, double* f, lapack_int ldf, double* scale,
-                          double* dif);
-lapack_int LAPACKE_ctgsyl(int matrix_order, char trans, lapack_int ijob, lapack_int m, lapack_int n,
-                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* b, lapack_int ldb,
-                          lapack_complex_float* c, lapack_int ldc, const lapack_complex_float* d, lapack_int ldd,
-                          const lapack_complex_float* e, lapack_int lde, lapack_complex_float* f, lapack_int ldf,
-                          float* scale, float* dif);
-lapack_int LAPACKE_ztgsyl(int matrix_order, char trans, lapack_int ijob, lapack_int m, lapack_int n,
-                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* b,
-                          lapack_int ldb, lapack_complex_double* c, lapack_int ldc, const lapack_complex_double* d,
-                          lapack_int ldd, const lapack_complex_double* e, lapack_int lde, lapack_complex_double* f,
-                          lapack_int ldf, double* scale, double* dif);
-
-lapack_int LAPACKE_stpcon(int matrix_order, char norm, char uplo, char diag, lapack_int n, const float* ap,
-                          float* rcond);
-lapack_int LAPACKE_dtpcon(int matrix_order, char norm, char uplo, char diag, lapack_int n, const double* ap,
-                          double* rcond);
-lapack_int LAPACKE_ctpcon(int matrix_order, char norm, char uplo, char diag, lapack_int n,
-                          const lapack_complex_float* ap, float* rcond);
-lapack_int LAPACKE_ztpcon(int matrix_order, char norm, char uplo, char diag, lapack_int n,
-                          const lapack_complex_double* ap, double* rcond);
-
-lapack_int LAPACKE_stprfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                          const float* ap, const float* b, lapack_int ldb, const float* x, lapack_int ldx, float* ferr,
-                          float* berr);
-lapack_int LAPACKE_dtprfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                          const double* ap, const double* b, lapack_int ldb, const double* x, lapack_int ldx,
-                          double* ferr, double* berr);
-lapack_int LAPACKE_ctprfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_float* ap, const lapack_complex_float* b, lapack_int ldb,
-                          const lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr);
-lapack_int LAPACKE_ztprfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_double* ap, const lapack_complex_double* b, lapack_int ldb,
-                          const lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr);
-
-lapack_int LAPACKE_stptri(int matrix_order, char uplo, char diag, lapack_int n, float* ap);
-lapack_int LAPACKE_dtptri(int matrix_order, char uplo, char diag, lapack_int n, double* ap);
-lapack_int LAPACKE_ctptri(int matrix_order, char uplo, char diag, lapack_int n, lapack_complex_float* ap);
-lapack_int LAPACKE_ztptri(int matrix_order, char uplo, char diag, lapack_int n, lapack_complex_double* ap);
-
-lapack_int LAPACKE_stptrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                          const float* ap, float* b, lapack_int ldb);
-lapack_int LAPACKE_dtptrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                          const double* ap, double* b, lapack_int ldb);
-lapack_int LAPACKE_ctptrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_float* ap, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_ztptrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_double* ap, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_stpttf(int matrix_order, char transr, char uplo, lapack_int n, const float* ap, float* arf);
-lapack_int LAPACKE_dtpttf(int matrix_order, char transr, char uplo, lapack_int n, const double* ap, double* arf);
-lapack_int LAPACKE_ctpttf(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_float* ap,
-                          lapack_complex_float* arf);
-lapack_int LAPACKE_ztpttf(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_double* ap,
-                          lapack_complex_double* arf);
-
-lapack_int LAPACKE_stpttr(int matrix_order, char uplo, lapack_int n, const float* ap, float* a, lapack_int lda);
-lapack_int LAPACKE_dtpttr(int matrix_order, char uplo, lapack_int n, const double* ap, double* a, lapack_int lda);
-lapack_int LAPACKE_ctpttr(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap,
-                          lapack_complex_float* a, lapack_int lda);
-lapack_int LAPACKE_ztpttr(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap,
-                          lapack_complex_double* a, lapack_int lda);
-
-lapack_int LAPACKE_strcon(int matrix_order, char norm, char uplo, char diag, lapack_int n, const float* a,
-                          lapack_int lda, float* rcond);
-lapack_int LAPACKE_dtrcon(int matrix_order, char norm, char uplo, char diag, lapack_int n, const double* a,
-                          lapack_int lda, double* rcond);
-lapack_int LAPACKE_ctrcon(int matrix_order, char norm, char uplo, char diag, lapack_int n,
-                          const lapack_complex_float* a, lapack_int lda, float* rcond);
-lapack_int LAPACKE_ztrcon(int matrix_order, char norm, char uplo, char diag, lapack_int n,
-                          const lapack_complex_double* a, lapack_int lda, double* rcond);
-
-lapack_int LAPACKE_strevc(int matrix_order, char side, char howmny, lapack_logical* select, lapack_int n,
-                          const float* t, lapack_int ldt, float* vl, lapack_int ldvl, float* vr, lapack_int ldvr,
-                          lapack_int mm, lapack_int* m);
-lapack_int LAPACKE_dtrevc(int matrix_order, char side, char howmny, lapack_logical* select, lapack_int n,
-                          const double* t, lapack_int ldt, double* vl, lapack_int ldvl, double* vr, lapack_int ldvr,
-                          lapack_int mm, lapack_int* m);
-lapack_int LAPACKE_ctrevc(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
-                          lapack_complex_float* t, lapack_int ldt, lapack_complex_float* vl, lapack_int ldvl,
-                          lapack_complex_float* vr, lapack_int ldvr, lapack_int mm, lapack_int* m);
-lapack_int LAPACKE_ztrevc(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
-                          lapack_complex_double* t, lapack_int ldt, lapack_complex_double* vl, lapack_int ldvl,
-                          lapack_complex_double* vr, lapack_int ldvr, lapack_int mm, lapack_int* m);
-
-lapack_int LAPACKE_strexc(int matrix_order, char compq, lapack_int n, float* t, lapack_int ldt, float* q,
-                          lapack_int ldq, lapack_int* ifst, lapack_int* ilst);
-lapack_int LAPACKE_dtrexc(int matrix_order, char compq, lapack_int n, double* t, lapack_int ldt, double* q,
-                          lapack_int ldq, lapack_int* ifst, lapack_int* ilst);
-lapack_int LAPACKE_ctrexc(int matrix_order, char compq, lapack_int n, lapack_complex_float* t, lapack_int ldt,
-                          lapack_complex_float* q, lapack_int ldq, lapack_int ifst, lapack_int ilst);
-lapack_int LAPACKE_ztrexc(int matrix_order, char compq, lapack_int n, lapack_complex_double* t, lapack_int ldt,
-                          lapack_complex_double* q, lapack_int ldq, lapack_int ifst, lapack_int ilst);
-
-lapack_int LAPACKE_strrfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                          const float* a, lapack_int lda, const float* b, lapack_int ldb, const float* x,
-                          lapack_int ldx, float* ferr, float* berr);
-lapack_int LAPACKE_dtrrfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                          const double* a, lapack_int lda, const double* b, lapack_int ldb, const double* x,
-                          lapack_int ldx, double* ferr, double* berr);
-lapack_int LAPACKE_ctrrfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* b, lapack_int ldb,
-                          const lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr);
-lapack_int LAPACKE_ztrrfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* b,
-                          lapack_int ldb, const lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr);
-
-lapack_int LAPACKE_strsen(int matrix_order, char job, char compq, const lapack_logical* select, lapack_int n, float* t,
-                          lapack_int ldt, float* q, lapack_int ldq, float* wr, float* wi, lapack_int* m, float* s,
-                          float* sep);
-lapack_int LAPACKE_dtrsen(int matrix_order, char job, char compq, const lapack_logical* select, lapack_int n, double* t,
-                          lapack_int ldt, double* q, lapack_int ldq, double* wr, double* wi, lapack_int* m, double* s,
-                          double* sep);
-lapack_int LAPACKE_ctrsen(int matrix_order, char job, char compq, const lapack_logical* select, lapack_int n,
-                          lapack_complex_float* t, lapack_int ldt, lapack_complex_float* q, lapack_int ldq,
-                          lapack_complex_float* w, lapack_int* m, float* s, float* sep);
-lapack_int LAPACKE_ztrsen(int matrix_order, char job, char compq, const lapack_logical* select, lapack_int n,
-                          lapack_complex_double* t, lapack_int ldt, lapack_complex_double* q, lapack_int ldq,
-                          lapack_complex_double* w, lapack_int* m, double* s, double* sep);
-
-lapack_int LAPACKE_strsna(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
-                          const float* t, lapack_int ldt, const float* vl, lapack_int ldvl, const float* vr,
-                          lapack_int ldvr, float* s, float* sep, lapack_int mm, lapack_int* m);
-lapack_int LAPACKE_dtrsna(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
-                          const double* t, lapack_int ldt, const double* vl, lapack_int ldvl, const double* vr,
-                          lapack_int ldvr, double* s, double* sep, lapack_int mm, lapack_int* m);
-lapack_int LAPACKE_ctrsna(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
-                          const lapack_complex_float* t, lapack_int ldt, const lapack_complex_float* vl,
-                          lapack_int ldvl, const lapack_complex_float* vr, lapack_int ldvr, float* s, float* sep,
-                          lapack_int mm, lapack_int* m);
-lapack_int LAPACKE_ztrsna(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
-                          const lapack_complex_double* t, lapack_int ldt, const lapack_complex_double* vl,
-                          lapack_int ldvl, const lapack_complex_double* vr, lapack_int ldvr, double* s, double* sep,
-                          lapack_int mm, lapack_int* m);
-
-lapack_int LAPACKE_strsyl(int matrix_order, char trana, char tranb, lapack_int isgn, lapack_int m, lapack_int n,
-                          const float* a, lapack_int lda, const float* b, lapack_int ldb, float* c, lapack_int ldc,
-                          float* scale);
-lapack_int LAPACKE_dtrsyl(int matrix_order, char trana, char tranb, lapack_int isgn, lapack_int m, lapack_int n,
-                          const double* a, lapack_int lda, const double* b, lapack_int ldb, double* c, lapack_int ldc,
-                          double* scale);
-lapack_int LAPACKE_ctrsyl(int matrix_order, char trana, char tranb, lapack_int isgn, lapack_int m, lapack_int n,
-                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* b, lapack_int ldb,
-                          lapack_complex_float* c, lapack_int ldc, float* scale);
-lapack_int LAPACKE_ztrsyl(int matrix_order, char trana, char tranb, lapack_int isgn, lapack_int m, lapack_int n,
-                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* b,
-                          lapack_int ldb, lapack_complex_double* c, lapack_int ldc, double* scale);
-
-lapack_int LAPACKE_strtri(int matrix_order, char uplo, char diag, lapack_int n, float* a, lapack_int lda);
-lapack_int LAPACKE_dtrtri(int matrix_order, char uplo, char diag, lapack_int n, double* a, lapack_int lda);
-lapack_int LAPACKE_ctrtri(int matrix_order, char uplo, char diag, lapack_int n, lapack_complex_float* a,
-                          lapack_int lda);
-lapack_int LAPACKE_ztrtri(int matrix_order, char uplo, char diag, lapack_int n, lapack_complex_double* a,
-                          lapack_int lda);
-
-lapack_int LAPACKE_strtrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                          const float* a, lapack_int lda, float* b, lapack_int ldb);
-lapack_int LAPACKE_dtrtrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                          const double* a, lapack_int lda, double* b, lapack_int ldb);
-lapack_int LAPACKE_ctrtrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_ztrtrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                          const lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_strttf(int matrix_order, char transr, char uplo, lapack_int n, const float* a, lapack_int lda,
-                          float* arf);
-lapack_int LAPACKE_dtrttf(int matrix_order, char transr, char uplo, lapack_int n, const double* a, lapack_int lda,
-                          double* arf);
-lapack_int LAPACKE_ctrttf(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_float* a,
-                          lapack_int lda, lapack_complex_float* arf);
-lapack_int LAPACKE_ztrttf(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_double* a,
-                          lapack_int lda, lapack_complex_double* arf);
-
-lapack_int LAPACKE_strttp(int matrix_order, char uplo, lapack_int n, const float* a, lapack_int lda, float* ap);
-lapack_int LAPACKE_dtrttp(int matrix_order, char uplo, lapack_int n, const double* a, lapack_int lda, double* ap);
-lapack_int LAPACKE_ctrttp(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
-                          lapack_complex_float* ap);
-lapack_int LAPACKE_ztrttp(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a, lapack_int lda,
-                          lapack_complex_double* ap);
-
-lapack_int LAPACKE_stzrzf(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau);
-lapack_int LAPACKE_dtzrzf(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau);
-lapack_int LAPACKE_ctzrzf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          lapack_complex_float* tau);
-lapack_int LAPACKE_ztzrzf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          lapack_complex_double* tau);
-
-lapack_int LAPACKE_cungbr(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int k,
-                          lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau);
-lapack_int LAPACKE_zungbr(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int k,
-                          lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau);
-
-lapack_int LAPACKE_cunghr(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, lapack_complex_float* a,
-                          lapack_int lda, const lapack_complex_float* tau);
-lapack_int LAPACKE_zunghr(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, lapack_complex_double* a,
-                          lapack_int lda, const lapack_complex_double* tau);
-
-lapack_int LAPACKE_cunglq(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_float* a,
-                          lapack_int lda, const lapack_complex_float* tau);
-lapack_int LAPACKE_zunglq(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_double* a,
-                          lapack_int lda, const lapack_complex_double* tau);
-
-lapack_int LAPACKE_cungql(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_float* a,
-                          lapack_int lda, const lapack_complex_float* tau);
-lapack_int LAPACKE_zungql(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_double* a,
-                          lapack_int lda, const lapack_complex_double* tau);
-
-lapack_int LAPACKE_cungqr(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_float* a,
-                          lapack_int lda, const lapack_complex_float* tau);
-lapack_int LAPACKE_zungqr(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_double* a,
-                          lapack_int lda, const lapack_complex_double* tau);
-
-lapack_int LAPACKE_cungrq(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_float* a,
-                          lapack_int lda, const lapack_complex_float* tau);
-lapack_int LAPACKE_zungrq(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_double* a,
-                          lapack_int lda, const lapack_complex_double* tau);
-
-lapack_int LAPACKE_cungtr(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                          const lapack_complex_float* tau);
-lapack_int LAPACKE_zungtr(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                          const lapack_complex_double* tau);
-
-lapack_int LAPACKE_cunmbr(int matrix_order, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
-                          lapack_complex_float* c, lapack_int ldc);
-lapack_int LAPACKE_zunmbr(int matrix_order, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
-                          lapack_complex_double* c, lapack_int ldc);
-
-lapack_int LAPACKE_cunmhr(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int ilo,
-                          lapack_int ihi, const lapack_complex_float* a, lapack_int lda,
-                          const lapack_complex_float* tau, lapack_complex_float* c, lapack_int ldc);
-lapack_int LAPACKE_zunmhr(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int ilo,
-                          lapack_int ihi, const lapack_complex_double* a, lapack_int lda,
-                          const lapack_complex_double* tau, lapack_complex_double* c, lapack_int ldc);
-
-lapack_int LAPACKE_cunmlq(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
-                          lapack_complex_float* c, lapack_int ldc);
-lapack_int LAPACKE_zunmlq(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
-                          lapack_complex_double* c, lapack_int ldc);
-
-lapack_int LAPACKE_cunmql(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
-                          lapack_complex_float* c, lapack_int ldc);
-lapack_int LAPACKE_zunmql(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
-                          lapack_complex_double* c, lapack_int ldc);
-
-lapack_int LAPACKE_cunmqr(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
-                          lapack_complex_float* c, lapack_int ldc);
-lapack_int LAPACKE_zunmqr(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
-                          lapack_complex_double* c, lapack_int ldc);
-
-lapack_int LAPACKE_cunmrq(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
-                          lapack_complex_float* c, lapack_int ldc);
-lapack_int LAPACKE_zunmrq(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
-                          lapack_complex_double* c, lapack_int ldc);
-
-lapack_int LAPACKE_cunmrz(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          lapack_int l, const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
-                          lapack_complex_float* c, lapack_int ldc);
-lapack_int LAPACKE_zunmrz(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                          lapack_int l, const lapack_complex_double* a, lapack_int lda,
-                          const lapack_complex_double* tau, lapack_complex_double* c, lapack_int ldc);
-
-lapack_int LAPACKE_cunmtr(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
-                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
-                          lapack_complex_float* c, lapack_int ldc);
-lapack_int LAPACKE_zunmtr(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
-                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
-                          lapack_complex_double* c, lapack_int ldc);
-
-lapack_int LAPACKE_cupgtr(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap,
-                          const lapack_complex_float* tau, lapack_complex_float* q, lapack_int ldq);
-lapack_int LAPACKE_zupgtr(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap,
-                          const lapack_complex_double* tau, lapack_complex_double* q, lapack_int ldq);
-
-lapack_int LAPACKE_cupmtr(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
-                          const lapack_complex_float* ap, const lapack_complex_float* tau, lapack_complex_float* c,
-                          lapack_int ldc);
-lapack_int LAPACKE_zupmtr(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
-                          const lapack_complex_double* ap, const lapack_complex_double* tau, lapack_complex_double* c,
-                          lapack_int ldc);
-
-lapack_int LAPACKE_sbdsdc_work(int matrix_order, char uplo, char compq, lapack_int n, float* d, float* e, float* u,
-                               lapack_int ldu, float* vt, lapack_int ldvt, float* q, lapack_int* iq, float* work,
-                               lapack_int* iwork);
-lapack_int LAPACKE_dbdsdc_work(int matrix_order, char uplo, char compq, lapack_int n, double* d, double* e, double* u,
-                               lapack_int ldu, double* vt, lapack_int ldvt, double* q, lapack_int* iq, double* work,
-                               lapack_int* iwork);
-
-lapack_int LAPACKE_sbdsqr_work(int matrix_order, char uplo, lapack_int n, lapack_int ncvt, lapack_int nru,
-                               lapack_int ncc, float* d, float* e, float* vt, lapack_int ldvt, float* u, lapack_int ldu,
-                               float* c, lapack_int ldc, float* work);
-lapack_int LAPACKE_dbdsqr_work(int matrix_order, char uplo, lapack_int n, lapack_int ncvt, lapack_int nru,
-                               lapack_int ncc, double* d, double* e, double* vt, lapack_int ldvt, double* u,
-                               lapack_int ldu, double* c, lapack_int ldc, double* work);
-lapack_int LAPACKE_cbdsqr_work(int matrix_order, char uplo, lapack_int n, lapack_int ncvt, lapack_int nru,
-                               lapack_int ncc, float* d, float* e, lapack_complex_float* vt, lapack_int ldvt,
-                               lapack_complex_float* u, lapack_int ldu, lapack_complex_float* c, lapack_int ldc,
-                               float* work);
-lapack_int LAPACKE_zbdsqr_work(int matrix_order, char uplo, lapack_int n, lapack_int ncvt, lapack_int nru,
-                               lapack_int ncc, double* d, double* e, lapack_complex_double* vt, lapack_int ldvt,
-                               lapack_complex_double* u, lapack_int ldu, lapack_complex_double* c, lapack_int ldc,
-                               double* work);
-
-lapack_int LAPACKE_sdisna_work(char job, lapack_int m, lapack_int n, const float* d, float* sep);
-lapack_int LAPACKE_ddisna_work(char job, lapack_int m, lapack_int n, const double* d, double* sep);
-
-lapack_int LAPACKE_sgbbrd_work(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int ncc, lapack_int kl,
-                               lapack_int ku, float* ab, lapack_int ldab, float* d, float* e, float* q, lapack_int ldq,
-                               float* pt, lapack_int ldpt, float* c, lapack_int ldc, float* work);
-lapack_int LAPACKE_dgbbrd_work(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int ncc, lapack_int kl,
-                               lapack_int ku, double* ab, lapack_int ldab, double* d, double* e, double* q,
-                               lapack_int ldq, double* pt, lapack_int ldpt, double* c, lapack_int ldc, double* work);
-lapack_int LAPACKE_cgbbrd_work(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int ncc, lapack_int kl,
-                               lapack_int ku, lapack_complex_float* ab, lapack_int ldab, float* d, float* e,
-                               lapack_complex_float* q, lapack_int ldq, lapack_complex_float* pt, lapack_int ldpt,
-                               lapack_complex_float* c, lapack_int ldc, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zgbbrd_work(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int ncc, lapack_int kl,
-                               lapack_int ku, lapack_complex_double* ab, lapack_int ldab, double* d, double* e,
-                               lapack_complex_double* q, lapack_int ldq, lapack_complex_double* pt, lapack_int ldpt,
-                               lapack_complex_double* c, lapack_int ldc, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sgbcon_work(int matrix_order, char norm, lapack_int n, lapack_int kl, lapack_int ku, const float* ab,
-                               lapack_int ldab, const lapack_int* ipiv, float anorm, float* rcond, float* work,
-                               lapack_int* iwork);
-lapack_int LAPACKE_dgbcon_work(int matrix_order, char norm, lapack_int n, lapack_int kl, lapack_int ku,
-                               const double* ab, lapack_int ldab, const lapack_int* ipiv, double anorm, double* rcond,
-                               double* work, lapack_int* iwork);
-lapack_int LAPACKE_cgbcon_work(int matrix_order, char norm, lapack_int n, lapack_int kl, lapack_int ku,
-                               const lapack_complex_float* ab, lapack_int ldab, const lapack_int* ipiv, float anorm,
-                               float* rcond, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zgbcon_work(int matrix_order, char norm, lapack_int n, lapack_int kl, lapack_int ku,
-                               const lapack_complex_double* ab, lapack_int ldab, const lapack_int* ipiv, double anorm,
-                               double* rcond, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sgbequ_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                               const float* ab, lapack_int ldab, float* r, float* c, float* rowcnd, float* colcnd,
-                               float* amax);
-lapack_int LAPACKE_dgbequ_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                               const double* ab, lapack_int ldab, double* r, double* c, double* rowcnd, double* colcnd,
-                               double* amax);
-lapack_int LAPACKE_cgbequ_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                               const lapack_complex_float* ab, lapack_int ldab, float* r, float* c, float* rowcnd,
-                               float* colcnd, float* amax);
-lapack_int LAPACKE_zgbequ_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                               const lapack_complex_double* ab, lapack_int ldab, double* r, double* c, double* rowcnd,
-                               double* colcnd, double* amax);
-
-lapack_int LAPACKE_sgbequb_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                                const float* ab, lapack_int ldab, float* r, float* c, float* rowcnd, float* colcnd,
-                                float* amax);
-lapack_int LAPACKE_dgbequb_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                                const double* ab, lapack_int ldab, double* r, double* c, double* rowcnd, double* colcnd,
-                                double* amax);
-lapack_int LAPACKE_cgbequb_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                                const lapack_complex_float* ab, lapack_int ldab, float* r, float* c, float* rowcnd,
-                                float* colcnd, float* amax);
-lapack_int LAPACKE_zgbequb_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                                const lapack_complex_double* ab, lapack_int ldab, double* r, double* c, double* rowcnd,
-                                double* colcnd, double* amax);
-
-lapack_int LAPACKE_sgbrfs_work(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                               lapack_int nrhs, const float* ab, lapack_int ldab, const float* afb, lapack_int ldafb,
-                               const lapack_int* ipiv, const float* b, lapack_int ldb, float* x, lapack_int ldx,
-                               float* ferr, float* berr, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dgbrfs_work(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                               lapack_int nrhs, const double* ab, lapack_int ldab, const double* afb, lapack_int ldafb,
-                               const lapack_int* ipiv, const double* b, lapack_int ldb, double* x, lapack_int ldx,
-                               double* ferr, double* berr, double* work, lapack_int* iwork);
-lapack_int LAPACKE_cgbrfs_work(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                               lapack_int nrhs, const lapack_complex_float* ab, lapack_int ldab,
-                               const lapack_complex_float* afb, lapack_int ldafb, const lapack_int* ipiv,
-                               const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                               float* ferr, float* berr, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zgbrfs_work(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                               lapack_int nrhs, const lapack_complex_double* ab, lapack_int ldab,
-                               const lapack_complex_double* afb, lapack_int ldafb, const lapack_int* ipiv,
-                               const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                               double* ferr, double* berr, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sgbrfsx_work(int matrix_order, char trans, char equed, lapack_int n, lapack_int kl, lapack_int ku,
-                                lapack_int nrhs, const float* ab, lapack_int ldab, const float* afb, lapack_int ldafb,
-                                const lapack_int* ipiv, const float* r, const float* c, const float* b, lapack_int ldb,
-                                float* x, lapack_int ldx, float* rcond, float* berr, lapack_int n_err_bnds,
-                                float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams, float* params,
-                                float* work, lapack_int* iwork);
-lapack_int LAPACKE_dgbrfsx_work(int matrix_order, char trans, char equed, lapack_int n, lapack_int kl, lapack_int ku,
-                                lapack_int nrhs, const double* ab, lapack_int ldab, const double* afb, lapack_int ldafb,
-                                const lapack_int* ipiv, const double* r, const double* c, const double* b,
-                                lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* berr,
-                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
-                                double* params, double* work, lapack_int* iwork);
-lapack_int LAPACKE_cgbrfsx_work(int matrix_order, char trans, char equed, lapack_int n, lapack_int kl, lapack_int ku,
-                                lapack_int nrhs, const lapack_complex_float* ab, lapack_int ldab,
-                                const lapack_complex_float* afb, lapack_int ldafb, const lapack_int* ipiv,
-                                const float* r, const float* c, const lapack_complex_float* b, lapack_int ldb,
-                                lapack_complex_float* x, lapack_int ldx, float* rcond, float* berr,
-                                lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
-                                float* params, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zgbrfsx_work(int matrix_order, char trans, char equed, lapack_int n, lapack_int kl, lapack_int ku,
-                                lapack_int nrhs, const lapack_complex_double* ab, lapack_int ldab,
-                                const lapack_complex_double* afb, lapack_int ldafb, const lapack_int* ipiv,
-                                const double* r, const double* c, const lapack_complex_double* b, lapack_int ldb,
-                                lapack_complex_double* x, lapack_int ldx, double* rcond, double* berr,
-                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
-                                double* params, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sgbsv_work(int matrix_order, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs, float* ab,
-                              lapack_int ldab, lapack_int* ipiv, float* b, lapack_int ldb);
-lapack_int LAPACKE_dgbsv_work(int matrix_order, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs, double* ab,
-                              lapack_int ldab, lapack_int* ipiv, double* b, lapack_int ldb);
-lapack_int LAPACKE_cgbsv_work(int matrix_order, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
-                              lapack_complex_float* ab, lapack_int ldab, lapack_int* ipiv, lapack_complex_float* b,
-                              lapack_int ldb);
-lapack_int LAPACKE_zgbsv_work(int matrix_order, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
-                              lapack_complex_double* ab, lapack_int ldab, lapack_int* ipiv, lapack_complex_double* b,
-                              lapack_int ldb);
-
-lapack_int LAPACKE_sgbsvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                               lapack_int nrhs, float* ab, lapack_int ldab, float* afb, lapack_int ldafb,
-                               lapack_int* ipiv, char* equed, float* r, float* c, float* b, lapack_int ldb, float* x,
-                               lapack_int ldx, float* rcond, float* ferr, float* berr, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dgbsvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                               lapack_int nrhs, double* ab, lapack_int ldab, double* afb, lapack_int ldafb,
-                               lapack_int* ipiv, char* equed, double* r, double* c, double* b, lapack_int ldb,
-                               double* x, lapack_int ldx, double* rcond, double* ferr, double* berr, double* work,
-                               lapack_int* iwork);
-lapack_int LAPACKE_cgbsvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                               lapack_int nrhs, lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* afb,
-                               lapack_int ldafb, lapack_int* ipiv, char* equed, float* r, float* c,
-                               lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                               float* rcond, float* ferr, float* berr, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zgbsvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                               lapack_int nrhs, lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* afb,
-                               lapack_int ldafb, lapack_int* ipiv, char* equed, double* r, double* c,
-                               lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                               double* rcond, double* ferr, double* berr, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sgbsvxx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                                lapack_int nrhs, float* ab, lapack_int ldab, float* afb, lapack_int ldafb,
-                                lapack_int* ipiv, char* equed, float* r, float* c, float* b, lapack_int ldb, float* x,
-                                lapack_int ldx, float* rcond, float* rpvgrw, float* berr, lapack_int n_err_bnds,
-                                float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams, float* params,
-                                float* work, lapack_int* iwork);
-lapack_int LAPACKE_dgbsvxx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                                lapack_int nrhs, double* ab, lapack_int ldab, double* afb, lapack_int ldafb,
-                                lapack_int* ipiv, char* equed, double* r, double* c, double* b, lapack_int ldb,
-                                double* x, lapack_int ldx, double* rcond, double* rpvgrw, double* berr,
-                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
-                                double* params, double* work, lapack_int* iwork);
-lapack_int LAPACKE_cgbsvxx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                                lapack_int nrhs, lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* afb,
-                                lapack_int ldafb, lapack_int* ipiv, char* equed, float* r, float* c,
-                                lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                                float* rcond, float* rpvgrw, float* berr, lapack_int n_err_bnds, float* err_bnds_norm,
-                                float* err_bnds_comp, lapack_int nparams, float* params, lapack_complex_float* work,
-                                float* rwork);
-lapack_int LAPACKE_zgbsvxx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                                lapack_int nrhs, lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* afb,
-                                lapack_int ldafb, lapack_int* ipiv, char* equed, double* r, double* c,
-                                lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                                double* rcond, double* rpvgrw, double* berr, lapack_int n_err_bnds,
-                                double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams, double* params,
-                                lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sgbtrf_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, float* ab,
-                               lapack_int ldab, lapack_int* ipiv);
-lapack_int LAPACKE_dgbtrf_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, double* ab,
-                               lapack_int ldab, lapack_int* ipiv);
-lapack_int LAPACKE_cgbtrf_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                               lapack_complex_float* ab, lapack_int ldab, lapack_int* ipiv);
-lapack_int LAPACKE_zgbtrf_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                               lapack_complex_double* ab, lapack_int ldab, lapack_int* ipiv);
-
-lapack_int LAPACKE_sgbtrs_work(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                               lapack_int nrhs, const float* ab, lapack_int ldab, const lapack_int* ipiv, float* b,
-                               lapack_int ldb);
-lapack_int LAPACKE_dgbtrs_work(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                               lapack_int nrhs, const double* ab, lapack_int ldab, const lapack_int* ipiv, double* b,
-                               lapack_int ldb);
-lapack_int LAPACKE_cgbtrs_work(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                               lapack_int nrhs, const lapack_complex_float* ab, lapack_int ldab, const lapack_int* ipiv,
-                               lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zgbtrs_work(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku,
-                               lapack_int nrhs, const lapack_complex_double* ab, lapack_int ldab,
-                               const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_sgebak_work(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
-                               const float* scale, lapack_int m, float* v, lapack_int ldv);
-lapack_int LAPACKE_dgebak_work(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
-                               const double* scale, lapack_int m, double* v, lapack_int ldv);
-lapack_int LAPACKE_cgebak_work(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
-                               const float* scale, lapack_int m, lapack_complex_float* v, lapack_int ldv);
-lapack_int LAPACKE_zgebak_work(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
-                               const double* scale, lapack_int m, lapack_complex_double* v, lapack_int ldv);
-
-lapack_int LAPACKE_sgebal_work(int matrix_order, char job, lapack_int n, float* a, lapack_int lda, lapack_int* ilo,
-                               lapack_int* ihi, float* scale);
-lapack_int LAPACKE_dgebal_work(int matrix_order, char job, lapack_int n, double* a, lapack_int lda, lapack_int* ilo,
-                               lapack_int* ihi, double* scale);
-lapack_int LAPACKE_cgebal_work(int matrix_order, char job, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               lapack_int* ilo, lapack_int* ihi, float* scale);
-lapack_int LAPACKE_zgebal_work(int matrix_order, char job, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               lapack_int* ilo, lapack_int* ihi, double* scale);
-
-lapack_int LAPACKE_sgebrd_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* d,
-                               float* e, float* tauq, float* taup, float* work, lapack_int lwork);
-lapack_int LAPACKE_dgebrd_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* d,
-                               double* e, double* tauq, double* taup, double* work, lapack_int lwork);
-lapack_int LAPACKE_cgebrd_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               float* d, float* e, lapack_complex_float* tauq, lapack_complex_float* taup,
-                               lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zgebrd_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               double* d, double* e, lapack_complex_double* tauq, lapack_complex_double* taup,
-                               lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sgecon_work(int matrix_order, char norm, lapack_int n, const float* a, lapack_int lda, float anorm,
-                               float* rcond, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dgecon_work(int matrix_order, char norm, lapack_int n, const double* a, lapack_int lda, double anorm,
-                               double* rcond, double* work, lapack_int* iwork);
-lapack_int LAPACKE_cgecon_work(int matrix_order, char norm, lapack_int n, const lapack_complex_float* a, lapack_int lda,
-                               float anorm, float* rcond, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zgecon_work(int matrix_order, char norm, lapack_int n, const lapack_complex_double* a,
-                               lapack_int lda, double anorm, double* rcond, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sgeequ_work(int matrix_order, lapack_int m, lapack_int n, const float* a, lapack_int lda, float* r,
-                               float* c, float* rowcnd, float* colcnd, float* amax);
-lapack_int LAPACKE_dgeequ_work(int matrix_order, lapack_int m, lapack_int n, const double* a, lapack_int lda, double* r,
-                               double* c, double* rowcnd, double* colcnd, double* amax);
-lapack_int LAPACKE_cgeequ_work(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_float* a,
-                               lapack_int lda, float* r, float* c, float* rowcnd, float* colcnd, float* amax);
-lapack_int LAPACKE_zgeequ_work(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_double* a,
-                               lapack_int lda, double* r, double* c, double* rowcnd, double* colcnd, double* amax);
-
-lapack_int LAPACKE_sgeequb_work(int matrix_order, lapack_int m, lapack_int n, const float* a, lapack_int lda, float* r,
-                                float* c, float* rowcnd, float* colcnd, float* amax);
-lapack_int LAPACKE_dgeequb_work(int matrix_order, lapack_int m, lapack_int n, const double* a, lapack_int lda,
-                                double* r, double* c, double* rowcnd, double* colcnd, double* amax);
-lapack_int LAPACKE_cgeequb_work(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_float* a,
-                                lapack_int lda, float* r, float* c, float* rowcnd, float* colcnd, float* amax);
-lapack_int LAPACKE_zgeequb_work(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_double* a,
-                                lapack_int lda, double* r, double* c, double* rowcnd, double* colcnd, double* amax);
-
-lapack_int LAPACKE_sgees_work(int matrix_order, char jobvs, char sort, LAPACK_S_SELECT2 select, lapack_int n, float* a,
-                              lapack_int lda, lapack_int* sdim, float* wr, float* wi, float* vs, lapack_int ldvs,
-                              float* work, lapack_int lwork, lapack_logical* bwork);
-lapack_int LAPACKE_dgees_work(int matrix_order, char jobvs, char sort, LAPACK_D_SELECT2 select, lapack_int n, double* a,
-                              lapack_int lda, lapack_int* sdim, double* wr, double* wi, double* vs, lapack_int ldvs,
-                              double* work, lapack_int lwork, lapack_logical* bwork);
-lapack_int LAPACKE_cgees_work(int matrix_order, char jobvs, char sort, LAPACK_C_SELECT1 select, lapack_int n,
-                              lapack_complex_float* a, lapack_int lda, lapack_int* sdim, lapack_complex_float* w,
-                              lapack_complex_float* vs, lapack_int ldvs, lapack_complex_float* work, lapack_int lwork,
-                              float* rwork, lapack_logical* bwork);
-lapack_int LAPACKE_zgees_work(int matrix_order, char jobvs, char sort, LAPACK_Z_SELECT1 select, lapack_int n,
-                              lapack_complex_double* a, lapack_int lda, lapack_int* sdim, lapack_complex_double* w,
-                              lapack_complex_double* vs, lapack_int ldvs, lapack_complex_double* work, lapack_int lwork,
-                              double* rwork, lapack_logical* bwork);
-
-lapack_int LAPACKE_sgeesx_work(int matrix_order, char jobvs, char sort, LAPACK_S_SELECT2 select, char sense,
-                               lapack_int n, float* a, lapack_int lda, lapack_int* sdim, float* wr, float* wi,
-                               float* vs, lapack_int ldvs, float* rconde, float* rcondv, float* work, lapack_int lwork,
-                               lapack_int* iwork, lapack_int liwork, lapack_logical* bwork);
-lapack_int LAPACKE_dgeesx_work(int matrix_order, char jobvs, char sort, LAPACK_D_SELECT2 select, char sense,
-                               lapack_int n, double* a, lapack_int lda, lapack_int* sdim, double* wr, double* wi,
-                               double* vs, lapack_int ldvs, double* rconde, double* rcondv, double* work,
-                               lapack_int lwork, lapack_int* iwork, lapack_int liwork, lapack_logical* bwork);
-lapack_int LAPACKE_cgeesx_work(int matrix_order, char jobvs, char sort, LAPACK_C_SELECT1 select, char sense,
-                               lapack_int n, lapack_complex_float* a, lapack_int lda, lapack_int* sdim,
-                               lapack_complex_float* w, lapack_complex_float* vs, lapack_int ldvs, float* rconde,
-                               float* rcondv, lapack_complex_float* work, lapack_int lwork, float* rwork,
-                               lapack_logical* bwork);
-lapack_int LAPACKE_zgeesx_work(int matrix_order, char jobvs, char sort, LAPACK_Z_SELECT1 select, char sense,
-                               lapack_int n, lapack_complex_double* a, lapack_int lda, lapack_int* sdim,
-                               lapack_complex_double* w, lapack_complex_double* vs, lapack_int ldvs, double* rconde,
-                               double* rcondv, lapack_complex_double* work, lapack_int lwork, double* rwork,
-                               lapack_logical* bwork);
-
-lapack_int LAPACKE_sgeev_work(int matrix_order, char jobvl, char jobvr, lapack_int n, float* a, lapack_int lda,
-                              float* wr, float* wi, float* vl, lapack_int ldvl, float* vr, lapack_int ldvr, float* work,
-                              lapack_int lwork);
-lapack_int LAPACKE_dgeev_work(int matrix_order, char jobvl, char jobvr, lapack_int n, double* a, lapack_int lda,
-                              double* wr, double* wi, double* vl, lapack_int ldvl, double* vr, lapack_int ldvr,
-                              double* work, lapack_int lwork);
-lapack_int LAPACKE_cgeev_work(int matrix_order, char jobvl, char jobvr, lapack_int n, lapack_complex_float* a,
-                              lapack_int lda, lapack_complex_float* w, lapack_complex_float* vl, lapack_int ldvl,
-                              lapack_complex_float* vr, lapack_int ldvr, lapack_complex_float* work, lapack_int lwork,
-                              float* rwork);
-lapack_int LAPACKE_zgeev_work(int matrix_order, char jobvl, char jobvr, lapack_int n, lapack_complex_double* a,
-                              lapack_int lda, lapack_complex_double* w, lapack_complex_double* vl, lapack_int ldvl,
-                              lapack_complex_double* vr, lapack_int ldvr, lapack_complex_double* work, lapack_int lwork,
-                              double* rwork);
-
-lapack_int LAPACKE_sgeevx_work(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
-                               float* a, lapack_int lda, float* wr, float* wi, float* vl, lapack_int ldvl, float* vr,
-                               lapack_int ldvr, lapack_int* ilo, lapack_int* ihi, float* scale, float* abnrm,
-                               float* rconde, float* rcondv, float* work, lapack_int lwork, lapack_int* iwork);
-lapack_int LAPACKE_dgeevx_work(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
-                               double* a, lapack_int lda, double* wr, double* wi, double* vl, lapack_int ldvl,
-                               double* vr, lapack_int ldvr, lapack_int* ilo, lapack_int* ihi, double* scale,
-                               double* abnrm, double* rconde, double* rcondv, double* work, lapack_int lwork,
-                               lapack_int* iwork);
-lapack_int LAPACKE_cgeevx_work(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
-                               lapack_complex_float* a, lapack_int lda, lapack_complex_float* w,
-                               lapack_complex_float* vl, lapack_int ldvl, lapack_complex_float* vr, lapack_int ldvr,
-                               lapack_int* ilo, lapack_int* ihi, float* scale, float* abnrm, float* rconde,
-                               float* rcondv, lapack_complex_float* work, lapack_int lwork, float* rwork);
-lapack_int LAPACKE_zgeevx_work(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
-                               lapack_complex_double* a, lapack_int lda, lapack_complex_double* w,
-                               lapack_complex_double* vl, lapack_int ldvl, lapack_complex_double* vr, lapack_int ldvr,
-                               lapack_int* ilo, lapack_int* ihi, double* scale, double* abnrm, double* rconde,
-                               double* rcondv, lapack_complex_double* work, lapack_int lwork, double* rwork);
-
-lapack_int LAPACKE_sgehrd_work(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, float* a, lapack_int lda,
-                               float* tau, float* work, lapack_int lwork);
-lapack_int LAPACKE_dgehrd_work(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, double* a,
-                               lapack_int lda, double* tau, double* work, lapack_int lwork);
-lapack_int LAPACKE_cgehrd_work(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, lapack_complex_float* a,
-                               lapack_int lda, lapack_complex_float* tau, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zgehrd_work(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, lapack_complex_double* a,
-                               lapack_int lda, lapack_complex_double* tau, lapack_complex_double* work,
-                               lapack_int lwork);
-
-lapack_int LAPACKE_sgejsv_work(int matrix_order, char joba, char jobu, char jobv, char jobr, char jobt, char jobp,
-                               lapack_int m, lapack_int n, float* a, lapack_int lda, float* sva, float* u,
-                               lapack_int ldu, float* v, lapack_int ldv, float* work, lapack_int lwork,
-                               lapack_int* iwork);
-lapack_int LAPACKE_dgejsv_work(int matrix_order, char joba, char jobu, char jobv, char jobr, char jobt, char jobp,
-                               lapack_int m, lapack_int n, double* a, lapack_int lda, double* sva, double* u,
-                               lapack_int ldu, double* v, lapack_int ldv, double* work, lapack_int lwork,
-                               lapack_int* iwork);
-
-lapack_int LAPACKE_sgelq2_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau,
-                               float* work);
-lapack_int LAPACKE_dgelq2_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau,
-                               double* work);
-lapack_int LAPACKE_cgelq2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               lapack_complex_float* tau, lapack_complex_float* work);
-lapack_int LAPACKE_zgelq2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               lapack_complex_double* tau, lapack_complex_double* work);
-
-lapack_int LAPACKE_sgelqf_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau,
-                               float* work, lapack_int lwork);
-lapack_int LAPACKE_dgelqf_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau,
-                               double* work, lapack_int lwork);
-lapack_int LAPACKE_cgelqf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               lapack_complex_float* tau, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zgelqf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               lapack_complex_double* tau, lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sgels_work(int matrix_order, char trans, lapack_int m, lapack_int n, lapack_int nrhs, float* a,
-                              lapack_int lda, float* b, lapack_int ldb, float* work, lapack_int lwork);
-lapack_int LAPACKE_dgels_work(int matrix_order, char trans, lapack_int m, lapack_int n, lapack_int nrhs, double* a,
-                              lapack_int lda, double* b, lapack_int ldb, double* work, lapack_int lwork);
-lapack_int LAPACKE_cgels_work(int matrix_order, char trans, lapack_int m, lapack_int n, lapack_int nrhs,
-                              lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
-                              lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zgels_work(int matrix_order, char trans, lapack_int m, lapack_int n, lapack_int nrhs,
-                              lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
-                              lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sgelsd_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
-                               float* b, lapack_int ldb, float* s, float rcond, lapack_int* rank, float* work,
-                               lapack_int lwork, lapack_int* iwork);
-lapack_int LAPACKE_dgelsd_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
-                               double* b, lapack_int ldb, double* s, double rcond, lapack_int* rank, double* work,
-                               lapack_int lwork, lapack_int* iwork);
-lapack_int LAPACKE_cgelsd_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
-                               lapack_int lda, lapack_complex_float* b, lapack_int ldb, float* s, float rcond,
-                               lapack_int* rank, lapack_complex_float* work, lapack_int lwork, float* rwork,
-                               lapack_int* iwork);
-lapack_int LAPACKE_zgelsd_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
-                               lapack_int lda, lapack_complex_double* b, lapack_int ldb, double* s, double rcond,
-                               lapack_int* rank, lapack_complex_double* work, lapack_int lwork, double* rwork,
-                               lapack_int* iwork);
-
-lapack_int LAPACKE_sgelss_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
-                               float* b, lapack_int ldb, float* s, float rcond, lapack_int* rank, float* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_dgelss_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
-                               double* b, lapack_int ldb, double* s, double rcond, lapack_int* rank, double* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_cgelss_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
-                               lapack_int lda, lapack_complex_float* b, lapack_int ldb, float* s, float rcond,
-                               lapack_int* rank, lapack_complex_float* work, lapack_int lwork, float* rwork);
-lapack_int LAPACKE_zgelss_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
-                               lapack_int lda, lapack_complex_double* b, lapack_int ldb, double* s, double rcond,
-                               lapack_int* rank, lapack_complex_double* work, lapack_int lwork, double* rwork);
-
-lapack_int LAPACKE_sgelsy_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
-                               float* b, lapack_int ldb, lapack_int* jpvt, float rcond, lapack_int* rank, float* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_dgelsy_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
-                               double* b, lapack_int ldb, lapack_int* jpvt, double rcond, lapack_int* rank,
-                               double* work, lapack_int lwork);
-lapack_int LAPACKE_cgelsy_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
-                               lapack_int lda, lapack_complex_float* b, lapack_int ldb, lapack_int* jpvt, float rcond,
-                               lapack_int* rank, lapack_complex_float* work, lapack_int lwork, float* rwork);
-lapack_int LAPACKE_zgelsy_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
-                               lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_int* jpvt, double rcond,
-                               lapack_int* rank, lapack_complex_double* work, lapack_int lwork, double* rwork);
-
-lapack_int LAPACKE_sgeqlf_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau,
-                               float* work, lapack_int lwork);
-lapack_int LAPACKE_dgeqlf_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau,
-                               double* work, lapack_int lwork);
-lapack_int LAPACKE_cgeqlf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               lapack_complex_float* tau, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zgeqlf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               lapack_complex_double* tau, lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sgeqp3_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, lapack_int* jpvt,
-                               float* tau, float* work, lapack_int lwork);
-lapack_int LAPACKE_dgeqp3_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda,
-                               lapack_int* jpvt, double* tau, double* work, lapack_int lwork);
-lapack_int LAPACKE_cgeqp3_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               lapack_int* jpvt, lapack_complex_float* tau, lapack_complex_float* work,
-                               lapack_int lwork, float* rwork);
-lapack_int LAPACKE_zgeqp3_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               lapack_int* jpvt, lapack_complex_double* tau, lapack_complex_double* work,
-                               lapack_int lwork, double* rwork);
-
-lapack_int LAPACKE_sgeqpf_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, lapack_int* jpvt,
-                               float* tau, float* work);
-lapack_int LAPACKE_dgeqpf_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda,
-                               lapack_int* jpvt, double* tau, double* work);
-lapack_int LAPACKE_cgeqpf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               lapack_int* jpvt, lapack_complex_float* tau, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zgeqpf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               lapack_int* jpvt, lapack_complex_double* tau, lapack_complex_double* work,
-                               double* rwork);
-
-lapack_int LAPACKE_sgeqr2_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau,
-                               float* work);
-lapack_int LAPACKE_dgeqr2_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau,
-                               double* work);
-lapack_int LAPACKE_cgeqr2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               lapack_complex_float* tau, lapack_complex_float* work);
-lapack_int LAPACKE_zgeqr2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               lapack_complex_double* tau, lapack_complex_double* work);
-
-lapack_int LAPACKE_sgeqrf_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau,
-                               float* work, lapack_int lwork);
-lapack_int LAPACKE_dgeqrf_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau,
-                               double* work, lapack_int lwork);
-lapack_int LAPACKE_cgeqrf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               lapack_complex_float* tau, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zgeqrf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               lapack_complex_double* tau, lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sgeqrfp_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau,
-                                float* work, lapack_int lwork);
-lapack_int LAPACKE_dgeqrfp_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau,
-                                double* work, lapack_int lwork);
-lapack_int LAPACKE_cgeqrfp_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                                lapack_complex_float* tau, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zgeqrfp_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                                lapack_complex_double* tau, lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sgerfs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const float* a,
-                               lapack_int lda, const float* af, lapack_int ldaf, const lapack_int* ipiv, const float* b,
-                               lapack_int ldb, float* x, lapack_int ldx, float* ferr, float* berr, float* work,
-                               lapack_int* iwork);
-lapack_int LAPACKE_dgerfs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const double* a,
-                               lapack_int lda, const double* af, lapack_int ldaf, const lapack_int* ipiv,
-                               const double* b, lapack_int ldb, double* x, lapack_int ldx, double* ferr, double* berr,
-                               double* work, lapack_int* iwork);
-lapack_int LAPACKE_cgerfs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
-                               lapack_int ldaf, const lapack_int* ipiv, const lapack_complex_float* b, lapack_int ldb,
-                               lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr,
-                               lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zgerfs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
-                               lapack_int ldaf, const lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb,
-                               lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr,
-                               lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sgerfsx_work(int matrix_order, char trans, char equed, lapack_int n, lapack_int nrhs, const float* a,
-                                lapack_int lda, const float* af, lapack_int ldaf, const lapack_int* ipiv,
-                                const float* r, const float* c, const float* b, lapack_int ldb, float* x,
-                                lapack_int ldx, float* rcond, float* berr, lapack_int n_err_bnds, float* err_bnds_norm,
-                                float* err_bnds_comp, lapack_int nparams, float* params, float* work,
-                                lapack_int* iwork);
-lapack_int LAPACKE_dgerfsx_work(int matrix_order, char trans, char equed, lapack_int n, lapack_int nrhs,
-                                const double* a, lapack_int lda, const double* af, lapack_int ldaf,
-                                const lapack_int* ipiv, const double* r, const double* c, const double* b,
-                                lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* berr,
-                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
-                                double* params, double* work, lapack_int* iwork);
-lapack_int LAPACKE_cgerfsx_work(int matrix_order, char trans, char equed, lapack_int n, lapack_int nrhs,
-                                const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
-                                lapack_int ldaf, const lapack_int* ipiv, const float* r, const float* c,
-                                const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                                float* rcond, float* berr, lapack_int n_err_bnds, float* err_bnds_norm,
-                                float* err_bnds_comp, lapack_int nparams, float* params, lapack_complex_float* work,
-                                float* rwork);
-lapack_int LAPACKE_zgerfsx_work(int matrix_order, char trans, char equed, lapack_int n, lapack_int nrhs,
-                                const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
-                                lapack_int ldaf, const lapack_int* ipiv, const double* r, const double* c,
-                                const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
-                                lapack_int ldx, double* rcond, double* berr, lapack_int n_err_bnds,
-                                double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams, double* params,
-                                lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sgerqf_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau,
-                               float* work, lapack_int lwork);
-lapack_int LAPACKE_dgerqf_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau,
-                               double* work, lapack_int lwork);
-lapack_int LAPACKE_cgerqf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               lapack_complex_float* tau, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zgerqf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               lapack_complex_double* tau, lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sgesdd_work(int matrix_order, char jobz, lapack_int m, lapack_int n, float* a, lapack_int lda,
-                               float* s, float* u, lapack_int ldu, float* vt, lapack_int ldvt, float* work,
-                               lapack_int lwork, lapack_int* iwork);
-lapack_int LAPACKE_dgesdd_work(int matrix_order, char jobz, lapack_int m, lapack_int n, double* a, lapack_int lda,
-                               double* s, double* u, lapack_int ldu, double* vt, lapack_int ldvt, double* work,
-                               lapack_int lwork, lapack_int* iwork);
-lapack_int LAPACKE_cgesdd_work(int matrix_order, char jobz, lapack_int m, lapack_int n, lapack_complex_float* a,
-                               lapack_int lda, float* s, lapack_complex_float* u, lapack_int ldu,
-                               lapack_complex_float* vt, lapack_int ldvt, lapack_complex_float* work, lapack_int lwork,
-                               float* rwork, lapack_int* iwork);
-lapack_int LAPACKE_zgesdd_work(int matrix_order, char jobz, lapack_int m, lapack_int n, lapack_complex_double* a,
-                               lapack_int lda, double* s, lapack_complex_double* u, lapack_int ldu,
-                               lapack_complex_double* vt, lapack_int ldvt, lapack_complex_double* work,
-                               lapack_int lwork, double* rwork, lapack_int* iwork);
-
-lapack_int LAPACKE_sgesv_work(int matrix_order, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
-                              lapack_int* ipiv, float* b, lapack_int ldb);
-lapack_int LAPACKE_dgesv_work(int matrix_order, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
-                              lapack_int* ipiv, double* b, lapack_int ldb);
-lapack_int LAPACKE_cgesv_work(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_float* a, lapack_int lda,
-                              lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zgesv_work(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_double* a, lapack_int lda,
-                              lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
-lapack_int LAPACKE_dsgesv_work(int matrix_order, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
-                               lapack_int* ipiv, double* b, lapack_int ldb, double* x, lapack_int ldx, double* work,
-                               float* swork, lapack_int* iter);
-lapack_int LAPACKE_zcgesv_work(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
-                               lapack_int lda, lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb,
-                               lapack_complex_double* x, lapack_int ldx, lapack_complex_double* work,
-                               lapack_complex_float* swork, double* rwork, lapack_int* iter);
-
-lapack_int LAPACKE_sgesvd_work(int matrix_order, char jobu, char jobvt, lapack_int m, lapack_int n, float* a,
-                               lapack_int lda, float* s, float* u, lapack_int ldu, float* vt, lapack_int ldvt,
-                               float* work, lapack_int lwork);
-lapack_int LAPACKE_dgesvd_work(int matrix_order, char jobu, char jobvt, lapack_int m, lapack_int n, double* a,
-                               lapack_int lda, double* s, double* u, lapack_int ldu, double* vt, lapack_int ldvt,
-                               double* work, lapack_int lwork);
-lapack_int LAPACKE_cgesvd_work(int matrix_order, char jobu, char jobvt, lapack_int m, lapack_int n,
-                               lapack_complex_float* a, lapack_int lda, float* s, lapack_complex_float* u,
-                               lapack_int ldu, lapack_complex_float* vt, lapack_int ldvt, lapack_complex_float* work,
-                               lapack_int lwork, float* rwork);
-lapack_int LAPACKE_zgesvd_work(int matrix_order, char jobu, char jobvt, lapack_int m, lapack_int n,
-                               lapack_complex_double* a, lapack_int lda, double* s, lapack_complex_double* u,
-                               lapack_int ldu, lapack_complex_double* vt, lapack_int ldvt, lapack_complex_double* work,
-                               lapack_int lwork, double* rwork);
-
-lapack_int LAPACKE_sgesvj_work(int matrix_order, char joba, char jobu, char jobv, lapack_int m, lapack_int n, float* a,
-                               lapack_int lda, float* sva, lapack_int mv, float* v, lapack_int ldv, float* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_dgesvj_work(int matrix_order, char joba, char jobu, char jobv, lapack_int m, lapack_int n, double* a,
-                               lapack_int lda, double* sva, lapack_int mv, double* v, lapack_int ldv, double* work,
-                               lapack_int lwork);
-
-lapack_int LAPACKE_sgesvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, float* a,
-                               lapack_int lda, float* af, lapack_int ldaf, lapack_int* ipiv, char* equed, float* r,
-                               float* c, float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* ferr,
-                               float* berr, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dgesvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, double* a,
-                               lapack_int lda, double* af, lapack_int ldaf, lapack_int* ipiv, char* equed, double* r,
-                               double* c, double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond,
-                               double* ferr, double* berr, double* work, lapack_int* iwork);
-lapack_int LAPACKE_cgesvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
-                               lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
-                               lapack_int* ipiv, char* equed, float* r, float* c, lapack_complex_float* b,
-                               lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* rcond, float* ferr,
-                               float* berr, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zgesvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
-                               lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
-                               lapack_int* ipiv, char* equed, double* r, double* c, lapack_complex_double* b,
-                               lapack_int ldb, lapack_complex_double* x, lapack_int ldx, double* rcond, double* ferr,
-                               double* berr, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sgesvxx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, float* a,
-                                lapack_int lda, float* af, lapack_int ldaf, lapack_int* ipiv, char* equed, float* r,
-                                float* c, float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond,
-                                float* rpvgrw, float* berr, lapack_int n_err_bnds, float* err_bnds_norm,
-                                float* err_bnds_comp, lapack_int nparams, float* params, float* work,
-                                lapack_int* iwork);
-lapack_int LAPACKE_dgesvxx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, double* a,
-                                lapack_int lda, double* af, lapack_int ldaf, lapack_int* ipiv, char* equed, double* r,
-                                double* c, double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond,
-                                double* rpvgrw, double* berr, lapack_int n_err_bnds, double* err_bnds_norm,
-                                double* err_bnds_comp, lapack_int nparams, double* params, double* work,
-                                lapack_int* iwork);
-lapack_int LAPACKE_cgesvxx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
-                                lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
-                                lapack_int* ipiv, char* equed, float* r, float* c, lapack_complex_float* b,
-                                lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* rcond, float* rpvgrw,
-                                float* berr, lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
-                                lapack_int nparams, float* params, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zgesvxx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
-                                lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
-                                lapack_int* ipiv, char* equed, double* r, double* c, lapack_complex_double* b,
-                                lapack_int ldb, lapack_complex_double* x, lapack_int ldx, double* rcond, double* rpvgrw,
-                                double* berr, lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
-                                lapack_int nparams, double* params, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sgetf2_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda,
-                               lapack_int* ipiv);
-lapack_int LAPACKE_dgetf2_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda,
-                               lapack_int* ipiv);
-lapack_int LAPACKE_cgetf2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               lapack_int* ipiv);
-lapack_int LAPACKE_zgetf2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               lapack_int* ipiv);
-
-lapack_int LAPACKE_sgetrf_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda,
-                               lapack_int* ipiv);
-lapack_int LAPACKE_dgetrf_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda,
-                               lapack_int* ipiv);
-lapack_int LAPACKE_cgetrf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               lapack_int* ipiv);
-lapack_int LAPACKE_zgetrf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               lapack_int* ipiv);
-
-lapack_int LAPACKE_sgetri_work(int matrix_order, lapack_int n, float* a, lapack_int lda, const lapack_int* ipiv,
-                               float* work, lapack_int lwork);
-lapack_int LAPACKE_dgetri_work(int matrix_order, lapack_int n, double* a, lapack_int lda, const lapack_int* ipiv,
-                               double* work, lapack_int lwork);
-lapack_int LAPACKE_cgetri_work(int matrix_order, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               const lapack_int* ipiv, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zgetri_work(int matrix_order, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               const lapack_int* ipiv, lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sgetrs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const float* a,
-                               lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb);
-lapack_int LAPACKE_dgetrs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const double* a,
-                               lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb);
-lapack_int LAPACKE_cgetrs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv,
-                               lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zgetrs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv,
-                               lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_sggbak_work(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
-                               const float* lscale, const float* rscale, lapack_int m, float* v, lapack_int ldv);
-lapack_int LAPACKE_dggbak_work(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
-                               const double* lscale, const double* rscale, lapack_int m, double* v, lapack_int ldv);
-lapack_int LAPACKE_cggbak_work(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
-                               const float* lscale, const float* rscale, lapack_int m, lapack_complex_float* v,
-                               lapack_int ldv);
-lapack_int LAPACKE_zggbak_work(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
-                               const double* lscale, const double* rscale, lapack_int m, lapack_complex_double* v,
-                               lapack_int ldv);
-
-lapack_int LAPACKE_sggbal_work(int matrix_order, char job, lapack_int n, float* a, lapack_int lda, float* b,
-                               lapack_int ldb, lapack_int* ilo, lapack_int* ihi, float* lscale, float* rscale,
-                               float* work);
-lapack_int LAPACKE_dggbal_work(int matrix_order, char job, lapack_int n, double* a, lapack_int lda, double* b,
-                               lapack_int ldb, lapack_int* ilo, lapack_int* ihi, double* lscale, double* rscale,
-                               double* work);
-lapack_int LAPACKE_cggbal_work(int matrix_order, char job, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               lapack_complex_float* b, lapack_int ldb, lapack_int* ilo, lapack_int* ihi, float* lscale,
-                               float* rscale, float* work);
-lapack_int LAPACKE_zggbal_work(int matrix_order, char job, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               lapack_complex_double* b, lapack_int ldb, lapack_int* ilo, lapack_int* ihi,
-                               double* lscale, double* rscale, double* work);
-
-lapack_int LAPACKE_sgges_work(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_S_SELECT3 selctg,
-                              lapack_int n, float* a, lapack_int lda, float* b, lapack_int ldb, lapack_int* sdim,
-                              float* alphar, float* alphai, float* beta, float* vsl, lapack_int ldvsl, float* vsr,
-                              lapack_int ldvsr, float* work, lapack_int lwork, lapack_logical* bwork);
-lapack_int LAPACKE_dgges_work(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_D_SELECT3 selctg,
-                              lapack_int n, double* a, lapack_int lda, double* b, lapack_int ldb, lapack_int* sdim,
-                              double* alphar, double* alphai, double* beta, double* vsl, lapack_int ldvsl, double* vsr,
-                              lapack_int ldvsr, double* work, lapack_int lwork, lapack_logical* bwork);
-lapack_int LAPACKE_cgges_work(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_C_SELECT2 selctg,
-                              lapack_int n, lapack_complex_float* a, lapack_int lda, lapack_complex_float* b,
-                              lapack_int ldb, lapack_int* sdim, lapack_complex_float* alpha, lapack_complex_float* beta,
-                              lapack_complex_float* vsl, lapack_int ldvsl, lapack_complex_float* vsr, lapack_int ldvsr,
-                              lapack_complex_float* work, lapack_int lwork, float* rwork, lapack_logical* bwork);
-lapack_int LAPACKE_zgges_work(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_Z_SELECT2 selctg,
-                              lapack_int n, lapack_complex_double* a, lapack_int lda, lapack_complex_double* b,
-                              lapack_int ldb, lapack_int* sdim, lapack_complex_double* alpha,
-                              lapack_complex_double* beta, lapack_complex_double* vsl, lapack_int ldvsl,
-                              lapack_complex_double* vsr, lapack_int ldvsr, lapack_complex_double* work,
-                              lapack_int lwork, double* rwork, lapack_logical* bwork);
-
-lapack_int LAPACKE_sggesx_work(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_S_SELECT3 selctg,
-                               char sense, lapack_int n, float* a, lapack_int lda, float* b, lapack_int ldb,
-                               lapack_int* sdim, float* alphar, float* alphai, float* beta, float* vsl,
-                               lapack_int ldvsl, float* vsr, lapack_int ldvsr, float* rconde, float* rcondv,
-                               float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork,
-                               lapack_logical* bwork);
-lapack_int LAPACKE_dggesx_work(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_D_SELECT3 selctg,
-                               char sense, lapack_int n, double* a, lapack_int lda, double* b, lapack_int ldb,
-                               lapack_int* sdim, double* alphar, double* alphai, double* beta, double* vsl,
-                               lapack_int ldvsl, double* vsr, lapack_int ldvsr, double* rconde, double* rcondv,
-                               double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork,
-                               lapack_logical* bwork);
-lapack_int LAPACKE_cggesx_work(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_C_SELECT2 selctg,
-                               char sense, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               lapack_complex_float* b, lapack_int ldb, lapack_int* sdim, lapack_complex_float* alpha,
-                               lapack_complex_float* beta, lapack_complex_float* vsl, lapack_int ldvsl,
-                               lapack_complex_float* vsr, lapack_int ldvsr, float* rconde, float* rcondv,
-                               lapack_complex_float* work, lapack_int lwork, float* rwork, lapack_int* iwork,
-                               lapack_int liwork, lapack_logical* bwork);
-lapack_int LAPACKE_zggesx_work(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_Z_SELECT2 selctg,
-                               char sense, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               lapack_complex_double* b, lapack_int ldb, lapack_int* sdim, lapack_complex_double* alpha,
-                               lapack_complex_double* beta, lapack_complex_double* vsl, lapack_int ldvsl,
-                               lapack_complex_double* vsr, lapack_int ldvsr, double* rconde, double* rcondv,
-                               lapack_complex_double* work, lapack_int lwork, double* rwork, lapack_int* iwork,
-                               lapack_int liwork, lapack_logical* bwork);
-
-lapack_int LAPACKE_sggev_work(int matrix_order, char jobvl, char jobvr, lapack_int n, float* a, lapack_int lda,
-                              float* b, lapack_int ldb, float* alphar, float* alphai, float* beta, float* vl,
-                              lapack_int ldvl, float* vr, lapack_int ldvr, float* work, lapack_int lwork);
-lapack_int LAPACKE_dggev_work(int matrix_order, char jobvl, char jobvr, lapack_int n, double* a, lapack_int lda,
-                              double* b, lapack_int ldb, double* alphar, double* alphai, double* beta, double* vl,
-                              lapack_int ldvl, double* vr, lapack_int ldvr, double* work, lapack_int lwork);
-lapack_int LAPACKE_cggev_work(int matrix_order, char jobvl, char jobvr, lapack_int n, lapack_complex_float* a,
-                              lapack_int lda, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* alpha,
-                              lapack_complex_float* beta, lapack_complex_float* vl, lapack_int ldvl,
-                              lapack_complex_float* vr, lapack_int ldvr, lapack_complex_float* work, lapack_int lwork,
-                              float* rwork);
-lapack_int LAPACKE_zggev_work(int matrix_order, char jobvl, char jobvr, lapack_int n, lapack_complex_double* a,
-                              lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* alpha,
-                              lapack_complex_double* beta, lapack_complex_double* vl, lapack_int ldvl,
-                              lapack_complex_double* vr, lapack_int ldvr, lapack_complex_double* work, lapack_int lwork,
-                              double* rwork);
-
-lapack_int LAPACKE_sggevx_work(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
-                               float* a, lapack_int lda, float* b, lapack_int ldb, float* alphar, float* alphai,
-                               float* beta, float* vl, lapack_int ldvl, float* vr, lapack_int ldvr, lapack_int* ilo,
-                               lapack_int* ihi, float* lscale, float* rscale, float* abnrm, float* bbnrm, float* rconde,
-                               float* rcondv, float* work, lapack_int lwork, lapack_int* iwork, lapack_logical* bwork);
-lapack_int LAPACKE_dggevx_work(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
-                               double* a, lapack_int lda, double* b, lapack_int ldb, double* alphar, double* alphai,
-                               double* beta, double* vl, lapack_int ldvl, double* vr, lapack_int ldvr, lapack_int* ilo,
-                               lapack_int* ihi, double* lscale, double* rscale, double* abnrm, double* bbnrm,
-                               double* rconde, double* rcondv, double* work, lapack_int lwork, lapack_int* iwork,
-                               lapack_logical* bwork);
-lapack_int LAPACKE_cggevx_work(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
-                               lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
-                               lapack_complex_float* alpha, lapack_complex_float* beta, lapack_complex_float* vl,
-                               lapack_int ldvl, lapack_complex_float* vr, lapack_int ldvr, lapack_int* ilo,
-                               lapack_int* ihi, float* lscale, float* rscale, float* abnrm, float* bbnrm, float* rconde,
-                               float* rcondv, lapack_complex_float* work, lapack_int lwork, float* rwork,
-                               lapack_int* iwork, lapack_logical* bwork);
-lapack_int LAPACKE_zggevx_work(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
-                               lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
-                               lapack_complex_double* alpha, lapack_complex_double* beta, lapack_complex_double* vl,
-                               lapack_int ldvl, lapack_complex_double* vr, lapack_int ldvr, lapack_int* ilo,
-                               lapack_int* ihi, double* lscale, double* rscale, double* abnrm, double* bbnrm,
-                               double* rconde, double* rcondv, lapack_complex_double* work, lapack_int lwork,
-                               double* rwork, lapack_int* iwork, lapack_logical* bwork);
-
-lapack_int LAPACKE_sggglm_work(int matrix_order, lapack_int n, lapack_int m, lapack_int p, float* a, lapack_int lda,
-                               float* b, lapack_int ldb, float* d, float* x, float* y, float* work, lapack_int lwork);
-lapack_int LAPACKE_dggglm_work(int matrix_order, lapack_int n, lapack_int m, lapack_int p, double* a, lapack_int lda,
-                               double* b, lapack_int ldb, double* d, double* x, double* y, double* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_cggglm_work(int matrix_order, lapack_int n, lapack_int m, lapack_int p, lapack_complex_float* a,
-                               lapack_int lda, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* d,
-                               lapack_complex_float* x, lapack_complex_float* y, lapack_complex_float* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_zggglm_work(int matrix_order, lapack_int n, lapack_int m, lapack_int p, lapack_complex_double* a,
-                               lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* d,
-                               lapack_complex_double* x, lapack_complex_double* y, lapack_complex_double* work,
-                               lapack_int lwork);
-
-lapack_int LAPACKE_sgghrd_work(int matrix_order, char compq, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
-                               float* a, lapack_int lda, float* b, lapack_int ldb, float* q, lapack_int ldq, float* z,
-                               lapack_int ldz);
-lapack_int LAPACKE_dgghrd_work(int matrix_order, char compq, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
-                               double* a, lapack_int lda, double* b, lapack_int ldb, double* q, lapack_int ldq,
-                               double* z, lapack_int ldz);
-lapack_int LAPACKE_cgghrd_work(int matrix_order, char compq, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
-                               lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
-                               lapack_complex_float* q, lapack_int ldq, lapack_complex_float* z, lapack_int ldz);
-lapack_int LAPACKE_zgghrd_work(int matrix_order, char compq, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
-                               lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
-                               lapack_complex_double* q, lapack_int ldq, lapack_complex_double* z, lapack_int ldz);
-
-lapack_int LAPACKE_sgglse_work(int matrix_order, lapack_int m, lapack_int n, lapack_int p, float* a, lapack_int lda,
-                               float* b, lapack_int ldb, float* c, float* d, float* x, float* work, lapack_int lwork);
-lapack_int LAPACKE_dgglse_work(int matrix_order, lapack_int m, lapack_int n, lapack_int p, double* a, lapack_int lda,
-                               double* b, lapack_int ldb, double* c, double* d, double* x, double* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_cgglse_work(int matrix_order, lapack_int m, lapack_int n, lapack_int p, lapack_complex_float* a,
-                               lapack_int lda, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* c,
-                               lapack_complex_float* d, lapack_complex_float* x, lapack_complex_float* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_zgglse_work(int matrix_order, lapack_int m, lapack_int n, lapack_int p, lapack_complex_double* a,
-                               lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* c,
-                               lapack_complex_double* d, lapack_complex_double* x, lapack_complex_double* work,
-                               lapack_int lwork);
-
-lapack_int LAPACKE_sggqrf_work(int matrix_order, lapack_int n, lapack_int m, lapack_int p, float* a, lapack_int lda,
-                               float* taua, float* b, lapack_int ldb, float* taub, float* work, lapack_int lwork);
-lapack_int LAPACKE_dggqrf_work(int matrix_order, lapack_int n, lapack_int m, lapack_int p, double* a, lapack_int lda,
-                               double* taua, double* b, lapack_int ldb, double* taub, double* work, lapack_int lwork);
-lapack_int LAPACKE_cggqrf_work(int matrix_order, lapack_int n, lapack_int m, lapack_int p, lapack_complex_float* a,
-                               lapack_int lda, lapack_complex_float* taua, lapack_complex_float* b, lapack_int ldb,
-                               lapack_complex_float* taub, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zggqrf_work(int matrix_order, lapack_int n, lapack_int m, lapack_int p, lapack_complex_double* a,
-                               lapack_int lda, lapack_complex_double* taua, lapack_complex_double* b, lapack_int ldb,
-                               lapack_complex_double* taub, lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sggrqf_work(int matrix_order, lapack_int m, lapack_int p, lapack_int n, float* a, lapack_int lda,
-                               float* taua, float* b, lapack_int ldb, float* taub, float* work, lapack_int lwork);
-lapack_int LAPACKE_dggrqf_work(int matrix_order, lapack_int m, lapack_int p, lapack_int n, double* a, lapack_int lda,
-                               double* taua, double* b, lapack_int ldb, double* taub, double* work, lapack_int lwork);
-lapack_int LAPACKE_cggrqf_work(int matrix_order, lapack_int m, lapack_int p, lapack_int n, lapack_complex_float* a,
-                               lapack_int lda, lapack_complex_float* taua, lapack_complex_float* b, lapack_int ldb,
-                               lapack_complex_float* taub, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zggrqf_work(int matrix_order, lapack_int m, lapack_int p, lapack_int n, lapack_complex_double* a,
-                               lapack_int lda, lapack_complex_double* taua, lapack_complex_double* b, lapack_int ldb,
-                               lapack_complex_double* taub, lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sggsvd_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int n,
-                               lapack_int p, lapack_int* k, lapack_int* l, float* a, lapack_int lda, float* b,
-                               lapack_int ldb, float* alpha, float* beta, float* u, lapack_int ldu, float* v,
-                               lapack_int ldv, float* q, lapack_int ldq, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dggsvd_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int n,
-                               lapack_int p, lapack_int* k, lapack_int* l, double* a, lapack_int lda, double* b,
-                               lapack_int ldb, double* alpha, double* beta, double* u, lapack_int ldu, double* v,
-                               lapack_int ldv, double* q, lapack_int ldq, double* work, lapack_int* iwork);
-lapack_int LAPACKE_cggsvd_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int n,
-                               lapack_int p, lapack_int* k, lapack_int* l, lapack_complex_float* a, lapack_int lda,
-                               lapack_complex_float* b, lapack_int ldb, float* alpha, float* beta,
-                               lapack_complex_float* u, lapack_int ldu, lapack_complex_float* v, lapack_int ldv,
-                               lapack_complex_float* q, lapack_int ldq, lapack_complex_float* work, float* rwork,
-                               lapack_int* iwork);
-lapack_int LAPACKE_zggsvd_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int n,
-                               lapack_int p, lapack_int* k, lapack_int* l, lapack_complex_double* a, lapack_int lda,
-                               lapack_complex_double* b, lapack_int ldb, double* alpha, double* beta,
-                               lapack_complex_double* u, lapack_int ldu, lapack_complex_double* v, lapack_int ldv,
-                               lapack_complex_double* q, lapack_int ldq, lapack_complex_double* work, double* rwork,
-                               lapack_int* iwork);
-
-lapack_int LAPACKE_sggsvp_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p,
-                               lapack_int n, float* a, lapack_int lda, float* b, lapack_int ldb, float tola, float tolb,
-                               lapack_int* k, lapack_int* l, float* u, lapack_int ldu, float* v, lapack_int ldv,
-                               float* q, lapack_int ldq, lapack_int* iwork, float* tau, float* work);
-lapack_int LAPACKE_dggsvp_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p,
-                               lapack_int n, double* a, lapack_int lda, double* b, lapack_int ldb, double tola,
-                               double tolb, lapack_int* k, lapack_int* l, double* u, lapack_int ldu, double* v,
-                               lapack_int ldv, double* q, lapack_int ldq, lapack_int* iwork, double* tau, double* work);
-lapack_int LAPACKE_cggsvp_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p,
-                               lapack_int n, lapack_complex_float* a, lapack_int lda, lapack_complex_float* b,
-                               lapack_int ldb, float tola, float tolb, lapack_int* k, lapack_int* l,
-                               lapack_complex_float* u, lapack_int ldu, lapack_complex_float* v, lapack_int ldv,
-                               lapack_complex_float* q, lapack_int ldq, lapack_int* iwork, float* rwork,
-                               lapack_complex_float* tau, lapack_complex_float* work);
-lapack_int LAPACKE_zggsvp_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p,
-                               lapack_int n, lapack_complex_double* a, lapack_int lda, lapack_complex_double* b,
-                               lapack_int ldb, double tola, double tolb, lapack_int* k, lapack_int* l,
-                               lapack_complex_double* u, lapack_int ldu, lapack_complex_double* v, lapack_int ldv,
-                               lapack_complex_double* q, lapack_int ldq, lapack_int* iwork, double* rwork,
-                               lapack_complex_double* tau, lapack_complex_double* work);
-
-lapack_int LAPACKE_sgtcon_work(char norm, lapack_int n, const float* dl, const float* d, const float* du,
-                               const float* du2, const lapack_int* ipiv, float anorm, float* rcond, float* work,
-                               lapack_int* iwork);
-lapack_int LAPACKE_dgtcon_work(char norm, lapack_int n, const double* dl, const double* d, const double* du,
-                               const double* du2, const lapack_int* ipiv, double anorm, double* rcond, double* work,
-                               lapack_int* iwork);
-lapack_int LAPACKE_cgtcon_work(char norm, lapack_int n, const lapack_complex_float* dl, const lapack_complex_float* d,
-                               const lapack_complex_float* du, const lapack_complex_float* du2, const lapack_int* ipiv,
-                               float anorm, float* rcond, lapack_complex_float* work);
-lapack_int LAPACKE_zgtcon_work(char norm, lapack_int n, const lapack_complex_double* dl, const lapack_complex_double* d,
-                               const lapack_complex_double* du, const lapack_complex_double* du2,
-                               const lapack_int* ipiv, double anorm, double* rcond, lapack_complex_double* work);
-
-lapack_int LAPACKE_sgtrfs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const float* dl,
-                               const float* d, const float* du, const float* dlf, const float* df, const float* duf,
-                               const float* du2, const lapack_int* ipiv, const float* b, lapack_int ldb, float* x,
-                               lapack_int ldx, float* ferr, float* berr, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dgtrfs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const double* dl,
-                               const double* d, const double* du, const double* dlf, const double* df,
-                               const double* duf, const double* du2, const lapack_int* ipiv, const double* b,
-                               lapack_int ldb, double* x, lapack_int ldx, double* ferr, double* berr, double* work,
-                               lapack_int* iwork);
-lapack_int LAPACKE_cgtrfs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* dl, const lapack_complex_float* d,
-                               const lapack_complex_float* du, const lapack_complex_float* dlf,
-                               const lapack_complex_float* df, const lapack_complex_float* duf,
-                               const lapack_complex_float* du2, const lapack_int* ipiv, const lapack_complex_float* b,
-                               lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr,
-                               lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zgtrfs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* dl, const lapack_complex_double* d,
-                               const lapack_complex_double* du, const lapack_complex_double* dlf,
-                               const lapack_complex_double* df, const lapack_complex_double* duf,
-                               const lapack_complex_double* du2, const lapack_int* ipiv, const lapack_complex_double* b,
-                               lapack_int ldb, lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr,
-                               lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sgtsv_work(int matrix_order, lapack_int n, lapack_int nrhs, float* dl, float* d, float* du, float* b,
-                              lapack_int ldb);
-lapack_int LAPACKE_dgtsv_work(int matrix_order, lapack_int n, lapack_int nrhs, double* dl, double* d, double* du,
-                              double* b, lapack_int ldb);
-lapack_int LAPACKE_cgtsv_work(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_float* dl,
-                              lapack_complex_float* d, lapack_complex_float* du, lapack_complex_float* b,
-                              lapack_int ldb);
-lapack_int LAPACKE_zgtsv_work(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_double* dl,
-                              lapack_complex_double* d, lapack_complex_double* du, lapack_complex_double* b,
-                              lapack_int ldb);
-
-lapack_int LAPACKE_sgtsvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, const float* dl,
-                               const float* d, const float* du, float* dlf, float* df, float* duf, float* du2,
-                               lapack_int* ipiv, const float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond,
-                               float* ferr, float* berr, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dgtsvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, const double* dl,
-                               const double* d, const double* du, double* dlf, double* df, double* duf, double* du2,
-                               lapack_int* ipiv, const double* b, lapack_int ldb, double* x, lapack_int ldx,
-                               double* rcond, double* ferr, double* berr, double* work, lapack_int* iwork);
-lapack_int LAPACKE_cgtsvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* dl, const lapack_complex_float* d,
-                               const lapack_complex_float* du, lapack_complex_float* dlf, lapack_complex_float* df,
-                               lapack_complex_float* duf, lapack_complex_float* du2, lapack_int* ipiv,
-                               const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                               float* rcond, float* ferr, float* berr, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zgtsvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* dl, const lapack_complex_double* d,
-                               const lapack_complex_double* du, lapack_complex_double* dlf, lapack_complex_double* df,
-                               lapack_complex_double* duf, lapack_complex_double* du2, lapack_int* ipiv,
-                               const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                               double* rcond, double* ferr, double* berr, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sgttrf_work(lapack_int n, float* dl, float* d, float* du, float* du2, lapack_int* ipiv);
-lapack_int LAPACKE_dgttrf_work(lapack_int n, double* dl, double* d, double* du, double* du2, lapack_int* ipiv);
-lapack_int LAPACKE_cgttrf_work(lapack_int n, lapack_complex_float* dl, lapack_complex_float* d,
-                               lapack_complex_float* du, lapack_complex_float* du2, lapack_int* ipiv);
-lapack_int LAPACKE_zgttrf_work(lapack_int n, lapack_complex_double* dl, lapack_complex_double* d,
-                               lapack_complex_double* du, lapack_complex_double* du2, lapack_int* ipiv);
-
-lapack_int LAPACKE_sgttrs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const float* dl,
-                               const float* d, const float* du, const float* du2, const lapack_int* ipiv, float* b,
-                               lapack_int ldb);
-lapack_int LAPACKE_dgttrs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const double* dl,
-                               const double* d, const double* du, const double* du2, const lapack_int* ipiv, double* b,
-                               lapack_int ldb);
-lapack_int LAPACKE_cgttrs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* dl, const lapack_complex_float* d,
-                               const lapack_complex_float* du, const lapack_complex_float* du2, const lapack_int* ipiv,
-                               lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zgttrs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* dl, const lapack_complex_double* d,
-                               const lapack_complex_double* du, const lapack_complex_double* du2,
-                               const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_chbev_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd,
-                              lapack_complex_float* ab, lapack_int ldab, float* w, lapack_complex_float* z,
-                              lapack_int ldz, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zhbev_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd,
-                              lapack_complex_double* ab, lapack_int ldab, double* w, lapack_complex_double* z,
-                              lapack_int ldz, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_chbevd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd,
-                               lapack_complex_float* ab, lapack_int ldab, float* w, lapack_complex_float* z,
-                               lapack_int ldz, lapack_complex_float* work, lapack_int lwork, float* rwork,
-                               lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_zhbevd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd,
-                               lapack_complex_double* ab, lapack_int ldab, double* w, lapack_complex_double* z,
-                               lapack_int ldz, lapack_complex_double* work, lapack_int lwork, double* rwork,
-                               lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
-
-lapack_int LAPACKE_chbevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int kd,
-                               lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* q, lapack_int ldq,
-                               float vl, float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w,
-                               lapack_complex_float* z, lapack_int ldz, lapack_complex_float* work, float* rwork,
-                               lapack_int* iwork, lapack_int* ifail);
-lapack_int LAPACKE_zhbevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int kd,
-                               lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* q, lapack_int ldq,
-                               double vl, double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m,
-                               double* w, lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work,
-                               double* rwork, lapack_int* iwork, lapack_int* ifail);
-
-lapack_int LAPACKE_chbgst_work(int matrix_order, char vect, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                               lapack_complex_float* ab, lapack_int ldab, const lapack_complex_float* bb,
-                               lapack_int ldbb, lapack_complex_float* x, lapack_int ldx, lapack_complex_float* work,
-                               float* rwork);
-lapack_int LAPACKE_zhbgst_work(int matrix_order, char vect, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                               lapack_complex_double* ab, lapack_int ldab, const lapack_complex_double* bb,
-                               lapack_int ldbb, lapack_complex_double* x, lapack_int ldx, lapack_complex_double* work,
-                               double* rwork);
-
-lapack_int LAPACKE_chbgv_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                              lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* bb, lapack_int ldbb,
-                              float* w, lapack_complex_float* z, lapack_int ldz, lapack_complex_float* work,
-                              float* rwork);
-lapack_int LAPACKE_zhbgv_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                              lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* bb, lapack_int ldbb,
-                              double* w, lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work,
-                              double* rwork);
-
-lapack_int LAPACKE_chbgvd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                               lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* bb, lapack_int ldbb,
-                               float* w, lapack_complex_float* z, lapack_int ldz, lapack_complex_float* work,
-                               lapack_int lwork, float* rwork, lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_zhbgvd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                               lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* bb, lapack_int ldbb,
-                               double* w, lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work,
-                               lapack_int lwork, double* rwork, lapack_int lrwork, lapack_int* iwork,
-                               lapack_int liwork);
-
-lapack_int LAPACKE_chbgvx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int ka,
-                               lapack_int kb, lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* bb,
-                               lapack_int ldbb, lapack_complex_float* q, lapack_int ldq, float vl, float vu,
-                               lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w,
-                               lapack_complex_float* z, lapack_int ldz, lapack_complex_float* work, float* rwork,
-                               lapack_int* iwork, lapack_int* ifail);
-lapack_int LAPACKE_zhbgvx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int ka,
-                               lapack_int kb, lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* bb,
-                               lapack_int ldbb, lapack_complex_double* q, lapack_int ldq, double vl, double vu,
-                               lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
-                               lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work, double* rwork,
-                               lapack_int* iwork, lapack_int* ifail);
-
-lapack_int LAPACKE_chbtrd_work(int matrix_order, char vect, char uplo, lapack_int n, lapack_int kd,
-                               lapack_complex_float* ab, lapack_int ldab, float* d, float* e, lapack_complex_float* q,
-                               lapack_int ldq, lapack_complex_float* work);
-lapack_int LAPACKE_zhbtrd_work(int matrix_order, char vect, char uplo, lapack_int n, lapack_int kd,
-                               lapack_complex_double* ab, lapack_int ldab, double* d, double* e,
-                               lapack_complex_double* q, lapack_int ldq, lapack_complex_double* work);
-
-lapack_int LAPACKE_checon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
-                               const lapack_int* ipiv, float anorm, float* rcond, lapack_complex_float* work);
-lapack_int LAPACKE_zhecon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a,
-                               lapack_int lda, const lapack_int* ipiv, double anorm, double* rcond,
-                               lapack_complex_double* work);
-
-lapack_int LAPACKE_cheequb_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a,
-                                lapack_int lda, float* s, float* scond, float* amax, lapack_complex_float* work);
-lapack_int LAPACKE_zheequb_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a,
-                                lapack_int lda, double* s, double* scond, double* amax, lapack_complex_double* work);
-
-lapack_int LAPACKE_cheev_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_float* a,
-                              lapack_int lda, float* w, lapack_complex_float* work, lapack_int lwork, float* rwork);
-lapack_int LAPACKE_zheev_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_double* a,
-                              lapack_int lda, double* w, lapack_complex_double* work, lapack_int lwork, double* rwork);
-
-lapack_int LAPACKE_cheevd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_float* a,
-                               lapack_int lda, float* w, lapack_complex_float* work, lapack_int lwork, float* rwork,
-                               lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_zheevd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_double* a,
-                               lapack_int lda, double* w, lapack_complex_double* work, lapack_int lwork, double* rwork,
-                               lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
-
-lapack_int LAPACKE_cheevr_work(int matrix_order, char jobz, char range, char uplo, lapack_int n,
-                               lapack_complex_float* a, lapack_int lda, float vl, float vu, lapack_int il,
-                               lapack_int iu, float abstol, lapack_int* m, float* w, lapack_complex_float* z,
-                               lapack_int ldz, lapack_int* isuppz, lapack_complex_float* work, lapack_int lwork,
-                               float* rwork, lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_zheevr_work(int matrix_order, char jobz, char range, char uplo, lapack_int n,
-                               lapack_complex_double* a, lapack_int lda, double vl, double vu, lapack_int il,
-                               lapack_int iu, double abstol, lapack_int* m, double* w, lapack_complex_double* z,
-                               lapack_int ldz, lapack_int* isuppz, lapack_complex_double* work, lapack_int lwork,
-                               double* rwork, lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
-
-lapack_int LAPACKE_cheevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n,
-                               lapack_complex_float* a, lapack_int lda, float vl, float vu, lapack_int il,
-                               lapack_int iu, float abstol, lapack_int* m, float* w, lapack_complex_float* z,
-                               lapack_int ldz, lapack_complex_float* work, lapack_int lwork, float* rwork,
-                               lapack_int* iwork, lapack_int* ifail);
-lapack_int LAPACKE_zheevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n,
-                               lapack_complex_double* a, lapack_int lda, double vl, double vu, lapack_int il,
-                               lapack_int iu, double abstol, lapack_int* m, double* w, lapack_complex_double* z,
-                               lapack_int ldz, lapack_complex_double* work, lapack_int lwork, double* rwork,
-                               lapack_int* iwork, lapack_int* ifail);
-
-lapack_int LAPACKE_chegst_work(int matrix_order, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a,
-                               lapack_int lda, const lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zhegst_work(int matrix_order, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a,
-                               lapack_int lda, const lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_chegv_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
-                              lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
-                              float* w, lapack_complex_float* work, lapack_int lwork, float* rwork);
-lapack_int LAPACKE_zhegv_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
-                              lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
-                              double* w, lapack_complex_double* work, lapack_int lwork, double* rwork);
-
-lapack_int LAPACKE_chegvd_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
-                               lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
-                               float* w, lapack_complex_float* work, lapack_int lwork, float* rwork, lapack_int lrwork,
-                               lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_zhegvd_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
-                               lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
-                               double* w, lapack_complex_double* work, lapack_int lwork, double* rwork,
-                               lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
-
-lapack_int LAPACKE_chegvx_work(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
-                               lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
-                               float vl, float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w,
-                               lapack_complex_float* z, lapack_int ldz, lapack_complex_float* work, lapack_int lwork,
-                               float* rwork, lapack_int* iwork, lapack_int* ifail);
-lapack_int LAPACKE_zhegvx_work(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
-                               lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
-                               double vl, double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m,
-                               double* w, lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work,
-                               lapack_int lwork, double* rwork, lapack_int* iwork, lapack_int* ifail);
-
-lapack_int LAPACKE_cherfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
-                               lapack_int ldaf, const lapack_int* ipiv, const lapack_complex_float* b, lapack_int ldb,
-                               lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr,
-                               lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zherfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
-                               lapack_int ldaf, const lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb,
-                               lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr,
-                               lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_cherfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
-                                const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
-                                lapack_int ldaf, const lapack_int* ipiv, const float* s, const lapack_complex_float* b,
-                                lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* rcond, float* berr,
-                                lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
-                                float* params, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zherfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
-                                const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
-                                lapack_int ldaf, const lapack_int* ipiv, const double* s,
-                                const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
-                                lapack_int ldx, double* rcond, double* berr, lapack_int n_err_bnds,
-                                double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams, double* params,
-                                lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_chesv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
-                              lapack_int lda, lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb,
-                              lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zhesv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
-                              lapack_int lda, lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb,
-                              lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_chesvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
-                               lapack_int* ipiv, const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
-                               lapack_int ldx, float* rcond, float* ferr, float* berr, lapack_complex_float* work,
-                               lapack_int lwork, float* rwork);
-lapack_int LAPACKE_zhesvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* a, lapack_int lda, lapack_complex_double* af,
-                               lapack_int ldaf, lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb,
-                               lapack_complex_double* x, lapack_int ldx, double* rcond, double* ferr, double* berr,
-                               lapack_complex_double* work, lapack_int lwork, double* rwork);
-
-lapack_int LAPACKE_chesvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                                lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
-                                lapack_int* ipiv, char* equed, float* s, lapack_complex_float* b, lapack_int ldb,
-                                lapack_complex_float* x, lapack_int ldx, float* rcond, float* rpvgrw, float* berr,
-                                lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
-                                float* params, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zhesvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                                lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
-                                lapack_int* ipiv, char* equed, double* s, lapack_complex_double* b, lapack_int ldb,
-                                lapack_complex_double* x, lapack_int ldx, double* rcond, double* rpvgrw, double* berr,
-                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
-                                double* params, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_chetrd_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               float* d, float* e, lapack_complex_float* tau, lapack_complex_float* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_zhetrd_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               double* d, double* e, lapack_complex_double* tau, lapack_complex_double* work,
-                               lapack_int lwork);
-
-lapack_int LAPACKE_chetrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               lapack_int* ipiv, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zhetrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               lapack_int* ipiv, lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_chetri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               const lapack_int* ipiv, lapack_complex_float* work);
-lapack_int LAPACKE_zhetri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               const lapack_int* ipiv, lapack_complex_double* work);
-
-lapack_int LAPACKE_chetrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv,
-                               lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zhetrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv,
-                               lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_chfrk_work(int matrix_order, char transr, char uplo, char trans, lapack_int n, lapack_int k,
-                              float alpha, const lapack_complex_float* a, lapack_int lda, float beta,
-                              lapack_complex_float* c);
-lapack_int LAPACKE_zhfrk_work(int matrix_order, char transr, char uplo, char trans, lapack_int n, lapack_int k,
-                              double alpha, const lapack_complex_double* a, lapack_int lda, double beta,
-                              lapack_complex_double* c);
-
-lapack_int LAPACKE_shgeqz_work(int matrix_order, char job, char compq, char compz, lapack_int n, lapack_int ilo,
-                               lapack_int ihi, float* h, lapack_int ldh, float* t, lapack_int ldt, float* alphar,
-                               float* alphai, float* beta, float* q, lapack_int ldq, float* z, lapack_int ldz,
-                               float* work, lapack_int lwork);
-lapack_int LAPACKE_dhgeqz_work(int matrix_order, char job, char compq, char compz, lapack_int n, lapack_int ilo,
-                               lapack_int ihi, double* h, lapack_int ldh, double* t, lapack_int ldt, double* alphar,
-                               double* alphai, double* beta, double* q, lapack_int ldq, double* z, lapack_int ldz,
-                               double* work, lapack_int lwork);
-lapack_int LAPACKE_chgeqz_work(int matrix_order, char job, char compq, char compz, lapack_int n, lapack_int ilo,
-                               lapack_int ihi, lapack_complex_float* h, lapack_int ldh, lapack_complex_float* t,
-                               lapack_int ldt, lapack_complex_float* alpha, lapack_complex_float* beta,
-                               lapack_complex_float* q, lapack_int ldq, lapack_complex_float* z, lapack_int ldz,
-                               lapack_complex_float* work, lapack_int lwork, float* rwork);
-lapack_int LAPACKE_zhgeqz_work(int matrix_order, char job, char compq, char compz, lapack_int n, lapack_int ilo,
-                               lapack_int ihi, lapack_complex_double* h, lapack_int ldh, lapack_complex_double* t,
-                               lapack_int ldt, lapack_complex_double* alpha, lapack_complex_double* beta,
-                               lapack_complex_double* q, lapack_int ldq, lapack_complex_double* z, lapack_int ldz,
-                               lapack_complex_double* work, lapack_int lwork, double* rwork);
-
-lapack_int LAPACKE_chpcon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap,
-                               const lapack_int* ipiv, float anorm, float* rcond, lapack_complex_float* work);
-lapack_int LAPACKE_zhpcon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap,
-                               const lapack_int* ipiv, double anorm, double* rcond, lapack_complex_double* work);
-
-lapack_int LAPACKE_chpev_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_float* ap, float* w,
-                              lapack_complex_float* z, lapack_int ldz, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zhpev_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_double* ap,
-                              double* w, lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work,
-                              double* rwork);
-
-lapack_int LAPACKE_chpevd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_float* ap, float* w,
-                               lapack_complex_float* z, lapack_int ldz, lapack_complex_float* work, lapack_int lwork,
-                               float* rwork, lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_zhpevd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_double* ap,
-                               double* w, lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work,
-                               lapack_int lwork, double* rwork, lapack_int lrwork, lapack_int* iwork,
-                               lapack_int liwork);
-
-lapack_int LAPACKE_chpevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n,
-                               lapack_complex_float* ap, float vl, float vu, lapack_int il, lapack_int iu, float abstol,
-                               lapack_int* m, float* w, lapack_complex_float* z, lapack_int ldz,
-                               lapack_complex_float* work, float* rwork, lapack_int* iwork, lapack_int* ifail);
-lapack_int LAPACKE_zhpevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n,
-                               lapack_complex_double* ap, double vl, double vu, lapack_int il, lapack_int iu,
-                               double abstol, lapack_int* m, double* w, lapack_complex_double* z, lapack_int ldz,
-                               lapack_complex_double* work, double* rwork, lapack_int* iwork, lapack_int* ifail);
-
-lapack_int LAPACKE_chpgst_work(int matrix_order, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* ap,
-                               const lapack_complex_float* bp);
-lapack_int LAPACKE_zhpgst_work(int matrix_order, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* ap,
-                               const lapack_complex_double* bp);
-
-lapack_int LAPACKE_chpgv_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
-                              lapack_complex_float* ap, lapack_complex_float* bp, float* w, lapack_complex_float* z,
-                              lapack_int ldz, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zhpgv_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
-                              lapack_complex_double* ap, lapack_complex_double* bp, double* w, lapack_complex_double* z,
-                              lapack_int ldz, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_chpgvd_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
-                               lapack_complex_float* ap, lapack_complex_float* bp, float* w, lapack_complex_float* z,
-                               lapack_int ldz, lapack_complex_float* work, lapack_int lwork, float* rwork,
-                               lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_zhpgvd_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
-                               lapack_complex_double* ap, lapack_complex_double* bp, double* w,
-                               lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work, lapack_int lwork,
-                               double* rwork, lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
-
-lapack_int LAPACKE_chpgvx_work(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
-                               lapack_complex_float* ap, lapack_complex_float* bp, float vl, float vu, lapack_int il,
-                               lapack_int iu, float abstol, lapack_int* m, float* w, lapack_complex_float* z,
-                               lapack_int ldz, lapack_complex_float* work, float* rwork, lapack_int* iwork,
-                               lapack_int* ifail);
-lapack_int LAPACKE_zhpgvx_work(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
-                               lapack_complex_double* ap, lapack_complex_double* bp, double vl, double vu,
-                               lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
-                               lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work, double* rwork,
-                               lapack_int* iwork, lapack_int* ifail);
-
-lapack_int LAPACKE_chprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* ap, const lapack_complex_float* afp, const lapack_int* ipiv,
-                               const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                               float* ferr, float* berr, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zhprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* ap, const lapack_complex_double* afp,
-                               const lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb,
-                               lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr,
-                               lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_chpsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* ap,
-                              lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zhpsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* ap,
-                              lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_chpsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* ap, lapack_complex_float* afp, lapack_int* ipiv,
-                               const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                               float* rcond, float* ferr, float* berr, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zhpsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* ap, lapack_complex_double* afp, lapack_int* ipiv,
-                               const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                               double* rcond, double* ferr, double* berr, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_chptrd_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap, float* d, float* e,
-                               lapack_complex_float* tau);
-lapack_int LAPACKE_zhptrd_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap, double* d,
-                               double* e, lapack_complex_double* tau);
-
-lapack_int LAPACKE_chptrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap, lapack_int* ipiv);
-lapack_int LAPACKE_zhptrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap, lapack_int* ipiv);
-
-lapack_int LAPACKE_chptri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap,
-                               const lapack_int* ipiv, lapack_complex_float* work);
-lapack_int LAPACKE_zhptri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap,
-                               const lapack_int* ipiv, lapack_complex_double* work);
-
-lapack_int LAPACKE_chptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* ap, const lapack_int* ipiv, lapack_complex_float* b,
-                               lapack_int ldb);
-lapack_int LAPACKE_zhptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* ap, const lapack_int* ipiv, lapack_complex_double* b,
-                               lapack_int ldb);
-
-lapack_int LAPACKE_shsein_work(int matrix_order, char job, char eigsrc, char initv, lapack_logical* select,
-                               lapack_int n, const float* h, lapack_int ldh, float* wr, const float* wi, float* vl,
-                               lapack_int ldvl, float* vr, lapack_int ldvr, lapack_int mm, lapack_int* m, float* work,
-                               lapack_int* ifaill, lapack_int* ifailr);
-lapack_int LAPACKE_dhsein_work(int matrix_order, char job, char eigsrc, char initv, lapack_logical* select,
-                               lapack_int n, const double* h, lapack_int ldh, double* wr, const double* wi, double* vl,
-                               lapack_int ldvl, double* vr, lapack_int ldvr, lapack_int mm, lapack_int* m, double* work,
-                               lapack_int* ifaill, lapack_int* ifailr);
-lapack_int LAPACKE_chsein_work(int matrix_order, char job, char eigsrc, char initv, const lapack_logical* select,
-                               lapack_int n, const lapack_complex_float* h, lapack_int ldh, lapack_complex_float* w,
-                               lapack_complex_float* vl, lapack_int ldvl, lapack_complex_float* vr, lapack_int ldvr,
-                               lapack_int mm, lapack_int* m, lapack_complex_float* work, float* rwork,
-                               lapack_int* ifaill, lapack_int* ifailr);
-lapack_int LAPACKE_zhsein_work(int matrix_order, char job, char eigsrc, char initv, const lapack_logical* select,
-                               lapack_int n, const lapack_complex_double* h, lapack_int ldh, lapack_complex_double* w,
-                               lapack_complex_double* vl, lapack_int ldvl, lapack_complex_double* vr, lapack_int ldvr,
-                               lapack_int mm, lapack_int* m, lapack_complex_double* work, double* rwork,
-                               lapack_int* ifaill, lapack_int* ifailr);
-
-lapack_int LAPACKE_shseqr_work(int matrix_order, char job, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
-                               float* h, lapack_int ldh, float* wr, float* wi, float* z, lapack_int ldz, float* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_dhseqr_work(int matrix_order, char job, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
-                               double* h, lapack_int ldh, double* wr, double* wi, double* z, lapack_int ldz,
-                               double* work, lapack_int lwork);
-lapack_int LAPACKE_chseqr_work(int matrix_order, char job, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
-                               lapack_complex_float* h, lapack_int ldh, lapack_complex_float* w,
-                               lapack_complex_float* z, lapack_int ldz, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zhseqr_work(int matrix_order, char job, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
-                               lapack_complex_double* h, lapack_int ldh, lapack_complex_double* w,
-                               lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_clacgv_work(lapack_int n, lapack_complex_float* x, lapack_int incx);
-lapack_int LAPACKE_zlacgv_work(lapack_int n, lapack_complex_double* x, lapack_int incx);
-
-lapack_int LAPACKE_slacpy_work(int matrix_order, char uplo, lapack_int m, lapack_int n, const float* a, lapack_int lda,
-                               float* b, lapack_int ldb);
-lapack_int LAPACKE_dlacpy_work(int matrix_order, char uplo, lapack_int m, lapack_int n, const double* a, lapack_int lda,
-                               double* b, lapack_int ldb);
-lapack_int LAPACKE_clacpy_work(int matrix_order, char uplo, lapack_int m, lapack_int n, const lapack_complex_float* a,
-                               lapack_int lda, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zlacpy_work(int matrix_order, char uplo, lapack_int m, lapack_int n, const lapack_complex_double* a,
-                               lapack_int lda, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_zlag2c_work(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_double* a,
-                               lapack_int lda, lapack_complex_float* sa, lapack_int ldsa);
-
-lapack_int LAPACKE_slag2d_work(int matrix_order, lapack_int m, lapack_int n, const float* sa, lapack_int ldsa,
-                               double* a, lapack_int lda);
-
-lapack_int LAPACKE_dlag2s_work(int matrix_order, lapack_int m, lapack_int n, const double* a, lapack_int lda, float* sa,
-                               lapack_int ldsa);
-
-lapack_int LAPACKE_clag2z_work(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_float* sa,
-                               lapack_int ldsa, lapack_complex_double* a, lapack_int lda);
-
-lapack_int LAPACKE_slagge_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                               const float* d, float* a, lapack_int lda, lapack_int* iseed, float* work);
-lapack_int LAPACKE_dlagge_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                               const double* d, double* a, lapack_int lda, lapack_int* iseed, double* work);
-lapack_int LAPACKE_clagge_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                               const float* d, lapack_complex_float* a, lapack_int lda, lapack_int* iseed,
-                               lapack_complex_float* work);
-lapack_int LAPACKE_zlagge_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
-                               const double* d, lapack_complex_double* a, lapack_int lda, lapack_int* iseed,
-                               lapack_complex_double* work);
-
-lapack_int LAPACKE_claghe_work(int matrix_order, lapack_int n, lapack_int k, const float* d, lapack_complex_float* a,
-                               lapack_int lda, lapack_int* iseed, lapack_complex_float* work);
-lapack_int LAPACKE_zlaghe_work(int matrix_order, lapack_int n, lapack_int k, const double* d, lapack_complex_double* a,
-                               lapack_int lda, lapack_int* iseed, lapack_complex_double* work);
-
-lapack_int LAPACKE_slagsy_work(int matrix_order, lapack_int n, lapack_int k, const float* d, float* a, lapack_int lda,
-                               lapack_int* iseed, float* work);
-lapack_int LAPACKE_dlagsy_work(int matrix_order, lapack_int n, lapack_int k, const double* d, double* a, lapack_int lda,
-                               lapack_int* iseed, double* work);
-lapack_int LAPACKE_clagsy_work(int matrix_order, lapack_int n, lapack_int k, const float* d, lapack_complex_float* a,
-                               lapack_int lda, lapack_int* iseed, lapack_complex_float* work);
-lapack_int LAPACKE_zlagsy_work(int matrix_order, lapack_int n, lapack_int k, const double* d, lapack_complex_double* a,
-                               lapack_int lda, lapack_int* iseed, lapack_complex_double* work);
-
-lapack_int LAPACKE_slapmr_work(int matrix_order, lapack_logical forwrd, lapack_int m, lapack_int n, float* x,
-                               lapack_int ldx, lapack_int* k);
-lapack_int LAPACKE_dlapmr_work(int matrix_order, lapack_logical forwrd, lapack_int m, lapack_int n, double* x,
-                               lapack_int ldx, lapack_int* k);
-lapack_int LAPACKE_clapmr_work(int matrix_order, lapack_logical forwrd, lapack_int m, lapack_int n,
-                               lapack_complex_float* x, lapack_int ldx, lapack_int* k);
-lapack_int LAPACKE_zlapmr_work(int matrix_order, lapack_logical forwrd, lapack_int m, lapack_int n,
-                               lapack_complex_double* x, lapack_int ldx, lapack_int* k);
-
-lapack_int LAPACKE_slartgp_work(float f, float g, float* cs, float* sn, float* r);
-lapack_int LAPACKE_dlartgp_work(double f, double g, double* cs, double* sn, double* r);
-
-lapack_int LAPACKE_slartgs_work(float x, float y, float sigma, float* cs, float* sn);
-lapack_int LAPACKE_dlartgs_work(double x, double y, double sigma, double* cs, double* sn);
-
-float LAPACKE_slapy2_work(float x, float y);
-double LAPACKE_dlapy2_work(double x, double y);
-
-float LAPACKE_slapy3_work(float x, float y, float z);
-double LAPACKE_dlapy3_work(double x, double y, double z);
-
-float LAPACKE_slamch_work(char cmach);
-double LAPACKE_dlamch_work(char cmach);
-
-float LAPACKE_slange_work(int matrix_order, char norm, lapack_int m, lapack_int n, const float* a, lapack_int lda,
-                          float* work);
-double LAPACKE_dlange_work(int matrix_order, char norm, lapack_int m, lapack_int n, const double* a, lapack_int lda,
-                           double* work);
-float LAPACKE_clange_work(int matrix_order, char norm, lapack_int m, lapack_int n, const lapack_complex_float* a,
-                          lapack_int lda, float* work);
-double LAPACKE_zlange_work(int matrix_order, char norm, lapack_int m, lapack_int n, const lapack_complex_double* a,
-                           lapack_int lda, double* work);
-
-float LAPACKE_clanhe_work(int matrix_order, char norm, char uplo, lapack_int n, const lapack_complex_float* a,
-                          lapack_int lda, float* work);
-double LAPACKE_zlanhe_work(int matrix_order, char norm, char uplo, lapack_int n, const lapack_complex_double* a,
-                           lapack_int lda, double* work);
-
-float LAPACKE_slansy_work(int matrix_order, char norm, char uplo, lapack_int n, const float* a, lapack_int lda,
-                          float* work);
-double LAPACKE_dlansy_work(int matrix_order, char norm, char uplo, lapack_int n, const double* a, lapack_int lda,
-                           double* work);
-float LAPACKE_clansy_work(int matrix_order, char norm, char uplo, lapack_int n, const lapack_complex_float* a,
-                          lapack_int lda, float* work);
-double LAPACKE_zlansy_work(int matrix_order, char norm, char uplo, lapack_int n, const lapack_complex_double* a,
-                           lapack_int lda, double* work);
-
-float LAPACKE_slantr_work(int matrix_order, char norm, char uplo, char diag, lapack_int m, lapack_int n, const float* a,
-                          lapack_int lda, float* work);
-double LAPACKE_dlantr_work(int matrix_order, char norm, char uplo, char diag, lapack_int m, lapack_int n,
-                           const double* a, lapack_int lda, double* work);
-float LAPACKE_clantr_work(int matrix_order, char norm, char uplo, char diag, lapack_int m, lapack_int n,
-                          const lapack_complex_float* a, lapack_int lda, float* work);
-double LAPACKE_zlantr_work(int matrix_order, char norm, char uplo, char diag, lapack_int m, lapack_int n,
-                           const lapack_complex_double* a, lapack_int lda, double* work);
-
-lapack_int LAPACKE_slarfb_work(int matrix_order, char side, char trans, char direct, char storev, lapack_int m,
-                               lapack_int n, lapack_int k, const float* v, lapack_int ldv, const float* t,
-                               lapack_int ldt, float* c, lapack_int ldc, float* work, lapack_int ldwork);
-lapack_int LAPACKE_dlarfb_work(int matrix_order, char side, char trans, char direct, char storev, lapack_int m,
-                               lapack_int n, lapack_int k, const double* v, lapack_int ldv, const double* t,
-                               lapack_int ldt, double* c, lapack_int ldc, double* work, lapack_int ldwork);
-lapack_int LAPACKE_clarfb_work(int matrix_order, char side, char trans, char direct, char storev, lapack_int m,
-                               lapack_int n, lapack_int k, const lapack_complex_float* v, lapack_int ldv,
-                               const lapack_complex_float* t, lapack_int ldt, lapack_complex_float* c, lapack_int ldc,
-                               lapack_complex_float* work, lapack_int ldwork);
-lapack_int LAPACKE_zlarfb_work(int matrix_order, char side, char trans, char direct, char storev, lapack_int m,
-                               lapack_int n, lapack_int k, const lapack_complex_double* v, lapack_int ldv,
-                               const lapack_complex_double* t, lapack_int ldt, lapack_complex_double* c, lapack_int ldc,
-                               lapack_complex_double* work, lapack_int ldwork);
-
-lapack_int LAPACKE_slarfg_work(lapack_int n, float* alpha, float* x, lapack_int incx, float* tau);
-lapack_int LAPACKE_dlarfg_work(lapack_int n, double* alpha, double* x, lapack_int incx, double* tau);
-lapack_int LAPACKE_clarfg_work(lapack_int n, lapack_complex_float* alpha, lapack_complex_float* x, lapack_int incx,
-                               lapack_complex_float* tau);
-lapack_int LAPACKE_zlarfg_work(lapack_int n, lapack_complex_double* alpha, lapack_complex_double* x, lapack_int incx,
-                               lapack_complex_double* tau);
-
-lapack_int LAPACKE_slarft_work(int matrix_order, char direct, char storev, lapack_int n, lapack_int k, const float* v,
-                               lapack_int ldv, const float* tau, float* t, lapack_int ldt);
-lapack_int LAPACKE_dlarft_work(int matrix_order, char direct, char storev, lapack_int n, lapack_int k, const double* v,
-                               lapack_int ldv, const double* tau, double* t, lapack_int ldt);
-lapack_int LAPACKE_clarft_work(int matrix_order, char direct, char storev, lapack_int n, lapack_int k,
-                               const lapack_complex_float* v, lapack_int ldv, const lapack_complex_float* tau,
-                               lapack_complex_float* t, lapack_int ldt);
-lapack_int LAPACKE_zlarft_work(int matrix_order, char direct, char storev, lapack_int n, lapack_int k,
-                               const lapack_complex_double* v, lapack_int ldv, const lapack_complex_double* tau,
-                               lapack_complex_double* t, lapack_int ldt);
-
-lapack_int LAPACKE_slarfx_work(int matrix_order, char side, lapack_int m, lapack_int n, const float* v, float tau,
-                               float* c, lapack_int ldc, float* work);
-lapack_int LAPACKE_dlarfx_work(int matrix_order, char side, lapack_int m, lapack_int n, const double* v, double tau,
-                               double* c, lapack_int ldc, double* work);
-lapack_int LAPACKE_clarfx_work(int matrix_order, char side, lapack_int m, lapack_int n, const lapack_complex_float* v,
-                               lapack_complex_float tau, lapack_complex_float* c, lapack_int ldc,
-                               lapack_complex_float* work);
-lapack_int LAPACKE_zlarfx_work(int matrix_order, char side, lapack_int m, lapack_int n, const lapack_complex_double* v,
-                               lapack_complex_double tau, lapack_complex_double* c, lapack_int ldc,
-                               lapack_complex_double* work);
-
-lapack_int LAPACKE_slarnv_work(lapack_int idist, lapack_int* iseed, lapack_int n, float* x);
-lapack_int LAPACKE_dlarnv_work(lapack_int idist, lapack_int* iseed, lapack_int n, double* x);
-lapack_int LAPACKE_clarnv_work(lapack_int idist, lapack_int* iseed, lapack_int n, lapack_complex_float* x);
-lapack_int LAPACKE_zlarnv_work(lapack_int idist, lapack_int* iseed, lapack_int n, lapack_complex_double* x);
-
-lapack_int LAPACKE_slaset_work(int matrix_order, char uplo, lapack_int m, lapack_int n, float alpha, float beta,
-                               float* a, lapack_int lda);
-lapack_int LAPACKE_dlaset_work(int matrix_order, char uplo, lapack_int m, lapack_int n, double alpha, double beta,
-                               double* a, lapack_int lda);
-lapack_int LAPACKE_claset_work(int matrix_order, char uplo, lapack_int m, lapack_int n, lapack_complex_float alpha,
-                               lapack_complex_float beta, lapack_complex_float* a, lapack_int lda);
-lapack_int LAPACKE_zlaset_work(int matrix_order, char uplo, lapack_int m, lapack_int n, lapack_complex_double alpha,
-                               lapack_complex_double beta, lapack_complex_double* a, lapack_int lda);
-
-lapack_int LAPACKE_slasrt_work(char id, lapack_int n, float* d);
-lapack_int LAPACKE_dlasrt_work(char id, lapack_int n, double* d);
-
-lapack_int LAPACKE_slaswp_work(int matrix_order, lapack_int n, float* a, lapack_int lda, lapack_int k1, lapack_int k2,
-                               const lapack_int* ipiv, lapack_int incx);
-lapack_int LAPACKE_dlaswp_work(int matrix_order, lapack_int n, double* a, lapack_int lda, lapack_int k1, lapack_int k2,
-                               const lapack_int* ipiv, lapack_int incx);
-lapack_int LAPACKE_claswp_work(int matrix_order, lapack_int n, lapack_complex_float* a, lapack_int lda, lapack_int k1,
-                               lapack_int k2, const lapack_int* ipiv, lapack_int incx);
-lapack_int LAPACKE_zlaswp_work(int matrix_order, lapack_int n, lapack_complex_double* a, lapack_int lda, lapack_int k1,
-                               lapack_int k2, const lapack_int* ipiv, lapack_int incx);
-
-lapack_int LAPACKE_slatms_work(int matrix_order, lapack_int m, lapack_int n, char dist, lapack_int* iseed, char sym,
-                               float* d, lapack_int mode, float cond, float dmax, lapack_int kl, lapack_int ku,
-                               char pack, float* a, lapack_int lda, float* work);
-lapack_int LAPACKE_dlatms_work(int matrix_order, lapack_int m, lapack_int n, char dist, lapack_int* iseed, char sym,
-                               double* d, lapack_int mode, double cond, double dmax, lapack_int kl, lapack_int ku,
-                               char pack, double* a, lapack_int lda, double* work);
-lapack_int LAPACKE_clatms_work(int matrix_order, lapack_int m, lapack_int n, char dist, lapack_int* iseed, char sym,
-                               float* d, lapack_int mode, float cond, float dmax, lapack_int kl, lapack_int ku,
-                               char pack, lapack_complex_float* a, lapack_int lda, lapack_complex_float* work);
-lapack_int LAPACKE_zlatms_work(int matrix_order, lapack_int m, lapack_int n, char dist, lapack_int* iseed, char sym,
-                               double* d, lapack_int mode, double cond, double dmax, lapack_int kl, lapack_int ku,
-                               char pack, lapack_complex_double* a, lapack_int lda, lapack_complex_double* work);
-
-lapack_int LAPACKE_slauum_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda);
-lapack_int LAPACKE_dlauum_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda);
-lapack_int LAPACKE_clauum_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda);
-lapack_int LAPACKE_zlauum_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda);
-
-lapack_int LAPACKE_sopgtr_work(int matrix_order, char uplo, lapack_int n, const float* ap, const float* tau, float* q,
-                               lapack_int ldq, float* work);
-lapack_int LAPACKE_dopgtr_work(int matrix_order, char uplo, lapack_int n, const double* ap, const double* tau,
-                               double* q, lapack_int ldq, double* work);
-
-lapack_int LAPACKE_sopmtr_work(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
-                               const float* ap, const float* tau, float* c, lapack_int ldc, float* work);
-lapack_int LAPACKE_dopmtr_work(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
-                               const double* ap, const double* tau, double* c, lapack_int ldc, double* work);
-
-lapack_int LAPACKE_sorgbr_work(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int k, float* a,
-                               lapack_int lda, const float* tau, float* work, lapack_int lwork);
-lapack_int LAPACKE_dorgbr_work(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int k, double* a,
-                               lapack_int lda, const double* tau, double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sorghr_work(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, float* a, lapack_int lda,
-                               const float* tau, float* work, lapack_int lwork);
-lapack_int LAPACKE_dorghr_work(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, double* a,
-                               lapack_int lda, const double* tau, double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sorglq_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, float* a, lapack_int lda,
-                               const float* tau, float* work, lapack_int lwork);
-lapack_int LAPACKE_dorglq_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, double* a, lapack_int lda,
-                               const double* tau, double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sorgql_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, float* a, lapack_int lda,
-                               const float* tau, float* work, lapack_int lwork);
-lapack_int LAPACKE_dorgql_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, double* a, lapack_int lda,
-                               const double* tau, double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sorgqr_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, float* a, lapack_int lda,
-                               const float* tau, float* work, lapack_int lwork);
-lapack_int LAPACKE_dorgqr_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, double* a, lapack_int lda,
-                               const double* tau, double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sorgrq_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, float* a, lapack_int lda,
-                               const float* tau, float* work, lapack_int lwork);
-lapack_int LAPACKE_dorgrq_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, double* a, lapack_int lda,
-                               const double* tau, double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sorgtr_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, const float* tau,
-                               float* work, lapack_int lwork);
-lapack_int LAPACKE_dorgtr_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda, const double* tau,
-                               double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sormbr_work(int matrix_order, char vect, char side, char trans, lapack_int m, lapack_int n,
-                               lapack_int k, const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc,
-                               float* work, lapack_int lwork);
-lapack_int LAPACKE_dormbr_work(int matrix_order, char vect, char side, char trans, lapack_int m, lapack_int n,
-                               lapack_int k, const double* a, lapack_int lda, const double* tau, double* c,
-                               lapack_int ldc, double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sormhr_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int ilo,
-                               lapack_int ihi, const float* a, lapack_int lda, const float* tau, float* c,
-                               lapack_int ldc, float* work, lapack_int lwork);
-lapack_int LAPACKE_dormhr_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int ilo,
-                               lapack_int ihi, const double* a, lapack_int lda, const double* tau, double* c,
-                               lapack_int ldc, double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sormlq_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc, float* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_dormlq_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc,
-                               double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sormql_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc, float* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_dormql_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc,
-                               double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sormqr_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc, float* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_dormqr_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc,
-                               double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sormrq_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc, float* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_dormrq_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc,
-                               double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sormrz_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               lapack_int l, const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc,
-                               float* work, lapack_int lwork);
-lapack_int LAPACKE_dormrz_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               lapack_int l, const double* a, lapack_int lda, const double* tau, double* c,
-                               lapack_int ldc, double* work, lapack_int lwork);
-
-lapack_int LAPACKE_sormtr_work(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
-                               const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc, float* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_dormtr_work(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
-                               const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc,
-                               double* work, lapack_int lwork);
-
-lapack_int LAPACKE_spbcon_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, const float* ab,
-                               lapack_int ldab, float anorm, float* rcond, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dpbcon_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, const double* ab,
-                               lapack_int ldab, double anorm, double* rcond, double* work, lapack_int* iwork);
-lapack_int LAPACKE_cpbcon_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, const lapack_complex_float* ab,
-                               lapack_int ldab, float anorm, float* rcond, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zpbcon_work(int matrix_order, char uplo, lapack_int n, lapack_int kd,
-                               const lapack_complex_double* ab, lapack_int ldab, double anorm, double* rcond,
-                               lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_spbequ_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, const float* ab,
-                               lapack_int ldab, float* s, float* scond, float* amax);
-lapack_int LAPACKE_dpbequ_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, const double* ab,
-                               lapack_int ldab, double* s, double* scond, double* amax);
-lapack_int LAPACKE_cpbequ_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, const lapack_complex_float* ab,
-                               lapack_int ldab, float* s, float* scond, float* amax);
-lapack_int LAPACKE_zpbequ_work(int matrix_order, char uplo, lapack_int n, lapack_int kd,
-                               const lapack_complex_double* ab, lapack_int ldab, double* s, double* scond,
-                               double* amax);
-
-lapack_int LAPACKE_spbrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                               const float* ab, lapack_int ldab, const float* afb, lapack_int ldafb, const float* b,
-                               lapack_int ldb, float* x, lapack_int ldx, float* ferr, float* berr, float* work,
-                               lapack_int* iwork);
-lapack_int LAPACKE_dpbrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                               const double* ab, lapack_int ldab, const double* afb, lapack_int ldafb, const double* b,
-                               lapack_int ldb, double* x, lapack_int ldx, double* ferr, double* berr, double* work,
-                               lapack_int* iwork);
-lapack_int LAPACKE_cpbrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                               const lapack_complex_float* ab, lapack_int ldab, const lapack_complex_float* afb,
-                               lapack_int ldafb, const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
-                               lapack_int ldx, float* ferr, float* berr, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zpbrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                               const lapack_complex_double* ab, lapack_int ldab, const lapack_complex_double* afb,
-                               lapack_int ldafb, const lapack_complex_double* b, lapack_int ldb,
-                               lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr,
-                               lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_spbstf_work(int matrix_order, char uplo, lapack_int n, lapack_int kb, float* bb, lapack_int ldbb);
-lapack_int LAPACKE_dpbstf_work(int matrix_order, char uplo, lapack_int n, lapack_int kb, double* bb, lapack_int ldbb);
-lapack_int LAPACKE_cpbstf_work(int matrix_order, char uplo, lapack_int n, lapack_int kb, lapack_complex_float* bb,
-                               lapack_int ldbb);
-lapack_int LAPACKE_zpbstf_work(int matrix_order, char uplo, lapack_int n, lapack_int kb, lapack_complex_double* bb,
-                               lapack_int ldbb);
-
-lapack_int LAPACKE_spbsv_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs, float* ab,
-                              lapack_int ldab, float* b, lapack_int ldb);
-lapack_int LAPACKE_dpbsv_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs, double* ab,
-                              lapack_int ldab, double* b, lapack_int ldb);
-lapack_int LAPACKE_cpbsv_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                              lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zpbsv_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                              lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_spbsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                               float* ab, lapack_int ldab, float* afb, lapack_int ldafb, char* equed, float* s,
-                               float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* ferr,
-                               float* berr, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dpbsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                               double* ab, lapack_int ldab, double* afb, lapack_int ldafb, char* equed, double* s,
-                               double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* ferr,
-                               double* berr, double* work, lapack_int* iwork);
-lapack_int LAPACKE_cpbsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                               lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* afb, lapack_int ldafb,
-                               char* equed, float* s, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
-                               lapack_int ldx, float* rcond, float* ferr, float* berr, lapack_complex_float* work,
-                               float* rwork);
-lapack_int LAPACKE_zpbsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                               lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* afb, lapack_int ldafb,
-                               char* equed, double* s, lapack_complex_double* b, lapack_int ldb,
-                               lapack_complex_double* x, lapack_int ldx, double* rcond, double* ferr, double* berr,
-                               lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_spbtrf_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, float* ab, lapack_int ldab);
-lapack_int LAPACKE_dpbtrf_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, double* ab, lapack_int ldab);
-lapack_int LAPACKE_cpbtrf_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_complex_float* ab,
-                               lapack_int ldab);
-lapack_int LAPACKE_zpbtrf_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_complex_double* ab,
-                               lapack_int ldab);
-
-lapack_int LAPACKE_spbtrs_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                               const float* ab, lapack_int ldab, float* b, lapack_int ldb);
-lapack_int LAPACKE_dpbtrs_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                               const double* ab, lapack_int ldab, double* b, lapack_int ldb);
-lapack_int LAPACKE_cpbtrs_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                               const lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* b,
-                               lapack_int ldb);
-lapack_int LAPACKE_zpbtrs_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
-                               const lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* b,
-                               lapack_int ldb);
-
-lapack_int LAPACKE_spftrf_work(int matrix_order, char transr, char uplo, lapack_int n, float* a);
-lapack_int LAPACKE_dpftrf_work(int matrix_order, char transr, char uplo, lapack_int n, double* a);
-lapack_int LAPACKE_cpftrf_work(int matrix_order, char transr, char uplo, lapack_int n, lapack_complex_float* a);
-lapack_int LAPACKE_zpftrf_work(int matrix_order, char transr, char uplo, lapack_int n, lapack_complex_double* a);
-
-lapack_int LAPACKE_spftri_work(int matrix_order, char transr, char uplo, lapack_int n, float* a);
-lapack_int LAPACKE_dpftri_work(int matrix_order, char transr, char uplo, lapack_int n, double* a);
-lapack_int LAPACKE_cpftri_work(int matrix_order, char transr, char uplo, lapack_int n, lapack_complex_float* a);
-lapack_int LAPACKE_zpftri_work(int matrix_order, char transr, char uplo, lapack_int n, lapack_complex_double* a);
-
-lapack_int LAPACKE_spftrs_work(int matrix_order, char transr, char uplo, lapack_int n, lapack_int nrhs, const float* a,
-                               float* b, lapack_int ldb);
-lapack_int LAPACKE_dpftrs_work(int matrix_order, char transr, char uplo, lapack_int n, lapack_int nrhs, const double* a,
-                               double* b, lapack_int ldb);
-lapack_int LAPACKE_cpftrs_work(int matrix_order, char transr, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* a, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zpftrs_work(int matrix_order, char transr, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* a, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_spocon_work(int matrix_order, char uplo, lapack_int n, const float* a, lapack_int lda, float anorm,
-                               float* rcond, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dpocon_work(int matrix_order, char uplo, lapack_int n, const double* a, lapack_int lda, double anorm,
-                               double* rcond, double* work, lapack_int* iwork);
-lapack_int LAPACKE_cpocon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
-                               float anorm, float* rcond, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zpocon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a,
-                               lapack_int lda, double anorm, double* rcond, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_spoequ_work(int matrix_order, lapack_int n, const float* a, lapack_int lda, float* s, float* scond,
-                               float* amax);
-lapack_int LAPACKE_dpoequ_work(int matrix_order, lapack_int n, const double* a, lapack_int lda, double* s,
-                               double* scond, double* amax);
-lapack_int LAPACKE_cpoequ_work(int matrix_order, lapack_int n, const lapack_complex_float* a, lapack_int lda, float* s,
-                               float* scond, float* amax);
-lapack_int LAPACKE_zpoequ_work(int matrix_order, lapack_int n, const lapack_complex_double* a, lapack_int lda,
-                               double* s, double* scond, double* amax);
-
-lapack_int LAPACKE_spoequb_work(int matrix_order, lapack_int n, const float* a, lapack_int lda, float* s, float* scond,
-                                float* amax);
-lapack_int LAPACKE_dpoequb_work(int matrix_order, lapack_int n, const double* a, lapack_int lda, double* s,
-                                double* scond, double* amax);
-lapack_int LAPACKE_cpoequb_work(int matrix_order, lapack_int n, const lapack_complex_float* a, lapack_int lda, float* s,
-                                float* scond, float* amax);
-lapack_int LAPACKE_zpoequb_work(int matrix_order, lapack_int n, const lapack_complex_double* a, lapack_int lda,
-                                double* s, double* scond, double* amax);
-
-lapack_int LAPACKE_sporfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a,
-                               lapack_int lda, const float* af, lapack_int ldaf, const float* b, lapack_int ldb,
-                               float* x, lapack_int ldx, float* ferr, float* berr, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dporfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a,
-                               lapack_int lda, const double* af, lapack_int ldaf, const double* b, lapack_int ldb,
-                               double* x, lapack_int ldx, double* ferr, double* berr, double* work, lapack_int* iwork);
-lapack_int LAPACKE_cporfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
-                               lapack_int ldaf, const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
-                               lapack_int ldx, float* ferr, float* berr, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zporfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
-                               lapack_int ldaf, const lapack_complex_double* b, lapack_int ldb,
-                               lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr,
-                               lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sporfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs, const float* a,
-                                lapack_int lda, const float* af, lapack_int ldaf, const float* s, const float* b,
-                                lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* berr,
-                                lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
-                                float* params, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dporfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs, const double* a,
-                                lapack_int lda, const double* af, lapack_int ldaf, const double* s, const double* b,
-                                lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* berr,
-                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
-                                double* params, double* work, lapack_int* iwork);
-lapack_int LAPACKE_cporfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
-                                const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
-                                lapack_int ldaf, const float* s, const lapack_complex_float* b, lapack_int ldb,
-                                lapack_complex_float* x, lapack_int ldx, float* rcond, float* berr,
-                                lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
-                                float* params, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zporfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
-                                const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
-                                lapack_int ldaf, const double* s, const lapack_complex_double* b, lapack_int ldb,
-                                lapack_complex_double* x, lapack_int ldx, double* rcond, double* berr,
-                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
-                                double* params, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sposv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
-                              float* b, lapack_int ldb);
-lapack_int LAPACKE_dposv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
-                              double* b, lapack_int ldb);
-lapack_int LAPACKE_cposv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
-                              lapack_int lda, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zposv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
-                              lapack_int lda, lapack_complex_double* b, lapack_int ldb);
-lapack_int LAPACKE_dsposv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
-                               double* b, lapack_int ldb, double* x, lapack_int ldx, double* work, float* swork,
-                               lapack_int* iter);
-lapack_int LAPACKE_zcposv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
-                               lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
-                               lapack_int ldx, lapack_complex_double* work, lapack_complex_float* swork, double* rwork,
-                               lapack_int* iter);
-
-lapack_int LAPACKE_sposvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, float* a,
-                               lapack_int lda, float* af, lapack_int ldaf, char* equed, float* s, float* b,
-                               lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* ferr, float* berr,
-                               float* work, lapack_int* iwork);
-lapack_int LAPACKE_dposvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, double* a,
-                               lapack_int lda, double* af, lapack_int ldaf, char* equed, double* s, double* b,
-                               lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* ferr, double* berr,
-                               double* work, lapack_int* iwork);
-lapack_int LAPACKE_cposvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                               lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
-                               char* equed, float* s, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
-                               lapack_int ldx, float* rcond, float* ferr, float* berr, lapack_complex_float* work,
-                               float* rwork);
-lapack_int LAPACKE_zposvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                               lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
-                               char* equed, double* s, lapack_complex_double* b, lapack_int ldb,
-                               lapack_complex_double* x, lapack_int ldx, double* rcond, double* ferr, double* berr,
-                               lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sposvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, float* a,
-                                lapack_int lda, float* af, lapack_int ldaf, char* equed, float* s, float* b,
-                                lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* rpvgrw, float* berr,
-                                lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
-                                float* params, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dposvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, double* a,
-                                lapack_int lda, double* af, lapack_int ldaf, char* equed, double* s, double* b,
-                                lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* rpvgrw, double* berr,
-                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
-                                double* params, double* work, lapack_int* iwork);
-lapack_int LAPACKE_cposvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                                lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
-                                char* equed, float* s, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
-                                lapack_int ldx, float* rcond, float* rpvgrw, float* berr, lapack_int n_err_bnds,
-                                float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams, float* params,
-                                lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zposvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                                lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
-                                char* equed, double* s, lapack_complex_double* b, lapack_int ldb,
-                                lapack_complex_double* x, lapack_int ldx, double* rcond, double* rpvgrw, double* berr,
-                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
-                                double* params, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_spotrf_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda);
-lapack_int LAPACKE_dpotrf_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda);
-lapack_int LAPACKE_cpotrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda);
-lapack_int LAPACKE_zpotrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda);
-
-lapack_int LAPACKE_spotri_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda);
-lapack_int LAPACKE_dpotri_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda);
-lapack_int LAPACKE_cpotri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda);
-lapack_int LAPACKE_zpotri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda);
-
-lapack_int LAPACKE_spotrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a,
-                               lapack_int lda, float* b, lapack_int ldb);
-lapack_int LAPACKE_dpotrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a,
-                               lapack_int lda, double* b, lapack_int ldb);
-lapack_int LAPACKE_cpotrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zpotrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* a, lapack_int lda, lapack_complex_double* b,
-                               lapack_int ldb);
-
-lapack_int LAPACKE_sppcon_work(int matrix_order, char uplo, lapack_int n, const float* ap, float anorm, float* rcond,
-                               float* work, lapack_int* iwork);
-lapack_int LAPACKE_dppcon_work(int matrix_order, char uplo, lapack_int n, const double* ap, double anorm, double* rcond,
-                               double* work, lapack_int* iwork);
-lapack_int LAPACKE_cppcon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap, float anorm,
-                               float* rcond, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zppcon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap, double anorm,
-                               double* rcond, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sppequ_work(int matrix_order, char uplo, lapack_int n, const float* ap, float* s, float* scond,
-                               float* amax);
-lapack_int LAPACKE_dppequ_work(int matrix_order, char uplo, lapack_int n, const double* ap, double* s, double* scond,
-                               double* amax);
-lapack_int LAPACKE_cppequ_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap, float* s,
-                               float* scond, float* amax);
-lapack_int LAPACKE_zppequ_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap, double* s,
-                               double* scond, double* amax);
-
-lapack_int LAPACKE_spprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* ap,
-                               const float* afp, const float* b, lapack_int ldb, float* x, lapack_int ldx, float* ferr,
-                               float* berr, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dpprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* ap,
-                               const double* afp, const double* b, lapack_int ldb, double* x, lapack_int ldx,
-                               double* ferr, double* berr, double* work, lapack_int* iwork);
-lapack_int LAPACKE_cpprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* ap, const lapack_complex_float* afp,
-                               const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                               float* ferr, float* berr, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zpprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* ap, const lapack_complex_double* afp,
-                               const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                               double* ferr, double* berr, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sppsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, float* ap, float* b,
-                              lapack_int ldb);
-lapack_int LAPACKE_dppsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* ap, double* b,
-                              lapack_int ldb);
-lapack_int LAPACKE_cppsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* ap,
-                              lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zppsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* ap,
-                              lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_sppsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, float* ap,
-                               float* afp, char* equed, float* s, float* b, lapack_int ldb, float* x, lapack_int ldx,
-                               float* rcond, float* ferr, float* berr, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dppsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, double* ap,
-                               double* afp, char* equed, double* s, double* b, lapack_int ldb, double* x,
-                               lapack_int ldx, double* rcond, double* ferr, double* berr, double* work,
-                               lapack_int* iwork);
-lapack_int LAPACKE_cppsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                               lapack_complex_float* ap, lapack_complex_float* afp, char* equed, float* s,
-                               lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                               float* rcond, float* ferr, float* berr, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zppsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                               lapack_complex_double* ap, lapack_complex_double* afp, char* equed, double* s,
-                               lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                               double* rcond, double* ferr, double* berr, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_spptrf_work(int matrix_order, char uplo, lapack_int n, float* ap);
-lapack_int LAPACKE_dpptrf_work(int matrix_order, char uplo, lapack_int n, double* ap);
-lapack_int LAPACKE_cpptrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap);
-lapack_int LAPACKE_zpptrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap);
-
-lapack_int LAPACKE_spptri_work(int matrix_order, char uplo, lapack_int n, float* ap);
-lapack_int LAPACKE_dpptri_work(int matrix_order, char uplo, lapack_int n, double* ap);
-lapack_int LAPACKE_cpptri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap);
-lapack_int LAPACKE_zpptri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap);
-
-lapack_int LAPACKE_spptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* ap, float* b,
-                               lapack_int ldb);
-lapack_int LAPACKE_dpptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* ap, double* b,
-                               lapack_int ldb);
-lapack_int LAPACKE_cpptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* ap, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zpptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* ap, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_spstrf_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, lapack_int* piv,
-                               lapack_int* rank, float tol, float* work);
-lapack_int LAPACKE_dpstrf_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda, lapack_int* piv,
-                               lapack_int* rank, double tol, double* work);
-lapack_int LAPACKE_cpstrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               lapack_int* piv, lapack_int* rank, float tol, float* work);
-lapack_int LAPACKE_zpstrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               lapack_int* piv, lapack_int* rank, double tol, double* work);
-
-lapack_int LAPACKE_sptcon_work(lapack_int n, const float* d, const float* e, float anorm, float* rcond, float* work);
-lapack_int LAPACKE_dptcon_work(lapack_int n, const double* d, const double* e, double anorm, double* rcond,
-                               double* work);
-lapack_int LAPACKE_cptcon_work(lapack_int n, const float* d, const lapack_complex_float* e, float anorm, float* rcond,
-                               float* work);
-lapack_int LAPACKE_zptcon_work(lapack_int n, const double* d, const lapack_complex_double* e, double anorm,
-                               double* rcond, double* work);
-
-lapack_int LAPACKE_spteqr_work(int matrix_order, char compz, lapack_int n, float* d, float* e, float* z, lapack_int ldz,
-                               float* work);
-lapack_int LAPACKE_dpteqr_work(int matrix_order, char compz, lapack_int n, double* d, double* e, double* z,
-                               lapack_int ldz, double* work);
-lapack_int LAPACKE_cpteqr_work(int matrix_order, char compz, lapack_int n, float* d, float* e, lapack_complex_float* z,
-                               lapack_int ldz, float* work);
-lapack_int LAPACKE_zpteqr_work(int matrix_order, char compz, lapack_int n, double* d, double* e,
-                               lapack_complex_double* z, lapack_int ldz, double* work);
-
-lapack_int LAPACKE_sptrfs_work(int matrix_order, lapack_int n, lapack_int nrhs, const float* d, const float* e,
-                               const float* df, const float* ef, const float* b, lapack_int ldb, float* x,
-                               lapack_int ldx, float* ferr, float* berr, float* work);
-lapack_int LAPACKE_dptrfs_work(int matrix_order, lapack_int n, lapack_int nrhs, const double* d, const double* e,
-                               const double* df, const double* ef, const double* b, lapack_int ldb, double* x,
-                               lapack_int ldx, double* ferr, double* berr, double* work);
-lapack_int LAPACKE_cptrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* d,
-                               const lapack_complex_float* e, const float* df, const lapack_complex_float* ef,
-                               const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                               float* ferr, float* berr, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zptrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* d,
-                               const lapack_complex_double* e, const double* df, const lapack_complex_double* ef,
-                               const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                               double* ferr, double* berr, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sptsv_work(int matrix_order, lapack_int n, lapack_int nrhs, float* d, float* e, float* b,
-                              lapack_int ldb);
-lapack_int LAPACKE_dptsv_work(int matrix_order, lapack_int n, lapack_int nrhs, double* d, double* e, double* b,
-                              lapack_int ldb);
-lapack_int LAPACKE_cptsv_work(int matrix_order, lapack_int n, lapack_int nrhs, float* d, lapack_complex_float* e,
-                              lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zptsv_work(int matrix_order, lapack_int n, lapack_int nrhs, double* d, lapack_complex_double* e,
-                              lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_sptsvx_work(int matrix_order, char fact, lapack_int n, lapack_int nrhs, const float* d,
-                               const float* e, float* df, float* ef, const float* b, lapack_int ldb, float* x,
-                               lapack_int ldx, float* rcond, float* ferr, float* berr, float* work);
-lapack_int LAPACKE_dptsvx_work(int matrix_order, char fact, lapack_int n, lapack_int nrhs, const double* d,
-                               const double* e, double* df, double* ef, const double* b, lapack_int ldb, double* x,
-                               lapack_int ldx, double* rcond, double* ferr, double* berr, double* work);
-lapack_int LAPACKE_cptsvx_work(int matrix_order, char fact, lapack_int n, lapack_int nrhs, const float* d,
-                               const lapack_complex_float* e, float* df, lapack_complex_float* ef,
-                               const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                               float* rcond, float* ferr, float* berr, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zptsvx_work(int matrix_order, char fact, lapack_int n, lapack_int nrhs, const double* d,
-                               const lapack_complex_double* e, double* df, lapack_complex_double* ef,
-                               const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                               double* rcond, double* ferr, double* berr, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_spttrf_work(lapack_int n, float* d, float* e);
-lapack_int LAPACKE_dpttrf_work(lapack_int n, double* d, double* e);
-lapack_int LAPACKE_cpttrf_work(lapack_int n, float* d, lapack_complex_float* e);
-lapack_int LAPACKE_zpttrf_work(lapack_int n, double* d, lapack_complex_double* e);
-
-lapack_int LAPACKE_spttrs_work(int matrix_order, lapack_int n, lapack_int nrhs, const float* d, const float* e,
-                               float* b, lapack_int ldb);
-lapack_int LAPACKE_dpttrs_work(int matrix_order, lapack_int n, lapack_int nrhs, const double* d, const double* e,
-                               double* b, lapack_int ldb);
-lapack_int LAPACKE_cpttrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* d,
-                               const lapack_complex_float* e, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zpttrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* d,
-                               const lapack_complex_double* e, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_ssbev_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, float* ab,
-                              lapack_int ldab, float* w, float* z, lapack_int ldz, float* work);
-lapack_int LAPACKE_dsbev_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, double* ab,
-                              lapack_int ldab, double* w, double* z, lapack_int ldz, double* work);
-
-lapack_int LAPACKE_ssbevd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, float* ab,
-                               lapack_int ldab, float* w, float* z, lapack_int ldz, float* work, lapack_int lwork,
-                               lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_dsbevd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, double* ab,
-                               lapack_int ldab, double* w, double* z, lapack_int ldz, double* work, lapack_int lwork,
-                               lapack_int* iwork, lapack_int liwork);
-
-lapack_int LAPACKE_ssbevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int kd,
-                               float* ab, lapack_int ldab, float* q, lapack_int ldq, float vl, float vu, lapack_int il,
-                               lapack_int iu, float abstol, lapack_int* m, float* w, float* z, lapack_int ldz,
-                               float* work, lapack_int* iwork, lapack_int* ifail);
-lapack_int LAPACKE_dsbevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int kd,
-                               double* ab, lapack_int ldab, double* q, lapack_int ldq, double vl, double vu,
-                               lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w, double* z,
-                               lapack_int ldz, double* work, lapack_int* iwork, lapack_int* ifail);
-
-lapack_int LAPACKE_ssbgst_work(int matrix_order, char vect, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                               float* ab, lapack_int ldab, const float* bb, lapack_int ldbb, float* x, lapack_int ldx,
-                               float* work);
-lapack_int LAPACKE_dsbgst_work(int matrix_order, char vect, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                               double* ab, lapack_int ldab, const double* bb, lapack_int ldbb, double* x,
-                               lapack_int ldx, double* work);
-
-lapack_int LAPACKE_ssbgv_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                              float* ab, lapack_int ldab, float* bb, lapack_int ldbb, float* w, float* z,
-                              lapack_int ldz, float* work);
-lapack_int LAPACKE_dsbgv_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                              double* ab, lapack_int ldab, double* bb, lapack_int ldbb, double* w, double* z,
-                              lapack_int ldz, double* work);
-
-lapack_int LAPACKE_ssbgvd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                               float* ab, lapack_int ldab, float* bb, lapack_int ldbb, float* w, float* z,
-                               lapack_int ldz, float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_dsbgvd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
-                               double* ab, lapack_int ldab, double* bb, lapack_int ldbb, double* w, double* z,
-                               lapack_int ldz, double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-
-lapack_int LAPACKE_ssbgvx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int ka,
-                               lapack_int kb, float* ab, lapack_int ldab, float* bb, lapack_int ldbb, float* q,
-                               lapack_int ldq, float vl, float vu, lapack_int il, lapack_int iu, float abstol,
-                               lapack_int* m, float* w, float* z, lapack_int ldz, float* work, lapack_int* iwork,
-                               lapack_int* ifail);
-lapack_int LAPACKE_dsbgvx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int ka,
-                               lapack_int kb, double* ab, lapack_int ldab, double* bb, lapack_int ldbb, double* q,
-                               lapack_int ldq, double vl, double vu, lapack_int il, lapack_int iu, double abstol,
-                               lapack_int* m, double* w, double* z, lapack_int ldz, double* work, lapack_int* iwork,
-                               lapack_int* ifail);
-
-lapack_int LAPACKE_ssbtrd_work(int matrix_order, char vect, char uplo, lapack_int n, lapack_int kd, float* ab,
-                               lapack_int ldab, float* d, float* e, float* q, lapack_int ldq, float* work);
-lapack_int LAPACKE_dsbtrd_work(int matrix_order, char vect, char uplo, lapack_int n, lapack_int kd, double* ab,
-                               lapack_int ldab, double* d, double* e, double* q, lapack_int ldq, double* work);
-
-lapack_int LAPACKE_ssfrk_work(int matrix_order, char transr, char uplo, char trans, lapack_int n, lapack_int k,
-                              float alpha, const float* a, lapack_int lda, float beta, float* c);
-lapack_int LAPACKE_dsfrk_work(int matrix_order, char transr, char uplo, char trans, lapack_int n, lapack_int k,
-                              double alpha, const double* a, lapack_int lda, double beta, double* c);
-
-lapack_int LAPACKE_sspcon_work(int matrix_order, char uplo, lapack_int n, const float* ap, const lapack_int* ipiv,
-                               float anorm, float* rcond, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dspcon_work(int matrix_order, char uplo, lapack_int n, const double* ap, const lapack_int* ipiv,
-                               double anorm, double* rcond, double* work, lapack_int* iwork);
-lapack_int LAPACKE_cspcon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap,
-                               const lapack_int* ipiv, float anorm, float* rcond, lapack_complex_float* work);
-lapack_int LAPACKE_zspcon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap,
-                               const lapack_int* ipiv, double anorm, double* rcond, lapack_complex_double* work);
-
-lapack_int LAPACKE_sspev_work(int matrix_order, char jobz, char uplo, lapack_int n, float* ap, float* w, float* z,
-                              lapack_int ldz, float* work);
-lapack_int LAPACKE_dspev_work(int matrix_order, char jobz, char uplo, lapack_int n, double* ap, double* w, double* z,
-                              lapack_int ldz, double* work);
-
-lapack_int LAPACKE_sspevd_work(int matrix_order, char jobz, char uplo, lapack_int n, float* ap, float* w, float* z,
-                               lapack_int ldz, float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_dspevd_work(int matrix_order, char jobz, char uplo, lapack_int n, double* ap, double* w, double* z,
-                               lapack_int ldz, double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-
-lapack_int LAPACKE_sspevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, float* ap, float vl,
-                               float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w, float* z,
-                               lapack_int ldz, float* work, lapack_int* iwork, lapack_int* ifail);
-lapack_int LAPACKE_dspevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, double* ap, double vl,
-                               double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
-                               double* z, lapack_int ldz, double* work, lapack_int* iwork, lapack_int* ifail);
-
-lapack_int LAPACKE_sspgst_work(int matrix_order, lapack_int itype, char uplo, lapack_int n, float* ap, const float* bp);
-lapack_int LAPACKE_dspgst_work(int matrix_order, lapack_int itype, char uplo, lapack_int n, double* ap,
-                               const double* bp);
-
-lapack_int LAPACKE_sspgv_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, float* ap,
-                              float* bp, float* w, float* z, lapack_int ldz, float* work);
-lapack_int LAPACKE_dspgv_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, double* ap,
-                              double* bp, double* w, double* z, lapack_int ldz, double* work);
-
-lapack_int LAPACKE_sspgvd_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, float* ap,
-                               float* bp, float* w, float* z, lapack_int ldz, float* work, lapack_int lwork,
-                               lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_dspgvd_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, double* ap,
-                               double* bp, double* w, double* z, lapack_int ldz, double* work, lapack_int lwork,
-                               lapack_int* iwork, lapack_int liwork);
-
-lapack_int LAPACKE_sspgvx_work(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
-                               float* ap, float* bp, float vl, float vu, lapack_int il, lapack_int iu, float abstol,
-                               lapack_int* m, float* w, float* z, lapack_int ldz, float* work, lapack_int* iwork,
-                               lapack_int* ifail);
-lapack_int LAPACKE_dspgvx_work(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
-                               double* ap, double* bp, double vl, double vu, lapack_int il, lapack_int iu,
-                               double abstol, lapack_int* m, double* w, double* z, lapack_int ldz, double* work,
-                               lapack_int* iwork, lapack_int* ifail);
-
-lapack_int LAPACKE_ssprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* ap,
-                               const float* afp, const lapack_int* ipiv, const float* b, lapack_int ldb, float* x,
-                               lapack_int ldx, float* ferr, float* berr, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dsprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* ap,
-                               const double* afp, const lapack_int* ipiv, const double* b, lapack_int ldb, double* x,
-                               lapack_int ldx, double* ferr, double* berr, double* work, lapack_int* iwork);
-lapack_int LAPACKE_csprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* ap, const lapack_complex_float* afp, const lapack_int* ipiv,
-                               const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                               float* ferr, float* berr, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zsprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* ap, const lapack_complex_double* afp,
-                               const lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb,
-                               lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr,
-                               lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_sspsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, float* ap, lapack_int* ipiv,
-                              float* b, lapack_int ldb);
-lapack_int LAPACKE_dspsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* ap, lapack_int* ipiv,
-                              double* b, lapack_int ldb);
-lapack_int LAPACKE_cspsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* ap,
-                              lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zspsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* ap,
-                              lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_sspsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, const float* ap,
-                               float* afp, lapack_int* ipiv, const float* b, lapack_int ldb, float* x, lapack_int ldx,
-                               float* rcond, float* ferr, float* berr, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dspsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, const double* ap,
-                               double* afp, lapack_int* ipiv, const double* b, lapack_int ldb, double* x,
-                               lapack_int ldx, double* rcond, double* ferr, double* berr, double* work,
-                               lapack_int* iwork);
-lapack_int LAPACKE_cspsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* ap, lapack_complex_float* afp, lapack_int* ipiv,
-                               const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
-                               float* rcond, float* ferr, float* berr, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zspsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* ap, lapack_complex_double* afp, lapack_int* ipiv,
-                               const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
-                               double* rcond, double* ferr, double* berr, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_ssptrd_work(int matrix_order, char uplo, lapack_int n, float* ap, float* d, float* e, float* tau);
-lapack_int LAPACKE_dsptrd_work(int matrix_order, char uplo, lapack_int n, double* ap, double* d, double* e,
-                               double* tau);
-
-lapack_int LAPACKE_ssptrf_work(int matrix_order, char uplo, lapack_int n, float* ap, lapack_int* ipiv);
-lapack_int LAPACKE_dsptrf_work(int matrix_order, char uplo, lapack_int n, double* ap, lapack_int* ipiv);
-lapack_int LAPACKE_csptrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap, lapack_int* ipiv);
-lapack_int LAPACKE_zsptrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap, lapack_int* ipiv);
-
-lapack_int LAPACKE_ssptri_work(int matrix_order, char uplo, lapack_int n, float* ap, const lapack_int* ipiv,
-                               float* work);
-lapack_int LAPACKE_dsptri_work(int matrix_order, char uplo, lapack_int n, double* ap, const lapack_int* ipiv,
-                               double* work);
-lapack_int LAPACKE_csptri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap,
-                               const lapack_int* ipiv, lapack_complex_float* work);
-lapack_int LAPACKE_zsptri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap,
-                               const lapack_int* ipiv, lapack_complex_double* work);
-
-lapack_int LAPACKE_ssptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* ap,
-                               const lapack_int* ipiv, float* b, lapack_int ldb);
-lapack_int LAPACKE_dsptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* ap,
-                               const lapack_int* ipiv, double* b, lapack_int ldb);
-lapack_int LAPACKE_csptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* ap, const lapack_int* ipiv, lapack_complex_float* b,
-                               lapack_int ldb);
-lapack_int LAPACKE_zsptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* ap, const lapack_int* ipiv, lapack_complex_double* b,
-                               lapack_int ldb);
-
-lapack_int LAPACKE_sstebz_work(char range, char order, lapack_int n, float vl, float vu, lapack_int il, lapack_int iu,
-                               float abstol, const float* d, const float* e, lapack_int* m, lapack_int* nsplit,
-                               float* w, lapack_int* iblock, lapack_int* isplit, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dstebz_work(char range, char order, lapack_int n, double vl, double vu, lapack_int il, lapack_int iu,
-                               double abstol, const double* d, const double* e, lapack_int* m, lapack_int* nsplit,
-                               double* w, lapack_int* iblock, lapack_int* isplit, double* work, lapack_int* iwork);
-
-lapack_int LAPACKE_sstedc_work(int matrix_order, char compz, lapack_int n, float* d, float* e, float* z, lapack_int ldz,
-                               float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_dstedc_work(int matrix_order, char compz, lapack_int n, double* d, double* e, double* z,
-                               lapack_int ldz, double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_cstedc_work(int matrix_order, char compz, lapack_int n, float* d, float* e, lapack_complex_float* z,
-                               lapack_int ldz, lapack_complex_float* work, lapack_int lwork, float* rwork,
-                               lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_zstedc_work(int matrix_order, char compz, lapack_int n, double* d, double* e,
-                               lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work, lapack_int lwork,
-                               double* rwork, lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
-
-lapack_int LAPACKE_sstegr_work(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl,
-                               float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w, float* z,
-                               lapack_int ldz, lapack_int* isuppz, float* work, lapack_int lwork, lapack_int* iwork,
-                               lapack_int liwork);
-lapack_int LAPACKE_dstegr_work(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
-                               double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
-                               double* z, lapack_int ldz, lapack_int* isuppz, double* work, lapack_int lwork,
-                               lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_cstegr_work(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl,
-                               float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w,
-                               lapack_complex_float* z, lapack_int ldz, lapack_int* isuppz, float* work,
-                               lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_zstegr_work(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
-                               double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
-                               lapack_complex_double* z, lapack_int ldz, lapack_int* isuppz, double* work,
-                               lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-
-lapack_int LAPACKE_sstein_work(int matrix_order, lapack_int n, const float* d, const float* e, lapack_int m,
-                               const float* w, const lapack_int* iblock, const lapack_int* isplit, float* z,
-                               lapack_int ldz, float* work, lapack_int* iwork, lapack_int* ifailv);
-lapack_int LAPACKE_dstein_work(int matrix_order, lapack_int n, const double* d, const double* e, lapack_int m,
-                               const double* w, const lapack_int* iblock, const lapack_int* isplit, double* z,
-                               lapack_int ldz, double* work, lapack_int* iwork, lapack_int* ifailv);
-lapack_int LAPACKE_cstein_work(int matrix_order, lapack_int n, const float* d, const float* e, lapack_int m,
-                               const float* w, const lapack_int* iblock, const lapack_int* isplit,
-                               lapack_complex_float* z, lapack_int ldz, float* work, lapack_int* iwork,
-                               lapack_int* ifailv);
-lapack_int LAPACKE_zstein_work(int matrix_order, lapack_int n, const double* d, const double* e, lapack_int m,
-                               const double* w, const lapack_int* iblock, const lapack_int* isplit,
-                               lapack_complex_double* z, lapack_int ldz, double* work, lapack_int* iwork,
-                               lapack_int* ifailv);
-
-lapack_int LAPACKE_sstemr_work(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl,
-                               float vu, lapack_int il, lapack_int iu, lapack_int* m, float* w, float* z,
-                               lapack_int ldz, lapack_int nzc, lapack_int* isuppz, lapack_logical* tryrac, float* work,
-                               lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_dstemr_work(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
-                               double vu, lapack_int il, lapack_int iu, lapack_int* m, double* w, double* z,
-                               lapack_int ldz, lapack_int nzc, lapack_int* isuppz, lapack_logical* tryrac, double* work,
-                               lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_cstemr_work(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl,
-                               float vu, lapack_int il, lapack_int iu, lapack_int* m, float* w, lapack_complex_float* z,
-                               lapack_int ldz, lapack_int nzc, lapack_int* isuppz, lapack_logical* tryrac, float* work,
-                               lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_zstemr_work(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
-                               double vu, lapack_int il, lapack_int iu, lapack_int* m, double* w,
-                               lapack_complex_double* z, lapack_int ldz, lapack_int nzc, lapack_int* isuppz,
-                               lapack_logical* tryrac, double* work, lapack_int lwork, lapack_int* iwork,
-                               lapack_int liwork);
-
-lapack_int LAPACKE_ssteqr_work(int matrix_order, char compz, lapack_int n, float* d, float* e, float* z, lapack_int ldz,
-                               float* work);
-lapack_int LAPACKE_dsteqr_work(int matrix_order, char compz, lapack_int n, double* d, double* e, double* z,
-                               lapack_int ldz, double* work);
-lapack_int LAPACKE_csteqr_work(int matrix_order, char compz, lapack_int n, float* d, float* e, lapack_complex_float* z,
-                               lapack_int ldz, float* work);
-lapack_int LAPACKE_zsteqr_work(int matrix_order, char compz, lapack_int n, double* d, double* e,
-                               lapack_complex_double* z, lapack_int ldz, double* work);
-
-lapack_int LAPACKE_ssterf_work(lapack_int n, float* d, float* e);
-lapack_int LAPACKE_dsterf_work(lapack_int n, double* d, double* e);
-
-lapack_int LAPACKE_sstev_work(int matrix_order, char jobz, lapack_int n, float* d, float* e, float* z, lapack_int ldz,
-                              float* work);
-lapack_int LAPACKE_dstev_work(int matrix_order, char jobz, lapack_int n, double* d, double* e, double* z,
-                              lapack_int ldz, double* work);
-
-lapack_int LAPACKE_sstevd_work(int matrix_order, char jobz, lapack_int n, float* d, float* e, float* z, lapack_int ldz,
-                               float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_dstevd_work(int matrix_order, char jobz, lapack_int n, double* d, double* e, double* z,
-                               lapack_int ldz, double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-
-lapack_int LAPACKE_sstevr_work(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl,
-                               float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w, float* z,
-                               lapack_int ldz, lapack_int* isuppz, float* work, lapack_int lwork, lapack_int* iwork,
-                               lapack_int liwork);
-lapack_int LAPACKE_dstevr_work(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
-                               double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
-                               double* z, lapack_int ldz, lapack_int* isuppz, double* work, lapack_int lwork,
-                               lapack_int* iwork, lapack_int liwork);
-
-lapack_int LAPACKE_sstevx_work(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl,
-                               float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w, float* z,
-                               lapack_int ldz, float* work, lapack_int* iwork, lapack_int* ifail);
-lapack_int LAPACKE_dstevx_work(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
-                               double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
-                               double* z, lapack_int ldz, double* work, lapack_int* iwork, lapack_int* ifail);
-
-lapack_int LAPACKE_ssycon_work(int matrix_order, char uplo, lapack_int n, const float* a, lapack_int lda,
-                               const lapack_int* ipiv, float anorm, float* rcond, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dsycon_work(int matrix_order, char uplo, lapack_int n, const double* a, lapack_int lda,
-                               const lapack_int* ipiv, double anorm, double* rcond, double* work, lapack_int* iwork);
-lapack_int LAPACKE_csycon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
-                               const lapack_int* ipiv, float anorm, float* rcond, lapack_complex_float* work);
-lapack_int LAPACKE_zsycon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a,
-                               lapack_int lda, const lapack_int* ipiv, double anorm, double* rcond,
-                               lapack_complex_double* work);
-
-lapack_int LAPACKE_ssyequb_work(int matrix_order, char uplo, lapack_int n, const float* a, lapack_int lda, float* s,
-                                float* scond, float* amax, float* work);
-lapack_int LAPACKE_dsyequb_work(int matrix_order, char uplo, lapack_int n, const double* a, lapack_int lda, double* s,
-                                double* scond, double* amax, double* work);
-lapack_int LAPACKE_csyequb_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a,
-                                lapack_int lda, float* s, float* scond, float* amax, lapack_complex_float* work);
-lapack_int LAPACKE_zsyequb_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a,
-                                lapack_int lda, double* s, double* scond, double* amax, lapack_complex_double* work);
-
-lapack_int LAPACKE_ssyev_work(int matrix_order, char jobz, char uplo, lapack_int n, float* a, lapack_int lda, float* w,
-                              float* work, lapack_int lwork);
-lapack_int LAPACKE_dsyev_work(int matrix_order, char jobz, char uplo, lapack_int n, double* a, lapack_int lda,
-                              double* w, double* work, lapack_int lwork);
-
-lapack_int LAPACKE_ssyevd_work(int matrix_order, char jobz, char uplo, lapack_int n, float* a, lapack_int lda, float* w,
-                               float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_dsyevd_work(int matrix_order, char jobz, char uplo, lapack_int n, double* a, lapack_int lda,
-                               double* w, double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-
-lapack_int LAPACKE_ssyevr_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, float* a,
-                               lapack_int lda, float vl, float vu, lapack_int il, lapack_int iu, float abstol,
-                               lapack_int* m, float* w, float* z, lapack_int ldz, lapack_int* isuppz, float* work,
-                               lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_dsyevr_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, double* a,
-                               lapack_int lda, double vl, double vu, lapack_int il, lapack_int iu, double abstol,
-                               lapack_int* m, double* w, double* z, lapack_int ldz, lapack_int* isuppz, double* work,
-                               lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-
-lapack_int LAPACKE_ssyevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, float* a,
-                               lapack_int lda, float vl, float vu, lapack_int il, lapack_int iu, float abstol,
-                               lapack_int* m, float* w, float* z, lapack_int ldz, float* work, lapack_int lwork,
-                               lapack_int* iwork, lapack_int* ifail);
-lapack_int LAPACKE_dsyevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, double* a,
-                               lapack_int lda, double vl, double vu, lapack_int il, lapack_int iu, double abstol,
-                               lapack_int* m, double* w, double* z, lapack_int ldz, double* work, lapack_int lwork,
-                               lapack_int* iwork, lapack_int* ifail);
-
-lapack_int LAPACKE_ssygst_work(int matrix_order, lapack_int itype, char uplo, lapack_int n, float* a, lapack_int lda,
-                               const float* b, lapack_int ldb);
-lapack_int LAPACKE_dsygst_work(int matrix_order, lapack_int itype, char uplo, lapack_int n, double* a, lapack_int lda,
-                               const double* b, lapack_int ldb);
-
-lapack_int LAPACKE_ssygv_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, float* a,
-                              lapack_int lda, float* b, lapack_int ldb, float* w, float* work, lapack_int lwork);
-lapack_int LAPACKE_dsygv_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, double* a,
-                              lapack_int lda, double* b, lapack_int ldb, double* w, double* work, lapack_int lwork);
-
-lapack_int LAPACKE_ssygvd_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, float* a,
-                               lapack_int lda, float* b, lapack_int ldb, float* w, float* work, lapack_int lwork,
-                               lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_dsygvd_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, double* a,
-                               lapack_int lda, double* b, lapack_int ldb, double* w, double* work, lapack_int lwork,
-                               lapack_int* iwork, lapack_int liwork);
-
-lapack_int LAPACKE_ssygvx_work(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
-                               float* a, lapack_int lda, float* b, lapack_int ldb, float vl, float vu, lapack_int il,
-                               lapack_int iu, float abstol, lapack_int* m, float* w, float* z, lapack_int ldz,
-                               float* work, lapack_int lwork, lapack_int* iwork, lapack_int* ifail);
-lapack_int LAPACKE_dsygvx_work(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
-                               double* a, lapack_int lda, double* b, lapack_int ldb, double vl, double vu,
-                               lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w, double* z,
-                               lapack_int ldz, double* work, lapack_int lwork, lapack_int* iwork, lapack_int* ifail);
-
-lapack_int LAPACKE_ssyrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a,
-                               lapack_int lda, const float* af, lapack_int ldaf, const lapack_int* ipiv, const float* b,
-                               lapack_int ldb, float* x, lapack_int ldx, float* ferr, float* berr, float* work,
-                               lapack_int* iwork);
-lapack_int LAPACKE_dsyrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a,
-                               lapack_int lda, const double* af, lapack_int ldaf, const lapack_int* ipiv,
-                               const double* b, lapack_int ldb, double* x, lapack_int ldx, double* ferr, double* berr,
-                               double* work, lapack_int* iwork);
-lapack_int LAPACKE_csyrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
-                               lapack_int ldaf, const lapack_int* ipiv, const lapack_complex_float* b, lapack_int ldb,
-                               lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr,
-                               lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zsyrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
-                               lapack_int ldaf, const lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb,
-                               lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr,
-                               lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_ssyrfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs, const float* a,
-                                lapack_int lda, const float* af, lapack_int ldaf, const lapack_int* ipiv,
-                                const float* s, const float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond,
-                                float* berr, lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
-                                lapack_int nparams, float* params, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dsyrfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs, const double* a,
-                                lapack_int lda, const double* af, lapack_int ldaf, const lapack_int* ipiv,
-                                const double* s, const double* b, lapack_int ldb, double* x, lapack_int ldx,
-                                double* rcond, double* berr, lapack_int n_err_bnds, double* err_bnds_norm,
-                                double* err_bnds_comp, lapack_int nparams, double* params, double* work,
-                                lapack_int* iwork);
-lapack_int LAPACKE_csyrfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
-                                const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
-                                lapack_int ldaf, const lapack_int* ipiv, const float* s, const lapack_complex_float* b,
-                                lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* rcond, float* berr,
-                                lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
-                                float* params, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zsyrfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
-                                const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
-                                lapack_int ldaf, const lapack_int* ipiv, const double* s,
-                                const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
-                                lapack_int ldx, double* rcond, double* berr, lapack_int n_err_bnds,
-                                double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams, double* params,
-                                lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_ssysv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
-                              lapack_int* ipiv, float* b, lapack_int ldb, float* work, lapack_int lwork);
-lapack_int LAPACKE_dsysv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
-                              lapack_int* ipiv, double* b, lapack_int ldb, double* work, lapack_int lwork);
-lapack_int LAPACKE_csysv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
-                              lapack_int lda, lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb,
-                              lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zsysv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
-                              lapack_int lda, lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb,
-                              lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_ssysvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, const float* a,
-                               lapack_int lda, float* af, lapack_int ldaf, lapack_int* ipiv, const float* b,
-                               lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* ferr, float* berr,
-                               float* work, lapack_int lwork, lapack_int* iwork);
-lapack_int LAPACKE_dsysvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, const double* a,
-                               lapack_int lda, double* af, lapack_int ldaf, lapack_int* ipiv, const double* b,
-                               lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* ferr, double* berr,
-                               double* work, lapack_int lwork, lapack_int* iwork);
-lapack_int LAPACKE_csysvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
-                               lapack_int* ipiv, const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
-                               lapack_int ldx, float* rcond, float* ferr, float* berr, lapack_complex_float* work,
-                               lapack_int lwork, float* rwork);
-lapack_int LAPACKE_zsysvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* a, lapack_int lda, lapack_complex_double* af,
-                               lapack_int ldaf, lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb,
-                               lapack_complex_double* x, lapack_int ldx, double* rcond, double* ferr, double* berr,
-                               lapack_complex_double* work, lapack_int lwork, double* rwork);
-
-lapack_int LAPACKE_ssysvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, float* a,
-                                lapack_int lda, float* af, lapack_int ldaf, lapack_int* ipiv, char* equed, float* s,
-                                float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* rpvgrw,
-                                float* berr, lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
-                                lapack_int nparams, float* params, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dsysvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, double* a,
-                                lapack_int lda, double* af, lapack_int ldaf, lapack_int* ipiv, char* equed, double* s,
-                                double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* rpvgrw,
-                                double* berr, lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
-                                lapack_int nparams, double* params, double* work, lapack_int* iwork);
-lapack_int LAPACKE_csysvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                                lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
-                                lapack_int* ipiv, char* equed, float* s, lapack_complex_float* b, lapack_int ldb,
-                                lapack_complex_float* x, lapack_int ldx, float* rcond, float* rpvgrw, float* berr,
-                                lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
-                                float* params, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_zsysvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
-                                lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
-                                lapack_int* ipiv, char* equed, double* s, lapack_complex_double* b, lapack_int ldb,
-                                lapack_complex_double* x, lapack_int ldx, double* rcond, double* rpvgrw, double* berr,
-                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
-                                double* params, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_ssytrd_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, float* d, float* e,
-                               float* tau, float* work, lapack_int lwork);
-lapack_int LAPACKE_dsytrd_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda, double* d,
-                               double* e, double* tau, double* work, lapack_int lwork);
-
-lapack_int LAPACKE_ssytrf_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, lapack_int* ipiv,
-                               float* work, lapack_int lwork);
-lapack_int LAPACKE_dsytrf_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda, lapack_int* ipiv,
-                               double* work, lapack_int lwork);
-lapack_int LAPACKE_csytrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               lapack_int* ipiv, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zsytrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               lapack_int* ipiv, lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_ssytri_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda,
-                               const lapack_int* ipiv, float* work);
-lapack_int LAPACKE_dsytri_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda,
-                               const lapack_int* ipiv, double* work);
-lapack_int LAPACKE_csytri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               const lapack_int* ipiv, lapack_complex_float* work);
-lapack_int LAPACKE_zsytri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               const lapack_int* ipiv, lapack_complex_double* work);
-
-lapack_int LAPACKE_ssytrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a,
-                               lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb);
-lapack_int LAPACKE_dsytrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a,
-                               lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb);
-lapack_int LAPACKE_csytrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv,
-                               lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_zsytrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv,
-                               lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_stbcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n, lapack_int kd,
-                               const float* ab, lapack_int ldab, float* rcond, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dtbcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n, lapack_int kd,
-                               const double* ab, lapack_int ldab, double* rcond, double* work, lapack_int* iwork);
-lapack_int LAPACKE_ctbcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n, lapack_int kd,
-                               const lapack_complex_float* ab, lapack_int ldab, float* rcond,
-                               lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_ztbcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n, lapack_int kd,
-                               const lapack_complex_double* ab, lapack_int ldab, double* rcond,
-                               lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_stbrfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
-                               lapack_int nrhs, const float* ab, lapack_int ldab, const float* b, lapack_int ldb,
-                               const float* x, lapack_int ldx, float* ferr, float* berr, float* work,
-                               lapack_int* iwork);
-lapack_int LAPACKE_dtbrfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
-                               lapack_int nrhs, const double* ab, lapack_int ldab, const double* b, lapack_int ldb,
-                               const double* x, lapack_int ldx, double* ferr, double* berr, double* work,
-                               lapack_int* iwork);
-lapack_int LAPACKE_ctbrfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
-                               lapack_int nrhs, const lapack_complex_float* ab, lapack_int ldab,
-                               const lapack_complex_float* b, lapack_int ldb, const lapack_complex_float* x,
-                               lapack_int ldx, float* ferr, float* berr, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_ztbrfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
-                               lapack_int nrhs, const lapack_complex_double* ab, lapack_int ldab,
-                               const lapack_complex_double* b, lapack_int ldb, const lapack_complex_double* x,
-                               lapack_int ldx, double* ferr, double* berr, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_stbtrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
-                               lapack_int nrhs, const float* ab, lapack_int ldab, float* b, lapack_int ldb);
-lapack_int LAPACKE_dtbtrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
-                               lapack_int nrhs, const double* ab, lapack_int ldab, double* b, lapack_int ldb);
-lapack_int LAPACKE_ctbtrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
-                               lapack_int nrhs, const lapack_complex_float* ab, lapack_int ldab,
-                               lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_ztbtrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
-                               lapack_int nrhs, const lapack_complex_double* ab, lapack_int ldab,
-                               lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_stfsm_work(int matrix_order, char transr, char side, char uplo, char trans, char diag, lapack_int m,
-                              lapack_int n, float alpha, const float* a, float* b, lapack_int ldb);
-lapack_int LAPACKE_dtfsm_work(int matrix_order, char transr, char side, char uplo, char trans, char diag, lapack_int m,
-                              lapack_int n, double alpha, const double* a, double* b, lapack_int ldb);
-lapack_int LAPACKE_ctfsm_work(int matrix_order, char transr, char side, char uplo, char trans, char diag, lapack_int m,
-                              lapack_int n, lapack_complex_float alpha, const lapack_complex_float* a,
-                              lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_ztfsm_work(int matrix_order, char transr, char side, char uplo, char trans, char diag, lapack_int m,
-                              lapack_int n, lapack_complex_double alpha, const lapack_complex_double* a,
-                              lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_stftri_work(int matrix_order, char transr, char uplo, char diag, lapack_int n, float* a);
-lapack_int LAPACKE_dtftri_work(int matrix_order, char transr, char uplo, char diag, lapack_int n, double* a);
-lapack_int LAPACKE_ctftri_work(int matrix_order, char transr, char uplo, char diag, lapack_int n,
-                               lapack_complex_float* a);
-lapack_int LAPACKE_ztftri_work(int matrix_order, char transr, char uplo, char diag, lapack_int n,
-                               lapack_complex_double* a);
-
-lapack_int LAPACKE_stfttp_work(int matrix_order, char transr, char uplo, lapack_int n, const float* arf, float* ap);
-lapack_int LAPACKE_dtfttp_work(int matrix_order, char transr, char uplo, lapack_int n, const double* arf, double* ap);
-lapack_int LAPACKE_ctfttp_work(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_float* arf,
-                               lapack_complex_float* ap);
-lapack_int LAPACKE_ztfttp_work(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_double* arf,
-                               lapack_complex_double* ap);
-
-lapack_int LAPACKE_stfttr_work(int matrix_order, char transr, char uplo, lapack_int n, const float* arf, float* a,
-                               lapack_int lda);
-lapack_int LAPACKE_dtfttr_work(int matrix_order, char transr, char uplo, lapack_int n, const double* arf, double* a,
-                               lapack_int lda);
-lapack_int LAPACKE_ctfttr_work(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_float* arf,
-                               lapack_complex_float* a, lapack_int lda);
-lapack_int LAPACKE_ztfttr_work(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_double* arf,
-                               lapack_complex_double* a, lapack_int lda);
-
-lapack_int LAPACKE_stgevc_work(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
-                               const float* s, lapack_int lds, const float* p, lapack_int ldp, float* vl,
-                               lapack_int ldvl, float* vr, lapack_int ldvr, lapack_int mm, lapack_int* m, float* work);
-lapack_int LAPACKE_dtgevc_work(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
-                               const double* s, lapack_int lds, const double* p, lapack_int ldp, double* vl,
-                               lapack_int ldvl, double* vr, lapack_int ldvr, lapack_int mm, lapack_int* m,
-                               double* work);
-lapack_int LAPACKE_ctgevc_work(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
-                               const lapack_complex_float* s, lapack_int lds, const lapack_complex_float* p,
-                               lapack_int ldp, lapack_complex_float* vl, lapack_int ldvl, lapack_complex_float* vr,
-                               lapack_int ldvr, lapack_int mm, lapack_int* m, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_ztgevc_work(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
-                               const lapack_complex_double* s, lapack_int lds, const lapack_complex_double* p,
-                               lapack_int ldp, lapack_complex_double* vl, lapack_int ldvl, lapack_complex_double* vr,
-                               lapack_int ldvr, lapack_int mm, lapack_int* m, lapack_complex_double* work,
-                               double* rwork);
-
-lapack_int LAPACKE_stgexc_work(int matrix_order, lapack_logical wantq, lapack_logical wantz, lapack_int n, float* a,
-                               lapack_int lda, float* b, lapack_int ldb, float* q, lapack_int ldq, float* z,
-                               lapack_int ldz, lapack_int* ifst, lapack_int* ilst, float* work, lapack_int lwork);
-lapack_int LAPACKE_dtgexc_work(int matrix_order, lapack_logical wantq, lapack_logical wantz, lapack_int n, double* a,
-                               lapack_int lda, double* b, lapack_int ldb, double* q, lapack_int ldq, double* z,
-                               lapack_int ldz, lapack_int* ifst, lapack_int* ilst, double* work, lapack_int lwork);
-lapack_int LAPACKE_ctgexc_work(int matrix_order, lapack_logical wantq, lapack_logical wantz, lapack_int n,
-                               lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
-                               lapack_complex_float* q, lapack_int ldq, lapack_complex_float* z, lapack_int ldz,
-                               lapack_int ifst, lapack_int ilst);
-lapack_int LAPACKE_ztgexc_work(int matrix_order, lapack_logical wantq, lapack_logical wantz, lapack_int n,
-                               lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
-                               lapack_complex_double* q, lapack_int ldq, lapack_complex_double* z, lapack_int ldz,
-                               lapack_int ifst, lapack_int ilst);
-
-lapack_int LAPACKE_stgsen_work(int matrix_order, lapack_int ijob, lapack_logical wantq, lapack_logical wantz,
-                               const lapack_logical* select, lapack_int n, float* a, lapack_int lda, float* b,
-                               lapack_int ldb, float* alphar, float* alphai, float* beta, float* q, lapack_int ldq,
-                               float* z, lapack_int ldz, lapack_int* m, float* pl, float* pr, float* dif, float* work,
-                               lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_dtgsen_work(int matrix_order, lapack_int ijob, lapack_logical wantq, lapack_logical wantz,
-                               const lapack_logical* select, lapack_int n, double* a, lapack_int lda, double* b,
-                               lapack_int ldb, double* alphar, double* alphai, double* beta, double* q, lapack_int ldq,
-                               double* z, lapack_int ldz, lapack_int* m, double* pl, double* pr, double* dif,
-                               double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_ctgsen_work(int matrix_order, lapack_int ijob, lapack_logical wantq, lapack_logical wantz,
-                               const lapack_logical* select, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               lapack_complex_float* b, lapack_int ldb, lapack_complex_float* alpha,
-                               lapack_complex_float* beta, lapack_complex_float* q, lapack_int ldq,
-                               lapack_complex_float* z, lapack_int ldz, lapack_int* m, float* pl, float* pr, float* dif,
-                               lapack_complex_float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
-lapack_int LAPACKE_ztgsen_work(int matrix_order, lapack_int ijob, lapack_logical wantq, lapack_logical wantz,
-                               const lapack_logical* select, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               lapack_complex_double* b, lapack_int ldb, lapack_complex_double* alpha,
-                               lapack_complex_double* beta, lapack_complex_double* q, lapack_int ldq,
-                               lapack_complex_double* z, lapack_int ldz, lapack_int* m, double* pl, double* pr,
-                               double* dif, lapack_complex_double* work, lapack_int lwork, lapack_int* iwork,
-                               lapack_int liwork);
-
-lapack_int LAPACKE_stgsja_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p,
-                               lapack_int n, lapack_int k, lapack_int l, float* a, lapack_int lda, float* b,
-                               lapack_int ldb, float tola, float tolb, float* alpha, float* beta, float* u,
-                               lapack_int ldu, float* v, lapack_int ldv, float* q, lapack_int ldq, float* work,
-                               lapack_int* ncycle);
-lapack_int LAPACKE_dtgsja_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p,
-                               lapack_int n, lapack_int k, lapack_int l, double* a, lapack_int lda, double* b,
-                               lapack_int ldb, double tola, double tolb, double* alpha, double* beta, double* u,
-                               lapack_int ldu, double* v, lapack_int ldv, double* q, lapack_int ldq, double* work,
-                               lapack_int* ncycle);
-lapack_int LAPACKE_ctgsja_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p,
-                               lapack_int n, lapack_int k, lapack_int l, lapack_complex_float* a, lapack_int lda,
-                               lapack_complex_float* b, lapack_int ldb, float tola, float tolb, float* alpha,
-                               float* beta, lapack_complex_float* u, lapack_int ldu, lapack_complex_float* v,
-                               lapack_int ldv, lapack_complex_float* q, lapack_int ldq, lapack_complex_float* work,
-                               lapack_int* ncycle);
-lapack_int LAPACKE_ztgsja_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p,
-                               lapack_int n, lapack_int k, lapack_int l, lapack_complex_double* a, lapack_int lda,
-                               lapack_complex_double* b, lapack_int ldb, double tola, double tolb, double* alpha,
-                               double* beta, lapack_complex_double* u, lapack_int ldu, lapack_complex_double* v,
-                               lapack_int ldv, lapack_complex_double* q, lapack_int ldq, lapack_complex_double* work,
-                               lapack_int* ncycle);
-
-lapack_int LAPACKE_stgsna_work(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
-                               const float* a, lapack_int lda, const float* b, lapack_int ldb, const float* vl,
-                               lapack_int ldvl, const float* vr, lapack_int ldvr, float* s, float* dif, lapack_int mm,
-                               lapack_int* m, float* work, lapack_int lwork, lapack_int* iwork);
-lapack_int LAPACKE_dtgsna_work(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
-                               const double* a, lapack_int lda, const double* b, lapack_int ldb, const double* vl,
-                               lapack_int ldvl, const double* vr, lapack_int ldvr, double* s, double* dif,
-                               lapack_int mm, lapack_int* m, double* work, lapack_int lwork, lapack_int* iwork);
-lapack_int LAPACKE_ctgsna_work(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
-                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* b,
-                               lapack_int ldb, const lapack_complex_float* vl, lapack_int ldvl,
-                               const lapack_complex_float* vr, lapack_int ldvr, float* s, float* dif, lapack_int mm,
-                               lapack_int* m, lapack_complex_float* work, lapack_int lwork, lapack_int* iwork);
-lapack_int LAPACKE_ztgsna_work(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
-                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* b,
-                               lapack_int ldb, const lapack_complex_double* vl, lapack_int ldvl,
-                               const lapack_complex_double* vr, lapack_int ldvr, double* s, double* dif, lapack_int mm,
-                               lapack_int* m, lapack_complex_double* work, lapack_int lwork, lapack_int* iwork);
-
-lapack_int LAPACKE_stgsyl_work(int matrix_order, char trans, lapack_int ijob, lapack_int m, lapack_int n,
-                               const float* a, lapack_int lda, const float* b, lapack_int ldb, float* c, lapack_int ldc,
-                               const float* d, lapack_int ldd, const float* e, lapack_int lde, float* f, lapack_int ldf,
-                               float* scale, float* dif, float* work, lapack_int lwork, lapack_int* iwork);
-lapack_int LAPACKE_dtgsyl_work(int matrix_order, char trans, lapack_int ijob, lapack_int m, lapack_int n,
-                               const double* a, lapack_int lda, const double* b, lapack_int ldb, double* c,
-                               lapack_int ldc, const double* d, lapack_int ldd, const double* e, lapack_int lde,
-                               double* f, lapack_int ldf, double* scale, double* dif, double* work, lapack_int lwork,
-                               lapack_int* iwork);
-lapack_int LAPACKE_ctgsyl_work(int matrix_order, char trans, lapack_int ijob, lapack_int m, lapack_int n,
-                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* b,
-                               lapack_int ldb, lapack_complex_float* c, lapack_int ldc, const lapack_complex_float* d,
-                               lapack_int ldd, const lapack_complex_float* e, lapack_int lde, lapack_complex_float* f,
-                               lapack_int ldf, float* scale, float* dif, lapack_complex_float* work, lapack_int lwork,
-                               lapack_int* iwork);
-lapack_int LAPACKE_ztgsyl_work(int matrix_order, char trans, lapack_int ijob, lapack_int m, lapack_int n,
-                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* b,
-                               lapack_int ldb, lapack_complex_double* c, lapack_int ldc, const lapack_complex_double* d,
-                               lapack_int ldd, const lapack_complex_double* e, lapack_int lde, lapack_complex_double* f,
-                               lapack_int ldf, double* scale, double* dif, lapack_complex_double* work,
-                               lapack_int lwork, lapack_int* iwork);
-
-lapack_int LAPACKE_stpcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n, const float* ap,
-                               float* rcond, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dtpcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n, const double* ap,
-                               double* rcond, double* work, lapack_int* iwork);
-lapack_int LAPACKE_ctpcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n,
-                               const lapack_complex_float* ap, float* rcond, lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_ztpcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n,
-                               const lapack_complex_double* ap, double* rcond, lapack_complex_double* work,
-                               double* rwork);
-
-lapack_int LAPACKE_stprfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                               const float* ap, const float* b, lapack_int ldb, const float* x, lapack_int ldx,
-                               float* ferr, float* berr, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dtprfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                               const double* ap, const double* b, lapack_int ldb, const double* x, lapack_int ldx,
-                               double* ferr, double* berr, double* work, lapack_int* iwork);
-lapack_int LAPACKE_ctprfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* ap, const lapack_complex_float* b, lapack_int ldb,
-                               const lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr,
-                               lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_ztprfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* ap, const lapack_complex_double* b, lapack_int ldb,
-                               const lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr,
-                               lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_stptri_work(int matrix_order, char uplo, char diag, lapack_int n, float* ap);
-lapack_int LAPACKE_dtptri_work(int matrix_order, char uplo, char diag, lapack_int n, double* ap);
-lapack_int LAPACKE_ctptri_work(int matrix_order, char uplo, char diag, lapack_int n, lapack_complex_float* ap);
-lapack_int LAPACKE_ztptri_work(int matrix_order, char uplo, char diag, lapack_int n, lapack_complex_double* ap);
-
-lapack_int LAPACKE_stptrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                               const float* ap, float* b, lapack_int ldb);
-lapack_int LAPACKE_dtptrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                               const double* ap, double* b, lapack_int ldb);
-lapack_int LAPACKE_ctptrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* ap, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_ztptrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* ap, lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_stpttf_work(int matrix_order, char transr, char uplo, lapack_int n, const float* ap, float* arf);
-lapack_int LAPACKE_dtpttf_work(int matrix_order, char transr, char uplo, lapack_int n, const double* ap, double* arf);
-lapack_int LAPACKE_ctpttf_work(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_float* ap,
-                               lapack_complex_float* arf);
-lapack_int LAPACKE_ztpttf_work(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_double* ap,
-                               lapack_complex_double* arf);
-
-lapack_int LAPACKE_stpttr_work(int matrix_order, char uplo, lapack_int n, const float* ap, float* a, lapack_int lda);
-lapack_int LAPACKE_dtpttr_work(int matrix_order, char uplo, lapack_int n, const double* ap, double* a, lapack_int lda);
-lapack_int LAPACKE_ctpttr_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap,
-                               lapack_complex_float* a, lapack_int lda);
-lapack_int LAPACKE_ztpttr_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap,
-                               lapack_complex_double* a, lapack_int lda);
-
-lapack_int LAPACKE_strcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n, const float* a,
-                               lapack_int lda, float* rcond, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dtrcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n, const double* a,
-                               lapack_int lda, double* rcond, double* work, lapack_int* iwork);
-lapack_int LAPACKE_ctrcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n,
-                               const lapack_complex_float* a, lapack_int lda, float* rcond, lapack_complex_float* work,
-                               float* rwork);
-lapack_int LAPACKE_ztrcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n,
-                               const lapack_complex_double* a, lapack_int lda, double* rcond,
-                               lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_strevc_work(int matrix_order, char side, char howmny, lapack_logical* select, lapack_int n,
-                               const float* t, lapack_int ldt, float* vl, lapack_int ldvl, float* vr, lapack_int ldvr,
-                               lapack_int mm, lapack_int* m, float* work);
-lapack_int LAPACKE_dtrevc_work(int matrix_order, char side, char howmny, lapack_logical* select, lapack_int n,
-                               const double* t, lapack_int ldt, double* vl, lapack_int ldvl, double* vr,
-                               lapack_int ldvr, lapack_int mm, lapack_int* m, double* work);
-lapack_int LAPACKE_ctrevc_work(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
-                               lapack_complex_float* t, lapack_int ldt, lapack_complex_float* vl, lapack_int ldvl,
-                               lapack_complex_float* vr, lapack_int ldvr, lapack_int mm, lapack_int* m,
-                               lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_ztrevc_work(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
-                               lapack_complex_double* t, lapack_int ldt, lapack_complex_double* vl, lapack_int ldvl,
-                               lapack_complex_double* vr, lapack_int ldvr, lapack_int mm, lapack_int* m,
-                               lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_strexc_work(int matrix_order, char compq, lapack_int n, float* t, lapack_int ldt, float* q,
-                               lapack_int ldq, lapack_int* ifst, lapack_int* ilst, float* work);
-lapack_int LAPACKE_dtrexc_work(int matrix_order, char compq, lapack_int n, double* t, lapack_int ldt, double* q,
-                               lapack_int ldq, lapack_int* ifst, lapack_int* ilst, double* work);
-lapack_int LAPACKE_ctrexc_work(int matrix_order, char compq, lapack_int n, lapack_complex_float* t, lapack_int ldt,
-                               lapack_complex_float* q, lapack_int ldq, lapack_int ifst, lapack_int ilst);
-lapack_int LAPACKE_ztrexc_work(int matrix_order, char compq, lapack_int n, lapack_complex_double* t, lapack_int ldt,
-                               lapack_complex_double* q, lapack_int ldq, lapack_int ifst, lapack_int ilst);
-
-lapack_int LAPACKE_strrfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                               const float* a, lapack_int lda, const float* b, lapack_int ldb, const float* x,
-                               lapack_int ldx, float* ferr, float* berr, float* work, lapack_int* iwork);
-lapack_int LAPACKE_dtrrfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                               const double* a, lapack_int lda, const double* b, lapack_int ldb, const double* x,
-                               lapack_int ldx, double* ferr, double* berr, double* work, lapack_int* iwork);
-lapack_int LAPACKE_ctrrfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* b,
-                               lapack_int ldb, const lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr,
-                               lapack_complex_float* work, float* rwork);
-lapack_int LAPACKE_ztrrfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* b,
-                               lapack_int ldb, const lapack_complex_double* x, lapack_int ldx, double* ferr,
-                               double* berr, lapack_complex_double* work, double* rwork);
-
-lapack_int LAPACKE_strsen_work(int matrix_order, char job, char compq, const lapack_logical* select, lapack_int n,
-                               float* t, lapack_int ldt, float* q, lapack_int ldq, float* wr, float* wi, lapack_int* m,
-                               float* s, float* sep, float* work, lapack_int lwork, lapack_int* iwork,
-                               lapack_int liwork);
-lapack_int LAPACKE_dtrsen_work(int matrix_order, char job, char compq, const lapack_logical* select, lapack_int n,
-                               double* t, lapack_int ldt, double* q, lapack_int ldq, double* wr, double* wi,
-                               lapack_int* m, double* s, double* sep, double* work, lapack_int lwork, lapack_int* iwork,
-                               lapack_int liwork);
-lapack_int LAPACKE_ctrsen_work(int matrix_order, char job, char compq, const lapack_logical* select, lapack_int n,
-                               lapack_complex_float* t, lapack_int ldt, lapack_complex_float* q, lapack_int ldq,
-                               lapack_complex_float* w, lapack_int* m, float* s, float* sep, lapack_complex_float* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_ztrsen_work(int matrix_order, char job, char compq, const lapack_logical* select, lapack_int n,
-                               lapack_complex_double* t, lapack_int ldt, lapack_complex_double* q, lapack_int ldq,
-                               lapack_complex_double* w, lapack_int* m, double* s, double* sep,
-                               lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_strsna_work(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
-                               const float* t, lapack_int ldt, const float* vl, lapack_int ldvl, const float* vr,
-                               lapack_int ldvr, float* s, float* sep, lapack_int mm, lapack_int* m, float* work,
-                               lapack_int ldwork, lapack_int* iwork);
-lapack_int LAPACKE_dtrsna_work(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
-                               const double* t, lapack_int ldt, const double* vl, lapack_int ldvl, const double* vr,
-                               lapack_int ldvr, double* s, double* sep, lapack_int mm, lapack_int* m, double* work,
-                               lapack_int ldwork, lapack_int* iwork);
-lapack_int LAPACKE_ctrsna_work(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
-                               const lapack_complex_float* t, lapack_int ldt, const lapack_complex_float* vl,
-                               lapack_int ldvl, const lapack_complex_float* vr, lapack_int ldvr, float* s, float* sep,
-                               lapack_int mm, lapack_int* m, lapack_complex_float* work, lapack_int ldwork,
-                               float* rwork);
-lapack_int LAPACKE_ztrsna_work(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
-                               const lapack_complex_double* t, lapack_int ldt, const lapack_complex_double* vl,
-                               lapack_int ldvl, const lapack_complex_double* vr, lapack_int ldvr, double* s,
-                               double* sep, lapack_int mm, lapack_int* m, lapack_complex_double* work,
-                               lapack_int ldwork, double* rwork);
-
-lapack_int LAPACKE_strsyl_work(int matrix_order, char trana, char tranb, lapack_int isgn, lapack_int m, lapack_int n,
-                               const float* a, lapack_int lda, const float* b, lapack_int ldb, float* c, lapack_int ldc,
-                               float* scale);
-lapack_int LAPACKE_dtrsyl_work(int matrix_order, char trana, char tranb, lapack_int isgn, lapack_int m, lapack_int n,
-                               const double* a, lapack_int lda, const double* b, lapack_int ldb, double* c,
-                               lapack_int ldc, double* scale);
-lapack_int LAPACKE_ctrsyl_work(int matrix_order, char trana, char tranb, lapack_int isgn, lapack_int m, lapack_int n,
-                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* b,
-                               lapack_int ldb, lapack_complex_float* c, lapack_int ldc, float* scale);
-lapack_int LAPACKE_ztrsyl_work(int matrix_order, char trana, char tranb, lapack_int isgn, lapack_int m, lapack_int n,
-                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* b,
-                               lapack_int ldb, lapack_complex_double* c, lapack_int ldc, double* scale);
-
-lapack_int LAPACKE_strtri_work(int matrix_order, char uplo, char diag, lapack_int n, float* a, lapack_int lda);
-lapack_int LAPACKE_dtrtri_work(int matrix_order, char uplo, char diag, lapack_int n, double* a, lapack_int lda);
-lapack_int LAPACKE_ctrtri_work(int matrix_order, char uplo, char diag, lapack_int n, lapack_complex_float* a,
-                               lapack_int lda);
-lapack_int LAPACKE_ztrtri_work(int matrix_order, char uplo, char diag, lapack_int n, lapack_complex_double* a,
-                               lapack_int lda);
-
-lapack_int LAPACKE_strtrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                               const float* a, lapack_int lda, float* b, lapack_int ldb);
-lapack_int LAPACKE_dtrtrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                               const double* a, lapack_int lda, double* b, lapack_int ldb);
-lapack_int LAPACKE_ctrtrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_ztrtrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
-                               const lapack_complex_double* a, lapack_int lda, lapack_complex_double* b,
-                               lapack_int ldb);
-
-lapack_int LAPACKE_strttf_work(int matrix_order, char transr, char uplo, lapack_int n, const float* a, lapack_int lda,
-                               float* arf);
-lapack_int LAPACKE_dtrttf_work(int matrix_order, char transr, char uplo, lapack_int n, const double* a, lapack_int lda,
-                               double* arf);
-lapack_int LAPACKE_ctrttf_work(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_float* a,
-                               lapack_int lda, lapack_complex_float* arf);
-lapack_int LAPACKE_ztrttf_work(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_double* a,
-                               lapack_int lda, lapack_complex_double* arf);
-
-lapack_int LAPACKE_strttp_work(int matrix_order, char uplo, lapack_int n, const float* a, lapack_int lda, float* ap);
-lapack_int LAPACKE_dtrttp_work(int matrix_order, char uplo, lapack_int n, const double* a, lapack_int lda, double* ap);
-lapack_int LAPACKE_ctrttp_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
-                               lapack_complex_float* ap);
-lapack_int LAPACKE_ztrttp_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a,
-                               lapack_int lda, lapack_complex_double* ap);
-
-lapack_int LAPACKE_stzrzf_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau,
-                               float* work, lapack_int lwork);
-lapack_int LAPACKE_dtzrzf_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau,
-                               double* work, lapack_int lwork);
-lapack_int LAPACKE_ctzrzf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               lapack_complex_float* tau, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_ztzrzf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               lapack_complex_double* tau, lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_cungbr_work(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int k,
-                               lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
-                               lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zungbr_work(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int k,
-                               lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
-                               lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_cunghr_work(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, lapack_complex_float* a,
-                               lapack_int lda, const lapack_complex_float* tau, lapack_complex_float* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_zunghr_work(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, lapack_complex_double* a,
-                               lapack_int lda, const lapack_complex_double* tau, lapack_complex_double* work,
-                               lapack_int lwork);
-
-lapack_int LAPACKE_cunglq_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_float* a,
-                               lapack_int lda, const lapack_complex_float* tau, lapack_complex_float* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_zunglq_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_double* a,
-                               lapack_int lda, const lapack_complex_double* tau, lapack_complex_double* work,
-                               lapack_int lwork);
-
-lapack_int LAPACKE_cungql_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_float* a,
-                               lapack_int lda, const lapack_complex_float* tau, lapack_complex_float* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_zungql_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_double* a,
-                               lapack_int lda, const lapack_complex_double* tau, lapack_complex_double* work,
-                               lapack_int lwork);
-
-lapack_int LAPACKE_cungqr_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_float* a,
-                               lapack_int lda, const lapack_complex_float* tau, lapack_complex_float* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_zungqr_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_double* a,
-                               lapack_int lda, const lapack_complex_double* tau, lapack_complex_double* work,
-                               lapack_int lwork);
-
-lapack_int LAPACKE_cungrq_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_float* a,
-                               lapack_int lda, const lapack_complex_float* tau, lapack_complex_float* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_zungrq_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_double* a,
-                               lapack_int lda, const lapack_complex_double* tau, lapack_complex_double* work,
-                               lapack_int lwork);
-
-lapack_int LAPACKE_cungtr_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                               const lapack_complex_float* tau, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zungtr_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                               const lapack_complex_double* tau, lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_cunmbr_work(int matrix_order, char vect, char side, char trans, lapack_int m, lapack_int n,
-                               lapack_int k, const lapack_complex_float* a, lapack_int lda,
-                               const lapack_complex_float* tau, lapack_complex_float* c, lapack_int ldc,
-                               lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zunmbr_work(int matrix_order, char vect, char side, char trans, lapack_int m, lapack_int n,
-                               lapack_int k, const lapack_complex_double* a, lapack_int lda,
-                               const lapack_complex_double* tau, lapack_complex_double* c, lapack_int ldc,
-                               lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_cunmhr_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int ilo,
-                               lapack_int ihi, const lapack_complex_float* a, lapack_int lda,
-                               const lapack_complex_float* tau, lapack_complex_float* c, lapack_int ldc,
-                               lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zunmhr_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int ilo,
-                               lapack_int ihi, const lapack_complex_double* a, lapack_int lda,
-                               const lapack_complex_double* tau, lapack_complex_double* c, lapack_int ldc,
-                               lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_cunmlq_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
-                               lapack_complex_float* c, lapack_int ldc, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zunmlq_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
-                               lapack_complex_double* c, lapack_int ldc, lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_cunmql_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
-                               lapack_complex_float* c, lapack_int ldc, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zunmql_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
-                               lapack_complex_double* c, lapack_int ldc, lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_cunmqr_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
-                               lapack_complex_float* c, lapack_int ldc, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zunmqr_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
-                               lapack_complex_double* c, lapack_int ldc, lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_cunmrq_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
-                               lapack_complex_float* c, lapack_int ldc, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zunmrq_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
-                               lapack_complex_double* c, lapack_int ldc, lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_cunmrz_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               lapack_int l, const lapack_complex_float* a, lapack_int lda,
-                               const lapack_complex_float* tau, lapack_complex_float* c, lapack_int ldc,
-                               lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zunmrz_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                               lapack_int l, const lapack_complex_double* a, lapack_int lda,
-                               const lapack_complex_double* tau, lapack_complex_double* c, lapack_int ldc,
-                               lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_cunmtr_work(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
-                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
-                               lapack_complex_float* c, lapack_int ldc, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_zunmtr_work(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
-                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
-                               lapack_complex_double* c, lapack_int ldc, lapack_complex_double* work, lapack_int lwork);
-
-lapack_int LAPACKE_cupgtr_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap,
-                               const lapack_complex_float* tau, lapack_complex_float* q, lapack_int ldq,
-                               lapack_complex_float* work);
-lapack_int LAPACKE_zupgtr_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap,
-                               const lapack_complex_double* tau, lapack_complex_double* q, lapack_int ldq,
-                               lapack_complex_double* work);
-
-lapack_int LAPACKE_cupmtr_work(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
-                               const lapack_complex_float* ap, const lapack_complex_float* tau, lapack_complex_float* c,
-                               lapack_int ldc, lapack_complex_float* work);
-lapack_int LAPACKE_zupmtr_work(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
-                               const lapack_complex_double* ap, const lapack_complex_double* tau,
-                               lapack_complex_double* c, lapack_int ldc, lapack_complex_double* work);
-
-lapack_int LAPACKE_claghe(int matrix_order, lapack_int n, lapack_int k, const float* d, lapack_complex_float* a,
-                          lapack_int lda, lapack_int* iseed);
-lapack_int LAPACKE_zlaghe(int matrix_order, lapack_int n, lapack_int k, const double* d, lapack_complex_double* a,
-                          lapack_int lda, lapack_int* iseed);
-
-lapack_int LAPACKE_slagsy(int matrix_order, lapack_int n, lapack_int k, const float* d, float* a, lapack_int lda,
-                          lapack_int* iseed);
-lapack_int LAPACKE_dlagsy(int matrix_order, lapack_int n, lapack_int k, const double* d, double* a, lapack_int lda,
-                          lapack_int* iseed);
-lapack_int LAPACKE_clagsy(int matrix_order, lapack_int n, lapack_int k, const float* d, lapack_complex_float* a,
-                          lapack_int lda, lapack_int* iseed);
-lapack_int LAPACKE_zlagsy(int matrix_order, lapack_int n, lapack_int k, const double* d, lapack_complex_double* a,
-                          lapack_int lda, lapack_int* iseed);
-
-lapack_int LAPACKE_slapmr(int matrix_order, lapack_logical forwrd, lapack_int m, lapack_int n, float* x, lapack_int ldx,
-                          lapack_int* k);
-lapack_int LAPACKE_dlapmr(int matrix_order, lapack_logical forwrd, lapack_int m, lapack_int n, double* x,
-                          lapack_int ldx, lapack_int* k);
-lapack_int LAPACKE_clapmr(int matrix_order, lapack_logical forwrd, lapack_int m, lapack_int n, lapack_complex_float* x,
-                          lapack_int ldx, lapack_int* k);
-lapack_int LAPACKE_zlapmr(int matrix_order, lapack_logical forwrd, lapack_int m, lapack_int n, lapack_complex_double* x,
-                          lapack_int ldx, lapack_int* k);
-
-float LAPACKE_slapy2(float x, float y);
-double LAPACKE_dlapy2(double x, double y);
-
-float LAPACKE_slapy3(float x, float y, float z);
-double LAPACKE_dlapy3(double x, double y, double z);
-
-lapack_int LAPACKE_slartgp(float f, float g, float* cs, float* sn, float* r);
-lapack_int LAPACKE_dlartgp(double f, double g, double* cs, double* sn, double* r);
-
-lapack_int LAPACKE_slartgs(float x, float y, float sigma, float* cs, float* sn);
-lapack_int LAPACKE_dlartgs(double x, double y, double sigma, double* cs, double* sn);
-
-// LAPACK 3.3.0
-lapack_int LAPACKE_cbbcsd(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans, lapack_int m,
-                          lapack_int p, lapack_int q, float* theta, float* phi, lapack_complex_float* u1,
-                          lapack_int ldu1, lapack_complex_float* u2, lapack_int ldu2, lapack_complex_float* v1t,
-                          lapack_int ldv1t, lapack_complex_float* v2t, lapack_int ldv2t, float* b11d, float* b11e,
-                          float* b12d, float* b12e, float* b21d, float* b21e, float* b22d, float* b22e);
-lapack_int LAPACKE_cbbcsd_work(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans,
-                               lapack_int m, lapack_int p, lapack_int q, float* theta, float* phi,
-                               lapack_complex_float* u1, lapack_int ldu1, lapack_complex_float* u2, lapack_int ldu2,
-                               lapack_complex_float* v1t, lapack_int ldv1t, lapack_complex_float* v2t, lapack_int ldv2t,
-                               float* b11d, float* b11e, float* b12d, float* b12e, float* b21d, float* b21e,
-                               float* b22d, float* b22e, float* rwork, lapack_int lrwork);
-lapack_int LAPACKE_cheswapr(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int i1,
-                            lapack_int i2);
-lapack_int LAPACKE_cheswapr_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int i1,
-                                 lapack_int i2);
-lapack_int LAPACKE_chetri2(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                           const lapack_int* ipiv);
-lapack_int LAPACKE_chetri2_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                                const lapack_int* ipiv, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_chetri2x(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                            const lapack_int* ipiv, lapack_int nb);
-lapack_int LAPACKE_chetri2x_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                                 const lapack_int* ipiv, lapack_complex_float* work, lapack_int nb);
-lapack_int LAPACKE_chetrs2(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
-                           lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_chetrs2_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                                const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv,
-                                lapack_complex_float* b, lapack_int ldb, lapack_complex_float* work);
-lapack_int LAPACKE_csyconv(int matrix_order, char uplo, char way, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                           const lapack_int* ipiv);
-lapack_int LAPACKE_csyconv_work(int matrix_order, char uplo, char way, lapack_int n, lapack_complex_float* a,
-                                lapack_int lda, const lapack_int* ipiv, lapack_complex_float* work);
-lapack_int LAPACKE_csyswapr(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int i1,
-                            lapack_int i2);
-lapack_int LAPACKE_csyswapr_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int i1,
-                                 lapack_int i2);
-lapack_int LAPACKE_csytri2(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                           const lapack_int* ipiv);
-lapack_int LAPACKE_csytri2_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                                const lapack_int* ipiv, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_csytri2x(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                            const lapack_int* ipiv, lapack_int nb);
-lapack_int LAPACKE_csytri2x_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                                 const lapack_int* ipiv, lapack_complex_float* work, lapack_int nb);
-lapack_int LAPACKE_csytrs2(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
-                           lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_csytrs2_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                                const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv,
-                                lapack_complex_float* b, lapack_int ldb, lapack_complex_float* work);
-lapack_int LAPACKE_cunbdb(int matrix_order, char trans, char signs, lapack_int m, lapack_int p, lapack_int q,
-                          lapack_complex_float* x11, lapack_int ldx11, lapack_complex_float* x12, lapack_int ldx12,
-                          lapack_complex_float* x21, lapack_int ldx21, lapack_complex_float* x22, lapack_int ldx22,
-                          float* theta, float* phi, lapack_complex_float* taup1, lapack_complex_float* taup2,
-                          lapack_complex_float* tauq1, lapack_complex_float* tauq2);
-lapack_int LAPACKE_cunbdb_work(int matrix_order, char trans, char signs, lapack_int m, lapack_int p, lapack_int q,
-                               lapack_complex_float* x11, lapack_int ldx11, lapack_complex_float* x12, lapack_int ldx12,
-                               lapack_complex_float* x21, lapack_int ldx21, lapack_complex_float* x22, lapack_int ldx22,
-                               float* theta, float* phi, lapack_complex_float* taup1, lapack_complex_float* taup2,
-                               lapack_complex_float* tauq1, lapack_complex_float* tauq2, lapack_complex_float* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_cuncsd(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans, char signs,
-                          lapack_int m, lapack_int p, lapack_int q, lapack_complex_float* x11, lapack_int ldx11,
-                          lapack_complex_float* x12, lapack_int ldx12, lapack_complex_float* x21, lapack_int ldx21,
-                          lapack_complex_float* x22, lapack_int ldx22, float* theta, lapack_complex_float* u1,
-                          lapack_int ldu1, lapack_complex_float* u2, lapack_int ldu2, lapack_complex_float* v1t,
-                          lapack_int ldv1t, lapack_complex_float* v2t, lapack_int ldv2t);
-lapack_int LAPACKE_cuncsd_work(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans,
-                               char signs, lapack_int m, lapack_int p, lapack_int q, lapack_complex_float* x11,
-                               lapack_int ldx11, lapack_complex_float* x12, lapack_int ldx12, lapack_complex_float* x21,
-                               lapack_int ldx21, lapack_complex_float* x22, lapack_int ldx22, float* theta,
-                               lapack_complex_float* u1, lapack_int ldu1, lapack_complex_float* u2, lapack_int ldu2,
-                               lapack_complex_float* v1t, lapack_int ldv1t, lapack_complex_float* v2t, lapack_int ldv2t,
-                               lapack_complex_float* work, lapack_int lwork, float* rwork, lapack_int lrwork,
-                               lapack_int* iwork);
-lapack_int LAPACKE_dbbcsd(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans, lapack_int m,
-                          lapack_int p, lapack_int q, double* theta, double* phi, double* u1, lapack_int ldu1,
-                          double* u2, lapack_int ldu2, double* v1t, lapack_int ldv1t, double* v2t, lapack_int ldv2t,
-                          double* b11d, double* b11e, double* b12d, double* b12e, double* b21d, double* b21e,
-                          double* b22d, double* b22e);
-lapack_int LAPACKE_dbbcsd_work(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans,
-                               lapack_int m, lapack_int p, lapack_int q, double* theta, double* phi, double* u1,
-                               lapack_int ldu1, double* u2, lapack_int ldu2, double* v1t, lapack_int ldv1t, double* v2t,
-                               lapack_int ldv2t, double* b11d, double* b11e, double* b12d, double* b12e, double* b21d,
-                               double* b21e, double* b22d, double* b22e, double* work, lapack_int lwork);
-lapack_int LAPACKE_dorbdb(int matrix_order, char trans, char signs, lapack_int m, lapack_int p, lapack_int q,
-                          double* x11, lapack_int ldx11, double* x12, lapack_int ldx12, double* x21, lapack_int ldx21,
-                          double* x22, lapack_int ldx22, double* theta, double* phi, double* taup1, double* taup2,
-                          double* tauq1, double* tauq2);
-lapack_int LAPACKE_dorbdb_work(int matrix_order, char trans, char signs, lapack_int m, lapack_int p, lapack_int q,
-                               double* x11, lapack_int ldx11, double* x12, lapack_int ldx12, double* x21,
-                               lapack_int ldx21, double* x22, lapack_int ldx22, double* theta, double* phi,
-                               double* taup1, double* taup2, double* tauq1, double* tauq2, double* work,
-                               lapack_int lwork);
-lapack_int LAPACKE_dorcsd(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans, char signs,
-                          lapack_int m, lapack_int p, lapack_int q, double* x11, lapack_int ldx11, double* x12,
-                          lapack_int ldx12, double* x21, lapack_int ldx21, double* x22, lapack_int ldx22, double* theta,
-                          double* u1, lapack_int ldu1, double* u2, lapack_int ldu2, double* v1t, lapack_int ldv1t,
-                          double* v2t, lapack_int ldv2t);
-lapack_int LAPACKE_dorcsd_work(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans,
-                               char signs, lapack_int m, lapack_int p, lapack_int q, double* x11, lapack_int ldx11,
-                               double* x12, lapack_int ldx12, double* x21, lapack_int ldx21, double* x22,
-                               lapack_int ldx22, double* theta, double* u1, lapack_int ldu1, double* u2,
-                               lapack_int ldu2, double* v1t, lapack_int ldv1t, double* v2t, lapack_int ldv2t,
-                               double* work, lapack_int lwork, lapack_int* iwork);
-lapack_int LAPACKE_dsyconv(int matrix_order, char uplo, char way, lapack_int n, double* a, lapack_int lda,
-                           const lapack_int* ipiv);
-lapack_int LAPACKE_dsyconv_work(int matrix_order, char uplo, char way, lapack_int n, double* a, lapack_int lda,
-                                const lapack_int* ipiv, double* work);
-lapack_int LAPACKE_dsyswapr(int matrix_order, char uplo, lapack_int n, double* a, lapack_int i1, lapack_int i2);
-lapack_int LAPACKE_dsyswapr_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int i1, lapack_int i2);
-lapack_int LAPACKE_dsytri2(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda,
-                           const lapack_int* ipiv);
-lapack_int LAPACKE_dsytri2_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda,
-                                const lapack_int* ipiv, lapack_complex_double* work, lapack_int lwork);
-lapack_int LAPACKE_dsytri2x(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda,
-                            const lapack_int* ipiv, lapack_int nb);
-lapack_int LAPACKE_dsytri2x_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda,
-                                 const lapack_int* ipiv, double* work, lapack_int nb);
-lapack_int LAPACKE_dsytrs2(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a, lapack_int lda,
-                           const lapack_int* ipiv, double* b, lapack_int ldb);
-lapack_int LAPACKE_dsytrs2_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a,
-                                lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb, double* work);
-lapack_int LAPACKE_sbbcsd(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans, lapack_int m,
-                          lapack_int p, lapack_int q, float* theta, float* phi, float* u1, lapack_int ldu1, float* u2,
-                          lapack_int ldu2, float* v1t, lapack_int ldv1t, float* v2t, lapack_int ldv2t, float* b11d,
-                          float* b11e, float* b12d, float* b12e, float* b21d, float* b21e, float* b22d, float* b22e);
-lapack_int LAPACKE_sbbcsd_work(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans,
-                               lapack_int m, lapack_int p, lapack_int q, float* theta, float* phi, float* u1,
-                               lapack_int ldu1, float* u2, lapack_int ldu2, float* v1t, lapack_int ldv1t, float* v2t,
-                               lapack_int ldv2t, float* b11d, float* b11e, float* b12d, float* b12e, float* b21d,
-                               float* b21e, float* b22d, float* b22e, float* work, lapack_int lwork);
-lapack_int LAPACKE_sorbdb(int matrix_order, char trans, char signs, lapack_int m, lapack_int p, lapack_int q,
-                          float* x11, lapack_int ldx11, float* x12, lapack_int ldx12, float* x21, lapack_int ldx21,
-                          float* x22, lapack_int ldx22, float* theta, float* phi, float* taup1, float* taup2,
-                          float* tauq1, float* tauq2);
-lapack_int LAPACKE_sorbdb_work(int matrix_order, char trans, char signs, lapack_int m, lapack_int p, lapack_int q,
-                               float* x11, lapack_int ldx11, float* x12, lapack_int ldx12, float* x21, lapack_int ldx21,
-                               float* x22, lapack_int ldx22, float* theta, float* phi, float* taup1, float* taup2,
-                               float* tauq1, float* tauq2, float* work, lapack_int lwork);
-lapack_int LAPACKE_sorcsd(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans, char signs,
-                          lapack_int m, lapack_int p, lapack_int q, float* x11, lapack_int ldx11, float* x12,
-                          lapack_int ldx12, float* x21, lapack_int ldx21, float* x22, lapack_int ldx22, float* theta,
-                          float* u1, lapack_int ldu1, float* u2, lapack_int ldu2, float* v1t, lapack_int ldv1t,
-                          float* v2t, lapack_int ldv2t);
-lapack_int LAPACKE_sorcsd_work(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans,
-                               char signs, lapack_int m, lapack_int p, lapack_int q, float* x11, lapack_int ldx11,
-                               float* x12, lapack_int ldx12, float* x21, lapack_int ldx21, float* x22, lapack_int ldx22,
-                               float* theta, float* u1, lapack_int ldu1, float* u2, lapack_int ldu2, float* v1t,
-                               lapack_int ldv1t, float* v2t, lapack_int ldv2t, float* work, lapack_int lwork,
-                               lapack_int* iwork);
-lapack_int LAPACKE_ssyconv(int matrix_order, char uplo, char way, lapack_int n, float* a, lapack_int lda,
-                           const lapack_int* ipiv);
-lapack_int LAPACKE_ssyconv_work(int matrix_order, char uplo, char way, lapack_int n, float* a, lapack_int lda,
-                                const lapack_int* ipiv, float* work);
-lapack_int LAPACKE_ssyswapr(int matrix_order, char uplo, lapack_int n, float* a, lapack_int i1, lapack_int i2);
-lapack_int LAPACKE_ssyswapr_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int i1, lapack_int i2);
-lapack_int LAPACKE_ssytri2(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, const lapack_int* ipiv);
-lapack_int LAPACKE_ssytri2_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda,
-                                const lapack_int* ipiv, lapack_complex_float* work, lapack_int lwork);
-lapack_int LAPACKE_ssytri2x(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, const lapack_int* ipiv,
-                            lapack_int nb);
-lapack_int LAPACKE_ssytri2x_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda,
-                                 const lapack_int* ipiv, float* work, lapack_int nb);
-lapack_int LAPACKE_ssytrs2(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a, lapack_int lda,
-                           const lapack_int* ipiv, float* b, lapack_int ldb);
-lapack_int LAPACKE_ssytrs2_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a,
-                                lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb, float* work);
-lapack_int LAPACKE_zbbcsd(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans, lapack_int m,
-                          lapack_int p, lapack_int q, double* theta, double* phi, lapack_complex_double* u1,
-                          lapack_int ldu1, lapack_complex_double* u2, lapack_int ldu2, lapack_complex_double* v1t,
-                          lapack_int ldv1t, lapack_complex_double* v2t, lapack_int ldv2t, double* b11d, double* b11e,
-                          double* b12d, double* b12e, double* b21d, double* b21e, double* b22d, double* b22e);
-lapack_int LAPACKE_zbbcsd_work(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans,
-                               lapack_int m, lapack_int p, lapack_int q, double* theta, double* phi,
-                               lapack_complex_double* u1, lapack_int ldu1, lapack_complex_double* u2, lapack_int ldu2,
-                               lapack_complex_double* v1t, lapack_int ldv1t, lapack_complex_double* v2t,
-                               lapack_int ldv2t, double* b11d, double* b11e, double* b12d, double* b12e, double* b21d,
-                               double* b21e, double* b22d, double* b22e, double* rwork, lapack_int lrwork);
-lapack_int LAPACKE_zheswapr(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int i1,
-                            lapack_int i2);
-lapack_int LAPACKE_zheswapr_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int i1,
-                                 lapack_int i2);
-lapack_int LAPACKE_zhetri2(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                           const lapack_int* ipiv);
-lapack_int LAPACKE_zhetri2_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                                const lapack_int* ipiv, lapack_complex_double* work, lapack_int lwork);
-lapack_int LAPACKE_zhetri2x(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                            const lapack_int* ipiv, lapack_int nb);
-lapack_int LAPACKE_zhetri2x_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                                 const lapack_int* ipiv, lapack_complex_double* work, lapack_int nb);
-lapack_int LAPACKE_zhetrs2(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
-                           lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
-lapack_int LAPACKE_zhetrs2_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                                const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv,
-                                lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work);
-lapack_int LAPACKE_zsyconv(int matrix_order, char uplo, char way, lapack_int n, lapack_complex_double* a,
-                           lapack_int lda, const lapack_int* ipiv);
-lapack_int LAPACKE_zsyconv_work(int matrix_order, char uplo, char way, lapack_int n, lapack_complex_double* a,
-                                lapack_int lda, const lapack_int* ipiv, lapack_complex_double* work);
-lapack_int LAPACKE_zsyswapr(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int i1,
-                            lapack_int i2);
-lapack_int LAPACKE_zsyswapr_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int i1,
-                                 lapack_int i2);
-lapack_int LAPACKE_zsytri2(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                           const lapack_int* ipiv);
-lapack_int LAPACKE_zsytri2_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                                const lapack_int* ipiv, lapack_complex_double* work, lapack_int lwork);
-lapack_int LAPACKE_zsytri2x(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                            const lapack_int* ipiv, lapack_int nb);
-lapack_int LAPACKE_zsytri2x_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                                 const lapack_int* ipiv, lapack_complex_double* work, lapack_int nb);
-lapack_int LAPACKE_zsytrs2(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
-                           lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
-lapack_int LAPACKE_zsytrs2_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
-                                const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv,
-                                lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work);
-lapack_int LAPACKE_zunbdb(int matrix_order, char trans, char signs, lapack_int m, lapack_int p, lapack_int q,
-                          lapack_complex_double* x11, lapack_int ldx11, lapack_complex_double* x12, lapack_int ldx12,
-                          lapack_complex_double* x21, lapack_int ldx21, lapack_complex_double* x22, lapack_int ldx22,
-                          double* theta, double* phi, lapack_complex_double* taup1, lapack_complex_double* taup2,
-                          lapack_complex_double* tauq1, lapack_complex_double* tauq2);
-lapack_int LAPACKE_zunbdb_work(int matrix_order, char trans, char signs, lapack_int m, lapack_int p, lapack_int q,
-                               lapack_complex_double* x11, lapack_int ldx11, lapack_complex_double* x12,
-                               lapack_int ldx12, lapack_complex_double* x21, lapack_int ldx21,
-                               lapack_complex_double* x22, lapack_int ldx22, double* theta, double* phi,
-                               lapack_complex_double* taup1, lapack_complex_double* taup2, lapack_complex_double* tauq1,
-                               lapack_complex_double* tauq2, lapack_complex_double* work, lapack_int lwork);
-lapack_int LAPACKE_zuncsd(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans, char signs,
-                          lapack_int m, lapack_int p, lapack_int q, lapack_complex_double* x11, lapack_int ldx11,
-                          lapack_complex_double* x12, lapack_int ldx12, lapack_complex_double* x21, lapack_int ldx21,
-                          lapack_complex_double* x22, lapack_int ldx22, double* theta, lapack_complex_double* u1,
-                          lapack_int ldu1, lapack_complex_double* u2, lapack_int ldu2, lapack_complex_double* v1t,
-                          lapack_int ldv1t, lapack_complex_double* v2t, lapack_int ldv2t);
-lapack_int LAPACKE_zuncsd_work(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans,
-                               char signs, lapack_int m, lapack_int p, lapack_int q, lapack_complex_double* x11,
-                               lapack_int ldx11, lapack_complex_double* x12, lapack_int ldx12,
-                               lapack_complex_double* x21, lapack_int ldx21, lapack_complex_double* x22,
-                               lapack_int ldx22, double* theta, lapack_complex_double* u1, lapack_int ldu1,
-                               lapack_complex_double* u2, lapack_int ldu2, lapack_complex_double* v1t, lapack_int ldv1t,
-                               lapack_complex_double* v2t, lapack_int ldv2t, lapack_complex_double* work,
-                               lapack_int lwork, double* rwork, lapack_int lrwork, lapack_int* iwork);
-// LAPACK 3.4.0
-lapack_int LAPACKE_sgemqrt(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                           lapack_int nb, const float* v, lapack_int ldv, const float* t, lapack_int ldt, float* c,
-                           lapack_int ldc);
-lapack_int LAPACKE_dgemqrt(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                           lapack_int nb, const double* v, lapack_int ldv, const double* t, lapack_int ldt, double* c,
-                           lapack_int ldc);
-lapack_int LAPACKE_cgemqrt(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                           lapack_int nb, const lapack_complex_float* v, lapack_int ldv, const lapack_complex_float* t,
-                           lapack_int ldt, lapack_complex_float* c, lapack_int ldc);
-lapack_int LAPACKE_zgemqrt(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                           lapack_int nb, const lapack_complex_double* v, lapack_int ldv,
-                           const lapack_complex_double* t, lapack_int ldt, lapack_complex_double* c, lapack_int ldc);
-
-lapack_int LAPACKE_sgeqrt(int matrix_order, lapack_int m, lapack_int n, lapack_int nb, float* a, lapack_int lda,
-                          float* t, lapack_int ldt);
-lapack_int LAPACKE_dgeqrt(int matrix_order, lapack_int m, lapack_int n, lapack_int nb, double* a, lapack_int lda,
-                          double* t, lapack_int ldt);
-lapack_int LAPACKE_cgeqrt(int matrix_order, lapack_int m, lapack_int n, lapack_int nb, lapack_complex_float* a,
-                          lapack_int lda, lapack_complex_float* t, lapack_int ldt);
-lapack_int LAPACKE_zgeqrt(int matrix_order, lapack_int m, lapack_int n, lapack_int nb, lapack_complex_double* a,
-                          lapack_int lda, lapack_complex_double* t, lapack_int ldt);
-
-lapack_int LAPACKE_sgeqrt2(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* t,
-                           lapack_int ldt);
-lapack_int LAPACKE_dgeqrt2(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* t,
-                           lapack_int ldt);
-lapack_int LAPACKE_cgeqrt2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                           lapack_complex_float* t, lapack_int ldt);
-lapack_int LAPACKE_zgeqrt2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                           lapack_complex_double* t, lapack_int ldt);
-
-lapack_int LAPACKE_sgeqrt3(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* t,
-                           lapack_int ldt);
-lapack_int LAPACKE_dgeqrt3(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* t,
-                           lapack_int ldt);
-lapack_int LAPACKE_cgeqrt3(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                           lapack_complex_float* t, lapack_int ldt);
-lapack_int LAPACKE_zgeqrt3(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                           lapack_complex_double* t, lapack_int ldt);
-
-lapack_int LAPACKE_stpmqrt(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                           lapack_int l, lapack_int nb, const float* v, lapack_int ldv, const float* t, lapack_int ldt,
-                           float* a, lapack_int lda, float* b, lapack_int ldb);
-lapack_int LAPACKE_dtpmqrt(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                           lapack_int l, lapack_int nb, const double* v, lapack_int ldv, const double* t,
-                           lapack_int ldt, double* a, lapack_int lda, double* b, lapack_int ldb);
-lapack_int LAPACKE_ctpmqrt(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                           lapack_int l, lapack_int nb, const lapack_complex_float* v, lapack_int ldv,
-                           const lapack_complex_float* t, lapack_int ldt, lapack_complex_float* a, lapack_int lda,
-                           lapack_complex_float* b, lapack_int ldb);
-lapack_int LAPACKE_ztpmqrt(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                           lapack_int l, lapack_int nb, const lapack_complex_double* v, lapack_int ldv,
-                           const lapack_complex_double* t, lapack_int ldt, lapack_complex_double* a, lapack_int lda,
-                           lapack_complex_double* b, lapack_int ldb);
-
-lapack_int LAPACKE_dtpqrt(int matrix_order, lapack_int m, lapack_int n, lapack_int l, lapack_int nb, double* a,
-                          lapack_int lda, double* b, lapack_int ldb, double* t, lapack_int ldt);
-lapack_int LAPACKE_ctpqrt(int matrix_order, lapack_int m, lapack_int n, lapack_int l, lapack_int nb,
-                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* t, lapack_complex_float* b,
-                          lapack_int ldb, lapack_int ldt);
-lapack_int LAPACKE_ztpqrt(int matrix_order, lapack_int m, lapack_int n, lapack_int l, lapack_int nb,
-                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
-                          lapack_complex_double* t, lapack_int ldt);
-
-lapack_int LAPACKE_stpqrt2(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* b,
-                           lapack_int ldb, float* t, lapack_int ldt);
-lapack_int LAPACKE_dtpqrt2(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* b,
-                           lapack_int ldb, double* t, lapack_int ldt);
-lapack_int LAPACKE_ctpqrt2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                           lapack_complex_float* b, lapack_int ldb, lapack_complex_float* t, lapack_int ldt);
-lapack_int LAPACKE_ztpqrt2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                           lapack_complex_double* b, lapack_int ldb, lapack_complex_double* t, lapack_int ldt);
-
-lapack_int LAPACKE_stprfb(int matrix_order, char side, char trans, char direct, char storev, lapack_int m, lapack_int n,
-                          lapack_int k, lapack_int l, const float* v, lapack_int ldv, const float* t, lapack_int ldt,
-                          float* a, lapack_int lda, float* b, lapack_int ldb, lapack_int myldwork);
-lapack_int LAPACKE_dtprfb(int matrix_order, char side, char trans, char direct, char storev, lapack_int m, lapack_int n,
-                          lapack_int k, lapack_int l, const double* v, lapack_int ldv, const double* t, lapack_int ldt,
-                          double* a, lapack_int lda, double* b, lapack_int ldb, lapack_int myldwork);
-lapack_int LAPACKE_ctprfb(int matrix_order, char side, char trans, char direct, char storev, lapack_int m, lapack_int n,
-                          lapack_int k, lapack_int l, const lapack_complex_float* v, lapack_int ldv,
-                          const lapack_complex_float* t, lapack_int ldt, lapack_complex_float* a, lapack_int lda,
-                          lapack_complex_float* b, lapack_int ldb, lapack_int myldwork);
-lapack_int LAPACKE_ztprfb(int matrix_order, char side, char trans, char direct, char storev, lapack_int m, lapack_int n,
-                          lapack_int k, lapack_int l, const lapack_complex_double* v, lapack_int ldv,
-                          const lapack_complex_double* t, lapack_int ldt, lapack_complex_double* a, lapack_int lda,
-                          lapack_complex_double* b, lapack_int ldb, lapack_int myldwork);
-
-lapack_int LAPACKE_sgemqrt_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                                lapack_int nb, const float* v, lapack_int ldv, const float* t, lapack_int ldt, float* c,
-                                lapack_int ldc, float* work);
-lapack_int LAPACKE_dgemqrt_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                                lapack_int nb, const double* v, lapack_int ldv, const double* t, lapack_int ldt,
-                                double* c, lapack_int ldc, double* work);
-lapack_int LAPACKE_cgemqrt_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                                lapack_int nb, const lapack_complex_float* v, lapack_int ldv,
-                                const lapack_complex_float* t, lapack_int ldt, lapack_complex_float* c, lapack_int ldc,
-                                lapack_complex_float* work);
-lapack_int LAPACKE_zgemqrt_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                                lapack_int nb, const lapack_complex_double* v, lapack_int ldv,
-                                const lapack_complex_double* t, lapack_int ldt, lapack_complex_double* c,
-                                lapack_int ldc, lapack_complex_double* work);
-
-lapack_int LAPACKE_sgeqrt_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nb, float* a, lapack_int lda,
-                               float* t, lapack_int ldt, float* work);
-lapack_int LAPACKE_dgeqrt_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nb, double* a, lapack_int lda,
-                               double* t, lapack_int ldt, double* work);
-lapack_int LAPACKE_cgeqrt_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nb, lapack_complex_float* a,
-                               lapack_int lda, lapack_complex_float* t, lapack_int ldt, lapack_complex_float* work);
-lapack_int LAPACKE_zgeqrt_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nb, lapack_complex_double* a,
-                               lapack_int lda, lapack_complex_double* t, lapack_int ldt, lapack_complex_double* work);
-
-lapack_int LAPACKE_sgeqrt2_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* t,
-                                lapack_int ldt);
-lapack_int LAPACKE_dgeqrt2_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* t,
-                                lapack_int ldt);
-lapack_int LAPACKE_cgeqrt2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                                lapack_complex_float* t, lapack_int ldt);
-lapack_int LAPACKE_zgeqrt2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                                lapack_complex_double* t, lapack_int ldt);
-
-lapack_int LAPACKE_sgeqrt3_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* t,
-                                lapack_int ldt);
-lapack_int LAPACKE_dgeqrt3_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* t,
-                                lapack_int ldt);
-lapack_int LAPACKE_cgeqrt3_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                                lapack_complex_float* t, lapack_int ldt);
-lapack_int LAPACKE_zgeqrt3_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                                lapack_complex_double* t, lapack_int ldt);
-
-lapack_int LAPACKE_stpmqrt_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                                lapack_int l, lapack_int nb, const float* v, lapack_int ldv, const float* t,
-                                lapack_int ldt, float* a, lapack_int lda, float* b, lapack_int ldb, float* work);
-lapack_int LAPACKE_dtpmqrt_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                                lapack_int l, lapack_int nb, const double* v, lapack_int ldv, const double* t,
-                                lapack_int ldt, double* a, lapack_int lda, double* b, lapack_int ldb, double* work);
-lapack_int LAPACKE_ctpmqrt_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                                lapack_int l, lapack_int nb, const lapack_complex_float* v, lapack_int ldv,
-                                const lapack_complex_float* t, lapack_int ldt, lapack_complex_float* a, lapack_int lda,
-                                lapack_complex_float* b, lapack_int ldb, lapack_complex_float* work);
-lapack_int LAPACKE_ztpmqrt_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
-                                lapack_int l, lapack_int nb, const lapack_complex_double* v, lapack_int ldv,
-                                const lapack_complex_double* t, lapack_int ldt, lapack_complex_double* a,
-                                lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work);
-
-lapack_int LAPACKE_dtpqrt_work(int matrix_order, lapack_int m, lapack_int n, lapack_int l, lapack_int nb, double* a,
-                               lapack_int lda, double* b, lapack_int ldb, double* t, lapack_int ldt, double* work);
-lapack_int LAPACKE_ctpqrt_work(int matrix_order, lapack_int m, lapack_int n, lapack_int l, lapack_int nb,
-                               lapack_complex_float* a, lapack_int lda, lapack_complex_float* t,
-                               lapack_complex_float* b, lapack_int ldb, lapack_int ldt, lapack_complex_float* work);
-lapack_int LAPACKE_ztpqrt_work(int matrix_order, lapack_int m, lapack_int n, lapack_int l, lapack_int nb,
-                               lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
-                               lapack_complex_double* t, lapack_int ldt, lapack_complex_double* work);
-
-lapack_int LAPACKE_stpqrt2_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* b,
-                                lapack_int ldb, float* t, lapack_int ldt);
-lapack_int LAPACKE_dtpqrt2_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* b,
-                                lapack_int ldb, double* t, lapack_int ldt);
-lapack_int LAPACKE_ctpqrt2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
-                                lapack_complex_float* b, lapack_int ldb, lapack_complex_float* t, lapack_int ldt);
-lapack_int LAPACKE_ztpqrt2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
-                                lapack_complex_double* b, lapack_int ldb, lapack_complex_double* t, lapack_int ldt);
-
-lapack_int LAPACKE_stprfb_work(int matrix_order, char side, char trans, char direct, char storev, lapack_int m,
-                               lapack_int n, lapack_int k, lapack_int l, const float* v, lapack_int ldv, const float* t,
-                               lapack_int ldt, float* a, lapack_int lda, float* b, lapack_int ldb, const float* mywork,
-                               lapack_int myldwork);
-lapack_int LAPACKE_dtprfb_work(int matrix_order, char side, char trans, char direct, char storev, lapack_int m,
-                               lapack_int n, lapack_int k, lapack_int l, const double* v, lapack_int ldv,
-                               const double* t, lapack_int ldt, double* a, lapack_int lda, double* b, lapack_int ldb,
-                               const double* mywork, lapack_int myldwork);
-lapack_int LAPACKE_ctprfb_work(int matrix_order, char side, char trans, char direct, char storev, lapack_int m,
-                               lapack_int n, lapack_int k, lapack_int l, const lapack_complex_float* v, lapack_int ldv,
-                               const lapack_complex_float* t, lapack_int ldt, lapack_complex_float* a, lapack_int lda,
-                               lapack_complex_float* b, lapack_int ldb, const float* mywork, lapack_int myldwork);
-lapack_int LAPACKE_ztprfb_work(int matrix_order, char side, char trans, char direct, char storev, lapack_int m,
-                               lapack_int n, lapack_int k, lapack_int l, const lapack_complex_double* v, lapack_int ldv,
-                               const lapack_complex_double* t, lapack_int ldt, lapack_complex_double* a, lapack_int lda,
-                               lapack_complex_double* b, lapack_int ldb, const double* mywork, lapack_int myldwork);
-// LAPACK 3.X.X
-lapack_int LAPACKE_csyr(int matrix_order, char uplo, lapack_int n, lapack_complex_float alpha,
-                        const lapack_complex_float* x, lapack_int incx, lapack_complex_float* a, lapack_int lda);
-lapack_int LAPACKE_zsyr(int matrix_order, char uplo, lapack_int n, lapack_complex_double alpha,
-                        const lapack_complex_double* x, lapack_int incx, lapack_complex_double* a, lapack_int lda);
-
-lapack_int LAPACKE_csyr_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float alpha,
-                             const lapack_complex_float* x, lapack_int incx, lapack_complex_float* a, lapack_int lda);
-lapack_int LAPACKE_zsyr_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double alpha,
-                             const lapack_complex_double* x, lapack_int incx, lapack_complex_double* a, lapack_int lda);
-
-#define LAPACK_sgetrf LAPACK_GLOBAL(sgetrf, SGETRF)
-#define LAPACK_dgetrf LAPACK_GLOBAL(dgetrf, DGETRF)
-#define LAPACK_cgetrf LAPACK_GLOBAL(cgetrf, CGETRF)
-#define LAPACK_zgetrf LAPACK_GLOBAL(zgetrf, ZGETRF)
-#define LAPACK_sgbtrf LAPACK_GLOBAL(sgbtrf, SGBTRF)
-#define LAPACK_dgbtrf LAPACK_GLOBAL(dgbtrf, DGBTRF)
-#define LAPACK_cgbtrf LAPACK_GLOBAL(cgbtrf, CGBTRF)
-#define LAPACK_zgbtrf LAPACK_GLOBAL(zgbtrf, ZGBTRF)
-#define LAPACK_sgttrf LAPACK_GLOBAL(sgttrf, SGTTRF)
-#define LAPACK_dgttrf LAPACK_GLOBAL(dgttrf, DGTTRF)
-#define LAPACK_cgttrf LAPACK_GLOBAL(cgttrf, CGTTRF)
-#define LAPACK_zgttrf LAPACK_GLOBAL(zgttrf, ZGTTRF)
-#define LAPACK_spotrf LAPACK_GLOBAL(spotrf, SPOTRF)
-#define LAPACK_dpotrf LAPACK_GLOBAL(dpotrf, DPOTRF)
-#define LAPACK_cpotrf LAPACK_GLOBAL(cpotrf, CPOTRF)
-#define LAPACK_zpotrf LAPACK_GLOBAL(zpotrf, ZPOTRF)
-#define LAPACK_dpstrf LAPACK_GLOBAL(dpstrf, DPSTRF)
-#define LAPACK_spstrf LAPACK_GLOBAL(spstrf, SPSTRF)
-#define LAPACK_zpstrf LAPACK_GLOBAL(zpstrf, ZPSTRF)
-#define LAPACK_cpstrf LAPACK_GLOBAL(cpstrf, CPSTRF)
-#define LAPACK_dpftrf LAPACK_GLOBAL(dpftrf, DPFTRF)
-#define LAPACK_spftrf LAPACK_GLOBAL(spftrf, SPFTRF)
-#define LAPACK_zpftrf LAPACK_GLOBAL(zpftrf, ZPFTRF)
-#define LAPACK_cpftrf LAPACK_GLOBAL(cpftrf, CPFTRF)
-#define LAPACK_spptrf LAPACK_GLOBAL(spptrf, SPPTRF)
-#define LAPACK_dpptrf LAPACK_GLOBAL(dpptrf, DPPTRF)
-#define LAPACK_cpptrf LAPACK_GLOBAL(cpptrf, CPPTRF)
-#define LAPACK_zpptrf LAPACK_GLOBAL(zpptrf, ZPPTRF)
-#define LAPACK_spbtrf LAPACK_GLOBAL(spbtrf, SPBTRF)
-#define LAPACK_dpbtrf LAPACK_GLOBAL(dpbtrf, DPBTRF)
-#define LAPACK_cpbtrf LAPACK_GLOBAL(cpbtrf, CPBTRF)
-#define LAPACK_zpbtrf LAPACK_GLOBAL(zpbtrf, ZPBTRF)
-#define LAPACK_spttrf LAPACK_GLOBAL(spttrf, SPTTRF)
-#define LAPACK_dpttrf LAPACK_GLOBAL(dpttrf, DPTTRF)
-#define LAPACK_cpttrf LAPACK_GLOBAL(cpttrf, CPTTRF)
-#define LAPACK_zpttrf LAPACK_GLOBAL(zpttrf, ZPTTRF)
-#define LAPACK_ssytrf LAPACK_GLOBAL(ssytrf, SSYTRF)
-#define LAPACK_dsytrf LAPACK_GLOBAL(dsytrf, DSYTRF)
-#define LAPACK_csytrf LAPACK_GLOBAL(csytrf, CSYTRF)
-#define LAPACK_zsytrf LAPACK_GLOBAL(zsytrf, ZSYTRF)
-#define LAPACK_chetrf LAPACK_GLOBAL(chetrf, CHETRF)
-#define LAPACK_zhetrf LAPACK_GLOBAL(zhetrf, ZHETRF)
-#define LAPACK_ssptrf LAPACK_GLOBAL(ssptrf, SSPTRF)
-#define LAPACK_dsptrf LAPACK_GLOBAL(dsptrf, DSPTRF)
-#define LAPACK_csptrf LAPACK_GLOBAL(csptrf, CSPTRF)
-#define LAPACK_zsptrf LAPACK_GLOBAL(zsptrf, ZSPTRF)
-#define LAPACK_chptrf LAPACK_GLOBAL(chptrf, CHPTRF)
-#define LAPACK_zhptrf LAPACK_GLOBAL(zhptrf, ZHPTRF)
-#define LAPACK_sgetrs LAPACK_GLOBAL(sgetrs, SGETRS)
-#define LAPACK_dgetrs LAPACK_GLOBAL(dgetrs, DGETRS)
-#define LAPACK_cgetrs LAPACK_GLOBAL(cgetrs, CGETRS)
-#define LAPACK_zgetrs LAPACK_GLOBAL(zgetrs, ZGETRS)
-#define LAPACK_sgbtrs LAPACK_GLOBAL(sgbtrs, SGBTRS)
-#define LAPACK_dgbtrs LAPACK_GLOBAL(dgbtrs, DGBTRS)
-#define LAPACK_cgbtrs LAPACK_GLOBAL(cgbtrs, CGBTRS)
-#define LAPACK_zgbtrs LAPACK_GLOBAL(zgbtrs, ZGBTRS)
-#define LAPACK_sgttrs LAPACK_GLOBAL(sgttrs, SGTTRS)
-#define LAPACK_dgttrs LAPACK_GLOBAL(dgttrs, DGTTRS)
-#define LAPACK_cgttrs LAPACK_GLOBAL(cgttrs, CGTTRS)
-#define LAPACK_zgttrs LAPACK_GLOBAL(zgttrs, ZGTTRS)
-#define LAPACK_spotrs LAPACK_GLOBAL(spotrs, SPOTRS)
-#define LAPACK_dpotrs LAPACK_GLOBAL(dpotrs, DPOTRS)
-#define LAPACK_cpotrs LAPACK_GLOBAL(cpotrs, CPOTRS)
-#define LAPACK_zpotrs LAPACK_GLOBAL(zpotrs, ZPOTRS)
-#define LAPACK_dpftrs LAPACK_GLOBAL(dpftrs, DPFTRS)
-#define LAPACK_spftrs LAPACK_GLOBAL(spftrs, SPFTRS)
-#define LAPACK_zpftrs LAPACK_GLOBAL(zpftrs, ZPFTRS)
-#define LAPACK_cpftrs LAPACK_GLOBAL(cpftrs, CPFTRS)
-#define LAPACK_spptrs LAPACK_GLOBAL(spptrs, SPPTRS)
-#define LAPACK_dpptrs LAPACK_GLOBAL(dpptrs, DPPTRS)
-#define LAPACK_cpptrs LAPACK_GLOBAL(cpptrs, CPPTRS)
-#define LAPACK_zpptrs LAPACK_GLOBAL(zpptrs, ZPPTRS)
-#define LAPACK_spbtrs LAPACK_GLOBAL(spbtrs, SPBTRS)
-#define LAPACK_dpbtrs LAPACK_GLOBAL(dpbtrs, DPBTRS)
-#define LAPACK_cpbtrs LAPACK_GLOBAL(cpbtrs, CPBTRS)
-#define LAPACK_zpbtrs LAPACK_GLOBAL(zpbtrs, ZPBTRS)
-#define LAPACK_spttrs LAPACK_GLOBAL(spttrs, SPTTRS)
-#define LAPACK_dpttrs LAPACK_GLOBAL(dpttrs, DPTTRS)
-#define LAPACK_cpttrs LAPACK_GLOBAL(cpttrs, CPTTRS)
-#define LAPACK_zpttrs LAPACK_GLOBAL(zpttrs, ZPTTRS)
-#define LAPACK_ssytrs LAPACK_GLOBAL(ssytrs, SSYTRS)
-#define LAPACK_dsytrs LAPACK_GLOBAL(dsytrs, DSYTRS)
-#define LAPACK_csytrs LAPACK_GLOBAL(csytrs, CSYTRS)
-#define LAPACK_zsytrs LAPACK_GLOBAL(zsytrs, ZSYTRS)
-#define LAPACK_chetrs LAPACK_GLOBAL(chetrs, CHETRS)
-#define LAPACK_zhetrs LAPACK_GLOBAL(zhetrs, ZHETRS)
-#define LAPACK_ssptrs LAPACK_GLOBAL(ssptrs, SSPTRS)
-#define LAPACK_dsptrs LAPACK_GLOBAL(dsptrs, DSPTRS)
-#define LAPACK_csptrs LAPACK_GLOBAL(csptrs, CSPTRS)
-#define LAPACK_zsptrs LAPACK_GLOBAL(zsptrs, ZSPTRS)
-#define LAPACK_chptrs LAPACK_GLOBAL(chptrs, CHPTRS)
-#define LAPACK_zhptrs LAPACK_GLOBAL(zhptrs, ZHPTRS)
-#define LAPACK_strtrs LAPACK_GLOBAL(strtrs, STRTRS)
-#define LAPACK_dtrtrs LAPACK_GLOBAL(dtrtrs, DTRTRS)
-#define LAPACK_ctrtrs LAPACK_GLOBAL(ctrtrs, CTRTRS)
-#define LAPACK_ztrtrs LAPACK_GLOBAL(ztrtrs, ZTRTRS)
-#define LAPACK_stptrs LAPACK_GLOBAL(stptrs, STPTRS)
-#define LAPACK_dtptrs LAPACK_GLOBAL(dtptrs, DTPTRS)
-#define LAPACK_ctptrs LAPACK_GLOBAL(ctptrs, CTPTRS)
-#define LAPACK_ztptrs LAPACK_GLOBAL(ztptrs, ZTPTRS)
-#define LAPACK_stbtrs LAPACK_GLOBAL(stbtrs, STBTRS)
-#define LAPACK_dtbtrs LAPACK_GLOBAL(dtbtrs, DTBTRS)
-#define LAPACK_ctbtrs LAPACK_GLOBAL(ctbtrs, CTBTRS)
-#define LAPACK_ztbtrs LAPACK_GLOBAL(ztbtrs, ZTBTRS)
-#define LAPACK_sgecon LAPACK_GLOBAL(sgecon, SGECON)
-#define LAPACK_dgecon LAPACK_GLOBAL(dgecon, DGECON)
-#define LAPACK_cgecon LAPACK_GLOBAL(cgecon, CGECON)
-#define LAPACK_zgecon LAPACK_GLOBAL(zgecon, ZGECON)
-#define LAPACK_sgbcon LAPACK_GLOBAL(sgbcon, SGBCON)
-#define LAPACK_dgbcon LAPACK_GLOBAL(dgbcon, DGBCON)
-#define LAPACK_cgbcon LAPACK_GLOBAL(cgbcon, CGBCON)
-#define LAPACK_zgbcon LAPACK_GLOBAL(zgbcon, ZGBCON)
-#define LAPACK_sgtcon LAPACK_GLOBAL(sgtcon, SGTCON)
-#define LAPACK_dgtcon LAPACK_GLOBAL(dgtcon, DGTCON)
-#define LAPACK_cgtcon LAPACK_GLOBAL(cgtcon, CGTCON)
-#define LAPACK_zgtcon LAPACK_GLOBAL(zgtcon, ZGTCON)
-#define LAPACK_spocon LAPACK_GLOBAL(spocon, SPOCON)
-#define LAPACK_dpocon LAPACK_GLOBAL(dpocon, DPOCON)
-#define LAPACK_cpocon LAPACK_GLOBAL(cpocon, CPOCON)
-#define LAPACK_zpocon LAPACK_GLOBAL(zpocon, ZPOCON)
-#define LAPACK_sppcon LAPACK_GLOBAL(sppcon, SPPCON)
-#define LAPACK_dppcon LAPACK_GLOBAL(dppcon, DPPCON)
-#define LAPACK_cppcon LAPACK_GLOBAL(cppcon, CPPCON)
-#define LAPACK_zppcon LAPACK_GLOBAL(zppcon, ZPPCON)
-#define LAPACK_spbcon LAPACK_GLOBAL(spbcon, SPBCON)
-#define LAPACK_dpbcon LAPACK_GLOBAL(dpbcon, DPBCON)
-#define LAPACK_cpbcon LAPACK_GLOBAL(cpbcon, CPBCON)
-#define LAPACK_zpbcon LAPACK_GLOBAL(zpbcon, ZPBCON)
-#define LAPACK_sptcon LAPACK_GLOBAL(sptcon, SPTCON)
-#define LAPACK_dptcon LAPACK_GLOBAL(dptcon, DPTCON)
-#define LAPACK_cptcon LAPACK_GLOBAL(cptcon, CPTCON)
-#define LAPACK_zptcon LAPACK_GLOBAL(zptcon, ZPTCON)
-#define LAPACK_ssycon LAPACK_GLOBAL(ssycon, SSYCON)
-#define LAPACK_dsycon LAPACK_GLOBAL(dsycon, DSYCON)
-#define LAPACK_csycon LAPACK_GLOBAL(csycon, CSYCON)
-#define LAPACK_zsycon LAPACK_GLOBAL(zsycon, ZSYCON)
-#define LAPACK_checon LAPACK_GLOBAL(checon, CHECON)
-#define LAPACK_zhecon LAPACK_GLOBAL(zhecon, ZHECON)
-#define LAPACK_sspcon LAPACK_GLOBAL(sspcon, SSPCON)
-#define LAPACK_dspcon LAPACK_GLOBAL(dspcon, DSPCON)
-#define LAPACK_cspcon LAPACK_GLOBAL(cspcon, CSPCON)
-#define LAPACK_zspcon LAPACK_GLOBAL(zspcon, ZSPCON)
-#define LAPACK_chpcon LAPACK_GLOBAL(chpcon, CHPCON)
-#define LAPACK_zhpcon LAPACK_GLOBAL(zhpcon, ZHPCON)
-#define LAPACK_strcon LAPACK_GLOBAL(strcon, STRCON)
-#define LAPACK_dtrcon LAPACK_GLOBAL(dtrcon, DTRCON)
-#define LAPACK_ctrcon LAPACK_GLOBAL(ctrcon, CTRCON)
-#define LAPACK_ztrcon LAPACK_GLOBAL(ztrcon, ZTRCON)
-#define LAPACK_stpcon LAPACK_GLOBAL(stpcon, STPCON)
-#define LAPACK_dtpcon LAPACK_GLOBAL(dtpcon, DTPCON)
-#define LAPACK_ctpcon LAPACK_GLOBAL(ctpcon, CTPCON)
-#define LAPACK_ztpcon LAPACK_GLOBAL(ztpcon, ZTPCON)
-#define LAPACK_stbcon LAPACK_GLOBAL(stbcon, STBCON)
-#define LAPACK_dtbcon LAPACK_GLOBAL(dtbcon, DTBCON)
-#define LAPACK_ctbcon LAPACK_GLOBAL(ctbcon, CTBCON)
-#define LAPACK_ztbcon LAPACK_GLOBAL(ztbcon, ZTBCON)
-#define LAPACK_sgerfs LAPACK_GLOBAL(sgerfs, SGERFS)
-#define LAPACK_dgerfs LAPACK_GLOBAL(dgerfs, DGERFS)
-#define LAPACK_cgerfs LAPACK_GLOBAL(cgerfs, CGERFS)
-#define LAPACK_zgerfs LAPACK_GLOBAL(zgerfs, ZGERFS)
-#define LAPACK_dgerfsx LAPACK_GLOBAL(dgerfsx, DGERFSX)
-#define LAPACK_sgerfsx LAPACK_GLOBAL(sgerfsx, SGERFSX)
-#define LAPACK_zgerfsx LAPACK_GLOBAL(zgerfsx, ZGERFSX)
-#define LAPACK_cgerfsx LAPACK_GLOBAL(cgerfsx, CGERFSX)
-#define LAPACK_sgbrfs LAPACK_GLOBAL(sgbrfs, SGBRFS)
-#define LAPACK_dgbrfs LAPACK_GLOBAL(dgbrfs, DGBRFS)
-#define LAPACK_cgbrfs LAPACK_GLOBAL(cgbrfs, CGBRFS)
-#define LAPACK_zgbrfs LAPACK_GLOBAL(zgbrfs, ZGBRFS)
-#define LAPACK_dgbrfsx LAPACK_GLOBAL(dgbrfsx, DGBRFSX)
-#define LAPACK_sgbrfsx LAPACK_GLOBAL(sgbrfsx, SGBRFSX)
-#define LAPACK_zgbrfsx LAPACK_GLOBAL(zgbrfsx, ZGBRFSX)
-#define LAPACK_cgbrfsx LAPACK_GLOBAL(cgbrfsx, CGBRFSX)
-#define LAPACK_sgtrfs LAPACK_GLOBAL(sgtrfs, SGTRFS)
-#define LAPACK_dgtrfs LAPACK_GLOBAL(dgtrfs, DGTRFS)
-#define LAPACK_cgtrfs LAPACK_GLOBAL(cgtrfs, CGTRFS)
-#define LAPACK_zgtrfs LAPACK_GLOBAL(zgtrfs, ZGTRFS)
-#define LAPACK_sporfs LAPACK_GLOBAL(sporfs, SPORFS)
-#define LAPACK_dporfs LAPACK_GLOBAL(dporfs, DPORFS)
-#define LAPACK_cporfs LAPACK_GLOBAL(cporfs, CPORFS)
-#define LAPACK_zporfs LAPACK_GLOBAL(zporfs, ZPORFS)
-#define LAPACK_dporfsx LAPACK_GLOBAL(dporfsx, DPORFSX)
-#define LAPACK_sporfsx LAPACK_GLOBAL(sporfsx, SPORFSX)
-#define LAPACK_zporfsx LAPACK_GLOBAL(zporfsx, ZPORFSX)
-#define LAPACK_cporfsx LAPACK_GLOBAL(cporfsx, CPORFSX)
-#define LAPACK_spprfs LAPACK_GLOBAL(spprfs, SPPRFS)
-#define LAPACK_dpprfs LAPACK_GLOBAL(dpprfs, DPPRFS)
-#define LAPACK_cpprfs LAPACK_GLOBAL(cpprfs, CPPRFS)
-#define LAPACK_zpprfs LAPACK_GLOBAL(zpprfs, ZPPRFS)
-#define LAPACK_spbrfs LAPACK_GLOBAL(spbrfs, SPBRFS)
-#define LAPACK_dpbrfs LAPACK_GLOBAL(dpbrfs, DPBRFS)
-#define LAPACK_cpbrfs LAPACK_GLOBAL(cpbrfs, CPBRFS)
-#define LAPACK_zpbrfs LAPACK_GLOBAL(zpbrfs, ZPBRFS)
-#define LAPACK_sptrfs LAPACK_GLOBAL(sptrfs, SPTRFS)
-#define LAPACK_dptrfs LAPACK_GLOBAL(dptrfs, DPTRFS)
-#define LAPACK_cptrfs LAPACK_GLOBAL(cptrfs, CPTRFS)
-#define LAPACK_zptrfs LAPACK_GLOBAL(zptrfs, ZPTRFS)
-#define LAPACK_ssyrfs LAPACK_GLOBAL(ssyrfs, SSYRFS)
-#define LAPACK_dsyrfs LAPACK_GLOBAL(dsyrfs, DSYRFS)
-#define LAPACK_csyrfs LAPACK_GLOBAL(csyrfs, CSYRFS)
-#define LAPACK_zsyrfs LAPACK_GLOBAL(zsyrfs, ZSYRFS)
-#define LAPACK_dsyrfsx LAPACK_GLOBAL(dsyrfsx, DSYRFSX)
-#define LAPACK_ssyrfsx LAPACK_GLOBAL(ssyrfsx, SSYRFSX)
-#define LAPACK_zsyrfsx LAPACK_GLOBAL(zsyrfsx, ZSYRFSX)
-#define LAPACK_csyrfsx LAPACK_GLOBAL(csyrfsx, CSYRFSX)
-#define LAPACK_cherfs LAPACK_GLOBAL(cherfs, CHERFS)
-#define LAPACK_zherfs LAPACK_GLOBAL(zherfs, ZHERFS)
-#define LAPACK_zherfsx LAPACK_GLOBAL(zherfsx, ZHERFSX)
-#define LAPACK_cherfsx LAPACK_GLOBAL(cherfsx, CHERFSX)
-#define LAPACK_ssprfs LAPACK_GLOBAL(ssprfs, SSPRFS)
-#define LAPACK_dsprfs LAPACK_GLOBAL(dsprfs, DSPRFS)
-#define LAPACK_csprfs LAPACK_GLOBAL(csprfs, CSPRFS)
-#define LAPACK_zsprfs LAPACK_GLOBAL(zsprfs, ZSPRFS)
-#define LAPACK_chprfs LAPACK_GLOBAL(chprfs, CHPRFS)
-#define LAPACK_zhprfs LAPACK_GLOBAL(zhprfs, ZHPRFS)
-#define LAPACK_strrfs LAPACK_GLOBAL(strrfs, STRRFS)
-#define LAPACK_dtrrfs LAPACK_GLOBAL(dtrrfs, DTRRFS)
-#define LAPACK_ctrrfs LAPACK_GLOBAL(ctrrfs, CTRRFS)
-#define LAPACK_ztrrfs LAPACK_GLOBAL(ztrrfs, ZTRRFS)
-#define LAPACK_stprfs LAPACK_GLOBAL(stprfs, STPRFS)
-#define LAPACK_dtprfs LAPACK_GLOBAL(dtprfs, DTPRFS)
-#define LAPACK_ctprfs LAPACK_GLOBAL(ctprfs, CTPRFS)
-#define LAPACK_ztprfs LAPACK_GLOBAL(ztprfs, ZTPRFS)
-#define LAPACK_stbrfs LAPACK_GLOBAL(stbrfs, STBRFS)
-#define LAPACK_dtbrfs LAPACK_GLOBAL(dtbrfs, DTBRFS)
-#define LAPACK_ctbrfs LAPACK_GLOBAL(ctbrfs, CTBRFS)
-#define LAPACK_ztbrfs LAPACK_GLOBAL(ztbrfs, ZTBRFS)
-#define LAPACK_sgetri LAPACK_GLOBAL(sgetri, SGETRI)
-#define LAPACK_dgetri LAPACK_GLOBAL(dgetri, DGETRI)
-#define LAPACK_cgetri LAPACK_GLOBAL(cgetri, CGETRI)
-#define LAPACK_zgetri LAPACK_GLOBAL(zgetri, ZGETRI)
-#define LAPACK_spotri LAPACK_GLOBAL(spotri, SPOTRI)
-#define LAPACK_dpotri LAPACK_GLOBAL(dpotri, DPOTRI)
-#define LAPACK_cpotri LAPACK_GLOBAL(cpotri, CPOTRI)
-#define LAPACK_zpotri LAPACK_GLOBAL(zpotri, ZPOTRI)
-#define LAPACK_dpftri LAPACK_GLOBAL(dpftri, DPFTRI)
-#define LAPACK_spftri LAPACK_GLOBAL(spftri, SPFTRI)
-#define LAPACK_zpftri LAPACK_GLOBAL(zpftri, ZPFTRI)
-#define LAPACK_cpftri LAPACK_GLOBAL(cpftri, CPFTRI)
-#define LAPACK_spptri LAPACK_GLOBAL(spptri, SPPTRI)
-#define LAPACK_dpptri LAPACK_GLOBAL(dpptri, DPPTRI)
-#define LAPACK_cpptri LAPACK_GLOBAL(cpptri, CPPTRI)
-#define LAPACK_zpptri LAPACK_GLOBAL(zpptri, ZPPTRI)
-#define LAPACK_ssytri LAPACK_GLOBAL(ssytri, SSYTRI)
-#define LAPACK_dsytri LAPACK_GLOBAL(dsytri, DSYTRI)
-#define LAPACK_csytri LAPACK_GLOBAL(csytri, CSYTRI)
-#define LAPACK_zsytri LAPACK_GLOBAL(zsytri, ZSYTRI)
-#define LAPACK_chetri LAPACK_GLOBAL(chetri, CHETRI)
-#define LAPACK_zhetri LAPACK_GLOBAL(zhetri, ZHETRI)
-#define LAPACK_ssptri LAPACK_GLOBAL(ssptri, SSPTRI)
-#define LAPACK_dsptri LAPACK_GLOBAL(dsptri, DSPTRI)
-#define LAPACK_csptri LAPACK_GLOBAL(csptri, CSPTRI)
-#define LAPACK_zsptri LAPACK_GLOBAL(zsptri, ZSPTRI)
-#define LAPACK_chptri LAPACK_GLOBAL(chptri, CHPTRI)
-#define LAPACK_zhptri LAPACK_GLOBAL(zhptri, ZHPTRI)
-#define LAPACK_strtri LAPACK_GLOBAL(strtri, STRTRI)
-#define LAPACK_dtrtri LAPACK_GLOBAL(dtrtri, DTRTRI)
-#define LAPACK_ctrtri LAPACK_GLOBAL(ctrtri, CTRTRI)
-#define LAPACK_ztrtri LAPACK_GLOBAL(ztrtri, ZTRTRI)
-#define LAPACK_dtftri LAPACK_GLOBAL(dtftri, DTFTRI)
-#define LAPACK_stftri LAPACK_GLOBAL(stftri, STFTRI)
-#define LAPACK_ztftri LAPACK_GLOBAL(ztftri, ZTFTRI)
-#define LAPACK_ctftri LAPACK_GLOBAL(ctftri, CTFTRI)
-#define LAPACK_stptri LAPACK_GLOBAL(stptri, STPTRI)
-#define LAPACK_dtptri LAPACK_GLOBAL(dtptri, DTPTRI)
-#define LAPACK_ctptri LAPACK_GLOBAL(ctptri, CTPTRI)
-#define LAPACK_ztptri LAPACK_GLOBAL(ztptri, ZTPTRI)
-#define LAPACK_sgeequ LAPACK_GLOBAL(sgeequ, SGEEQU)
-#define LAPACK_dgeequ LAPACK_GLOBAL(dgeequ, DGEEQU)
-#define LAPACK_cgeequ LAPACK_GLOBAL(cgeequ, CGEEQU)
-#define LAPACK_zgeequ LAPACK_GLOBAL(zgeequ, ZGEEQU)
-#define LAPACK_dgeequb LAPACK_GLOBAL(dgeequb, DGEEQUB)
-#define LAPACK_sgeequb LAPACK_GLOBAL(sgeequb, SGEEQUB)
-#define LAPACK_zgeequb LAPACK_GLOBAL(zgeequb, ZGEEQUB)
-#define LAPACK_cgeequb LAPACK_GLOBAL(cgeequb, CGEEQUB)
-#define LAPACK_sgbequ LAPACK_GLOBAL(sgbequ, SGBEQU)
-#define LAPACK_dgbequ LAPACK_GLOBAL(dgbequ, DGBEQU)
-#define LAPACK_cgbequ LAPACK_GLOBAL(cgbequ, CGBEQU)
-#define LAPACK_zgbequ LAPACK_GLOBAL(zgbequ, ZGBEQU)
-#define LAPACK_dgbequb LAPACK_GLOBAL(dgbequb, DGBEQUB)
-#define LAPACK_sgbequb LAPACK_GLOBAL(sgbequb, SGBEQUB)
-#define LAPACK_zgbequb LAPACK_GLOBAL(zgbequb, ZGBEQUB)
-#define LAPACK_cgbequb LAPACK_GLOBAL(cgbequb, CGBEQUB)
-#define LAPACK_spoequ LAPACK_GLOBAL(spoequ, SPOEQU)
-#define LAPACK_dpoequ LAPACK_GLOBAL(dpoequ, DPOEQU)
-#define LAPACK_cpoequ LAPACK_GLOBAL(cpoequ, CPOEQU)
-#define LAPACK_zpoequ LAPACK_GLOBAL(zpoequ, ZPOEQU)
-#define LAPACK_dpoequb LAPACK_GLOBAL(dpoequb, DPOEQUB)
-#define LAPACK_spoequb LAPACK_GLOBAL(spoequb, SPOEQUB)
-#define LAPACK_zpoequb LAPACK_GLOBAL(zpoequb, ZPOEQUB)
-#define LAPACK_cpoequb LAPACK_GLOBAL(cpoequb, CPOEQUB)
-#define LAPACK_sppequ LAPACK_GLOBAL(sppequ, SPPEQU)
-#define LAPACK_dppequ LAPACK_GLOBAL(dppequ, DPPEQU)
-#define LAPACK_cppequ LAPACK_GLOBAL(cppequ, CPPEQU)
-#define LAPACK_zppequ LAPACK_GLOBAL(zppequ, ZPPEQU)
-#define LAPACK_spbequ LAPACK_GLOBAL(spbequ, SPBEQU)
-#define LAPACK_dpbequ LAPACK_GLOBAL(dpbequ, DPBEQU)
-#define LAPACK_cpbequ LAPACK_GLOBAL(cpbequ, CPBEQU)
-#define LAPACK_zpbequ LAPACK_GLOBAL(zpbequ, ZPBEQU)
-#define LAPACK_dsyequb LAPACK_GLOBAL(dsyequb, DSYEQUB)
-#define LAPACK_ssyequb LAPACK_GLOBAL(ssyequb, SSYEQUB)
-#define LAPACK_zsyequb LAPACK_GLOBAL(zsyequb, ZSYEQUB)
-#define LAPACK_csyequb LAPACK_GLOBAL(csyequb, CSYEQUB)
-#define LAPACK_zheequb LAPACK_GLOBAL(zheequb, ZHEEQUB)
-#define LAPACK_cheequb LAPACK_GLOBAL(cheequb, CHEEQUB)
-#define LAPACK_sgesv LAPACK_GLOBAL(sgesv, SGESV)
-#define LAPACK_dgesv LAPACK_GLOBAL(dgesv, DGESV)
-#define LAPACK_cgesv LAPACK_GLOBAL(cgesv, CGESV)
-#define LAPACK_zgesv LAPACK_GLOBAL(zgesv, ZGESV)
-#define LAPACK_dsgesv LAPACK_GLOBAL(dsgesv, DSGESV)
-#define LAPACK_zcgesv LAPACK_GLOBAL(zcgesv, ZCGESV)
-#define LAPACK_sgesvx LAPACK_GLOBAL(sgesvx, SGESVX)
-#define LAPACK_dgesvx LAPACK_GLOBAL(dgesvx, DGESVX)
-#define LAPACK_cgesvx LAPACK_GLOBAL(cgesvx, CGESVX)
-#define LAPACK_zgesvx LAPACK_GLOBAL(zgesvx, ZGESVX)
-#define LAPACK_dgesvxx LAPACK_GLOBAL(dgesvxx, DGESVXX)
-#define LAPACK_sgesvxx LAPACK_GLOBAL(sgesvxx, SGESVXX)
-#define LAPACK_zgesvxx LAPACK_GLOBAL(zgesvxx, ZGESVXX)
-#define LAPACK_cgesvxx LAPACK_GLOBAL(cgesvxx, CGESVXX)
-#define LAPACK_sgbsv LAPACK_GLOBAL(sgbsv, SGBSV)
-#define LAPACK_dgbsv LAPACK_GLOBAL(dgbsv, DGBSV)
-#define LAPACK_cgbsv LAPACK_GLOBAL(cgbsv, CGBSV)
-#define LAPACK_zgbsv LAPACK_GLOBAL(zgbsv, ZGBSV)
-#define LAPACK_sgbsvx LAPACK_GLOBAL(sgbsvx, SGBSVX)
-#define LAPACK_dgbsvx LAPACK_GLOBAL(dgbsvx, DGBSVX)
-#define LAPACK_cgbsvx LAPACK_GLOBAL(cgbsvx, CGBSVX)
-#define LAPACK_zgbsvx LAPACK_GLOBAL(zgbsvx, ZGBSVX)
-#define LAPACK_dgbsvxx LAPACK_GLOBAL(dgbsvxx, DGBSVXX)
-#define LAPACK_sgbsvxx LAPACK_GLOBAL(sgbsvxx, SGBSVXX)
-#define LAPACK_zgbsvxx LAPACK_GLOBAL(zgbsvxx, ZGBSVXX)
-#define LAPACK_cgbsvxx LAPACK_GLOBAL(cgbsvxx, CGBSVXX)
-#define LAPACK_sgtsv LAPACK_GLOBAL(sgtsv, SGTSV)
-#define LAPACK_dgtsv LAPACK_GLOBAL(dgtsv, DGTSV)
-#define LAPACK_cgtsv LAPACK_GLOBAL(cgtsv, CGTSV)
-#define LAPACK_zgtsv LAPACK_GLOBAL(zgtsv, ZGTSV)
-#define LAPACK_sgtsvx LAPACK_GLOBAL(sgtsvx, SGTSVX)
-#define LAPACK_dgtsvx LAPACK_GLOBAL(dgtsvx, DGTSVX)
-#define LAPACK_cgtsvx LAPACK_GLOBAL(cgtsvx, CGTSVX)
-#define LAPACK_zgtsvx LAPACK_GLOBAL(zgtsvx, ZGTSVX)
-#define LAPACK_sposv LAPACK_GLOBAL(sposv, SPOSV)
-#define LAPACK_dposv LAPACK_GLOBAL(dposv, DPOSV)
-#define LAPACK_cposv LAPACK_GLOBAL(cposv, CPOSV)
-#define LAPACK_zposv LAPACK_GLOBAL(zposv, ZPOSV)
-#define LAPACK_dsposv LAPACK_GLOBAL(dsposv, DSPOSV)
-#define LAPACK_zcposv LAPACK_GLOBAL(zcposv, ZCPOSV)
-#define LAPACK_sposvx LAPACK_GLOBAL(sposvx, SPOSVX)
-#define LAPACK_dposvx LAPACK_GLOBAL(dposvx, DPOSVX)
-#define LAPACK_cposvx LAPACK_GLOBAL(cposvx, CPOSVX)
-#define LAPACK_zposvx LAPACK_GLOBAL(zposvx, ZPOSVX)
-#define LAPACK_dposvxx LAPACK_GLOBAL(dposvxx, DPOSVXX)
-#define LAPACK_sposvxx LAPACK_GLOBAL(sposvxx, SPOSVXX)
-#define LAPACK_zposvxx LAPACK_GLOBAL(zposvxx, ZPOSVXX)
-#define LAPACK_cposvxx LAPACK_GLOBAL(cposvxx, CPOSVXX)
-#define LAPACK_sppsv LAPACK_GLOBAL(sppsv, SPPSV)
-#define LAPACK_dppsv LAPACK_GLOBAL(dppsv, DPPSV)
-#define LAPACK_cppsv LAPACK_GLOBAL(cppsv, CPPSV)
-#define LAPACK_zppsv LAPACK_GLOBAL(zppsv, ZPPSV)
-#define LAPACK_sppsvx LAPACK_GLOBAL(sppsvx, SPPSVX)
-#define LAPACK_dppsvx LAPACK_GLOBAL(dppsvx, DPPSVX)
-#define LAPACK_cppsvx LAPACK_GLOBAL(cppsvx, CPPSVX)
-#define LAPACK_zppsvx LAPACK_GLOBAL(zppsvx, ZPPSVX)
-#define LAPACK_spbsv LAPACK_GLOBAL(spbsv, SPBSV)
-#define LAPACK_dpbsv LAPACK_GLOBAL(dpbsv, DPBSV)
-#define LAPACK_cpbsv LAPACK_GLOBAL(cpbsv, CPBSV)
-#define LAPACK_zpbsv LAPACK_GLOBAL(zpbsv, ZPBSV)
-#define LAPACK_spbsvx LAPACK_GLOBAL(spbsvx, SPBSVX)
-#define LAPACK_dpbsvx LAPACK_GLOBAL(dpbsvx, DPBSVX)
-#define LAPACK_cpbsvx LAPACK_GLOBAL(cpbsvx, CPBSVX)
-#define LAPACK_zpbsvx LAPACK_GLOBAL(zpbsvx, ZPBSVX)
-#define LAPACK_sptsv LAPACK_GLOBAL(sptsv, SPTSV)
-#define LAPACK_dptsv LAPACK_GLOBAL(dptsv, DPTSV)
-#define LAPACK_cptsv LAPACK_GLOBAL(cptsv, CPTSV)
-#define LAPACK_zptsv LAPACK_GLOBAL(zptsv, ZPTSV)
-#define LAPACK_sptsvx LAPACK_GLOBAL(sptsvx, SPTSVX)
-#define LAPACK_dptsvx LAPACK_GLOBAL(dptsvx, DPTSVX)
-#define LAPACK_cptsvx LAPACK_GLOBAL(cptsvx, CPTSVX)
-#define LAPACK_zptsvx LAPACK_GLOBAL(zptsvx, ZPTSVX)
-#define LAPACK_ssysv LAPACK_GLOBAL(ssysv, SSYSV)
-#define LAPACK_dsysv LAPACK_GLOBAL(dsysv, DSYSV)
-#define LAPACK_csysv LAPACK_GLOBAL(csysv, CSYSV)
-#define LAPACK_zsysv LAPACK_GLOBAL(zsysv, ZSYSV)
-#define LAPACK_ssysvx LAPACK_GLOBAL(ssysvx, SSYSVX)
-#define LAPACK_dsysvx LAPACK_GLOBAL(dsysvx, DSYSVX)
-#define LAPACK_csysvx LAPACK_GLOBAL(csysvx, CSYSVX)
-#define LAPACK_zsysvx LAPACK_GLOBAL(zsysvx, ZSYSVX)
-#define LAPACK_dsysvxx LAPACK_GLOBAL(dsysvxx, DSYSVXX)
-#define LAPACK_ssysvxx LAPACK_GLOBAL(ssysvxx, SSYSVXX)
-#define LAPACK_zsysvxx LAPACK_GLOBAL(zsysvxx, ZSYSVXX)
-#define LAPACK_csysvxx LAPACK_GLOBAL(csysvxx, CSYSVXX)
-#define LAPACK_chesv LAPACK_GLOBAL(chesv, CHESV)
-#define LAPACK_zhesv LAPACK_GLOBAL(zhesv, ZHESV)
-#define LAPACK_chesvx LAPACK_GLOBAL(chesvx, CHESVX)
-#define LAPACK_zhesvx LAPACK_GLOBAL(zhesvx, ZHESVX)
-#define LAPACK_zhesvxx LAPACK_GLOBAL(zhesvxx, ZHESVXX)
-#define LAPACK_chesvxx LAPACK_GLOBAL(chesvxx, CHESVXX)
-#define LAPACK_sspsv LAPACK_GLOBAL(sspsv, SSPSV)
-#define LAPACK_dspsv LAPACK_GLOBAL(dspsv, DSPSV)
-#define LAPACK_cspsv LAPACK_GLOBAL(cspsv, CSPSV)
-#define LAPACK_zspsv LAPACK_GLOBAL(zspsv, ZSPSV)
-#define LAPACK_sspsvx LAPACK_GLOBAL(sspsvx, SSPSVX)
-#define LAPACK_dspsvx LAPACK_GLOBAL(dspsvx, DSPSVX)
-#define LAPACK_cspsvx LAPACK_GLOBAL(cspsvx, CSPSVX)
-#define LAPACK_zspsvx LAPACK_GLOBAL(zspsvx, ZSPSVX)
-#define LAPACK_chpsv LAPACK_GLOBAL(chpsv, CHPSV)
-#define LAPACK_zhpsv LAPACK_GLOBAL(zhpsv, ZHPSV)
-#define LAPACK_chpsvx LAPACK_GLOBAL(chpsvx, CHPSVX)
-#define LAPACK_zhpsvx LAPACK_GLOBAL(zhpsvx, ZHPSVX)
-#define LAPACK_sgeqrf LAPACK_GLOBAL(sgeqrf, SGEQRF)
-#define LAPACK_dgeqrf LAPACK_GLOBAL(dgeqrf, DGEQRF)
-#define LAPACK_cgeqrf LAPACK_GLOBAL(cgeqrf, CGEQRF)
-#define LAPACK_zgeqrf LAPACK_GLOBAL(zgeqrf, ZGEQRF)
-#define LAPACK_sgeqpf LAPACK_GLOBAL(sgeqpf, SGEQPF)
-#define LAPACK_dgeqpf LAPACK_GLOBAL(dgeqpf, DGEQPF)
-#define LAPACK_cgeqpf LAPACK_GLOBAL(cgeqpf, CGEQPF)
-#define LAPACK_zgeqpf LAPACK_GLOBAL(zgeqpf, ZGEQPF)
-#define LAPACK_sgeqp3 LAPACK_GLOBAL(sgeqp3, SGEQP3)
-#define LAPACK_dgeqp3 LAPACK_GLOBAL(dgeqp3, DGEQP3)
-#define LAPACK_cgeqp3 LAPACK_GLOBAL(cgeqp3, CGEQP3)
-#define LAPACK_zgeqp3 LAPACK_GLOBAL(zgeqp3, ZGEQP3)
-#define LAPACK_sorgqr LAPACK_GLOBAL(sorgqr, SORGQR)
-#define LAPACK_dorgqr LAPACK_GLOBAL(dorgqr, DORGQR)
-#define LAPACK_sormqr LAPACK_GLOBAL(sormqr, SORMQR)
-#define LAPACK_dormqr LAPACK_GLOBAL(dormqr, DORMQR)
-#define LAPACK_cungqr LAPACK_GLOBAL(cungqr, CUNGQR)
-#define LAPACK_zungqr LAPACK_GLOBAL(zungqr, ZUNGQR)
-#define LAPACK_cunmqr LAPACK_GLOBAL(cunmqr, CUNMQR)
-#define LAPACK_zunmqr LAPACK_GLOBAL(zunmqr, ZUNMQR)
-#define LAPACK_sgelqf LAPACK_GLOBAL(sgelqf, SGELQF)
-#define LAPACK_dgelqf LAPACK_GLOBAL(dgelqf, DGELQF)
-#define LAPACK_cgelqf LAPACK_GLOBAL(cgelqf, CGELQF)
-#define LAPACK_zgelqf LAPACK_GLOBAL(zgelqf, ZGELQF)
-#define LAPACK_sorglq LAPACK_GLOBAL(sorglq, SORGLQ)
-#define LAPACK_dorglq LAPACK_GLOBAL(dorglq, DORGLQ)
-#define LAPACK_sormlq LAPACK_GLOBAL(sormlq, SORMLQ)
-#define LAPACK_dormlq LAPACK_GLOBAL(dormlq, DORMLQ)
-#define LAPACK_cunglq LAPACK_GLOBAL(cunglq, CUNGLQ)
-#define LAPACK_zunglq LAPACK_GLOBAL(zunglq, ZUNGLQ)
-#define LAPACK_cunmlq LAPACK_GLOBAL(cunmlq, CUNMLQ)
-#define LAPACK_zunmlq LAPACK_GLOBAL(zunmlq, ZUNMLQ)
-#define LAPACK_sgeqlf LAPACK_GLOBAL(sgeqlf, SGEQLF)
-#define LAPACK_dgeqlf LAPACK_GLOBAL(dgeqlf, DGEQLF)
-#define LAPACK_cgeqlf LAPACK_GLOBAL(cgeqlf, CGEQLF)
-#define LAPACK_zgeqlf LAPACK_GLOBAL(zgeqlf, ZGEQLF)
-#define LAPACK_sorgql LAPACK_GLOBAL(sorgql, SORGQL)
-#define LAPACK_dorgql LAPACK_GLOBAL(dorgql, DORGQL)
-#define LAPACK_cungql LAPACK_GLOBAL(cungql, CUNGQL)
-#define LAPACK_zungql LAPACK_GLOBAL(zungql, ZUNGQL)
-#define LAPACK_sormql LAPACK_GLOBAL(sormql, SORMQL)
-#define LAPACK_dormql LAPACK_GLOBAL(dormql, DORMQL)
-#define LAPACK_cunmql LAPACK_GLOBAL(cunmql, CUNMQL)
-#define LAPACK_zunmql LAPACK_GLOBAL(zunmql, ZUNMQL)
-#define LAPACK_sgerqf LAPACK_GLOBAL(sgerqf, SGERQF)
-#define LAPACK_dgerqf LAPACK_GLOBAL(dgerqf, DGERQF)
-#define LAPACK_cgerqf LAPACK_GLOBAL(cgerqf, CGERQF)
-#define LAPACK_zgerqf LAPACK_GLOBAL(zgerqf, ZGERQF)
-#define LAPACK_sorgrq LAPACK_GLOBAL(sorgrq, SORGRQ)
-#define LAPACK_dorgrq LAPACK_GLOBAL(dorgrq, DORGRQ)
-#define LAPACK_cungrq LAPACK_GLOBAL(cungrq, CUNGRQ)
-#define LAPACK_zungrq LAPACK_GLOBAL(zungrq, ZUNGRQ)
-#define LAPACK_sormrq LAPACK_GLOBAL(sormrq, SORMRQ)
-#define LAPACK_dormrq LAPACK_GLOBAL(dormrq, DORMRQ)
-#define LAPACK_cunmrq LAPACK_GLOBAL(cunmrq, CUNMRQ)
-#define LAPACK_zunmrq LAPACK_GLOBAL(zunmrq, ZUNMRQ)
-#define LAPACK_stzrzf LAPACK_GLOBAL(stzrzf, STZRZF)
-#define LAPACK_dtzrzf LAPACK_GLOBAL(dtzrzf, DTZRZF)
-#define LAPACK_ctzrzf LAPACK_GLOBAL(ctzrzf, CTZRZF)
-#define LAPACK_ztzrzf LAPACK_GLOBAL(ztzrzf, ZTZRZF)
-#define LAPACK_sormrz LAPACK_GLOBAL(sormrz, SORMRZ)
-#define LAPACK_dormrz LAPACK_GLOBAL(dormrz, DORMRZ)
-#define LAPACK_cunmrz LAPACK_GLOBAL(cunmrz, CUNMRZ)
-#define LAPACK_zunmrz LAPACK_GLOBAL(zunmrz, ZUNMRZ)
-#define LAPACK_sggqrf LAPACK_GLOBAL(sggqrf, SGGQRF)
-#define LAPACK_dggqrf LAPACK_GLOBAL(dggqrf, DGGQRF)
-#define LAPACK_cggqrf LAPACK_GLOBAL(cggqrf, CGGQRF)
-#define LAPACK_zggqrf LAPACK_GLOBAL(zggqrf, ZGGQRF)
-#define LAPACK_sggrqf LAPACK_GLOBAL(sggrqf, SGGRQF)
-#define LAPACK_dggrqf LAPACK_GLOBAL(dggrqf, DGGRQF)
-#define LAPACK_cggrqf LAPACK_GLOBAL(cggrqf, CGGRQF)
-#define LAPACK_zggrqf LAPACK_GLOBAL(zggrqf, ZGGRQF)
-#define LAPACK_sgebrd LAPACK_GLOBAL(sgebrd, SGEBRD)
-#define LAPACK_dgebrd LAPACK_GLOBAL(dgebrd, DGEBRD)
-#define LAPACK_cgebrd LAPACK_GLOBAL(cgebrd, CGEBRD)
-#define LAPACK_zgebrd LAPACK_GLOBAL(zgebrd, ZGEBRD)
-#define LAPACK_sgbbrd LAPACK_GLOBAL(sgbbrd, SGBBRD)
-#define LAPACK_dgbbrd LAPACK_GLOBAL(dgbbrd, DGBBRD)
-#define LAPACK_cgbbrd LAPACK_GLOBAL(cgbbrd, CGBBRD)
-#define LAPACK_zgbbrd LAPACK_GLOBAL(zgbbrd, ZGBBRD)
-#define LAPACK_sorgbr LAPACK_GLOBAL(sorgbr, SORGBR)
-#define LAPACK_dorgbr LAPACK_GLOBAL(dorgbr, DORGBR)
-#define LAPACK_sormbr LAPACK_GLOBAL(sormbr, SORMBR)
-#define LAPACK_dormbr LAPACK_GLOBAL(dormbr, DORMBR)
-#define LAPACK_cungbr LAPACK_GLOBAL(cungbr, CUNGBR)
-#define LAPACK_zungbr LAPACK_GLOBAL(zungbr, ZUNGBR)
-#define LAPACK_cunmbr LAPACK_GLOBAL(cunmbr, CUNMBR)
-#define LAPACK_zunmbr LAPACK_GLOBAL(zunmbr, ZUNMBR)
-#define LAPACK_sbdsqr LAPACK_GLOBAL(sbdsqr, SBDSQR)
-#define LAPACK_dbdsqr LAPACK_GLOBAL(dbdsqr, DBDSQR)
-#define LAPACK_cbdsqr LAPACK_GLOBAL(cbdsqr, CBDSQR)
-#define LAPACK_zbdsqr LAPACK_GLOBAL(zbdsqr, ZBDSQR)
-#define LAPACK_sbdsdc LAPACK_GLOBAL(sbdsdc, SBDSDC)
-#define LAPACK_dbdsdc LAPACK_GLOBAL(dbdsdc, DBDSDC)
-#define LAPACK_ssytrd LAPACK_GLOBAL(ssytrd, SSYTRD)
-#define LAPACK_dsytrd LAPACK_GLOBAL(dsytrd, DSYTRD)
-#define LAPACK_sorgtr LAPACK_GLOBAL(sorgtr, SORGTR)
-#define LAPACK_dorgtr LAPACK_GLOBAL(dorgtr, DORGTR)
-#define LAPACK_sormtr LAPACK_GLOBAL(sormtr, SORMTR)
-#define LAPACK_dormtr LAPACK_GLOBAL(dormtr, DORMTR)
-#define LAPACK_chetrd LAPACK_GLOBAL(chetrd, CHETRD)
-#define LAPACK_zhetrd LAPACK_GLOBAL(zhetrd, ZHETRD)
-#define LAPACK_cungtr LAPACK_GLOBAL(cungtr, CUNGTR)
-#define LAPACK_zungtr LAPACK_GLOBAL(zungtr, ZUNGTR)
-#define LAPACK_cunmtr LAPACK_GLOBAL(cunmtr, CUNMTR)
-#define LAPACK_zunmtr LAPACK_GLOBAL(zunmtr, ZUNMTR)
-#define LAPACK_ssptrd LAPACK_GLOBAL(ssptrd, SSPTRD)
-#define LAPACK_dsptrd LAPACK_GLOBAL(dsptrd, DSPTRD)
-#define LAPACK_sopgtr LAPACK_GLOBAL(sopgtr, SOPGTR)
-#define LAPACK_dopgtr LAPACK_GLOBAL(dopgtr, DOPGTR)
-#define LAPACK_sopmtr LAPACK_GLOBAL(sopmtr, SOPMTR)
-#define LAPACK_dopmtr LAPACK_GLOBAL(dopmtr, DOPMTR)
-#define LAPACK_chptrd LAPACK_GLOBAL(chptrd, CHPTRD)
-#define LAPACK_zhptrd LAPACK_GLOBAL(zhptrd, ZHPTRD)
-#define LAPACK_cupgtr LAPACK_GLOBAL(cupgtr, CUPGTR)
-#define LAPACK_zupgtr LAPACK_GLOBAL(zupgtr, ZUPGTR)
-#define LAPACK_cupmtr LAPACK_GLOBAL(cupmtr, CUPMTR)
-#define LAPACK_zupmtr LAPACK_GLOBAL(zupmtr, ZUPMTR)
-#define LAPACK_ssbtrd LAPACK_GLOBAL(ssbtrd, SSBTRD)
-#define LAPACK_dsbtrd LAPACK_GLOBAL(dsbtrd, DSBTRD)
-#define LAPACK_chbtrd LAPACK_GLOBAL(chbtrd, CHBTRD)
-#define LAPACK_zhbtrd LAPACK_GLOBAL(zhbtrd, ZHBTRD)
-#define LAPACK_ssterf LAPACK_GLOBAL(ssterf, SSTERF)
-#define LAPACK_dsterf LAPACK_GLOBAL(dsterf, DSTERF)
-#define LAPACK_ssteqr LAPACK_GLOBAL(ssteqr, SSTEQR)
-#define LAPACK_dsteqr LAPACK_GLOBAL(dsteqr, DSTEQR)
-#define LAPACK_csteqr LAPACK_GLOBAL(csteqr, CSTEQR)
-#define LAPACK_zsteqr LAPACK_GLOBAL(zsteqr, ZSTEQR)
-#define LAPACK_sstemr LAPACK_GLOBAL(sstemr, SSTEMR)
-#define LAPACK_dstemr LAPACK_GLOBAL(dstemr, DSTEMR)
-#define LAPACK_cstemr LAPACK_GLOBAL(cstemr, CSTEMR)
-#define LAPACK_zstemr LAPACK_GLOBAL(zstemr, ZSTEMR)
-#define LAPACK_sstedc LAPACK_GLOBAL(sstedc, SSTEDC)
-#define LAPACK_dstedc LAPACK_GLOBAL(dstedc, DSTEDC)
-#define LAPACK_cstedc LAPACK_GLOBAL(cstedc, CSTEDC)
-#define LAPACK_zstedc LAPACK_GLOBAL(zstedc, ZSTEDC)
-#define LAPACK_sstegr LAPACK_GLOBAL(sstegr, SSTEGR)
-#define LAPACK_dstegr LAPACK_GLOBAL(dstegr, DSTEGR)
-#define LAPACK_cstegr LAPACK_GLOBAL(cstegr, CSTEGR)
-#define LAPACK_zstegr LAPACK_GLOBAL(zstegr, ZSTEGR)
-#define LAPACK_spteqr LAPACK_GLOBAL(spteqr, SPTEQR)
-#define LAPACK_dpteqr LAPACK_GLOBAL(dpteqr, DPTEQR)
-#define LAPACK_cpteqr LAPACK_GLOBAL(cpteqr, CPTEQR)
-#define LAPACK_zpteqr LAPACK_GLOBAL(zpteqr, ZPTEQR)
-#define LAPACK_sstebz LAPACK_GLOBAL(sstebz, SSTEBZ)
-#define LAPACK_dstebz LAPACK_GLOBAL(dstebz, DSTEBZ)
-#define LAPACK_sstein LAPACK_GLOBAL(sstein, SSTEIN)
-#define LAPACK_dstein LAPACK_GLOBAL(dstein, DSTEIN)
-#define LAPACK_cstein LAPACK_GLOBAL(cstein, CSTEIN)
-#define LAPACK_zstein LAPACK_GLOBAL(zstein, ZSTEIN)
-#define LAPACK_sdisna LAPACK_GLOBAL(sdisna, SDISNA)
-#define LAPACK_ddisna LAPACK_GLOBAL(ddisna, DDISNA)
-#define LAPACK_ssygst LAPACK_GLOBAL(ssygst, SSYGST)
-#define LAPACK_dsygst LAPACK_GLOBAL(dsygst, DSYGST)
-#define LAPACK_chegst LAPACK_GLOBAL(chegst, CHEGST)
-#define LAPACK_zhegst LAPACK_GLOBAL(zhegst, ZHEGST)
-#define LAPACK_sspgst LAPACK_GLOBAL(sspgst, SSPGST)
-#define LAPACK_dspgst LAPACK_GLOBAL(dspgst, DSPGST)
-#define LAPACK_chpgst LAPACK_GLOBAL(chpgst, CHPGST)
-#define LAPACK_zhpgst LAPACK_GLOBAL(zhpgst, ZHPGST)
-#define LAPACK_ssbgst LAPACK_GLOBAL(ssbgst, SSBGST)
-#define LAPACK_dsbgst LAPACK_GLOBAL(dsbgst, DSBGST)
-#define LAPACK_chbgst LAPACK_GLOBAL(chbgst, CHBGST)
-#define LAPACK_zhbgst LAPACK_GLOBAL(zhbgst, ZHBGST)
-#define LAPACK_spbstf LAPACK_GLOBAL(spbstf, SPBSTF)
-#define LAPACK_dpbstf LAPACK_GLOBAL(dpbstf, DPBSTF)
-#define LAPACK_cpbstf LAPACK_GLOBAL(cpbstf, CPBSTF)
-#define LAPACK_zpbstf LAPACK_GLOBAL(zpbstf, ZPBSTF)
-#define LAPACK_sgehrd LAPACK_GLOBAL(sgehrd, SGEHRD)
-#define LAPACK_dgehrd LAPACK_GLOBAL(dgehrd, DGEHRD)
-#define LAPACK_cgehrd LAPACK_GLOBAL(cgehrd, CGEHRD)
-#define LAPACK_zgehrd LAPACK_GLOBAL(zgehrd, ZGEHRD)
-#define LAPACK_sorghr LAPACK_GLOBAL(sorghr, SORGHR)
-#define LAPACK_dorghr LAPACK_GLOBAL(dorghr, DORGHR)
-#define LAPACK_sormhr LAPACK_GLOBAL(sormhr, SORMHR)
-#define LAPACK_dormhr LAPACK_GLOBAL(dormhr, DORMHR)
-#define LAPACK_cunghr LAPACK_GLOBAL(cunghr, CUNGHR)
-#define LAPACK_zunghr LAPACK_GLOBAL(zunghr, ZUNGHR)
-#define LAPACK_cunmhr LAPACK_GLOBAL(cunmhr, CUNMHR)
-#define LAPACK_zunmhr LAPACK_GLOBAL(zunmhr, ZUNMHR)
-#define LAPACK_sgebal LAPACK_GLOBAL(sgebal, SGEBAL)
-#define LAPACK_dgebal LAPACK_GLOBAL(dgebal, DGEBAL)
-#define LAPACK_cgebal LAPACK_GLOBAL(cgebal, CGEBAL)
-#define LAPACK_zgebal LAPACK_GLOBAL(zgebal, ZGEBAL)
-#define LAPACK_sgebak LAPACK_GLOBAL(sgebak, SGEBAK)
-#define LAPACK_dgebak LAPACK_GLOBAL(dgebak, DGEBAK)
-#define LAPACK_cgebak LAPACK_GLOBAL(cgebak, CGEBAK)
-#define LAPACK_zgebak LAPACK_GLOBAL(zgebak, ZGEBAK)
-#define LAPACK_shseqr LAPACK_GLOBAL(shseqr, SHSEQR)
-#define LAPACK_dhseqr LAPACK_GLOBAL(dhseqr, DHSEQR)
-#define LAPACK_chseqr LAPACK_GLOBAL(chseqr, CHSEQR)
-#define LAPACK_zhseqr LAPACK_GLOBAL(zhseqr, ZHSEQR)
-#define LAPACK_shsein LAPACK_GLOBAL(shsein, SHSEIN)
-#define LAPACK_dhsein LAPACK_GLOBAL(dhsein, DHSEIN)
-#define LAPACK_chsein LAPACK_GLOBAL(chsein, CHSEIN)
-#define LAPACK_zhsein LAPACK_GLOBAL(zhsein, ZHSEIN)
-#define LAPACK_strevc LAPACK_GLOBAL(strevc, STREVC)
-#define LAPACK_dtrevc LAPACK_GLOBAL(dtrevc, DTREVC)
-#define LAPACK_ctrevc LAPACK_GLOBAL(ctrevc, CTREVC)
-#define LAPACK_ztrevc LAPACK_GLOBAL(ztrevc, ZTREVC)
-#define LAPACK_strsna LAPACK_GLOBAL(strsna, STRSNA)
-#define LAPACK_dtrsna LAPACK_GLOBAL(dtrsna, DTRSNA)
-#define LAPACK_ctrsna LAPACK_GLOBAL(ctrsna, CTRSNA)
-#define LAPACK_ztrsna LAPACK_GLOBAL(ztrsna, ZTRSNA)
-#define LAPACK_strexc LAPACK_GLOBAL(strexc, STREXC)
-#define LAPACK_dtrexc LAPACK_GLOBAL(dtrexc, DTREXC)
-#define LAPACK_ctrexc LAPACK_GLOBAL(ctrexc, CTREXC)
-#define LAPACK_ztrexc LAPACK_GLOBAL(ztrexc, ZTREXC)
-#define LAPACK_strsen LAPACK_GLOBAL(strsen, STRSEN)
-#define LAPACK_dtrsen LAPACK_GLOBAL(dtrsen, DTRSEN)
-#define LAPACK_ctrsen LAPACK_GLOBAL(ctrsen, CTRSEN)
-#define LAPACK_ztrsen LAPACK_GLOBAL(ztrsen, ZTRSEN)
-#define LAPACK_strsyl LAPACK_GLOBAL(strsyl, STRSYL)
-#define LAPACK_dtrsyl LAPACK_GLOBAL(dtrsyl, DTRSYL)
-#define LAPACK_ctrsyl LAPACK_GLOBAL(ctrsyl, CTRSYL)
-#define LAPACK_ztrsyl LAPACK_GLOBAL(ztrsyl, ZTRSYL)
-#define LAPACK_sgghrd LAPACK_GLOBAL(sgghrd, SGGHRD)
-#define LAPACK_dgghrd LAPACK_GLOBAL(dgghrd, DGGHRD)
-#define LAPACK_cgghrd LAPACK_GLOBAL(cgghrd, CGGHRD)
-#define LAPACK_zgghrd LAPACK_GLOBAL(zgghrd, ZGGHRD)
-#define LAPACK_sggbal LAPACK_GLOBAL(sggbal, SGGBAL)
-#define LAPACK_dggbal LAPACK_GLOBAL(dggbal, DGGBAL)
-#define LAPACK_cggbal LAPACK_GLOBAL(cggbal, CGGBAL)
-#define LAPACK_zggbal LAPACK_GLOBAL(zggbal, ZGGBAL)
-#define LAPACK_sggbak LAPACK_GLOBAL(sggbak, SGGBAK)
-#define LAPACK_dggbak LAPACK_GLOBAL(dggbak, DGGBAK)
-#define LAPACK_cggbak LAPACK_GLOBAL(cggbak, CGGBAK)
-#define LAPACK_zggbak LAPACK_GLOBAL(zggbak, ZGGBAK)
-#define LAPACK_shgeqz LAPACK_GLOBAL(shgeqz, SHGEQZ)
-#define LAPACK_dhgeqz LAPACK_GLOBAL(dhgeqz, DHGEQZ)
-#define LAPACK_chgeqz LAPACK_GLOBAL(chgeqz, CHGEQZ)
-#define LAPACK_zhgeqz LAPACK_GLOBAL(zhgeqz, ZHGEQZ)
-#define LAPACK_stgevc LAPACK_GLOBAL(stgevc, STGEVC)
-#define LAPACK_dtgevc LAPACK_GLOBAL(dtgevc, DTGEVC)
-#define LAPACK_ctgevc LAPACK_GLOBAL(ctgevc, CTGEVC)
-#define LAPACK_ztgevc LAPACK_GLOBAL(ztgevc, ZTGEVC)
-#define LAPACK_stgexc LAPACK_GLOBAL(stgexc, STGEXC)
-#define LAPACK_dtgexc LAPACK_GLOBAL(dtgexc, DTGEXC)
-#define LAPACK_ctgexc LAPACK_GLOBAL(ctgexc, CTGEXC)
-#define LAPACK_ztgexc LAPACK_GLOBAL(ztgexc, ZTGEXC)
-#define LAPACK_stgsen LAPACK_GLOBAL(stgsen, STGSEN)
-#define LAPACK_dtgsen LAPACK_GLOBAL(dtgsen, DTGSEN)
-#define LAPACK_ctgsen LAPACK_GLOBAL(ctgsen, CTGSEN)
-#define LAPACK_ztgsen LAPACK_GLOBAL(ztgsen, ZTGSEN)
-#define LAPACK_stgsyl LAPACK_GLOBAL(stgsyl, STGSYL)
-#define LAPACK_dtgsyl LAPACK_GLOBAL(dtgsyl, DTGSYL)
-#define LAPACK_ctgsyl LAPACK_GLOBAL(ctgsyl, CTGSYL)
-#define LAPACK_ztgsyl LAPACK_GLOBAL(ztgsyl, ZTGSYL)
-#define LAPACK_stgsna LAPACK_GLOBAL(stgsna, STGSNA)
-#define LAPACK_dtgsna LAPACK_GLOBAL(dtgsna, DTGSNA)
-#define LAPACK_ctgsna LAPACK_GLOBAL(ctgsna, CTGSNA)
-#define LAPACK_ztgsna LAPACK_GLOBAL(ztgsna, ZTGSNA)
-#define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp, SGGSVP)
-#define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp, DGGSVP)
-#define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp, CGGSVP)
-#define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp, ZGGSVP)
-#define LAPACK_stgsja LAPACK_GLOBAL(stgsja, STGSJA)
-#define LAPACK_dtgsja LAPACK_GLOBAL(dtgsja, DTGSJA)
-#define LAPACK_ctgsja LAPACK_GLOBAL(ctgsja, CTGSJA)
-#define LAPACK_ztgsja LAPACK_GLOBAL(ztgsja, ZTGSJA)
-#define LAPACK_sgels LAPACK_GLOBAL(sgels, SGELS)
-#define LAPACK_dgels LAPACK_GLOBAL(dgels, DGELS)
-#define LAPACK_cgels LAPACK_GLOBAL(cgels, CGELS)
-#define LAPACK_zgels LAPACK_GLOBAL(zgels, ZGELS)
-#define LAPACK_sgelsy LAPACK_GLOBAL(sgelsy, SGELSY)
-#define LAPACK_dgelsy LAPACK_GLOBAL(dgelsy, DGELSY)
-#define LAPACK_cgelsy LAPACK_GLOBAL(cgelsy, CGELSY)
-#define LAPACK_zgelsy LAPACK_GLOBAL(zgelsy, ZGELSY)
-#define LAPACK_sgelss LAPACK_GLOBAL(sgelss, SGELSS)
-#define LAPACK_dgelss LAPACK_GLOBAL(dgelss, DGELSS)
-#define LAPACK_cgelss LAPACK_GLOBAL(cgelss, CGELSS)
-#define LAPACK_zgelss LAPACK_GLOBAL(zgelss, ZGELSS)
-#define LAPACK_sgelsd LAPACK_GLOBAL(sgelsd, SGELSD)
-#define LAPACK_dgelsd LAPACK_GLOBAL(dgelsd, DGELSD)
-#define LAPACK_cgelsd LAPACK_GLOBAL(cgelsd, CGELSD)
-#define LAPACK_zgelsd LAPACK_GLOBAL(zgelsd, ZGELSD)
-#define LAPACK_sgglse LAPACK_GLOBAL(sgglse, SGGLSE)
-#define LAPACK_dgglse LAPACK_GLOBAL(dgglse, DGGLSE)
-#define LAPACK_cgglse LAPACK_GLOBAL(cgglse, CGGLSE)
-#define LAPACK_zgglse LAPACK_GLOBAL(zgglse, ZGGLSE)
-#define LAPACK_sggglm LAPACK_GLOBAL(sggglm, SGGGLM)
-#define LAPACK_dggglm LAPACK_GLOBAL(dggglm, DGGGLM)
-#define LAPACK_cggglm LAPACK_GLOBAL(cggglm, CGGGLM)
-#define LAPACK_zggglm LAPACK_GLOBAL(zggglm, ZGGGLM)
-#define LAPACK_ssyev LAPACK_GLOBAL(ssyev, SSYEV)
-#define LAPACK_dsyev LAPACK_GLOBAL(dsyev, DSYEV)
-#define LAPACK_cheev LAPACK_GLOBAL(cheev, CHEEV)
-#define LAPACK_zheev LAPACK_GLOBAL(zheev, ZHEEV)
-#define LAPACK_ssyevd LAPACK_GLOBAL(ssyevd, SSYEVD)
-#define LAPACK_dsyevd LAPACK_GLOBAL(dsyevd, DSYEVD)
-#define LAPACK_cheevd LAPACK_GLOBAL(cheevd, CHEEVD)
-#define LAPACK_zheevd LAPACK_GLOBAL(zheevd, ZHEEVD)
-#define LAPACK_ssyevx LAPACK_GLOBAL(ssyevx, SSYEVX)
-#define LAPACK_dsyevx LAPACK_GLOBAL(dsyevx, DSYEVX)
-#define LAPACK_cheevx LAPACK_GLOBAL(cheevx, CHEEVX)
-#define LAPACK_zheevx LAPACK_GLOBAL(zheevx, ZHEEVX)
-#define LAPACK_ssyevr LAPACK_GLOBAL(ssyevr, SSYEVR)
-#define LAPACK_dsyevr LAPACK_GLOBAL(dsyevr, DSYEVR)
-#define LAPACK_cheevr LAPACK_GLOBAL(cheevr, CHEEVR)
-#define LAPACK_zheevr LAPACK_GLOBAL(zheevr, ZHEEVR)
-#define LAPACK_sspev LAPACK_GLOBAL(sspev, SSPEV)
-#define LAPACK_dspev LAPACK_GLOBAL(dspev, DSPEV)
-#define LAPACK_chpev LAPACK_GLOBAL(chpev, CHPEV)
-#define LAPACK_zhpev LAPACK_GLOBAL(zhpev, ZHPEV)
-#define LAPACK_sspevd LAPACK_GLOBAL(sspevd, SSPEVD)
-#define LAPACK_dspevd LAPACK_GLOBAL(dspevd, DSPEVD)
-#define LAPACK_chpevd LAPACK_GLOBAL(chpevd, CHPEVD)
-#define LAPACK_zhpevd LAPACK_GLOBAL(zhpevd, ZHPEVD)
-#define LAPACK_sspevx LAPACK_GLOBAL(sspevx, SSPEVX)
-#define LAPACK_dspevx LAPACK_GLOBAL(dspevx, DSPEVX)
-#define LAPACK_chpevx LAPACK_GLOBAL(chpevx, CHPEVX)
-#define LAPACK_zhpevx LAPACK_GLOBAL(zhpevx, ZHPEVX)
-#define LAPACK_ssbev LAPACK_GLOBAL(ssbev, SSBEV)
-#define LAPACK_dsbev LAPACK_GLOBAL(dsbev, DSBEV)
-#define LAPACK_chbev LAPACK_GLOBAL(chbev, CHBEV)
-#define LAPACK_zhbev LAPACK_GLOBAL(zhbev, ZHBEV)
-#define LAPACK_ssbevd LAPACK_GLOBAL(ssbevd, SSBEVD)
-#define LAPACK_dsbevd LAPACK_GLOBAL(dsbevd, DSBEVD)
-#define LAPACK_chbevd LAPACK_GLOBAL(chbevd, CHBEVD)
-#define LAPACK_zhbevd LAPACK_GLOBAL(zhbevd, ZHBEVD)
-#define LAPACK_ssbevx LAPACK_GLOBAL(ssbevx, SSBEVX)
-#define LAPACK_dsbevx LAPACK_GLOBAL(dsbevx, DSBEVX)
-#define LAPACK_chbevx LAPACK_GLOBAL(chbevx, CHBEVX)
-#define LAPACK_zhbevx LAPACK_GLOBAL(zhbevx, ZHBEVX)
-#define LAPACK_sstev LAPACK_GLOBAL(sstev, SSTEV)
-#define LAPACK_dstev LAPACK_GLOBAL(dstev, DSTEV)
-#define LAPACK_sstevd LAPACK_GLOBAL(sstevd, SSTEVD)
-#define LAPACK_dstevd LAPACK_GLOBAL(dstevd, DSTEVD)
-#define LAPACK_sstevx LAPACK_GLOBAL(sstevx, SSTEVX)
-#define LAPACK_dstevx LAPACK_GLOBAL(dstevx, DSTEVX)
-#define LAPACK_sstevr LAPACK_GLOBAL(sstevr, SSTEVR)
-#define LAPACK_dstevr LAPACK_GLOBAL(dstevr, DSTEVR)
-#define LAPACK_sgees LAPACK_GLOBAL(sgees, SGEES)
-#define LAPACK_dgees LAPACK_GLOBAL(dgees, DGEES)
-#define LAPACK_cgees LAPACK_GLOBAL(cgees, CGEES)
-#define LAPACK_zgees LAPACK_GLOBAL(zgees, ZGEES)
-#define LAPACK_sgeesx LAPACK_GLOBAL(sgeesx, SGEESX)
-#define LAPACK_dgeesx LAPACK_GLOBAL(dgeesx, DGEESX)
-#define LAPACK_cgeesx LAPACK_GLOBAL(cgeesx, CGEESX)
-#define LAPACK_zgeesx LAPACK_GLOBAL(zgeesx, ZGEESX)
-#define LAPACK_sgeev LAPACK_GLOBAL(sgeev, SGEEV)
-#define LAPACK_dgeev LAPACK_GLOBAL(dgeev, DGEEV)
-#define LAPACK_cgeev LAPACK_GLOBAL(cgeev, CGEEV)
-#define LAPACK_zgeev LAPACK_GLOBAL(zgeev, ZGEEV)
-#define LAPACK_sgeevx LAPACK_GLOBAL(sgeevx, SGEEVX)
-#define LAPACK_dgeevx LAPACK_GLOBAL(dgeevx, DGEEVX)
-#define LAPACK_cgeevx LAPACK_GLOBAL(cgeevx, CGEEVX)
-#define LAPACK_zgeevx LAPACK_GLOBAL(zgeevx, ZGEEVX)
-#define LAPACK_sgesvd LAPACK_GLOBAL(sgesvd, SGESVD)
-#define LAPACK_dgesvd LAPACK_GLOBAL(dgesvd, DGESVD)
-#define LAPACK_cgesvd LAPACK_GLOBAL(cgesvd, CGESVD)
-#define LAPACK_zgesvd LAPACK_GLOBAL(zgesvd, ZGESVD)
-#define LAPACK_sgesdd LAPACK_GLOBAL(sgesdd, SGESDD)
-#define LAPACK_dgesdd LAPACK_GLOBAL(dgesdd, DGESDD)
-#define LAPACK_cgesdd LAPACK_GLOBAL(cgesdd, CGESDD)
-#define LAPACK_zgesdd LAPACK_GLOBAL(zgesdd, ZGESDD)
-#define LAPACK_dgejsv LAPACK_GLOBAL(dgejsv, DGEJSV)
-#define LAPACK_sgejsv LAPACK_GLOBAL(sgejsv, SGEJSV)
-#define LAPACK_dgesvj LAPACK_GLOBAL(dgesvj, DGESVJ)
-#define LAPACK_sgesvj LAPACK_GLOBAL(sgesvj, SGESVJ)
-#define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd, SGGSVD)
-#define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd, DGGSVD)
-#define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd, CGGSVD)
-#define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd, ZGGSVD)
-#define LAPACK_ssygv LAPACK_GLOBAL(ssygv, SSYGV)
-#define LAPACK_dsygv LAPACK_GLOBAL(dsygv, DSYGV)
-#define LAPACK_chegv LAPACK_GLOBAL(chegv, CHEGV)
-#define LAPACK_zhegv LAPACK_GLOBAL(zhegv, ZHEGV)
-#define LAPACK_ssygvd LAPACK_GLOBAL(ssygvd, SSYGVD)
-#define LAPACK_dsygvd LAPACK_GLOBAL(dsygvd, DSYGVD)
-#define LAPACK_chegvd LAPACK_GLOBAL(chegvd, CHEGVD)
-#define LAPACK_zhegvd LAPACK_GLOBAL(zhegvd, ZHEGVD)
-#define LAPACK_ssygvx LAPACK_GLOBAL(ssygvx, SSYGVX)
-#define LAPACK_dsygvx LAPACK_GLOBAL(dsygvx, DSYGVX)
-#define LAPACK_chegvx LAPACK_GLOBAL(chegvx, CHEGVX)
-#define LAPACK_zhegvx LAPACK_GLOBAL(zhegvx, ZHEGVX)
-#define LAPACK_sspgv LAPACK_GLOBAL(sspgv, SSPGV)
-#define LAPACK_dspgv LAPACK_GLOBAL(dspgv, DSPGV)
-#define LAPACK_chpgv LAPACK_GLOBAL(chpgv, CHPGV)
-#define LAPACK_zhpgv LAPACK_GLOBAL(zhpgv, ZHPGV)
-#define LAPACK_sspgvd LAPACK_GLOBAL(sspgvd, SSPGVD)
-#define LAPACK_dspgvd LAPACK_GLOBAL(dspgvd, DSPGVD)
-#define LAPACK_chpgvd LAPACK_GLOBAL(chpgvd, CHPGVD)
-#define LAPACK_zhpgvd LAPACK_GLOBAL(zhpgvd, ZHPGVD)
-#define LAPACK_sspgvx LAPACK_GLOBAL(sspgvx, SSPGVX)
-#define LAPACK_dspgvx LAPACK_GLOBAL(dspgvx, DSPGVX)
-#define LAPACK_chpgvx LAPACK_GLOBAL(chpgvx, CHPGVX)
-#define LAPACK_zhpgvx LAPACK_GLOBAL(zhpgvx, ZHPGVX)
-#define LAPACK_ssbgv LAPACK_GLOBAL(ssbgv, SSBGV)
-#define LAPACK_dsbgv LAPACK_GLOBAL(dsbgv, DSBGV)
-#define LAPACK_chbgv LAPACK_GLOBAL(chbgv, CHBGV)
-#define LAPACK_zhbgv LAPACK_GLOBAL(zhbgv, ZHBGV)
-#define LAPACK_ssbgvd LAPACK_GLOBAL(ssbgvd, SSBGVD)
-#define LAPACK_dsbgvd LAPACK_GLOBAL(dsbgvd, DSBGVD)
-#define LAPACK_chbgvd LAPACK_GLOBAL(chbgvd, CHBGVD)
-#define LAPACK_zhbgvd LAPACK_GLOBAL(zhbgvd, ZHBGVD)
-#define LAPACK_ssbgvx LAPACK_GLOBAL(ssbgvx, SSBGVX)
-#define LAPACK_dsbgvx LAPACK_GLOBAL(dsbgvx, DSBGVX)
-#define LAPACK_chbgvx LAPACK_GLOBAL(chbgvx, CHBGVX)
-#define LAPACK_zhbgvx LAPACK_GLOBAL(zhbgvx, ZHBGVX)
-#define LAPACK_sgges LAPACK_GLOBAL(sgges, SGGES)
-#define LAPACK_dgges LAPACK_GLOBAL(dgges, DGGES)
-#define LAPACK_cgges LAPACK_GLOBAL(cgges, CGGES)
-#define LAPACK_zgges LAPACK_GLOBAL(zgges, ZGGES)
-#define LAPACK_sggesx LAPACK_GLOBAL(sggesx, SGGESX)
-#define LAPACK_dggesx LAPACK_GLOBAL(dggesx, DGGESX)
-#define LAPACK_cggesx LAPACK_GLOBAL(cggesx, CGGESX)
-#define LAPACK_zggesx LAPACK_GLOBAL(zggesx, ZGGESX)
-#define LAPACK_sggev LAPACK_GLOBAL(sggev, SGGEV)
-#define LAPACK_dggev LAPACK_GLOBAL(dggev, DGGEV)
-#define LAPACK_cggev LAPACK_GLOBAL(cggev, CGGEV)
-#define LAPACK_zggev LAPACK_GLOBAL(zggev, ZGGEV)
-#define LAPACK_sggevx LAPACK_GLOBAL(sggevx, SGGEVX)
-#define LAPACK_dggevx LAPACK_GLOBAL(dggevx, DGGEVX)
-#define LAPACK_cggevx LAPACK_GLOBAL(cggevx, CGGEVX)
-#define LAPACK_zggevx LAPACK_GLOBAL(zggevx, ZGGEVX)
-#define LAPACK_dsfrk LAPACK_GLOBAL(dsfrk, DSFRK)
-#define LAPACK_ssfrk LAPACK_GLOBAL(ssfrk, SSFRK)
-#define LAPACK_zhfrk LAPACK_GLOBAL(zhfrk, ZHFRK)
-#define LAPACK_chfrk LAPACK_GLOBAL(chfrk, CHFRK)
-#define LAPACK_dtfsm LAPACK_GLOBAL(dtfsm, DTFSM)
-#define LAPACK_stfsm LAPACK_GLOBAL(stfsm, STFSM)
-#define LAPACK_ztfsm LAPACK_GLOBAL(ztfsm, ZTFSM)
-#define LAPACK_ctfsm LAPACK_GLOBAL(ctfsm, CTFSM)
-#define LAPACK_dtfttp LAPACK_GLOBAL(dtfttp, DTFTTP)
-#define LAPACK_stfttp LAPACK_GLOBAL(stfttp, STFTTP)
-#define LAPACK_ztfttp LAPACK_GLOBAL(ztfttp, ZTFTTP)
-#define LAPACK_ctfttp LAPACK_GLOBAL(ctfttp, CTFTTP)
-#define LAPACK_dtfttr LAPACK_GLOBAL(dtfttr, DTFTTR)
-#define LAPACK_stfttr LAPACK_GLOBAL(stfttr, STFTTR)
-#define LAPACK_ztfttr LAPACK_GLOBAL(ztfttr, ZTFTTR)
-#define LAPACK_ctfttr LAPACK_GLOBAL(ctfttr, CTFTTR)
-#define LAPACK_dtpttf LAPACK_GLOBAL(dtpttf, DTPTTF)
-#define LAPACK_stpttf LAPACK_GLOBAL(stpttf, STPTTF)
-#define LAPACK_ztpttf LAPACK_GLOBAL(ztpttf, ZTPTTF)
-#define LAPACK_ctpttf LAPACK_GLOBAL(ctpttf, CTPTTF)
-#define LAPACK_dtpttr LAPACK_GLOBAL(dtpttr, DTPTTR)
-#define LAPACK_stpttr LAPACK_GLOBAL(stpttr, STPTTR)
-#define LAPACK_ztpttr LAPACK_GLOBAL(ztpttr, ZTPTTR)
-#define LAPACK_ctpttr LAPACK_GLOBAL(ctpttr, CTPTTR)
-#define LAPACK_dtrttf LAPACK_GLOBAL(dtrttf, DTRTTF)
-#define LAPACK_strttf LAPACK_GLOBAL(strttf, STRTTF)
-#define LAPACK_ztrttf LAPACK_GLOBAL(ztrttf, ZTRTTF)
-#define LAPACK_ctrttf LAPACK_GLOBAL(ctrttf, CTRTTF)
-#define LAPACK_dtrttp LAPACK_GLOBAL(dtrttp, DTRTTP)
-#define LAPACK_strttp LAPACK_GLOBAL(strttp, STRTTP)
-#define LAPACK_ztrttp LAPACK_GLOBAL(ztrttp, ZTRTTP)
-#define LAPACK_ctrttp LAPACK_GLOBAL(ctrttp, CTRTTP)
-#define LAPACK_sgeqrfp LAPACK_GLOBAL(sgeqrfp, SGEQRFP)
-#define LAPACK_dgeqrfp LAPACK_GLOBAL(dgeqrfp, DGEQRFP)
-#define LAPACK_cgeqrfp LAPACK_GLOBAL(cgeqrfp, CGEQRFP)
-#define LAPACK_zgeqrfp LAPACK_GLOBAL(zgeqrfp, ZGEQRFP)
-#define LAPACK_clacgv LAPACK_GLOBAL(clacgv, CLACGV)
-#define LAPACK_zlacgv LAPACK_GLOBAL(zlacgv, ZLACGV)
-#define LAPACK_slarnv LAPACK_GLOBAL(slarnv, SLARNV)
-#define LAPACK_dlarnv LAPACK_GLOBAL(dlarnv, DLARNV)
-#define LAPACK_clarnv LAPACK_GLOBAL(clarnv, CLARNV)
-#define LAPACK_zlarnv LAPACK_GLOBAL(zlarnv, ZLARNV)
-#define LAPACK_sgeqr2 LAPACK_GLOBAL(sgeqr2, SGEQR2)
-#define LAPACK_dgeqr2 LAPACK_GLOBAL(dgeqr2, DGEQR2)
-#define LAPACK_cgeqr2 LAPACK_GLOBAL(cgeqr2, CGEQR2)
-#define LAPACK_zgeqr2 LAPACK_GLOBAL(zgeqr2, ZGEQR2)
-#define LAPACK_slacpy LAPACK_GLOBAL(slacpy, SLACPY)
-#define LAPACK_dlacpy LAPACK_GLOBAL(dlacpy, DLACPY)
-#define LAPACK_clacpy LAPACK_GLOBAL(clacpy, CLACPY)
-#define LAPACK_zlacpy LAPACK_GLOBAL(zlacpy, ZLACPY)
-#define LAPACK_sgetf2 LAPACK_GLOBAL(sgetf2, SGETF2)
-#define LAPACK_dgetf2 LAPACK_GLOBAL(dgetf2, DGETF2)
-#define LAPACK_cgetf2 LAPACK_GLOBAL(cgetf2, CGETF2)
-#define LAPACK_zgetf2 LAPACK_GLOBAL(zgetf2, ZGETF2)
-#define LAPACK_slaswp LAPACK_GLOBAL(slaswp, SLASWP)
-#define LAPACK_dlaswp LAPACK_GLOBAL(dlaswp, DLASWP)
-#define LAPACK_claswp LAPACK_GLOBAL(claswp, CLASWP)
-#define LAPACK_zlaswp LAPACK_GLOBAL(zlaswp, ZLASWP)
-#define LAPACK_slange LAPACK_GLOBAL(slange, SLANGE)
-#define LAPACK_dlange LAPACK_GLOBAL(dlange, DLANGE)
-#define LAPACK_clange LAPACK_GLOBAL(clange, CLANGE)
-#define LAPACK_zlange LAPACK_GLOBAL(zlange, ZLANGE)
-#define LAPACK_clanhe LAPACK_GLOBAL(clanhe, CLANHE)
-#define LAPACK_zlanhe LAPACK_GLOBAL(zlanhe, ZLANHE)
-#define LAPACK_slansy LAPACK_GLOBAL(slansy, SLANSY)
-#define LAPACK_dlansy LAPACK_GLOBAL(dlansy, DLANSY)
-#define LAPACK_clansy LAPACK_GLOBAL(clansy, CLANSY)
-#define LAPACK_zlansy LAPACK_GLOBAL(zlansy, ZLANSY)
-#define LAPACK_slantr LAPACK_GLOBAL(slantr, SLANTR)
-#define LAPACK_dlantr LAPACK_GLOBAL(dlantr, DLANTR)
-#define LAPACK_clantr LAPACK_GLOBAL(clantr, CLANTR)
-#define LAPACK_zlantr LAPACK_GLOBAL(zlantr, ZLANTR)
-#define LAPACK_slamch LAPACK_GLOBAL(slamch, SLAMCH)
-#define LAPACK_dlamch LAPACK_GLOBAL(dlamch, DLAMCH)
-#define LAPACK_sgelq2 LAPACK_GLOBAL(sgelq2, SGELQ2)
-#define LAPACK_dgelq2 LAPACK_GLOBAL(dgelq2, DGELQ2)
-#define LAPACK_cgelq2 LAPACK_GLOBAL(cgelq2, CGELQ2)
-#define LAPACK_zgelq2 LAPACK_GLOBAL(zgelq2, ZGELQ2)
-#define LAPACK_slarfb LAPACK_GLOBAL(slarfb, SLARFB)
-#define LAPACK_dlarfb LAPACK_GLOBAL(dlarfb, DLARFB)
-#define LAPACK_clarfb LAPACK_GLOBAL(clarfb, CLARFB)
-#define LAPACK_zlarfb LAPACK_GLOBAL(zlarfb, ZLARFB)
-#define LAPACK_slarfg LAPACK_GLOBAL(slarfg, SLARFG)
-#define LAPACK_dlarfg LAPACK_GLOBAL(dlarfg, DLARFG)
-#define LAPACK_clarfg LAPACK_GLOBAL(clarfg, CLARFG)
-#define LAPACK_zlarfg LAPACK_GLOBAL(zlarfg, ZLARFG)
-#define LAPACK_slarft LAPACK_GLOBAL(slarft, SLARFT)
-#define LAPACK_dlarft LAPACK_GLOBAL(dlarft, DLARFT)
-#define LAPACK_clarft LAPACK_GLOBAL(clarft, CLARFT)
-#define LAPACK_zlarft LAPACK_GLOBAL(zlarft, ZLARFT)
-#define LAPACK_slarfx LAPACK_GLOBAL(slarfx, SLARFX)
-#define LAPACK_dlarfx LAPACK_GLOBAL(dlarfx, DLARFX)
-#define LAPACK_clarfx LAPACK_GLOBAL(clarfx, CLARFX)
-#define LAPACK_zlarfx LAPACK_GLOBAL(zlarfx, ZLARFX)
-#define LAPACK_slatms LAPACK_GLOBAL(slatms, SLATMS)
-#define LAPACK_dlatms LAPACK_GLOBAL(dlatms, DLATMS)
-#define LAPACK_clatms LAPACK_GLOBAL(clatms, CLATMS)
-#define LAPACK_zlatms LAPACK_GLOBAL(zlatms, ZLATMS)
-#define LAPACK_slag2d LAPACK_GLOBAL(slag2d, SLAG2D)
-#define LAPACK_dlag2s LAPACK_GLOBAL(dlag2s, DLAG2S)
-#define LAPACK_clag2z LAPACK_GLOBAL(clag2z, CLAG2Z)
-#define LAPACK_zlag2c LAPACK_GLOBAL(zlag2c, ZLAG2C)
-#define LAPACK_slauum LAPACK_GLOBAL(slauum, SLAUUM)
-#define LAPACK_dlauum LAPACK_GLOBAL(dlauum, DLAUUM)
-#define LAPACK_clauum LAPACK_GLOBAL(clauum, CLAUUM)
-#define LAPACK_zlauum LAPACK_GLOBAL(zlauum, ZLAUUM)
-#define LAPACK_slagge LAPACK_GLOBAL(slagge, SLAGGE)
-#define LAPACK_dlagge LAPACK_GLOBAL(dlagge, DLAGGE)
-#define LAPACK_clagge LAPACK_GLOBAL(clagge, CLAGGE)
-#define LAPACK_zlagge LAPACK_GLOBAL(zlagge, ZLAGGE)
-#define LAPACK_slaset LAPACK_GLOBAL(slaset, SLASET)
-#define LAPACK_dlaset LAPACK_GLOBAL(dlaset, DLASET)
-#define LAPACK_claset LAPACK_GLOBAL(claset, CLASET)
-#define LAPACK_zlaset LAPACK_GLOBAL(zlaset, ZLASET)
-#define LAPACK_slasrt LAPACK_GLOBAL(slasrt, SLASRT)
-#define LAPACK_dlasrt LAPACK_GLOBAL(dlasrt, DLASRT)
-#define LAPACK_slagsy LAPACK_GLOBAL(slagsy, SLAGSY)
-#define LAPACK_dlagsy LAPACK_GLOBAL(dlagsy, DLAGSY)
-#define LAPACK_clagsy LAPACK_GLOBAL(clagsy, CLAGSY)
-#define LAPACK_zlagsy LAPACK_GLOBAL(zlagsy, ZLAGSY)
-#define LAPACK_claghe LAPACK_GLOBAL(claghe, CLAGHE)
-#define LAPACK_zlaghe LAPACK_GLOBAL(zlaghe, ZLAGHE)
-#define LAPACK_slapmr LAPACK_GLOBAL(slapmr, SLAPMR)
-#define LAPACK_dlapmr LAPACK_GLOBAL(dlapmr, DLAPMR)
-#define LAPACK_clapmr LAPACK_GLOBAL(clapmr, CLAPMR)
-#define LAPACK_zlapmr LAPACK_GLOBAL(zlapmr, ZLAPMR)
-#define LAPACK_slapy2 LAPACK_GLOBAL(slapy2, SLAPY2)
-#define LAPACK_dlapy2 LAPACK_GLOBAL(dlapy2, DLAPY2)
-#define LAPACK_slapy3 LAPACK_GLOBAL(slapy3, SLAPY3)
-#define LAPACK_dlapy3 LAPACK_GLOBAL(dlapy3, DLAPY3)
-#define LAPACK_slartgp LAPACK_GLOBAL(slartgp, SLARTGP)
-#define LAPACK_dlartgp LAPACK_GLOBAL(dlartgp, DLARTGP)
-#define LAPACK_slartgs LAPACK_GLOBAL(slartgs, SLARTGS)
-#define LAPACK_dlartgs LAPACK_GLOBAL(dlartgs, DLARTGS)
-// LAPACK 3.3.0
-#define LAPACK_cbbcsd LAPACK_GLOBAL(cbbcsd, CBBCSD)
-#define LAPACK_cheswapr LAPACK_GLOBAL(cheswapr, CHESWAPR)
-#define LAPACK_chetri2 LAPACK_GLOBAL(chetri2, CHETRI2)
-#define LAPACK_chetri2x LAPACK_GLOBAL(chetri2x, CHETRI2X)
-#define LAPACK_chetrs2 LAPACK_GLOBAL(chetrs2, CHETRS2)
-#define LAPACK_csyconv LAPACK_GLOBAL(csyconv, CSYCONV)
-#define LAPACK_csyswapr LAPACK_GLOBAL(csyswapr, CSYSWAPR)
-#define LAPACK_csytri2 LAPACK_GLOBAL(csytri2, CSYTRI2)
-#define LAPACK_csytri2x LAPACK_GLOBAL(csytri2x, CSYTRI2X)
-#define LAPACK_csytrs2 LAPACK_GLOBAL(csytrs2, CSYTRS2)
-#define LAPACK_cunbdb LAPACK_GLOBAL(cunbdb, CUNBDB)
-#define LAPACK_cuncsd LAPACK_GLOBAL(cuncsd, CUNCSD)
-#define LAPACK_dbbcsd LAPACK_GLOBAL(dbbcsd, DBBCSD)
-#define LAPACK_dorbdb LAPACK_GLOBAL(dorbdb, DORBDB)
-#define LAPACK_dorcsd LAPACK_GLOBAL(dorcsd, DORCSD)
-#define LAPACK_dsyconv LAPACK_GLOBAL(dsyconv, DSYCONV)
-#define LAPACK_dsyswapr LAPACK_GLOBAL(dsyswapr, DSYSWAPR)
-#define LAPACK_dsytri2 LAPACK_GLOBAL(dsytri2, DSYTRI2)
-#define LAPACK_dsytri2x LAPACK_GLOBAL(dsytri2x, DSYTRI2X)
-#define LAPACK_dsytrs2 LAPACK_GLOBAL(dsytrs2, DSYTRS2)
-#define LAPACK_sbbcsd LAPACK_GLOBAL(sbbcsd, SBBCSD)
-#define LAPACK_sorbdb LAPACK_GLOBAL(sorbdb, SORBDB)
-#define LAPACK_sorcsd LAPACK_GLOBAL(sorcsd, SORCSD)
-#define LAPACK_ssyconv LAPACK_GLOBAL(ssyconv, SSYCONV)
-#define LAPACK_ssyswapr LAPACK_GLOBAL(ssyswapr, SSYSWAPR)
-#define LAPACK_ssytri2 LAPACK_GLOBAL(ssytri2, SSYTRI2)
-#define LAPACK_ssytri2x LAPACK_GLOBAL(ssytri2x, SSYTRI2X)
-#define LAPACK_ssytrs2 LAPACK_GLOBAL(ssytrs2, SSYTRS2)
-#define LAPACK_zbbcsd LAPACK_GLOBAL(zbbcsd, ZBBCSD)
-#define LAPACK_zheswapr LAPACK_GLOBAL(zheswapr, ZHESWAPR)
-#define LAPACK_zhetri2 LAPACK_GLOBAL(zhetri2, ZHETRI2)
-#define LAPACK_zhetri2x LAPACK_GLOBAL(zhetri2x, ZHETRI2X)
-#define LAPACK_zhetrs2 LAPACK_GLOBAL(zhetrs2, ZHETRS2)
-#define LAPACK_zsyconv LAPACK_GLOBAL(zsyconv, ZSYCONV)
-#define LAPACK_zsyswapr LAPACK_GLOBAL(zsyswapr, ZSYSWAPR)
-#define LAPACK_zsytri2 LAPACK_GLOBAL(zsytri2, ZSYTRI2)
-#define LAPACK_zsytri2x LAPACK_GLOBAL(zsytri2x, ZSYTRI2X)
-#define LAPACK_zsytrs2 LAPACK_GLOBAL(zsytrs2, ZSYTRS2)
-#define LAPACK_zunbdb LAPACK_GLOBAL(zunbdb, ZUNBDB)
-#define LAPACK_zuncsd LAPACK_GLOBAL(zuncsd, ZUNCSD)
-// LAPACK 3.4.0
-#define LAPACK_sgemqrt LAPACK_GLOBAL(sgemqrt, SGEMQRT)
-#define LAPACK_dgemqrt LAPACK_GLOBAL(dgemqrt, DGEMQRT)
-#define LAPACK_cgemqrt LAPACK_GLOBAL(cgemqrt, CGEMQRT)
-#define LAPACK_zgemqrt LAPACK_GLOBAL(zgemqrt, ZGEMQRT)
-#define LAPACK_sgeqrt LAPACK_GLOBAL(sgeqrt, SGEQRT)
-#define LAPACK_dgeqrt LAPACK_GLOBAL(dgeqrt, DGEQRT)
-#define LAPACK_cgeqrt LAPACK_GLOBAL(cgeqrt, CGEQRT)
-#define LAPACK_zgeqrt LAPACK_GLOBAL(zgeqrt, ZGEQRT)
-#define LAPACK_sgeqrt2 LAPACK_GLOBAL(sgeqrt2, SGEQRT2)
-#define LAPACK_dgeqrt2 LAPACK_GLOBAL(dgeqrt2, DGEQRT2)
-#define LAPACK_cgeqrt2 LAPACK_GLOBAL(cgeqrt2, CGEQRT2)
-#define LAPACK_zgeqrt2 LAPACK_GLOBAL(zgeqrt2, ZGEQRT2)
-#define LAPACK_sgeqrt3 LAPACK_GLOBAL(sgeqrt3, SGEQRT3)
-#define LAPACK_dgeqrt3 LAPACK_GLOBAL(dgeqrt3, DGEQRT3)
-#define LAPACK_cgeqrt3 LAPACK_GLOBAL(cgeqrt3, CGEQRT3)
-#define LAPACK_zgeqrt3 LAPACK_GLOBAL(zgeqrt3, ZGEQRT3)
-#define LAPACK_stpmqrt LAPACK_GLOBAL(stpmqrt, STPMQRT)
-#define LAPACK_dtpmqrt LAPACK_GLOBAL(dtpmqrt, DTPMQRT)
-#define LAPACK_ctpmqrt LAPACK_GLOBAL(ctpmqrt, CTPMQRT)
-#define LAPACK_ztpmqrt LAPACK_GLOBAL(ztpmqrt, ZTPMQRT)
-#define LAPACK_dtpqrt LAPACK_GLOBAL(dtpqrt, DTPQRT)
-#define LAPACK_ctpqrt LAPACK_GLOBAL(ctpqrt, CTPQRT)
-#define LAPACK_ztpqrt LAPACK_GLOBAL(ztpqrt, ZTPQRT)
-#define LAPACK_stpqrt2 LAPACK_GLOBAL(stpqrt2, STPQRT2)
-#define LAPACK_dtpqrt2 LAPACK_GLOBAL(dtpqrt2, DTPQRT2)
-#define LAPACK_ctpqrt2 LAPACK_GLOBAL(ctpqrt2, CTPQRT2)
-#define LAPACK_ztpqrt2 LAPACK_GLOBAL(ztpqrt2, ZTPQRT2)
-#define LAPACK_stprfb LAPACK_GLOBAL(stprfb, STPRFB)
-#define LAPACK_dtprfb LAPACK_GLOBAL(dtprfb, DTPRFB)
-#define LAPACK_ctprfb LAPACK_GLOBAL(ctprfb, CTPRFB)
-#define LAPACK_ztprfb LAPACK_GLOBAL(ztprfb, ZTPRFB)
-// LAPACK 3.X.X
-#define LAPACK_csyr LAPACK_GLOBAL(csyr, CSYR)
-#define LAPACK_zsyr LAPACK_GLOBAL(zsyr, ZSYR)
-
-void LAPACK_sgetrf(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, lapack_int* ipiv, lapack_int* info);
-void LAPACK_dgetrf(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, lapack_int* ipiv, lapack_int* info);
-void LAPACK_cgetrf(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
-                   lapack_int* info);
-void LAPACK_zgetrf(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
-                   lapack_int* info);
-void LAPACK_sgbtrf(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, float* ab, lapack_int* ldab,
-                   lapack_int* ipiv, lapack_int* info);
-void LAPACK_dgbtrf(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, double* ab, lapack_int* ldab,
-                   lapack_int* ipiv, lapack_int* info);
-void LAPACK_cgbtrf(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_complex_float* ab,
-                   lapack_int* ldab, lapack_int* ipiv, lapack_int* info);
-void LAPACK_zgbtrf(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_complex_double* ab,
-                   lapack_int* ldab, lapack_int* ipiv, lapack_int* info);
-void LAPACK_sgttrf(lapack_int* n, float* dl, float* d, float* du, float* du2, lapack_int* ipiv, lapack_int* info);
-void LAPACK_dgttrf(lapack_int* n, double* dl, double* d, double* du, double* du2, lapack_int* ipiv, lapack_int* info);
-void LAPACK_cgttrf(lapack_int* n, lapack_complex_float* dl, lapack_complex_float* d, lapack_complex_float* du,
-                   lapack_complex_float* du2, lapack_int* ipiv, lapack_int* info);
-void LAPACK_zgttrf(lapack_int* n, lapack_complex_double* dl, lapack_complex_double* d, lapack_complex_double* du,
-                   lapack_complex_double* du2, lapack_int* ipiv, lapack_int* info);
-void LAPACK_spotrf(char* uplo, lapack_int* n, float* a, lapack_int* lda, lapack_int* info);
-void LAPACK_dpotrf(char* uplo, lapack_int* n, double* a, lapack_int* lda, lapack_int* info);
-void LAPACK_cpotrf(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* info);
-void LAPACK_zpotrf(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* info);
-void LAPACK_dpstrf(char* uplo, lapack_int* n, double* a, lapack_int* lda, lapack_int* piv, lapack_int* rank,
-                   double* tol, double* work, lapack_int* info);
-void LAPACK_spstrf(char* uplo, lapack_int* n, float* a, lapack_int* lda, lapack_int* piv, lapack_int* rank, float* tol,
-                   float* work, lapack_int* info);
-void LAPACK_zpstrf(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* piv,
-                   lapack_int* rank, double* tol, double* work, lapack_int* info);
-void LAPACK_cpstrf(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* piv,
-                   lapack_int* rank, float* tol, float* work, lapack_int* info);
-void LAPACK_dpftrf(char* transr, char* uplo, lapack_int* n, double* a, lapack_int* info);
-void LAPACK_spftrf(char* transr, char* uplo, lapack_int* n, float* a, lapack_int* info);
-void LAPACK_zpftrf(char* transr, char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* info);
-void LAPACK_cpftrf(char* transr, char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* info);
-void LAPACK_spptrf(char* uplo, lapack_int* n, float* ap, lapack_int* info);
-void LAPACK_dpptrf(char* uplo, lapack_int* n, double* ap, lapack_int* info);
-void LAPACK_cpptrf(char* uplo, lapack_int* n, lapack_complex_float* ap, lapack_int* info);
-void LAPACK_zpptrf(char* uplo, lapack_int* n, lapack_complex_double* ap, lapack_int* info);
-void LAPACK_spbtrf(char* uplo, lapack_int* n, lapack_int* kd, float* ab, lapack_int* ldab, lapack_int* info);
-void LAPACK_dpbtrf(char* uplo, lapack_int* n, lapack_int* kd, double* ab, lapack_int* ldab, lapack_int* info);
-void LAPACK_cpbtrf(char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_float* ab, lapack_int* ldab,
-                   lapack_int* info);
-void LAPACK_zpbtrf(char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_double* ab, lapack_int* ldab,
-                   lapack_int* info);
-void LAPACK_spttrf(lapack_int* n, float* d, float* e, lapack_int* info);
-void LAPACK_dpttrf(lapack_int* n, double* d, double* e, lapack_int* info);
-void LAPACK_cpttrf(lapack_int* n, float* d, lapack_complex_float* e, lapack_int* info);
-void LAPACK_zpttrf(lapack_int* n, double* d, lapack_complex_double* e, lapack_int* info);
-void LAPACK_ssytrf(char* uplo, lapack_int* n, float* a, lapack_int* lda, lapack_int* ipiv, float* work,
-                   lapack_int* lwork, lapack_int* info);
-void LAPACK_dsytrf(char* uplo, lapack_int* n, double* a, lapack_int* lda, lapack_int* ipiv, double* work,
-                   lapack_int* lwork, lapack_int* info);
-void LAPACK_csytrf(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
-                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zsytrf(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
-                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_chetrf(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
-                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zhetrf(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
-                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_ssptrf(char* uplo, lapack_int* n, float* ap, lapack_int* ipiv, lapack_int* info);
-void LAPACK_dsptrf(char* uplo, lapack_int* n, double* ap, lapack_int* ipiv, lapack_int* info);
-void LAPACK_csptrf(char* uplo, lapack_int* n, lapack_complex_float* ap, lapack_int* ipiv, lapack_int* info);
-void LAPACK_zsptrf(char* uplo, lapack_int* n, lapack_complex_double* ap, lapack_int* ipiv, lapack_int* info);
-void LAPACK_chptrf(char* uplo, lapack_int* n, lapack_complex_float* ap, lapack_int* ipiv, lapack_int* info);
-void LAPACK_zhptrf(char* uplo, lapack_int* n, lapack_complex_double* ap, lapack_int* ipiv, lapack_int* info);
-void LAPACK_sgetrs(char* trans, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda,
-                   const lapack_int* ipiv, float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_dgetrs(char* trans, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda,
-                   const lapack_int* ipiv, double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_cgetrs(char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
-                   const lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_zgetrs(char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
-                   const lapack_int* ipiv, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_sgbtrs(char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, const float* ab,
-                   lapack_int* ldab, const lapack_int* ipiv, float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_dgbtrs(char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, const double* ab,
-                   lapack_int* ldab, const lapack_int* ipiv, double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_cgbtrs(char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
-                   const lapack_complex_float* ab, lapack_int* ldab, const lapack_int* ipiv, lapack_complex_float* b,
-                   lapack_int* ldb, lapack_int* info);
-void LAPACK_zgbtrs(char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
-                   const lapack_complex_double* ab, lapack_int* ldab, const lapack_int* ipiv, lapack_complex_double* b,
-                   lapack_int* ldb, lapack_int* info);
-void LAPACK_sgttrs(char* trans, lapack_int* n, lapack_int* nrhs, const float* dl, const float* d, const float* du,
-                   const float* du2, const lapack_int* ipiv, float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_dgttrs(char* trans, lapack_int* n, lapack_int* nrhs, const double* dl, const double* d, const double* du,
-                   const double* du2, const lapack_int* ipiv, double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_cgttrs(char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* dl,
-                   const lapack_complex_float* d, const lapack_complex_float* du, const lapack_complex_float* du2,
-                   const lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_zgttrs(char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* dl,
-                   const lapack_complex_double* d, const lapack_complex_double* du, const lapack_complex_double* du2,
-                   const lapack_int* ipiv, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_spotrs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda, float* b,
-                   lapack_int* ldb, lapack_int* info);
-void LAPACK_dpotrs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda, double* b,
-                   lapack_int* ldb, lapack_int* info);
-void LAPACK_cpotrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_zpotrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_dpftrs(char* transr, char* uplo, lapack_int* n, lapack_int* nrhs, const double* a, double* b,
-                   lapack_int* ldb, lapack_int* info);
-void LAPACK_spftrs(char* transr, char* uplo, lapack_int* n, lapack_int* nrhs, const float* a, float* b, lapack_int* ldb,
-                   lapack_int* info);
-void LAPACK_zpftrs(char* transr, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a,
-                   lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_cpftrs(char* transr, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a,
-                   lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_spptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* ap, float* b, lapack_int* ldb,
-                   lapack_int* info);
-void LAPACK_dpptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* ap, double* b, lapack_int* ldb,
-                   lapack_int* info);
-void LAPACK_cpptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap, lapack_complex_float* b,
-                   lapack_int* ldb, lapack_int* info);
-void LAPACK_zpptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* ap,
-                   lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_spbtrs(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, const float* ab, lapack_int* ldab,
-                   float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_dpbtrs(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, const double* ab, lapack_int* ldab,
-                   double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_cpbtrs(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, const lapack_complex_float* ab,
-                   lapack_int* ldab, lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_zpbtrs(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, const lapack_complex_double* ab,
-                   lapack_int* ldab, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_spttrs(lapack_int* n, lapack_int* nrhs, const float* d, const float* e, float* b, lapack_int* ldb,
-                   lapack_int* info);
-void LAPACK_dpttrs(lapack_int* n, lapack_int* nrhs, const double* d, const double* e, double* b, lapack_int* ldb,
-                   lapack_int* info);
-void LAPACK_cpttrs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* d, const lapack_complex_float* e,
-                   lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_zpttrs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* d, const lapack_complex_double* e,
-                   lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_ssytrs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda, const lapack_int* ipiv,
-                   float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_dsytrs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda,
-                   const lapack_int* ipiv, double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_csytrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
-                   const lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_zsytrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
-                   const lapack_int* ipiv, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_chetrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
-                   const lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_zhetrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
-                   const lapack_int* ipiv, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_ssptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* ap, const lapack_int* ipiv, float* b,
-                   lapack_int* ldb, lapack_int* info);
-void LAPACK_dsptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* ap, const lapack_int* ipiv, double* b,
-                   lapack_int* ldb, lapack_int* info);
-void LAPACK_csptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap, const lapack_int* ipiv,
-                   lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_zsptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* ap, const lapack_int* ipiv,
-                   lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_chptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap, const lapack_int* ipiv,
-                   lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_zhptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* ap, const lapack_int* ipiv,
-                   lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_strtrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const float* a,
-                   lapack_int* lda, float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_dtrtrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const double* a,
-                   lapack_int* lda, double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_ctrtrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a,
-                   lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_ztrtrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a,
-                   lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_stptrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const float* ap, float* b,
-                   lapack_int* ldb, lapack_int* info);
-void LAPACK_dtptrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const double* ap, double* b,
-                   lapack_int* ldb, lapack_int* info);
-void LAPACK_ctptrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap,
-                   lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_ztptrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs,
-                   const lapack_complex_double* ap, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_stbtrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                   const float* ab, lapack_int* ldab, float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_dtbtrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                   const double* ab, lapack_int* ldab, double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_ctbtrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                   const lapack_complex_float* ab, lapack_int* ldab, lapack_complex_float* b, lapack_int* ldb,
-                   lapack_int* info);
-void LAPACK_ztbtrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                   const lapack_complex_double* ab, lapack_int* ldab, lapack_complex_double* b, lapack_int* ldb,
-                   lapack_int* info);
-void LAPACK_sgecon(char* norm, lapack_int* n, const float* a, lapack_int* lda, float* anorm, float* rcond, float* work,
-                   lapack_int* iwork, lapack_int* info);
-void LAPACK_dgecon(char* norm, lapack_int* n, const double* a, lapack_int* lda, double* anorm, double* rcond,
-                   double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_cgecon(char* norm, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* anorm,
-                   float* rcond, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zgecon(char* norm, lapack_int* n, const lapack_complex_double* a, lapack_int* lda, double* anorm,
-                   double* rcond, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_sgbcon(char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku, const float* ab, lapack_int* ldab,
-                   const lapack_int* ipiv, float* anorm, float* rcond, float* work, lapack_int* iwork,
-                   lapack_int* info);
-void LAPACK_dgbcon(char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku, const double* ab, lapack_int* ldab,
-                   const lapack_int* ipiv, double* anorm, double* rcond, double* work, lapack_int* iwork,
-                   lapack_int* info);
-void LAPACK_cgbcon(char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku, const lapack_complex_float* ab,
-                   lapack_int* ldab, const lapack_int* ipiv, float* anorm, float* rcond, lapack_complex_float* work,
-                   float* rwork, lapack_int* info);
-void LAPACK_zgbcon(char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku, const lapack_complex_double* ab,
-                   lapack_int* ldab, const lapack_int* ipiv, double* anorm, double* rcond, lapack_complex_double* work,
-                   double* rwork, lapack_int* info);
-void LAPACK_sgtcon(char* norm, lapack_int* n, const float* dl, const float* d, const float* du, const float* du2,
-                   const lapack_int* ipiv, float* anorm, float* rcond, float* work, lapack_int* iwork,
-                   lapack_int* info);
-void LAPACK_dgtcon(char* norm, lapack_int* n, const double* dl, const double* d, const double* du, const double* du2,
-                   const lapack_int* ipiv, double* anorm, double* rcond, double* work, lapack_int* iwork,
-                   lapack_int* info);
-void LAPACK_cgtcon(char* norm, lapack_int* n, const lapack_complex_float* dl, const lapack_complex_float* d,
-                   const lapack_complex_float* du, const lapack_complex_float* du2, const lapack_int* ipiv,
-                   float* anorm, float* rcond, lapack_complex_float* work, lapack_int* info);
-void LAPACK_zgtcon(char* norm, lapack_int* n, const lapack_complex_double* dl, const lapack_complex_double* d,
-                   const lapack_complex_double* du, const lapack_complex_double* du2, const lapack_int* ipiv,
-                   double* anorm, double* rcond, lapack_complex_double* work, lapack_int* info);
-void LAPACK_spocon(char* uplo, lapack_int* n, const float* a, lapack_int* lda, float* anorm, float* rcond, float* work,
-                   lapack_int* iwork, lapack_int* info);
-void LAPACK_dpocon(char* uplo, lapack_int* n, const double* a, lapack_int* lda, double* anorm, double* rcond,
-                   double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_cpocon(char* uplo, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* anorm,
-                   float* rcond, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zpocon(char* uplo, lapack_int* n, const lapack_complex_double* a, lapack_int* lda, double* anorm,
-                   double* rcond, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_sppcon(char* uplo, lapack_int* n, const float* ap, float* anorm, float* rcond, float* work,
-                   lapack_int* iwork, lapack_int* info);
-void LAPACK_dppcon(char* uplo, lapack_int* n, const double* ap, double* anorm, double* rcond, double* work,
-                   lapack_int* iwork, lapack_int* info);
-void LAPACK_cppcon(char* uplo, lapack_int* n, const lapack_complex_float* ap, float* anorm, float* rcond,
-                   lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zppcon(char* uplo, lapack_int* n, const lapack_complex_double* ap, double* anorm, double* rcond,
-                   lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_spbcon(char* uplo, lapack_int* n, lapack_int* kd, const float* ab, lapack_int* ldab, float* anorm,
-                   float* rcond, float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_dpbcon(char* uplo, lapack_int* n, lapack_int* kd, const double* ab, lapack_int* ldab, double* anorm,
-                   double* rcond, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_cpbcon(char* uplo, lapack_int* n, lapack_int* kd, const lapack_complex_float* ab, lapack_int* ldab,
-                   float* anorm, float* rcond, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zpbcon(char* uplo, lapack_int* n, lapack_int* kd, const lapack_complex_double* ab, lapack_int* ldab,
-                   double* anorm, double* rcond, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_sptcon(lapack_int* n, const float* d, const float* e, float* anorm, float* rcond, float* work,
-                   lapack_int* info);
-void LAPACK_dptcon(lapack_int* n, const double* d, const double* e, double* anorm, double* rcond, double* work,
-                   lapack_int* info);
-void LAPACK_cptcon(lapack_int* n, const float* d, const lapack_complex_float* e, float* anorm, float* rcond,
-                   float* work, lapack_int* info);
-void LAPACK_zptcon(lapack_int* n, const double* d, const lapack_complex_double* e, double* anorm, double* rcond,
-                   double* work, lapack_int* info);
-void LAPACK_ssycon(char* uplo, lapack_int* n, const float* a, lapack_int* lda, const lapack_int* ipiv, float* anorm,
-                   float* rcond, float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_dsycon(char* uplo, lapack_int* n, const double* a, lapack_int* lda, const lapack_int* ipiv, double* anorm,
-                   double* rcond, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_csycon(char* uplo, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, const lapack_int* ipiv,
-                   float* anorm, float* rcond, lapack_complex_float* work, lapack_int* info);
-void LAPACK_zsycon(char* uplo, lapack_int* n, const lapack_complex_double* a, lapack_int* lda, const lapack_int* ipiv,
-                   double* anorm, double* rcond, lapack_complex_double* work, lapack_int* info);
-void LAPACK_checon(char* uplo, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, const lapack_int* ipiv,
-                   float* anorm, float* rcond, lapack_complex_float* work, lapack_int* info);
-void LAPACK_zhecon(char* uplo, lapack_int* n, const lapack_complex_double* a, lapack_int* lda, const lapack_int* ipiv,
-                   double* anorm, double* rcond, lapack_complex_double* work, lapack_int* info);
-void LAPACK_sspcon(char* uplo, lapack_int* n, const float* ap, const lapack_int* ipiv, float* anorm, float* rcond,
-                   float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_dspcon(char* uplo, lapack_int* n, const double* ap, const lapack_int* ipiv, double* anorm, double* rcond,
-                   double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_cspcon(char* uplo, lapack_int* n, const lapack_complex_float* ap, const lapack_int* ipiv, float* anorm,
-                   float* rcond, lapack_complex_float* work, lapack_int* info);
-void LAPACK_zspcon(char* uplo, lapack_int* n, const lapack_complex_double* ap, const lapack_int* ipiv, double* anorm,
-                   double* rcond, lapack_complex_double* work, lapack_int* info);
-void LAPACK_chpcon(char* uplo, lapack_int* n, const lapack_complex_float* ap, const lapack_int* ipiv, float* anorm,
-                   float* rcond, lapack_complex_float* work, lapack_int* info);
-void LAPACK_zhpcon(char* uplo, lapack_int* n, const lapack_complex_double* ap, const lapack_int* ipiv, double* anorm,
-                   double* rcond, lapack_complex_double* work, lapack_int* info);
-void LAPACK_strcon(char* norm, char* uplo, char* diag, lapack_int* n, const float* a, lapack_int* lda, float* rcond,
-                   float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_dtrcon(char* norm, char* uplo, char* diag, lapack_int* n, const double* a, lapack_int* lda, double* rcond,
-                   double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_ctrcon(char* norm, char* uplo, char* diag, lapack_int* n, const lapack_complex_float* a, lapack_int* lda,
-                   float* rcond, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_ztrcon(char* norm, char* uplo, char* diag, lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
-                   double* rcond, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_stpcon(char* norm, char* uplo, char* diag, lapack_int* n, const float* ap, float* rcond, float* work,
-                   lapack_int* iwork, lapack_int* info);
-void LAPACK_dtpcon(char* norm, char* uplo, char* diag, lapack_int* n, const double* ap, double* rcond, double* work,
-                   lapack_int* iwork, lapack_int* info);
-void LAPACK_ctpcon(char* norm, char* uplo, char* diag, lapack_int* n, const lapack_complex_float* ap, float* rcond,
-                   lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_ztpcon(char* norm, char* uplo, char* diag, lapack_int* n, const lapack_complex_double* ap, double* rcond,
-                   lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_stbcon(char* norm, char* uplo, char* diag, lapack_int* n, lapack_int* kd, const float* ab, lapack_int* ldab,
-                   float* rcond, float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_dtbcon(char* norm, char* uplo, char* diag, lapack_int* n, lapack_int* kd, const double* ab,
-                   lapack_int* ldab, double* rcond, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_ctbcon(char* norm, char* uplo, char* diag, lapack_int* n, lapack_int* kd, const lapack_complex_float* ab,
-                   lapack_int* ldab, float* rcond, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_ztbcon(char* norm, char* uplo, char* diag, lapack_int* n, lapack_int* kd, const lapack_complex_double* ab,
-                   lapack_int* ldab, double* rcond, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_sgerfs(char* trans, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda, const float* af,
-                   lapack_int* ldaf, const lapack_int* ipiv, const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
-                   float* ferr, float* berr, float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_dgerfs(char* trans, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda, const double* af,
-                   lapack_int* ldaf, const lapack_int* ipiv, const double* b, lapack_int* ldb, double* x,
-                   lapack_int* ldx, double* ferr, double* berr, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_cgerfs(char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
-                   const lapack_complex_float* af, lapack_int* ldaf, const lapack_int* ipiv,
-                   const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
-                   float* ferr, float* berr, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zgerfs(char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
-                   const lapack_complex_double* af, lapack_int* ldaf, const lapack_int* ipiv,
-                   const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                   double* ferr, double* berr, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_dgerfsx(char* trans, char* equed, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda,
-                    const double* af, lapack_int* ldaf, const lapack_int* ipiv, const double* r, const double* c,
-                    const double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* rcond, double* berr,
-                    lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int* nparams,
-                    double* params, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_sgerfsx(char* trans, char* equed, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda,
-                    const float* af, lapack_int* ldaf, const lapack_int* ipiv, const float* r, const float* c,
-                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, float* berr,
-                    lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int* nparams,
-                    float* params, float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_zgerfsx(char* trans, char* equed, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a,
-                    lapack_int* lda, const lapack_complex_double* af, lapack_int* ldaf, const lapack_int* ipiv,
-                    const double* r, const double* c, const lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* x, lapack_int* ldx, double* rcond, double* berr, lapack_int* n_err_bnds,
-                    double* err_bnds_norm, double* err_bnds_comp, lapack_int* nparams, double* params,
-                    lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_cgerfsx(char* trans, char* equed, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a,
-                    lapack_int* lda, const lapack_complex_float* af, lapack_int* ldaf, const lapack_int* ipiv,
-                    const float* r, const float* c, const lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* x, lapack_int* ldx, float* rcond, float* berr, lapack_int* n_err_bnds,
-                    float* err_bnds_norm, float* err_bnds_comp, lapack_int* nparams, float* params,
-                    lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_sgbrfs(char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, const float* ab,
-                   lapack_int* ldab, const float* afb, lapack_int* ldafb, const lapack_int* ipiv, const float* b,
-                   lapack_int* ldb, float* x, lapack_int* ldx, float* ferr, float* berr, float* work, lapack_int* iwork,
-                   lapack_int* info);
-void LAPACK_dgbrfs(char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, const double* ab,
-                   lapack_int* ldab, const double* afb, lapack_int* ldafb, const lapack_int* ipiv, const double* b,
-                   lapack_int* ldb, double* x, lapack_int* ldx, double* ferr, double* berr, double* work,
-                   lapack_int* iwork, lapack_int* info);
-void LAPACK_cgbrfs(char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
-                   const lapack_complex_float* ab, lapack_int* ldab, const lapack_complex_float* afb, lapack_int* ldafb,
-                   const lapack_int* ipiv, const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x,
-                   lapack_int* ldx, float* ferr, float* berr, lapack_complex_float* work, float* rwork,
-                   lapack_int* info);
-void LAPACK_zgbrfs(char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
-                   const lapack_complex_double* ab, lapack_int* ldab, const lapack_complex_double* afb,
-                   lapack_int* ldafb, const lapack_int* ipiv, const lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* x, lapack_int* ldx, double* ferr, double* berr, lapack_complex_double* work,
-                   double* rwork, lapack_int* info);
-void LAPACK_dgbrfsx(char* trans, char* equed, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
-                    const double* ab, lapack_int* ldab, const double* afb, lapack_int* ldafb, const lapack_int* ipiv,
-                    const double* r, const double* c, const double* b, lapack_int* ldb, double* x, lapack_int* ldx,
-                    double* rcond, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
-                    lapack_int* nparams, double* params, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_sgbrfsx(char* trans, char* equed, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
-                    const float* ab, lapack_int* ldab, const float* afb, lapack_int* ldafb, const lapack_int* ipiv,
-                    const float* r, const float* c, const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
-                    float* rcond, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
-                    lapack_int* nparams, float* params, float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_zgbrfsx(char* trans, char* equed, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
-                    const lapack_complex_double* ab, lapack_int* ldab, const lapack_complex_double* afb,
-                    lapack_int* ldafb, const lapack_int* ipiv, const double* r, const double* c,
-                    const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                    double* rcond, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
-                    lapack_int* nparams, double* params, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_cgbrfsx(char* trans, char* equed, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
-                    const lapack_complex_float* ab, lapack_int* ldab, const lapack_complex_float* afb,
-                    lapack_int* ldafb, const lapack_int* ipiv, const float* r, const float* c,
-                    const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
-                    float* rcond, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
-                    lapack_int* nparams, float* params, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_sgtrfs(char* trans, lapack_int* n, lapack_int* nrhs, const float* dl, const float* d, const float* du,
-                   const float* dlf, const float* df, const float* duf, const float* du2, const lapack_int* ipiv,
-                   const float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* ferr, float* berr, float* work,
-                   lapack_int* iwork, lapack_int* info);
-void LAPACK_dgtrfs(char* trans, lapack_int* n, lapack_int* nrhs, const double* dl, const double* d, const double* du,
-                   const double* dlf, const double* df, const double* duf, const double* du2, const lapack_int* ipiv,
-                   const double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* ferr, double* berr,
-                   double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_cgtrfs(char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* dl,
-                   const lapack_complex_float* d, const lapack_complex_float* du, const lapack_complex_float* dlf,
-                   const lapack_complex_float* df, const lapack_complex_float* duf, const lapack_complex_float* du2,
-                   const lapack_int* ipiv, const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x,
-                   lapack_int* ldx, float* ferr, float* berr, lapack_complex_float* work, float* rwork,
-                   lapack_int* info);
-void LAPACK_zgtrfs(char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* dl,
-                   const lapack_complex_double* d, const lapack_complex_double* du, const lapack_complex_double* dlf,
-                   const lapack_complex_double* df, const lapack_complex_double* duf, const lapack_complex_double* du2,
-                   const lapack_int* ipiv, const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x,
-                   lapack_int* ldx, double* ferr, double* berr, lapack_complex_double* work, double* rwork,
-                   lapack_int* info);
-void LAPACK_sporfs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda, const float* af,
-                   lapack_int* ldaf, const float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* ferr,
-                   float* berr, float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_dporfs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda, const double* af,
-                   lapack_int* ldaf, const double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
-                   double* berr, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_cporfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
-                   const lapack_complex_float* af, lapack_int* ldaf, const lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* x, lapack_int* ldx, float* ferr, float* berr, lapack_complex_float* work,
-                   float* rwork, lapack_int* info);
-void LAPACK_zporfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
-                   const lapack_complex_double* af, lapack_int* ldaf, const lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* x, lapack_int* ldx, double* ferr, double* berr, lapack_complex_double* work,
-                   double* rwork, lapack_int* info);
-void LAPACK_dporfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda,
-                    const double* af, lapack_int* ldaf, const double* s, const double* b, lapack_int* ldb, double* x,
-                    lapack_int* ldx, double* rcond, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm,
-                    double* err_bnds_comp, lapack_int* nparams, double* params, double* work, lapack_int* iwork,
-                    lapack_int* info);
-void LAPACK_sporfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda,
-                    const float* af, lapack_int* ldaf, const float* s, const float* b, lapack_int* ldb, float* x,
-                    lapack_int* ldx, float* rcond, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
-                    float* err_bnds_comp, lapack_int* nparams, float* params, float* work, lapack_int* iwork,
-                    lapack_int* info);
-void LAPACK_zporfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a,
-                    lapack_int* lda, const lapack_complex_double* af, lapack_int* ldaf, const double* s,
-                    const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                    double* rcond, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
-                    lapack_int* nparams, double* params, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_cporfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a,
-                    lapack_int* lda, const lapack_complex_float* af, lapack_int* ldaf, const float* s,
-                    const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
-                    float* rcond, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
-                    lapack_int* nparams, float* params, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_spprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* ap, const float* afp, const float* b,
-                   lapack_int* ldb, float* x, lapack_int* ldx, float* ferr, float* berr, float* work, lapack_int* iwork,
-                   lapack_int* info);
-void LAPACK_dpprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* ap, const double* afp, const double* b,
-                   lapack_int* ldb, double* x, lapack_int* ldx, double* ferr, double* berr, double* work,
-                   lapack_int* iwork, lapack_int* info);
-void LAPACK_cpprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap,
-                   const lapack_complex_float* afp, const lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* x, lapack_int* ldx, float* ferr, float* berr, lapack_complex_float* work,
-                   float* rwork, lapack_int* info);
-void LAPACK_zpprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* ap,
-                   const lapack_complex_double* afp, const lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* x, lapack_int* ldx, double* ferr, double* berr, lapack_complex_double* work,
-                   double* rwork, lapack_int* info);
-void LAPACK_spbrfs(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, const float* ab, lapack_int* ldab,
-                   const float* afb, lapack_int* ldafb, const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
-                   float* ferr, float* berr, float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_dpbrfs(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, const double* ab, lapack_int* ldab,
-                   const double* afb, lapack_int* ldafb, const double* b, lapack_int* ldb, double* x, lapack_int* ldx,
-                   double* ferr, double* berr, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_cpbrfs(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, const lapack_complex_float* ab,
-                   lapack_int* ldab, const lapack_complex_float* afb, lapack_int* ldafb, const lapack_complex_float* b,
-                   lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* ferr, float* berr,
-                   lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zpbrfs(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, const lapack_complex_double* ab,
-                   lapack_int* ldab, const lapack_complex_double* afb, lapack_int* ldafb,
-                   const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                   double* ferr, double* berr, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_sptrfs(lapack_int* n, lapack_int* nrhs, const float* d, const float* e, const float* df, const float* ef,
-                   const float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* ferr, float* berr, float* work,
-                   lapack_int* info);
-void LAPACK_dptrfs(lapack_int* n, lapack_int* nrhs, const double* d, const double* e, const double* df,
-                   const double* ef, const double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
-                   double* berr, double* work, lapack_int* info);
-void LAPACK_cptrfs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* d, const lapack_complex_float* e,
-                   const float* df, const lapack_complex_float* ef, const lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* x, lapack_int* ldx, float* ferr, float* berr, lapack_complex_float* work,
-                   float* rwork, lapack_int* info);
-void LAPACK_zptrfs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* d, const lapack_complex_double* e,
-                   const double* df, const lapack_complex_double* ef, const lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* x, lapack_int* ldx, double* ferr, double* berr, lapack_complex_double* work,
-                   double* rwork, lapack_int* info);
-void LAPACK_ssyrfs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda, const float* af,
-                   lapack_int* ldaf, const lapack_int* ipiv, const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
-                   float* ferr, float* berr, float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_dsyrfs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda, const double* af,
-                   lapack_int* ldaf, const lapack_int* ipiv, const double* b, lapack_int* ldb, double* x,
-                   lapack_int* ldx, double* ferr, double* berr, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_csyrfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
-                   const lapack_complex_float* af, lapack_int* ldaf, const lapack_int* ipiv,
-                   const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
-                   float* ferr, float* berr, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zsyrfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
-                   const lapack_complex_double* af, lapack_int* ldaf, const lapack_int* ipiv,
-                   const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                   double* ferr, double* berr, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_dsyrfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda,
-                    const double* af, lapack_int* ldaf, const lapack_int* ipiv, const double* s, const double* b,
-                    lapack_int* ldb, double* x, lapack_int* ldx, double* rcond, double* berr, lapack_int* n_err_bnds,
-                    double* err_bnds_norm, double* err_bnds_comp, lapack_int* nparams, double* params, double* work,
-                    lapack_int* iwork, lapack_int* info);
-void LAPACK_ssyrfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda,
-                    const float* af, lapack_int* ldaf, const lapack_int* ipiv, const float* s, const float* b,
-                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, float* berr, lapack_int* n_err_bnds,
-                    float* err_bnds_norm, float* err_bnds_comp, lapack_int* nparams, float* params, float* work,
-                    lapack_int* iwork, lapack_int* info);
-void LAPACK_zsyrfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a,
-                    lapack_int* lda, const lapack_complex_double* af, lapack_int* ldaf, const lapack_int* ipiv,
-                    const double* s, const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x,
-                    lapack_int* ldx, double* rcond, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm,
-                    double* err_bnds_comp, lapack_int* nparams, double* params, lapack_complex_double* work,
-                    double* rwork, lapack_int* info);
-void LAPACK_csyrfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a,
-                    lapack_int* lda, const lapack_complex_float* af, lapack_int* ldaf, const lapack_int* ipiv,
-                    const float* s, const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x,
-                    lapack_int* ldx, float* rcond, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
-                    float* err_bnds_comp, lapack_int* nparams, float* params, lapack_complex_float* work, float* rwork,
-                    lapack_int* info);
-void LAPACK_cherfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
-                   const lapack_complex_float* af, lapack_int* ldaf, const lapack_int* ipiv,
-                   const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
-                   float* ferr, float* berr, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zherfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
-                   const lapack_complex_double* af, lapack_int* ldaf, const lapack_int* ipiv,
-                   const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                   double* ferr, double* berr, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_zherfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a,
-                    lapack_int* lda, const lapack_complex_double* af, lapack_int* ldaf, const lapack_int* ipiv,
-                    const double* s, const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x,
-                    lapack_int* ldx, double* rcond, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm,
-                    double* err_bnds_comp, lapack_int* nparams, double* params, lapack_complex_double* work,
-                    double* rwork, lapack_int* info);
-void LAPACK_cherfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a,
-                    lapack_int* lda, const lapack_complex_float* af, lapack_int* ldaf, const lapack_int* ipiv,
-                    const float* s, const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x,
-                    lapack_int* ldx, float* rcond, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
-                    float* err_bnds_comp, lapack_int* nparams, float* params, lapack_complex_float* work, float* rwork,
-                    lapack_int* info);
-void LAPACK_ssprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* ap, const float* afp,
-                   const lapack_int* ipiv, const float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* ferr,
-                   float* berr, float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_dsprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* ap, const double* afp,
-                   const lapack_int* ipiv, const double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
-                   double* berr, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_csprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap,
-                   const lapack_complex_float* afp, const lapack_int* ipiv, const lapack_complex_float* b,
-                   lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* ferr, float* berr,
-                   lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zsprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* ap,
-                   const lapack_complex_double* afp, const lapack_int* ipiv, const lapack_complex_double* b,
-                   lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, double* ferr, double* berr,
-                   lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_chprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap,
-                   const lapack_complex_float* afp, const lapack_int* ipiv, const lapack_complex_float* b,
-                   lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* ferr, float* berr,
-                   lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zhprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* ap,
-                   const lapack_complex_double* afp, const lapack_int* ipiv, const lapack_complex_double* b,
-                   lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, double* ferr, double* berr,
-                   lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_strrfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const float* a,
-                   lapack_int* lda, const float* b, lapack_int* ldb, const float* x, lapack_int* ldx, float* ferr,
-                   float* berr, float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_dtrrfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const double* a,
-                   lapack_int* lda, const double* b, lapack_int* ldb, const double* x, lapack_int* ldx, double* ferr,
-                   double* berr, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_ctrrfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a,
-                   lapack_int* lda, const lapack_complex_float* b, lapack_int* ldb, const lapack_complex_float* x,
-                   lapack_int* ldx, float* ferr, float* berr, lapack_complex_float* work, float* rwork,
-                   lapack_int* info);
-void LAPACK_ztrrfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a,
-                   lapack_int* lda, const lapack_complex_double* b, lapack_int* ldb, const lapack_complex_double* x,
-                   lapack_int* ldx, double* ferr, double* berr, lapack_complex_double* work, double* rwork,
-                   lapack_int* info);
-void LAPACK_stprfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const float* ap,
-                   const float* b, lapack_int* ldb, const float* x, lapack_int* ldx, float* ferr, float* berr,
-                   float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_dtprfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const double* ap,
-                   const double* b, lapack_int* ldb, const double* x, lapack_int* ldx, double* ferr, double* berr,
-                   double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_ctprfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap,
-                   const lapack_complex_float* b, lapack_int* ldb, const lapack_complex_float* x, lapack_int* ldx,
-                   float* ferr, float* berr, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_ztprfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs,
-                   const lapack_complex_double* ap, const lapack_complex_double* b, lapack_int* ldb,
-                   const lapack_complex_double* x, lapack_int* ldx, double* ferr, double* berr,
-                   lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_stbrfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                   const float* ab, lapack_int* ldab, const float* b, lapack_int* ldb, const float* x, lapack_int* ldx,
-                   float* ferr, float* berr, float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_dtbrfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                   const double* ab, lapack_int* ldab, const double* b, lapack_int* ldb, const double* x,
-                   lapack_int* ldx, double* ferr, double* berr, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_ctbrfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                   const lapack_complex_float* ab, lapack_int* ldab, const lapack_complex_float* b, lapack_int* ldb,
-                   const lapack_complex_float* x, lapack_int* ldx, float* ferr, float* berr, lapack_complex_float* work,
-                   float* rwork, lapack_int* info);
-void LAPACK_ztbrfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                   const lapack_complex_double* ab, lapack_int* ldab, const lapack_complex_double* b, lapack_int* ldb,
-                   const lapack_complex_double* x, lapack_int* ldx, double* ferr, double* berr,
-                   lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_sgetri(lapack_int* n, float* a, lapack_int* lda, const lapack_int* ipiv, float* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_dgetri(lapack_int* n, double* a, lapack_int* lda, const lapack_int* ipiv, double* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_cgetri(lapack_int* n, lapack_complex_float* a, lapack_int* lda, const lapack_int* ipiv,
-                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zgetri(lapack_int* n, lapack_complex_double* a, lapack_int* lda, const lapack_int* ipiv,
-                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_spotri(char* uplo, lapack_int* n, float* a, lapack_int* lda, lapack_int* info);
-void LAPACK_dpotri(char* uplo, lapack_int* n, double* a, lapack_int* lda, lapack_int* info);
-void LAPACK_cpotri(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* info);
-void LAPACK_zpotri(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* info);
-void LAPACK_dpftri(char* transr, char* uplo, lapack_int* n, double* a, lapack_int* info);
-void LAPACK_spftri(char* transr, char* uplo, lapack_int* n, float* a, lapack_int* info);
-void LAPACK_zpftri(char* transr, char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* info);
-void LAPACK_cpftri(char* transr, char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* info);
-void LAPACK_spptri(char* uplo, lapack_int* n, float* ap, lapack_int* info);
-void LAPACK_dpptri(char* uplo, lapack_int* n, double* ap, lapack_int* info);
-void LAPACK_cpptri(char* uplo, lapack_int* n, lapack_complex_float* ap, lapack_int* info);
-void LAPACK_zpptri(char* uplo, lapack_int* n, lapack_complex_double* ap, lapack_int* info);
-void LAPACK_ssytri(char* uplo, lapack_int* n, float* a, lapack_int* lda, const lapack_int* ipiv, float* work,
-                   lapack_int* info);
-void LAPACK_dsytri(char* uplo, lapack_int* n, double* a, lapack_int* lda, const lapack_int* ipiv, double* work,
-                   lapack_int* info);
-void LAPACK_csytri(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, const lapack_int* ipiv,
-                   lapack_complex_float* work, lapack_int* info);
-void LAPACK_zsytri(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, const lapack_int* ipiv,
-                   lapack_complex_double* work, lapack_int* info);
-void LAPACK_chetri(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, const lapack_int* ipiv,
-                   lapack_complex_float* work, lapack_int* info);
-void LAPACK_zhetri(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, const lapack_int* ipiv,
-                   lapack_complex_double* work, lapack_int* info);
-void LAPACK_ssptri(char* uplo, lapack_int* n, float* ap, const lapack_int* ipiv, float* work, lapack_int* info);
-void LAPACK_dsptri(char* uplo, lapack_int* n, double* ap, const lapack_int* ipiv, double* work, lapack_int* info);
-void LAPACK_csptri(char* uplo, lapack_int* n, lapack_complex_float* ap, const lapack_int* ipiv,
-                   lapack_complex_float* work, lapack_int* info);
-void LAPACK_zsptri(char* uplo, lapack_int* n, lapack_complex_double* ap, const lapack_int* ipiv,
-                   lapack_complex_double* work, lapack_int* info);
-void LAPACK_chptri(char* uplo, lapack_int* n, lapack_complex_float* ap, const lapack_int* ipiv,
-                   lapack_complex_float* work, lapack_int* info);
-void LAPACK_zhptri(char* uplo, lapack_int* n, lapack_complex_double* ap, const lapack_int* ipiv,
-                   lapack_complex_double* work, lapack_int* info);
-void LAPACK_strtri(char* uplo, char* diag, lapack_int* n, float* a, lapack_int* lda, lapack_int* info);
-void LAPACK_dtrtri(char* uplo, char* diag, lapack_int* n, double* a, lapack_int* lda, lapack_int* info);
-void LAPACK_ctrtri(char* uplo, char* diag, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* info);
-void LAPACK_ztrtri(char* uplo, char* diag, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* info);
-void LAPACK_dtftri(char* transr, char* uplo, char* diag, lapack_int* n, double* a, lapack_int* info);
-void LAPACK_stftri(char* transr, char* uplo, char* diag, lapack_int* n, float* a, lapack_int* info);
-void LAPACK_ztftri(char* transr, char* uplo, char* diag, lapack_int* n, lapack_complex_double* a, lapack_int* info);
-void LAPACK_ctftri(char* transr, char* uplo, char* diag, lapack_int* n, lapack_complex_float* a, lapack_int* info);
-void LAPACK_stptri(char* uplo, char* diag, lapack_int* n, float* ap, lapack_int* info);
-void LAPACK_dtptri(char* uplo, char* diag, lapack_int* n, double* ap, lapack_int* info);
-void LAPACK_ctptri(char* uplo, char* diag, lapack_int* n, lapack_complex_float* ap, lapack_int* info);
-void LAPACK_ztptri(char* uplo, char* diag, lapack_int* n, lapack_complex_double* ap, lapack_int* info);
-void LAPACK_sgeequ(lapack_int* m, lapack_int* n, const float* a, lapack_int* lda, float* r, float* c, float* rowcnd,
-                   float* colcnd, float* amax, lapack_int* info);
-void LAPACK_dgeequ(lapack_int* m, lapack_int* n, const double* a, lapack_int* lda, double* r, double* c, double* rowcnd,
-                   double* colcnd, double* amax, lapack_int* info);
-void LAPACK_cgeequ(lapack_int* m, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* r, float* c,
-                   float* rowcnd, float* colcnd, float* amax, lapack_int* info);
-void LAPACK_zgeequ(lapack_int* m, lapack_int* n, const lapack_complex_double* a, lapack_int* lda, double* r, double* c,
-                   double* rowcnd, double* colcnd, double* amax, lapack_int* info);
-void LAPACK_dgeequb(lapack_int* m, lapack_int* n, const double* a, lapack_int* lda, double* r, double* c,
-                    double* rowcnd, double* colcnd, double* amax, lapack_int* info);
-void LAPACK_sgeequb(lapack_int* m, lapack_int* n, const float* a, lapack_int* lda, float* r, float* c, float* rowcnd,
-                    float* colcnd, float* amax, lapack_int* info);
-void LAPACK_zgeequb(lapack_int* m, lapack_int* n, const lapack_complex_double* a, lapack_int* lda, double* r, double* c,
-                    double* rowcnd, double* colcnd, double* amax, lapack_int* info);
-void LAPACK_cgeequb(lapack_int* m, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* r, float* c,
-                    float* rowcnd, float* colcnd, float* amax, lapack_int* info);
-void LAPACK_sgbequ(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const float* ab, lapack_int* ldab,
-                   float* r, float* c, float* rowcnd, float* colcnd, float* amax, lapack_int* info);
-void LAPACK_dgbequ(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const double* ab, lapack_int* ldab,
-                   double* r, double* c, double* rowcnd, double* colcnd, double* amax, lapack_int* info);
-void LAPACK_cgbequ(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const lapack_complex_float* ab,
-                   lapack_int* ldab, float* r, float* c, float* rowcnd, float* colcnd, float* amax, lapack_int* info);
-void LAPACK_zgbequ(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const lapack_complex_double* ab,
-                   lapack_int* ldab, double* r, double* c, double* rowcnd, double* colcnd, double* amax,
-                   lapack_int* info);
-void LAPACK_dgbequb(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const double* ab, lapack_int* ldab,
-                    double* r, double* c, double* rowcnd, double* colcnd, double* amax, lapack_int* info);
-void LAPACK_sgbequb(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const float* ab, lapack_int* ldab,
-                    float* r, float* c, float* rowcnd, float* colcnd, float* amax, lapack_int* info);
-void LAPACK_zgbequb(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const lapack_complex_double* ab,
-                    lapack_int* ldab, double* r, double* c, double* rowcnd, double* colcnd, double* amax,
-                    lapack_int* info);
-void LAPACK_cgbequb(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const lapack_complex_float* ab,
-                    lapack_int* ldab, float* r, float* c, float* rowcnd, float* colcnd, float* amax, lapack_int* info);
-void LAPACK_spoequ(lapack_int* n, const float* a, lapack_int* lda, float* s, float* scond, float* amax,
-                   lapack_int* info);
-void LAPACK_dpoequ(lapack_int* n, const double* a, lapack_int* lda, double* s, double* scond, double* amax,
-                   lapack_int* info);
-void LAPACK_cpoequ(lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* s, float* scond, float* amax,
-                   lapack_int* info);
-void LAPACK_zpoequ(lapack_int* n, const lapack_complex_double* a, lapack_int* lda, double* s, double* scond,
-                   double* amax, lapack_int* info);
-void LAPACK_dpoequb(lapack_int* n, const double* a, lapack_int* lda, double* s, double* scond, double* amax,
-                    lapack_int* info);
-void LAPACK_spoequb(lapack_int* n, const float* a, lapack_int* lda, float* s, float* scond, float* amax,
-                    lapack_int* info);
-void LAPACK_zpoequb(lapack_int* n, const lapack_complex_double* a, lapack_int* lda, double* s, double* scond,
-                    double* amax, lapack_int* info);
-void LAPACK_cpoequb(lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* s, float* scond, float* amax,
-                    lapack_int* info);
-void LAPACK_sppequ(char* uplo, lapack_int* n, const float* ap, float* s, float* scond, float* amax, lapack_int* info);
-void LAPACK_dppequ(char* uplo, lapack_int* n, const double* ap, double* s, double* scond, double* amax,
-                   lapack_int* info);
-void LAPACK_cppequ(char* uplo, lapack_int* n, const lapack_complex_float* ap, float* s, float* scond, float* amax,
-                   lapack_int* info);
-void LAPACK_zppequ(char* uplo, lapack_int* n, const lapack_complex_double* ap, double* s, double* scond, double* amax,
-                   lapack_int* info);
-void LAPACK_spbequ(char* uplo, lapack_int* n, lapack_int* kd, const float* ab, lapack_int* ldab, float* s, float* scond,
-                   float* amax, lapack_int* info);
-void LAPACK_dpbequ(char* uplo, lapack_int* n, lapack_int* kd, const double* ab, lapack_int* ldab, double* s,
-                   double* scond, double* amax, lapack_int* info);
-void LAPACK_cpbequ(char* uplo, lapack_int* n, lapack_int* kd, const lapack_complex_float* ab, lapack_int* ldab,
-                   float* s, float* scond, float* amax, lapack_int* info);
-void LAPACK_zpbequ(char* uplo, lapack_int* n, lapack_int* kd, const lapack_complex_double* ab, lapack_int* ldab,
-                   double* s, double* scond, double* amax, lapack_int* info);
-void LAPACK_dsyequb(char* uplo, lapack_int* n, const double* a, lapack_int* lda, double* s, double* scond, double* amax,
-                    double* work, lapack_int* info);
-void LAPACK_ssyequb(char* uplo, lapack_int* n, const float* a, lapack_int* lda, float* s, float* scond, float* amax,
-                    float* work, lapack_int* info);
-void LAPACK_zsyequb(char* uplo, lapack_int* n, const lapack_complex_double* a, lapack_int* lda, double* s,
-                    double* scond, double* amax, lapack_complex_double* work, lapack_int* info);
-void LAPACK_csyequb(char* uplo, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* s, float* scond,
-                    float* amax, lapack_complex_float* work, lapack_int* info);
-void LAPACK_zheequb(char* uplo, lapack_int* n, const lapack_complex_double* a, lapack_int* lda, double* s,
-                    double* scond, double* amax, lapack_complex_double* work, lapack_int* info);
-void LAPACK_cheequb(char* uplo, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* s, float* scond,
-                    float* amax, lapack_complex_float* work, lapack_int* info);
-void LAPACK_sgesv(lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, lapack_int* ipiv, float* b,
-                  lapack_int* ldb, lapack_int* info);
-void LAPACK_dgesv(lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, lapack_int* ipiv, double* b,
-                  lapack_int* ldb, lapack_int* info);
-void LAPACK_cgesv(lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
-                  lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_zgesv(lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
-                  lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_dsgesv(lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, lapack_int* ipiv, double* b,
-                   lapack_int* ldb, double* x, lapack_int* ldx, double* work, float* swork, lapack_int* iter,
-                   lapack_int* info);
-void LAPACK_zcgesv(lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
-                   lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                   lapack_complex_double* work, lapack_complex_float* swork, double* rwork, lapack_int* iter,
-                   lapack_int* info);
-void LAPACK_sgesvx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* af,
-                   lapack_int* ldaf, lapack_int* ipiv, char* equed, float* r, float* c, float* b, lapack_int* ldb,
-                   float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr, float* work, lapack_int* iwork,
-                   lapack_int* info);
-void LAPACK_dgesvx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* af,
-                   lapack_int* ldaf, lapack_int* ipiv, char* equed, double* r, double* c, double* b, lapack_int* ldb,
-                   double* x, lapack_int* ldx, double* rcond, double* ferr, double* berr, double* work,
-                   lapack_int* iwork, lapack_int* info);
-void LAPACK_cgesvx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* af, lapack_int* ldaf, lapack_int* ipiv, char* equed, float* r, float* c,
-                   lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* rcond,
-                   float* ferr, float* berr, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zgesvx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* af, lapack_int* ldaf, lapack_int* ipiv, char* equed, double* r, double* c,
-                   lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, double* rcond,
-                   double* ferr, double* berr, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_dgesvxx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* af,
-                    lapack_int* ldaf, lapack_int* ipiv, char* equed, double* r, double* c, double* b, lapack_int* ldb,
-                    double* x, lapack_int* ldx, double* rcond, double* rpvgrw, double* berr, lapack_int* n_err_bnds,
-                    double* err_bnds_norm, double* err_bnds_comp, lapack_int* nparams, double* params, double* work,
-                    lapack_int* iwork, lapack_int* info);
-void LAPACK_sgesvxx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* af,
-                    lapack_int* ldaf, lapack_int* ipiv, char* equed, float* r, float* c, float* b, lapack_int* ldb,
-                    float* x, lapack_int* ldx, float* rcond, float* rpvgrw, float* berr, lapack_int* n_err_bnds,
-                    float* err_bnds_norm, float* err_bnds_comp, lapack_int* nparams, float* params, float* work,
-                    lapack_int* iwork, lapack_int* info);
-void LAPACK_zgesvxx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* af, lapack_int* ldaf, lapack_int* ipiv, char* equed, double* r, double* c,
-                    lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, double* rcond,
-                    double* rpvgrw, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
-                    lapack_int* nparams, double* params, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_cgesvxx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* af, lapack_int* ldaf, lapack_int* ipiv, char* equed, float* r, float* c,
-                    lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* rcond,
-                    float* rpvgrw, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
-                    lapack_int* nparams, float* params, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_sgbsv(lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, float* ab, lapack_int* ldab,
-                  lapack_int* ipiv, float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_dgbsv(lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, double* ab, lapack_int* ldab,
-                  lapack_int* ipiv, double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_cgbsv(lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, lapack_complex_float* ab,
-                  lapack_int* ldab, lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_zgbsv(lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, lapack_complex_double* ab,
-                  lapack_int* ldab, lapack_int* ipiv, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_sgbsvx(char* fact, char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, float* ab,
-                   lapack_int* ldab, float* afb, lapack_int* ldafb, lapack_int* ipiv, char* equed, float* r, float* c,
-                   float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr,
-                   float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_dgbsvx(char* fact, char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, double* ab,
-                   lapack_int* ldab, double* afb, lapack_int* ldafb, lapack_int* ipiv, char* equed, double* r,
-                   double* c, double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* rcond, double* ferr,
-                   double* berr, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_cgbsvx(char* fact, char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
-                   lapack_complex_float* ab, lapack_int* ldab, lapack_complex_float* afb, lapack_int* ldafb,
-                   lapack_int* ipiv, char* equed, float* r, float* c, lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr,
-                   lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zgbsvx(char* fact, char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
-                   lapack_complex_double* ab, lapack_int* ldab, lapack_complex_double* afb, lapack_int* ldafb,
-                   lapack_int* ipiv, char* equed, double* r, double* c, lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* x, lapack_int* ldx, double* rcond, double* ferr, double* berr,
-                   lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_dgbsvxx(char* fact, char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
-                    double* ab, lapack_int* ldab, double* afb, lapack_int* ldafb, lapack_int* ipiv, char* equed,
-                    double* r, double* c, double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
-                    double* rpvgrw, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
-                    lapack_int* nparams, double* params, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_sgbsvxx(char* fact, char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, float* ab,
-                    lapack_int* ldab, float* afb, lapack_int* ldafb, lapack_int* ipiv, char* equed, float* r, float* c,
-                    float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, float* rpvgrw, float* berr,
-                    lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int* nparams,
-                    float* params, float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_zgbsvxx(char* fact, char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
-                    lapack_complex_double* ab, lapack_int* ldab, lapack_complex_double* afb, lapack_int* ldafb,
-                    lapack_int* ipiv, char* equed, double* r, double* c, lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* x, lapack_int* ldx, double* rcond, double* rpvgrw, double* berr,
-                    lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int* nparams,
-                    double* params, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_cgbsvxx(char* fact, char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
-                    lapack_complex_float* ab, lapack_int* ldab, lapack_complex_float* afb, lapack_int* ldafb,
-                    lapack_int* ipiv, char* equed, float* r, float* c, lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* x, lapack_int* ldx, float* rcond, float* rpvgrw, float* berr,
-                    lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int* nparams,
-                    float* params, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_sgtsv(lapack_int* n, lapack_int* nrhs, float* dl, float* d, float* du, float* b, lapack_int* ldb,
-                  lapack_int* info);
-void LAPACK_dgtsv(lapack_int* n, lapack_int* nrhs, double* dl, double* d, double* du, double* b, lapack_int* ldb,
-                  lapack_int* info);
-void LAPACK_cgtsv(lapack_int* n, lapack_int* nrhs, lapack_complex_float* dl, lapack_complex_float* d,
-                  lapack_complex_float* du, lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_zgtsv(lapack_int* n, lapack_int* nrhs, lapack_complex_double* dl, lapack_complex_double* d,
-                  lapack_complex_double* du, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_sgtsvx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, const float* dl, const float* d,
-                   const float* du, float* dlf, float* df, float* duf, float* du2, lapack_int* ipiv, const float* b,
-                   lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr, float* work,
-                   lapack_int* iwork, lapack_int* info);
-void LAPACK_dgtsvx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, const double* dl, const double* d,
-                   const double* du, double* dlf, double* df, double* duf, double* du2, lapack_int* ipiv,
-                   const double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* rcond, double* ferr,
-                   double* berr, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_cgtsvx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* dl,
-                   const lapack_complex_float* d, const lapack_complex_float* du, lapack_complex_float* dlf,
-                   lapack_complex_float* df, lapack_complex_float* duf, lapack_complex_float* du2, lapack_int* ipiv,
-                   const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
-                   float* rcond, float* ferr, float* berr, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zgtsvx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* dl,
-                   const lapack_complex_double* d, const lapack_complex_double* du, lapack_complex_double* dlf,
-                   lapack_complex_double* df, lapack_complex_double* duf, lapack_complex_double* du2, lapack_int* ipiv,
-                   const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                   double* rcond, double* ferr, double* berr, lapack_complex_double* work, double* rwork,
-                   lapack_int* info);
-void LAPACK_sposv(char* uplo, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* b, lapack_int* ldb,
-                  lapack_int* info);
-void LAPACK_dposv(char* uplo, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* b, lapack_int* ldb,
-                  lapack_int* info);
-void LAPACK_cposv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
-                  lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_zposv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
-                  lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_dsposv(char* uplo, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* b, lapack_int* ldb,
-                   double* x, lapack_int* ldx, double* work, float* swork, lapack_int* iter, lapack_int* info);
-void LAPACK_zcposv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                   lapack_complex_double* work, lapack_complex_float* swork, double* rwork, lapack_int* iter,
-                   lapack_int* info);
-void LAPACK_sposvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* af,
-                   lapack_int* ldaf, char* equed, float* s, float* b, lapack_int* ldb, float* x, lapack_int* ldx,
-                   float* rcond, float* ferr, float* berr, float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_dposvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* af,
-                   lapack_int* ldaf, char* equed, double* s, double* b, lapack_int* ldb, double* x, lapack_int* ldx,
-                   double* rcond, double* ferr, double* berr, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_cposvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* af, lapack_int* ldaf, char* equed, float* s, lapack_complex_float* b,
-                   lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr,
-                   lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zposvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* af, lapack_int* ldaf, char* equed, double* s, lapack_complex_double* b,
-                   lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, double* rcond, double* ferr,
-                   double* berr, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_dposvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* af,
-                    lapack_int* ldaf, char* equed, double* s, double* b, lapack_int* ldb, double* x, lapack_int* ldx,
-                    double* rcond, double* rpvgrw, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm,
-                    double* err_bnds_comp, lapack_int* nparams, double* params, double* work, lapack_int* iwork,
-                    lapack_int* info);
-void LAPACK_sposvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* af,
-                    lapack_int* ldaf, char* equed, float* s, float* b, lapack_int* ldb, float* x, lapack_int* ldx,
-                    float* rcond, float* rpvgrw, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
-                    float* err_bnds_comp, lapack_int* nparams, float* params, float* work, lapack_int* iwork,
-                    lapack_int* info);
-void LAPACK_zposvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* af, lapack_int* ldaf, char* equed, double* s, lapack_complex_double* b,
-                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, double* rcond, double* rpvgrw,
-                    double* berr, lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
-                    lapack_int* nparams, double* params, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_cposvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* af, lapack_int* ldaf, char* equed, float* s, lapack_complex_float* b,
-                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* rcond, float* rpvgrw, float* berr,
-                    lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int* nparams,
-                    float* params, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_sppsv(char* uplo, lapack_int* n, lapack_int* nrhs, float* ap, float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_dppsv(char* uplo, lapack_int* n, lapack_int* nrhs, double* ap, double* b, lapack_int* ldb,
-                  lapack_int* info);
-void LAPACK_cppsv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* ap, lapack_complex_float* b,
-                  lapack_int* ldb, lapack_int* info);
-void LAPACK_zppsv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* ap, lapack_complex_double* b,
-                  lapack_int* ldb, lapack_int* info);
-void LAPACK_sppsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, float* ap, float* afp, char* equed,
-                   float* s, float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, float* ferr,
-                   float* berr, float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_dppsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, double* ap, double* afp, char* equed,
-                   double* s, double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* rcond, double* ferr,
-                   double* berr, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_cppsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* ap,
-                   lapack_complex_float* afp, char* equed, float* s, lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr,
-                   lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zppsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* ap,
-                   lapack_complex_double* afp, char* equed, double* s, lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* x, lapack_int* ldx, double* rcond, double* ferr, double* berr,
-                   lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_spbsv(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, float* ab, lapack_int* ldab, float* b,
-                  lapack_int* ldb, lapack_int* info);
-void LAPACK_dpbsv(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, double* ab, lapack_int* ldab, double* b,
-                  lapack_int* ldb, lapack_int* info);
-void LAPACK_cpbsv(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, lapack_complex_float* ab,
-                  lapack_int* ldab, lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_zpbsv(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, lapack_complex_double* ab,
-                  lapack_int* ldab, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_spbsvx(char* fact, char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, float* ab, lapack_int* ldab,
-                   float* afb, lapack_int* ldafb, char* equed, float* s, float* b, lapack_int* ldb, float* x,
-                   lapack_int* ldx, float* rcond, float* ferr, float* berr, float* work, lapack_int* iwork,
-                   lapack_int* info);
-void LAPACK_dpbsvx(char* fact, char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, double* ab,
-                   lapack_int* ldab, double* afb, lapack_int* ldafb, char* equed, double* s, double* b, lapack_int* ldb,
-                   double* x, lapack_int* ldx, double* rcond, double* ferr, double* berr, double* work,
-                   lapack_int* iwork, lapack_int* info);
-void LAPACK_cpbsvx(char* fact, char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, lapack_complex_float* ab,
-                   lapack_int* ldab, lapack_complex_float* afb, lapack_int* ldafb, char* equed, float* s,
-                   lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* rcond,
-                   float* ferr, float* berr, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zpbsvx(char* fact, char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, lapack_complex_double* ab,
-                   lapack_int* ldab, lapack_complex_double* afb, lapack_int* ldafb, char* equed, double* s,
-                   lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, double* rcond,
-                   double* ferr, double* berr, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_sptsv(lapack_int* n, lapack_int* nrhs, float* d, float* e, float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_dptsv(lapack_int* n, lapack_int* nrhs, double* d, double* e, double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_cptsv(lapack_int* n, lapack_int* nrhs, float* d, lapack_complex_float* e, lapack_complex_float* b,
-                  lapack_int* ldb, lapack_int* info);
-void LAPACK_zptsv(lapack_int* n, lapack_int* nrhs, double* d, lapack_complex_double* e, lapack_complex_double* b,
-                  lapack_int* ldb, lapack_int* info);
-void LAPACK_sptsvx(char* fact, lapack_int* n, lapack_int* nrhs, const float* d, const float* e, float* df, float* ef,
-                   const float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr,
-                   float* work, lapack_int* info);
-void LAPACK_dptsvx(char* fact, lapack_int* n, lapack_int* nrhs, const double* d, const double* e, double* df,
-                   double* ef, const double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
-                   double* ferr, double* berr, double* work, lapack_int* info);
-void LAPACK_cptsvx(char* fact, lapack_int* n, lapack_int* nrhs, const float* d, const lapack_complex_float* e,
-                   float* df, lapack_complex_float* ef, const lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr,
-                   lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zptsvx(char* fact, lapack_int* n, lapack_int* nrhs, const double* d, const lapack_complex_double* e,
-                   double* df, lapack_complex_double* ef, const lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* x, lapack_int* ldx, double* rcond, double* ferr, double* berr,
-                   lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_ssysv(char* uplo, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, lapack_int* ipiv, float* b,
-                  lapack_int* ldb, float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dsysv(char* uplo, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, lapack_int* ipiv, double* b,
-                  lapack_int* ldb, double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_csysv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
-                  lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* work,
-                  lapack_int* lwork, lapack_int* info);
-void LAPACK_zsysv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
-                  lapack_int* ipiv, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* work,
-                  lapack_int* lwork, lapack_int* info);
-void LAPACK_ssysvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda, float* af,
-                   lapack_int* ldaf, lapack_int* ipiv, const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
-                   float* rcond, float* ferr, float* berr, float* work, lapack_int* lwork, lapack_int* iwork,
-                   lapack_int* info);
-void LAPACK_dsysvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda,
-                   double* af, lapack_int* ldaf, lapack_int* ipiv, const double* b, lapack_int* ldb, double* x,
-                   lapack_int* ldx, double* rcond, double* ferr, double* berr, double* work, lapack_int* lwork,
-                   lapack_int* iwork, lapack_int* info);
-void LAPACK_csysvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a,
-                   lapack_int* lda, lapack_complex_float* af, lapack_int* ldaf, lapack_int* ipiv,
-                   const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
-                   float* rcond, float* ferr, float* berr, lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                   lapack_int* info);
-void LAPACK_zsysvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a,
-                   lapack_int* lda, lapack_complex_double* af, lapack_int* ldaf, lapack_int* ipiv,
-                   const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                   double* rcond, double* ferr, double* berr, lapack_complex_double* work, lapack_int* lwork,
-                   double* rwork, lapack_int* info);
-void LAPACK_dsysvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* af,
-                    lapack_int* ldaf, lapack_int* ipiv, char* equed, double* s, double* b, lapack_int* ldb, double* x,
-                    lapack_int* ldx, double* rcond, double* rpvgrw, double* berr, lapack_int* n_err_bnds,
-                    double* err_bnds_norm, double* err_bnds_comp, lapack_int* nparams, double* params, double* work,
-                    lapack_int* iwork, lapack_int* info);
-void LAPACK_ssysvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* af,
-                    lapack_int* ldaf, lapack_int* ipiv, char* equed, float* s, float* b, lapack_int* ldb, float* x,
-                    lapack_int* ldx, float* rcond, float* rpvgrw, float* berr, lapack_int* n_err_bnds,
-                    float* err_bnds_norm, float* err_bnds_comp, lapack_int* nparams, float* params, float* work,
-                    lapack_int* iwork, lapack_int* info);
-void LAPACK_zsysvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* af, lapack_int* ldaf, lapack_int* ipiv, char* equed, double* s,
-                    lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, double* rcond,
-                    double* rpvgrw, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
-                    lapack_int* nparams, double* params, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_csysvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* af, lapack_int* ldaf, lapack_int* ipiv, char* equed, float* s,
-                    lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* rcond,
-                    float* rpvgrw, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
-                    lapack_int* nparams, float* params, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_chesv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
-                  lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* work,
-                  lapack_int* lwork, lapack_int* info);
-void LAPACK_zhesv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
-                  lapack_int* ipiv, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* work,
-                  lapack_int* lwork, lapack_int* info);
-void LAPACK_chesvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a,
-                   lapack_int* lda, lapack_complex_float* af, lapack_int* ldaf, lapack_int* ipiv,
-                   const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
-                   float* rcond, float* ferr, float* berr, lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                   lapack_int* info);
-void LAPACK_zhesvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a,
-                   lapack_int* lda, lapack_complex_double* af, lapack_int* ldaf, lapack_int* ipiv,
-                   const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                   double* rcond, double* ferr, double* berr, lapack_complex_double* work, lapack_int* lwork,
-                   double* rwork, lapack_int* info);
-void LAPACK_zhesvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* af, lapack_int* ldaf, lapack_int* ipiv, char* equed, double* s,
-                    lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, double* rcond,
-                    double* rpvgrw, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
-                    lapack_int* nparams, double* params, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_chesvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* af, lapack_int* ldaf, lapack_int* ipiv, char* equed, float* s,
-                    lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* rcond,
-                    float* rpvgrw, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
-                    lapack_int* nparams, float* params, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_sspsv(char* uplo, lapack_int* n, lapack_int* nrhs, float* ap, lapack_int* ipiv, float* b, lapack_int* ldb,
-                  lapack_int* info);
-void LAPACK_dspsv(char* uplo, lapack_int* n, lapack_int* nrhs, double* ap, lapack_int* ipiv, double* b, lapack_int* ldb,
-                  lapack_int* info);
-void LAPACK_cspsv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* ap, lapack_int* ipiv,
-                  lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_zspsv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* ap, lapack_int* ipiv,
-                  lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_sspsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const float* ap, float* afp,
-                   lapack_int* ipiv, const float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
-                   float* ferr, float* berr, float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_dspsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const double* ap, double* afp,
-                   lapack_int* ipiv, const double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
-                   double* ferr, double* berr, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_cspsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap,
-                   lapack_complex_float* afp, lapack_int* ipiv, const lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr,
-                   lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zspsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* ap,
-                   lapack_complex_double* afp, lapack_int* ipiv, const lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* x, lapack_int* ldx, double* rcond, double* ferr, double* berr,
-                   lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_chpsv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* ap, lapack_int* ipiv,
-                  lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_zhpsv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* ap, lapack_int* ipiv,
-                  lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_chpsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap,
-                   lapack_complex_float* afp, lapack_int* ipiv, const lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr,
-                   lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zhpsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* ap,
-                   lapack_complex_double* afp, lapack_int* ipiv, const lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* x, lapack_int* ldx, double* rcond, double* ferr, double* berr,
-                   lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_sgeqrf(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* tau, float* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_dgeqrf(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* tau, double* work,
-                   lapack_int* lwork, lapack_int* info);
-void LAPACK_cgeqrf(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* tau,
-                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zgeqrf(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* tau,
-                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_sgeqpf(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, lapack_int* jpvt, float* tau, float* work,
-                   lapack_int* info);
-void LAPACK_dgeqpf(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, lapack_int* jpvt, double* tau,
-                   double* work, lapack_int* info);
-void LAPACK_cgeqpf(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* jpvt,
-                   lapack_complex_float* tau, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zgeqpf(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* jpvt,
-                   lapack_complex_double* tau, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_sgeqp3(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, lapack_int* jpvt, float* tau, float* work,
-                   lapack_int* lwork, lapack_int* info);
-void LAPACK_dgeqp3(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, lapack_int* jpvt, double* tau,
-                   double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_cgeqp3(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* jpvt,
-                   lapack_complex_float* tau, lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                   lapack_int* info);
-void LAPACK_zgeqp3(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* jpvt,
-                   lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, double* rwork,
-                   lapack_int* info);
-void LAPACK_sorgqr(lapack_int* m, lapack_int* n, lapack_int* k, float* a, lapack_int* lda, const float* tau,
-                   float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dorgqr(lapack_int* m, lapack_int* n, lapack_int* k, double* a, lapack_int* lda, const double* tau,
-                   double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_sormqr(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const float* a,
-                   lapack_int* lda, const float* tau, float* c, lapack_int* ldc, float* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_dormqr(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const double* a,
-                   lapack_int* lda, const double* tau, double* c, lapack_int* ldc, double* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_cungqr(lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_float* a, lapack_int* lda,
-                   const lapack_complex_float* tau, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zungqr(lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_double* a, lapack_int* lda,
-                   const lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_cunmqr(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const lapack_complex_float* a,
-                   lapack_int* lda, const lapack_complex_float* tau, lapack_complex_float* c, lapack_int* ldc,
-                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zunmqr(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const lapack_complex_double* a,
-                   lapack_int* lda, const lapack_complex_double* tau, lapack_complex_double* c, lapack_int* ldc,
-                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_sgelqf(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* tau, float* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_dgelqf(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* tau, double* work,
-                   lapack_int* lwork, lapack_int* info);
-void LAPACK_cgelqf(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* tau,
-                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zgelqf(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* tau,
-                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_sorglq(lapack_int* m, lapack_int* n, lapack_int* k, float* a, lapack_int* lda, const float* tau,
-                   float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dorglq(lapack_int* m, lapack_int* n, lapack_int* k, double* a, lapack_int* lda, const double* tau,
-                   double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_sormlq(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const float* a,
-                   lapack_int* lda, const float* tau, float* c, lapack_int* ldc, float* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_dormlq(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const double* a,
-                   lapack_int* lda, const double* tau, double* c, lapack_int* ldc, double* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_cunglq(lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_float* a, lapack_int* lda,
-                   const lapack_complex_float* tau, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zunglq(lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_double* a, lapack_int* lda,
-                   const lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_cunmlq(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const lapack_complex_float* a,
-                   lapack_int* lda, const lapack_complex_float* tau, lapack_complex_float* c, lapack_int* ldc,
-                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zunmlq(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const lapack_complex_double* a,
-                   lapack_int* lda, const lapack_complex_double* tau, lapack_complex_double* c, lapack_int* ldc,
-                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_sgeqlf(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* tau, float* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_dgeqlf(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* tau, double* work,
-                   lapack_int* lwork, lapack_int* info);
-void LAPACK_cgeqlf(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* tau,
-                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zgeqlf(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* tau,
-                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_sorgql(lapack_int* m, lapack_int* n, lapack_int* k, float* a, lapack_int* lda, const float* tau,
-                   float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dorgql(lapack_int* m, lapack_int* n, lapack_int* k, double* a, lapack_int* lda, const double* tau,
-                   double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_cungql(lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_float* a, lapack_int* lda,
-                   const lapack_complex_float* tau, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zungql(lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_double* a, lapack_int* lda,
-                   const lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_sormql(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const float* a,
-                   lapack_int* lda, const float* tau, float* c, lapack_int* ldc, float* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_dormql(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const double* a,
-                   lapack_int* lda, const double* tau, double* c, lapack_int* ldc, double* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_cunmql(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const lapack_complex_float* a,
-                   lapack_int* lda, const lapack_complex_float* tau, lapack_complex_float* c, lapack_int* ldc,
-                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zunmql(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const lapack_complex_double* a,
-                   lapack_int* lda, const lapack_complex_double* tau, lapack_complex_double* c, lapack_int* ldc,
-                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_sgerqf(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* tau, float* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_dgerqf(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* tau, double* work,
-                   lapack_int* lwork, lapack_int* info);
-void LAPACK_cgerqf(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* tau,
-                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zgerqf(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* tau,
-                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_sorgrq(lapack_int* m, lapack_int* n, lapack_int* k, float* a, lapack_int* lda, const float* tau,
-                   float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dorgrq(lapack_int* m, lapack_int* n, lapack_int* k, double* a, lapack_int* lda, const double* tau,
-                   double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_cungrq(lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_float* a, lapack_int* lda,
-                   const lapack_complex_float* tau, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zungrq(lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_double* a, lapack_int* lda,
-                   const lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_sormrq(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const float* a,
-                   lapack_int* lda, const float* tau, float* c, lapack_int* ldc, float* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_dormrq(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const double* a,
-                   lapack_int* lda, const double* tau, double* c, lapack_int* ldc, double* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_cunmrq(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const lapack_complex_float* a,
-                   lapack_int* lda, const lapack_complex_float* tau, lapack_complex_float* c, lapack_int* ldc,
-                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zunmrq(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const lapack_complex_double* a,
-                   lapack_int* lda, const lapack_complex_double* tau, lapack_complex_double* c, lapack_int* ldc,
-                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_stzrzf(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* tau, float* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_dtzrzf(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* tau, double* work,
-                   lapack_int* lwork, lapack_int* info);
-void LAPACK_ctzrzf(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* tau,
-                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_ztzrzf(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* tau,
-                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_sormrz(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l, const float* a,
-                   lapack_int* lda, const float* tau, float* c, lapack_int* ldc, float* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_dormrz(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l, const double* a,
-                   lapack_int* lda, const double* tau, double* c, lapack_int* ldc, double* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_cunmrz(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
-                   const lapack_complex_float* a, lapack_int* lda, const lapack_complex_float* tau,
-                   lapack_complex_float* c, lapack_int* ldc, lapack_complex_float* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_zunmrz(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
-                   const lapack_complex_double* a, lapack_int* lda, const lapack_complex_double* tau,
-                   lapack_complex_double* c, lapack_int* ldc, lapack_complex_double* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_sggqrf(lapack_int* n, lapack_int* m, lapack_int* p, float* a, lapack_int* lda, float* taua, float* b,
-                   lapack_int* ldb, float* taub, float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dggqrf(lapack_int* n, lapack_int* m, lapack_int* p, double* a, lapack_int* lda, double* taua, double* b,
-                   lapack_int* ldb, double* taub, double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_cggqrf(lapack_int* n, lapack_int* m, lapack_int* p, lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* taua, lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* taub,
-                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zggqrf(lapack_int* n, lapack_int* m, lapack_int* p, lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* taua, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* taub,
-                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_sggrqf(lapack_int* m, lapack_int* p, lapack_int* n, float* a, lapack_int* lda, float* taua, float* b,
-                   lapack_int* ldb, float* taub, float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dggrqf(lapack_int* m, lapack_int* p, lapack_int* n, double* a, lapack_int* lda, double* taua, double* b,
-                   lapack_int* ldb, double* taub, double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_cggrqf(lapack_int* m, lapack_int* p, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* taua, lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* taub,
-                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zggrqf(lapack_int* m, lapack_int* p, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* taua, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* taub,
-                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_sgebrd(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* d, float* e, float* tauq,
-                   float* taup, float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dgebrd(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* d, double* e, double* tauq,
-                   double* taup, double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_cgebrd(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, float* d, float* e,
-                   lapack_complex_float* tauq, lapack_complex_float* taup, lapack_complex_float* work,
-                   lapack_int* lwork, lapack_int* info);
-void LAPACK_zgebrd(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, double* d, double* e,
-                   lapack_complex_double* tauq, lapack_complex_double* taup, lapack_complex_double* work,
-                   lapack_int* lwork, lapack_int* info);
-void LAPACK_sgbbrd(char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc, lapack_int* kl, lapack_int* ku, float* ab,
-                   lapack_int* ldab, float* d, float* e, float* q, lapack_int* ldq, float* pt, lapack_int* ldpt,
-                   float* c, lapack_int* ldc, float* work, lapack_int* info);
-void LAPACK_dgbbrd(char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc, lapack_int* kl, lapack_int* ku,
-                   double* ab, lapack_int* ldab, double* d, double* e, double* q, lapack_int* ldq, double* pt,
-                   lapack_int* ldpt, double* c, lapack_int* ldc, double* work, lapack_int* info);
-void LAPACK_cgbbrd(char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc, lapack_int* kl, lapack_int* ku,
-                   lapack_complex_float* ab, lapack_int* ldab, float* d, float* e, lapack_complex_float* q,
-                   lapack_int* ldq, lapack_complex_float* pt, lapack_int* ldpt, lapack_complex_float* c,
-                   lapack_int* ldc, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zgbbrd(char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc, lapack_int* kl, lapack_int* ku,
-                   lapack_complex_double* ab, lapack_int* ldab, double* d, double* e, lapack_complex_double* q,
-                   lapack_int* ldq, lapack_complex_double* pt, lapack_int* ldpt, lapack_complex_double* c,
-                   lapack_int* ldc, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_sorgbr(char* vect, lapack_int* m, lapack_int* n, lapack_int* k, float* a, lapack_int* lda, const float* tau,
-                   float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dorgbr(char* vect, lapack_int* m, lapack_int* n, lapack_int* k, double* a, lapack_int* lda,
-                   const double* tau, double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_sormbr(char* vect, char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const float* a,
-                   lapack_int* lda, const float* tau, float* c, lapack_int* ldc, float* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_dormbr(char* vect, char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const double* a,
-                   lapack_int* lda, const double* tau, double* c, lapack_int* ldc, double* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_cungbr(char* vect, lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_float* a, lapack_int* lda,
-                   const lapack_complex_float* tau, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zungbr(char* vect, lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_double* a, lapack_int* lda,
-                   const lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_cunmbr(char* vect, char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k,
-                   const lapack_complex_float* a, lapack_int* lda, const lapack_complex_float* tau,
-                   lapack_complex_float* c, lapack_int* ldc, lapack_complex_float* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_zunmbr(char* vect, char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k,
-                   const lapack_complex_double* a, lapack_int* lda, const lapack_complex_double* tau,
-                   lapack_complex_double* c, lapack_int* ldc, lapack_complex_double* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_sbdsqr(char* uplo, lapack_int* n, lapack_int* ncvt, lapack_int* nru, lapack_int* ncc, float* d, float* e,
-                   float* vt, lapack_int* ldvt, float* u, lapack_int* ldu, float* c, lapack_int* ldc, float* work,
-                   lapack_int* info);
-void LAPACK_dbdsqr(char* uplo, lapack_int* n, lapack_int* ncvt, lapack_int* nru, lapack_int* ncc, double* d, double* e,
-                   double* vt, lapack_int* ldvt, double* u, lapack_int* ldu, double* c, lapack_int* ldc, double* work,
-                   lapack_int* info);
-void LAPACK_cbdsqr(char* uplo, lapack_int* n, lapack_int* ncvt, lapack_int* nru, lapack_int* ncc, float* d, float* e,
-                   lapack_complex_float* vt, lapack_int* ldvt, lapack_complex_float* u, lapack_int* ldu,
-                   lapack_complex_float* c, lapack_int* ldc, float* work, lapack_int* info);
-void LAPACK_zbdsqr(char* uplo, lapack_int* n, lapack_int* ncvt, lapack_int* nru, lapack_int* ncc, double* d, double* e,
-                   lapack_complex_double* vt, lapack_int* ldvt, lapack_complex_double* u, lapack_int* ldu,
-                   lapack_complex_double* c, lapack_int* ldc, double* work, lapack_int* info);
-void LAPACK_sbdsdc(char* uplo, char* compq, lapack_int* n, float* d, float* e, float* u, lapack_int* ldu, float* vt,
-                   lapack_int* ldvt, float* q, lapack_int* iq, float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_dbdsdc(char* uplo, char* compq, lapack_int* n, double* d, double* e, double* u, lapack_int* ldu, double* vt,
-                   lapack_int* ldvt, double* q, lapack_int* iq, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_ssytrd(char* uplo, lapack_int* n, float* a, lapack_int* lda, float* d, float* e, float* tau, float* work,
-                   lapack_int* lwork, lapack_int* info);
-void LAPACK_dsytrd(char* uplo, lapack_int* n, double* a, lapack_int* lda, double* d, double* e, double* tau,
-                   double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_sorgtr(char* uplo, lapack_int* n, float* a, lapack_int* lda, const float* tau, float* work,
-                   lapack_int* lwork, lapack_int* info);
-void LAPACK_dorgtr(char* uplo, lapack_int* n, double* a, lapack_int* lda, const double* tau, double* work,
-                   lapack_int* lwork, lapack_int* info);
-void LAPACK_sormtr(char* side, char* uplo, char* trans, lapack_int* m, lapack_int* n, const float* a, lapack_int* lda,
-                   const float* tau, float* c, lapack_int* ldc, float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dormtr(char* side, char* uplo, char* trans, lapack_int* m, lapack_int* n, const double* a, lapack_int* lda,
-                   const double* tau, double* c, lapack_int* ldc, double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_chetrd(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, float* d, float* e,
-                   lapack_complex_float* tau, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zhetrd(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, double* d, double* e,
-                   lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_cungtr(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, const lapack_complex_float* tau,
-                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zungtr(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                   const lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_cunmtr(char* side, char* uplo, char* trans, lapack_int* m, lapack_int* n, const lapack_complex_float* a,
-                   lapack_int* lda, const lapack_complex_float* tau, lapack_complex_float* c, lapack_int* ldc,
-                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zunmtr(char* side, char* uplo, char* trans, lapack_int* m, lapack_int* n, const lapack_complex_double* a,
-                   lapack_int* lda, const lapack_complex_double* tau, lapack_complex_double* c, lapack_int* ldc,
-                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_ssptrd(char* uplo, lapack_int* n, float* ap, float* d, float* e, float* tau, lapack_int* info);
-void LAPACK_dsptrd(char* uplo, lapack_int* n, double* ap, double* d, double* e, double* tau, lapack_int* info);
-void LAPACK_sopgtr(char* uplo, lapack_int* n, const float* ap, const float* tau, float* q, lapack_int* ldq, float* work,
-                   lapack_int* info);
-void LAPACK_dopgtr(char* uplo, lapack_int* n, const double* ap, const double* tau, double* q, lapack_int* ldq,
-                   double* work, lapack_int* info);
-void LAPACK_sopmtr(char* side, char* uplo, char* trans, lapack_int* m, lapack_int* n, const float* ap, const float* tau,
-                   float* c, lapack_int* ldc, float* work, lapack_int* info);
-void LAPACK_dopmtr(char* side, char* uplo, char* trans, lapack_int* m, lapack_int* n, const double* ap,
-                   const double* tau, double* c, lapack_int* ldc, double* work, lapack_int* info);
-void LAPACK_chptrd(char* uplo, lapack_int* n, lapack_complex_float* ap, float* d, float* e, lapack_complex_float* tau,
-                   lapack_int* info);
-void LAPACK_zhptrd(char* uplo, lapack_int* n, lapack_complex_double* ap, double* d, double* e,
-                   lapack_complex_double* tau, lapack_int* info);
-void LAPACK_cupgtr(char* uplo, lapack_int* n, const lapack_complex_float* ap, const lapack_complex_float* tau,
-                   lapack_complex_float* q, lapack_int* ldq, lapack_complex_float* work, lapack_int* info);
-void LAPACK_zupgtr(char* uplo, lapack_int* n, const lapack_complex_double* ap, const lapack_complex_double* tau,
-                   lapack_complex_double* q, lapack_int* ldq, lapack_complex_double* work, lapack_int* info);
-void LAPACK_cupmtr(char* side, char* uplo, char* trans, lapack_int* m, lapack_int* n, const lapack_complex_float* ap,
-                   const lapack_complex_float* tau, lapack_complex_float* c, lapack_int* ldc,
-                   lapack_complex_float* work, lapack_int* info);
-void LAPACK_zupmtr(char* side, char* uplo, char* trans, lapack_int* m, lapack_int* n, const lapack_complex_double* ap,
-                   const lapack_complex_double* tau, lapack_complex_double* c, lapack_int* ldc,
-                   lapack_complex_double* work, lapack_int* info);
-void LAPACK_ssbtrd(char* vect, char* uplo, lapack_int* n, lapack_int* kd, float* ab, lapack_int* ldab, float* d,
-                   float* e, float* q, lapack_int* ldq, float* work, lapack_int* info);
-void LAPACK_dsbtrd(char* vect, char* uplo, lapack_int* n, lapack_int* kd, double* ab, lapack_int* ldab, double* d,
-                   double* e, double* q, lapack_int* ldq, double* work, lapack_int* info);
-void LAPACK_chbtrd(char* vect, char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_float* ab, lapack_int* ldab,
-                   float* d, float* e, lapack_complex_float* q, lapack_int* ldq, lapack_complex_float* work,
-                   lapack_int* info);
-void LAPACK_zhbtrd(char* vect, char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_double* ab, lapack_int* ldab,
-                   double* d, double* e, lapack_complex_double* q, lapack_int* ldq, lapack_complex_double* work,
-                   lapack_int* info);
-void LAPACK_ssterf(lapack_int* n, float* d, float* e, lapack_int* info);
-void LAPACK_dsterf(lapack_int* n, double* d, double* e, lapack_int* info);
-void LAPACK_ssteqr(char* compz, lapack_int* n, float* d, float* e, float* z, lapack_int* ldz, float* work,
-                   lapack_int* info);
-void LAPACK_dsteqr(char* compz, lapack_int* n, double* d, double* e, double* z, lapack_int* ldz, double* work,
-                   lapack_int* info);
-void LAPACK_csteqr(char* compz, lapack_int* n, float* d, float* e, lapack_complex_float* z, lapack_int* ldz,
-                   float* work, lapack_int* info);
-void LAPACK_zsteqr(char* compz, lapack_int* n, double* d, double* e, lapack_complex_double* z, lapack_int* ldz,
-                   double* work, lapack_int* info);
-void LAPACK_sstemr(char* jobz, char* range, lapack_int* n, float* d, float* e, float* vl, float* vu, lapack_int* il,
-                   lapack_int* iu, lapack_int* m, float* w, float* z, lapack_int* ldz, lapack_int* nzc,
-                   lapack_int* isuppz, lapack_logical* tryrac, float* work, lapack_int* lwork, lapack_int* iwork,
-                   lapack_int* liwork, lapack_int* info);
-void LAPACK_dstemr(char* jobz, char* range, lapack_int* n, double* d, double* e, double* vl, double* vu, lapack_int* il,
-                   lapack_int* iu, lapack_int* m, double* w, double* z, lapack_int* ldz, lapack_int* nzc,
-                   lapack_int* isuppz, lapack_logical* tryrac, double* work, lapack_int* lwork, lapack_int* iwork,
-                   lapack_int* liwork, lapack_int* info);
-void LAPACK_cstemr(char* jobz, char* range, lapack_int* n, float* d, float* e, float* vl, float* vu, lapack_int* il,
-                   lapack_int* iu, lapack_int* m, float* w, lapack_complex_float* z, lapack_int* ldz, lapack_int* nzc,
-                   lapack_int* isuppz, lapack_logical* tryrac, float* work, lapack_int* lwork, lapack_int* iwork,
-                   lapack_int* liwork, lapack_int* info);
-void LAPACK_zstemr(char* jobz, char* range, lapack_int* n, double* d, double* e, double* vl, double* vu, lapack_int* il,
-                   lapack_int* iu, lapack_int* m, double* w, lapack_complex_double* z, lapack_int* ldz, lapack_int* nzc,
-                   lapack_int* isuppz, lapack_logical* tryrac, double* work, lapack_int* lwork, lapack_int* iwork,
-                   lapack_int* liwork, lapack_int* info);
-void LAPACK_sstedc(char* compz, lapack_int* n, float* d, float* e, float* z, lapack_int* ldz, float* work,
-                   lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_dstedc(char* compz, lapack_int* n, double* d, double* e, double* z, lapack_int* ldz, double* work,
-                   lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_cstedc(char* compz, lapack_int* n, float* d, float* e, lapack_complex_float* z, lapack_int* ldz,
-                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* lrwork, lapack_int* iwork,
-                   lapack_int* liwork, lapack_int* info);
-void LAPACK_zstedc(char* compz, lapack_int* n, double* d, double* e, lapack_complex_double* z, lapack_int* ldz,
-                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* lrwork, lapack_int* iwork,
-                   lapack_int* liwork, lapack_int* info);
-void LAPACK_sstegr(char* jobz, char* range, lapack_int* n, float* d, float* e, float* vl, float* vu, lapack_int* il,
-                   lapack_int* iu, float* abstol, lapack_int* m, float* w, float* z, lapack_int* ldz,
-                   lapack_int* isuppz, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_int* info);
-void LAPACK_dstegr(char* jobz, char* range, lapack_int* n, double* d, double* e, double* vl, double* vu, lapack_int* il,
-                   lapack_int* iu, double* abstol, lapack_int* m, double* w, double* z, lapack_int* ldz,
-                   lapack_int* isuppz, double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_int* info);
-void LAPACK_cstegr(char* jobz, char* range, lapack_int* n, float* d, float* e, float* vl, float* vu, lapack_int* il,
-                   lapack_int* iu, float* abstol, lapack_int* m, float* w, lapack_complex_float* z, lapack_int* ldz,
-                   lapack_int* isuppz, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_int* info);
-void LAPACK_zstegr(char* jobz, char* range, lapack_int* n, double* d, double* e, double* vl, double* vu, lapack_int* il,
-                   lapack_int* iu, double* abstol, lapack_int* m, double* w, lapack_complex_double* z, lapack_int* ldz,
-                   lapack_int* isuppz, double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_int* info);
-void LAPACK_spteqr(char* compz, lapack_int* n, float* d, float* e, float* z, lapack_int* ldz, float* work,
-                   lapack_int* info);
-void LAPACK_dpteqr(char* compz, lapack_int* n, double* d, double* e, double* z, lapack_int* ldz, double* work,
-                   lapack_int* info);
-void LAPACK_cpteqr(char* compz, lapack_int* n, float* d, float* e, lapack_complex_float* z, lapack_int* ldz,
-                   float* work, lapack_int* info);
-void LAPACK_zpteqr(char* compz, lapack_int* n, double* d, double* e, lapack_complex_double* z, lapack_int* ldz,
-                   double* work, lapack_int* info);
-void LAPACK_sstebz(char* range, char* order, lapack_int* n, float* vl, float* vu, lapack_int* il, lapack_int* iu,
-                   float* abstol, const float* d, const float* e, lapack_int* m, lapack_int* nsplit, float* w,
-                   lapack_int* iblock, lapack_int* isplit, float* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_dstebz(char* range, char* order, lapack_int* n, double* vl, double* vu, lapack_int* il, lapack_int* iu,
-                   double* abstol, const double* d, const double* e, lapack_int* m, lapack_int* nsplit, double* w,
-                   lapack_int* iblock, lapack_int* isplit, double* work, lapack_int* iwork, lapack_int* info);
-void LAPACK_sstein(lapack_int* n, const float* d, const float* e, lapack_int* m, const float* w,
-                   const lapack_int* iblock, const lapack_int* isplit, float* z, lapack_int* ldz, float* work,
-                   lapack_int* iwork, lapack_int* ifailv, lapack_int* info);
-void LAPACK_dstein(lapack_int* n, const double* d, const double* e, lapack_int* m, const double* w,
-                   const lapack_int* iblock, const lapack_int* isplit, double* z, lapack_int* ldz, double* work,
-                   lapack_int* iwork, lapack_int* ifailv, lapack_int* info);
-void LAPACK_cstein(lapack_int* n, const float* d, const float* e, lapack_int* m, const float* w,
-                   const lapack_int* iblock, const lapack_int* isplit, lapack_complex_float* z, lapack_int* ldz,
-                   float* work, lapack_int* iwork, lapack_int* ifailv, lapack_int* info);
-void LAPACK_zstein(lapack_int* n, const double* d, const double* e, lapack_int* m, const double* w,
-                   const lapack_int* iblock, const lapack_int* isplit, lapack_complex_double* z, lapack_int* ldz,
-                   double* work, lapack_int* iwork, lapack_int* ifailv, lapack_int* info);
-void LAPACK_sdisna(char* job, lapack_int* m, lapack_int* n, const float* d, float* sep, lapack_int* info);
-void LAPACK_ddisna(char* job, lapack_int* m, lapack_int* n, const double* d, double* sep, lapack_int* info);
-void LAPACK_ssygst(lapack_int* itype, char* uplo, lapack_int* n, float* a, lapack_int* lda, const float* b,
-                   lapack_int* ldb, lapack_int* info);
-void LAPACK_dsygst(lapack_int* itype, char* uplo, lapack_int* n, double* a, lapack_int* lda, const double* b,
-                   lapack_int* ldb, lapack_int* info);
-void LAPACK_chegst(lapack_int* itype, char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
-                   const lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_zhegst(lapack_int* itype, char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                   const lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
-void LAPACK_sspgst(lapack_int* itype, char* uplo, lapack_int* n, float* ap, const float* bp, lapack_int* info);
-void LAPACK_dspgst(lapack_int* itype, char* uplo, lapack_int* n, double* ap, const double* bp, lapack_int* info);
-void LAPACK_chpgst(lapack_int* itype, char* uplo, lapack_int* n, lapack_complex_float* ap,
-                   const lapack_complex_float* bp, lapack_int* info);
-void LAPACK_zhpgst(lapack_int* itype, char* uplo, lapack_int* n, lapack_complex_double* ap,
-                   const lapack_complex_double* bp, lapack_int* info);
-void LAPACK_ssbgst(char* vect, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, float* ab, lapack_int* ldab,
-                   const float* bb, lapack_int* ldbb, float* x, lapack_int* ldx, float* work, lapack_int* info);
-void LAPACK_dsbgst(char* vect, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, double* ab, lapack_int* ldab,
-                   const double* bb, lapack_int* ldbb, double* x, lapack_int* ldx, double* work, lapack_int* info);
-void LAPACK_chbgst(char* vect, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, lapack_complex_float* ab,
-                   lapack_int* ldab, const lapack_complex_float* bb, lapack_int* ldbb, lapack_complex_float* x,
-                   lapack_int* ldx, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zhbgst(char* vect, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, lapack_complex_double* ab,
-                   lapack_int* ldab, const lapack_complex_double* bb, lapack_int* ldbb, lapack_complex_double* x,
-                   lapack_int* ldx, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_spbstf(char* uplo, lapack_int* n, lapack_int* kb, float* bb, lapack_int* ldbb, lapack_int* info);
-void LAPACK_dpbstf(char* uplo, lapack_int* n, lapack_int* kb, double* bb, lapack_int* ldbb, lapack_int* info);
-void LAPACK_cpbstf(char* uplo, lapack_int* n, lapack_int* kb, lapack_complex_float* bb, lapack_int* ldbb,
-                   lapack_int* info);
-void LAPACK_zpbstf(char* uplo, lapack_int* n, lapack_int* kb, lapack_complex_double* bb, lapack_int* ldbb,
-                   lapack_int* info);
-void LAPACK_sgehrd(lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* a, lapack_int* lda, float* tau, float* work,
-                   lapack_int* lwork, lapack_int* info);
-void LAPACK_dgehrd(lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* a, lapack_int* lda, double* tau,
-                   double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_cgehrd(lapack_int* n, lapack_int* ilo, lapack_int* ihi, lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* tau, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zgehrd(lapack_int* n, lapack_int* ilo, lapack_int* ihi, lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_sorghr(lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* a, lapack_int* lda, const float* tau,
-                   float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dorghr(lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* a, lapack_int* lda, const double* tau,
-                   double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_sormhr(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* ilo, lapack_int* ihi,
-                   const float* a, lapack_int* lda, const float* tau, float* c, lapack_int* ldc, float* work,
-                   lapack_int* lwork, lapack_int* info);
-void LAPACK_dormhr(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* ilo, lapack_int* ihi,
-                   const double* a, lapack_int* lda, const double* tau, double* c, lapack_int* ldc, double* work,
-                   lapack_int* lwork, lapack_int* info);
-void LAPACK_cunghr(lapack_int* n, lapack_int* ilo, lapack_int* ihi, lapack_complex_float* a, lapack_int* lda,
-                   const lapack_complex_float* tau, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zunghr(lapack_int* n, lapack_int* ilo, lapack_int* ihi, lapack_complex_double* a, lapack_int* lda,
-                   const lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_cunmhr(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* ilo, lapack_int* ihi,
-                   const lapack_complex_float* a, lapack_int* lda, const lapack_complex_float* tau,
-                   lapack_complex_float* c, lapack_int* ldc, lapack_complex_float* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_zunmhr(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* ilo, lapack_int* ihi,
-                   const lapack_complex_double* a, lapack_int* lda, const lapack_complex_double* tau,
-                   lapack_complex_double* c, lapack_int* ldc, lapack_complex_double* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_sgebal(char* job, lapack_int* n, float* a, lapack_int* lda, lapack_int* ilo, lapack_int* ihi, float* scale,
-                   lapack_int* info);
-void LAPACK_dgebal(char* job, lapack_int* n, double* a, lapack_int* lda, lapack_int* ilo, lapack_int* ihi,
-                   double* scale, lapack_int* info);
-void LAPACK_cgebal(char* job, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* ilo, lapack_int* ihi,
-                   float* scale, lapack_int* info);
-void LAPACK_zgebal(char* job, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* ilo,
-                   lapack_int* ihi, double* scale, lapack_int* info);
-void LAPACK_sgebak(char* job, char* side, lapack_int* n, lapack_int* ilo, lapack_int* ihi, const float* scale,
-                   lapack_int* m, float* v, lapack_int* ldv, lapack_int* info);
-void LAPACK_dgebak(char* job, char* side, lapack_int* n, lapack_int* ilo, lapack_int* ihi, const double* scale,
-                   lapack_int* m, double* v, lapack_int* ldv, lapack_int* info);
-void LAPACK_cgebak(char* job, char* side, lapack_int* n, lapack_int* ilo, lapack_int* ihi, const float* scale,
-                   lapack_int* m, lapack_complex_float* v, lapack_int* ldv, lapack_int* info);
-void LAPACK_zgebak(char* job, char* side, lapack_int* n, lapack_int* ilo, lapack_int* ihi, const double* scale,
-                   lapack_int* m, lapack_complex_double* v, lapack_int* ldv, lapack_int* info);
-void LAPACK_shseqr(char* job, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* h, lapack_int* ldh,
-                   float* wr, float* wi, float* z, lapack_int* ldz, float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dhseqr(char* job, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* h, lapack_int* ldh,
-                   double* wr, double* wi, double* z, lapack_int* ldz, double* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_chseqr(char* job, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, lapack_complex_float* h,
-                   lapack_int* ldh, lapack_complex_float* w, lapack_complex_float* z, lapack_int* ldz,
-                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zhseqr(char* job, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, lapack_complex_double* h,
-                   lapack_int* ldh, lapack_complex_double* w, lapack_complex_double* z, lapack_int* ldz,
-                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_shsein(char* job, char* eigsrc, char* initv, lapack_logical* select, lapack_int* n, const float* h,
-                   lapack_int* ldh, float* wr, const float* wi, float* vl, lapack_int* ldvl, float* vr,
-                   lapack_int* ldvr, lapack_int* mm, lapack_int* m, float* work, lapack_int* ifaill, lapack_int* ifailr,
-                   lapack_int* info);
-void LAPACK_dhsein(char* job, char* eigsrc, char* initv, lapack_logical* select, lapack_int* n, const double* h,
-                   lapack_int* ldh, double* wr, const double* wi, double* vl, lapack_int* ldvl, double* vr,
-                   lapack_int* ldvr, lapack_int* mm, lapack_int* m, double* work, lapack_int* ifaill,
-                   lapack_int* ifailr, lapack_int* info);
-void LAPACK_chsein(char* job, char* eigsrc, char* initv, const lapack_logical* select, lapack_int* n,
-                   const lapack_complex_float* h, lapack_int* ldh, lapack_complex_float* w, lapack_complex_float* vl,
-                   lapack_int* ldvl, lapack_complex_float* vr, lapack_int* ldvr, lapack_int* mm, lapack_int* m,
-                   lapack_complex_float* work, float* rwork, lapack_int* ifaill, lapack_int* ifailr, lapack_int* info);
-void LAPACK_zhsein(char* job, char* eigsrc, char* initv, const lapack_logical* select, lapack_int* n,
-                   const lapack_complex_double* h, lapack_int* ldh, lapack_complex_double* w, lapack_complex_double* vl,
-                   lapack_int* ldvl, lapack_complex_double* vr, lapack_int* ldvr, lapack_int* mm, lapack_int* m,
-                   lapack_complex_double* work, double* rwork, lapack_int* ifaill, lapack_int* ifailr,
-                   lapack_int* info);
-void LAPACK_strevc(char* side, char* howmny, lapack_logical* select, lapack_int* n, const float* t, lapack_int* ldt,
-                   float* vl, lapack_int* ldvl, float* vr, lapack_int* ldvr, lapack_int* mm, lapack_int* m, float* work,
-                   lapack_int* info);
-void LAPACK_dtrevc(char* side, char* howmny, lapack_logical* select, lapack_int* n, const double* t, lapack_int* ldt,
-                   double* vl, lapack_int* ldvl, double* vr, lapack_int* ldvr, lapack_int* mm, lapack_int* m,
-                   double* work, lapack_int* info);
-void LAPACK_ctrevc(char* side, char* howmny, const lapack_logical* select, lapack_int* n, lapack_complex_float* t,
-                   lapack_int* ldt, lapack_complex_float* vl, lapack_int* ldvl, lapack_complex_float* vr,
-                   lapack_int* ldvr, lapack_int* mm, lapack_int* m, lapack_complex_float* work, float* rwork,
-                   lapack_int* info);
-void LAPACK_ztrevc(char* side, char* howmny, const lapack_logical* select, lapack_int* n, lapack_complex_double* t,
-                   lapack_int* ldt, lapack_complex_double* vl, lapack_int* ldvl, lapack_complex_double* vr,
-                   lapack_int* ldvr, lapack_int* mm, lapack_int* m, lapack_complex_double* work, double* rwork,
-                   lapack_int* info);
-void LAPACK_strsna(char* job, char* howmny, const lapack_logical* select, lapack_int* n, const float* t,
-                   lapack_int* ldt, const float* vl, lapack_int* ldvl, const float* vr, lapack_int* ldvr, float* s,
-                   float* sep, lapack_int* mm, lapack_int* m, float* work, lapack_int* ldwork, lapack_int* iwork,
-                   lapack_int* info);
-void LAPACK_dtrsna(char* job, char* howmny, const lapack_logical* select, lapack_int* n, const double* t,
-                   lapack_int* ldt, const double* vl, lapack_int* ldvl, const double* vr, lapack_int* ldvr, double* s,
-                   double* sep, lapack_int* mm, lapack_int* m, double* work, lapack_int* ldwork, lapack_int* iwork,
-                   lapack_int* info);
-void LAPACK_ctrsna(char* job, char* howmny, const lapack_logical* select, lapack_int* n, const lapack_complex_float* t,
-                   lapack_int* ldt, const lapack_complex_float* vl, lapack_int* ldvl, const lapack_complex_float* vr,
-                   lapack_int* ldvr, float* s, float* sep, lapack_int* mm, lapack_int* m, lapack_complex_float* work,
-                   lapack_int* ldwork, float* rwork, lapack_int* info);
-void LAPACK_ztrsna(char* job, char* howmny, const lapack_logical* select, lapack_int* n, const lapack_complex_double* t,
-                   lapack_int* ldt, const lapack_complex_double* vl, lapack_int* ldvl, const lapack_complex_double* vr,
-                   lapack_int* ldvr, double* s, double* sep, lapack_int* mm, lapack_int* m, lapack_complex_double* work,
-                   lapack_int* ldwork, double* rwork, lapack_int* info);
-void LAPACK_strexc(char* compq, lapack_int* n, float* t, lapack_int* ldt, float* q, lapack_int* ldq, lapack_int* ifst,
-                   lapack_int* ilst, float* work, lapack_int* info);
-void LAPACK_dtrexc(char* compq, lapack_int* n, double* t, lapack_int* ldt, double* q, lapack_int* ldq, lapack_int* ifst,
-                   lapack_int* ilst, double* work, lapack_int* info);
-void LAPACK_ctrexc(char* compq, lapack_int* n, lapack_complex_float* t, lapack_int* ldt, lapack_complex_float* q,
-                   lapack_int* ldq, lapack_int* ifst, lapack_int* ilst, lapack_int* info);
-void LAPACK_ztrexc(char* compq, lapack_int* n, lapack_complex_double* t, lapack_int* ldt, lapack_complex_double* q,
-                   lapack_int* ldq, lapack_int* ifst, lapack_int* ilst, lapack_int* info);
-void LAPACK_strsen(char* job, char* compq, const lapack_logical* select, lapack_int* n, float* t, lapack_int* ldt,
-                   float* q, lapack_int* ldq, float* wr, float* wi, lapack_int* m, float* s, float* sep, float* work,
-                   lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_dtrsen(char* job, char* compq, const lapack_logical* select, lapack_int* n, double* t, lapack_int* ldt,
-                   double* q, lapack_int* ldq, double* wr, double* wi, lapack_int* m, double* s, double* sep,
-                   double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_ctrsen(char* job, char* compq, const lapack_logical* select, lapack_int* n, lapack_complex_float* t,
-                   lapack_int* ldt, lapack_complex_float* q, lapack_int* ldq, lapack_complex_float* w, lapack_int* m,
-                   float* s, float* sep, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_ztrsen(char* job, char* compq, const lapack_logical* select, lapack_int* n, lapack_complex_double* t,
-                   lapack_int* ldt, lapack_complex_double* q, lapack_int* ldq, lapack_complex_double* w, lapack_int* m,
-                   double* s, double* sep, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_strsyl(char* trana, char* tranb, lapack_int* isgn, lapack_int* m, lapack_int* n, const float* a,
-                   lapack_int* lda, const float* b, lapack_int* ldb, float* c, lapack_int* ldc, float* scale,
-                   lapack_int* info);
-void LAPACK_dtrsyl(char* trana, char* tranb, lapack_int* isgn, lapack_int* m, lapack_int* n, const double* a,
-                   lapack_int* lda, const double* b, lapack_int* ldb, double* c, lapack_int* ldc, double* scale,
-                   lapack_int* info);
-void LAPACK_ctrsyl(char* trana, char* tranb, lapack_int* isgn, lapack_int* m, lapack_int* n,
-                   const lapack_complex_float* a, lapack_int* lda, const lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* c, lapack_int* ldc, float* scale, lapack_int* info);
-void LAPACK_ztrsyl(char* trana, char* tranb, lapack_int* isgn, lapack_int* m, lapack_int* n,
-                   const lapack_complex_double* a, lapack_int* lda, const lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* c, lapack_int* ldc, double* scale, lapack_int* info);
-void LAPACK_sgghrd(char* compq, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* a, lapack_int* lda,
-                   float* b, lapack_int* ldb, float* q, lapack_int* ldq, float* z, lapack_int* ldz, lapack_int* info);
-void LAPACK_dgghrd(char* compq, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* a,
-                   lapack_int* lda, double* b, lapack_int* ldb, double* q, lapack_int* ldq, double* z, lapack_int* ldz,
-                   lapack_int* info);
-void LAPACK_cgghrd(char* compq, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, lapack_complex_float* a,
-                   lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* q, lapack_int* ldq,
-                   lapack_complex_float* z, lapack_int* ldz, lapack_int* info);
-void LAPACK_zgghrd(char* compq, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, lapack_complex_double* a,
-                   lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* q,
-                   lapack_int* ldq, lapack_complex_double* z, lapack_int* ldz, lapack_int* info);
-void LAPACK_sggbal(char* job, lapack_int* n, float* a, lapack_int* lda, float* b, lapack_int* ldb, lapack_int* ilo,
-                   lapack_int* ihi, float* lscale, float* rscale, float* work, lapack_int* info);
-void LAPACK_dggbal(char* job, lapack_int* n, double* a, lapack_int* lda, double* b, lapack_int* ldb, lapack_int* ilo,
-                   lapack_int* ihi, double* lscale, double* rscale, double* work, lapack_int* info);
-void LAPACK_cggbal(char* job, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b,
-                   lapack_int* ldb, lapack_int* ilo, lapack_int* ihi, float* lscale, float* rscale, float* work,
-                   lapack_int* info);
-void LAPACK_zggbal(char* job, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b,
-                   lapack_int* ldb, lapack_int* ilo, lapack_int* ihi, double* lscale, double* rscale, double* work,
-                   lapack_int* info);
-void LAPACK_sggbak(char* job, char* side, lapack_int* n, lapack_int* ilo, lapack_int* ihi, const float* lscale,
-                   const float* rscale, lapack_int* m, float* v, lapack_int* ldv, lapack_int* info);
-void LAPACK_dggbak(char* job, char* side, lapack_int* n, lapack_int* ilo, lapack_int* ihi, const double* lscale,
-                   const double* rscale, lapack_int* m, double* v, lapack_int* ldv, lapack_int* info);
-void LAPACK_cggbak(char* job, char* side, lapack_int* n, lapack_int* ilo, lapack_int* ihi, const float* lscale,
-                   const float* rscale, lapack_int* m, lapack_complex_float* v, lapack_int* ldv, lapack_int* info);
-void LAPACK_zggbak(char* job, char* side, lapack_int* n, lapack_int* ilo, lapack_int* ihi, const double* lscale,
-                   const double* rscale, lapack_int* m, lapack_complex_double* v, lapack_int* ldv, lapack_int* info);
-void LAPACK_shgeqz(char* job, char* compq, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* h,
-                   lapack_int* ldh, float* t, lapack_int* ldt, float* alphar, float* alphai, float* beta, float* q,
-                   lapack_int* ldq, float* z, lapack_int* ldz, float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dhgeqz(char* job, char* compq, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* h,
-                   lapack_int* ldh, double* t, lapack_int* ldt, double* alphar, double* alphai, double* beta, double* q,
-                   lapack_int* ldq, double* z, lapack_int* ldz, double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_chgeqz(char* job, char* compq, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi,
-                   lapack_complex_float* h, lapack_int* ldh, lapack_complex_float* t, lapack_int* ldt,
-                   lapack_complex_float* alpha, lapack_complex_float* beta, lapack_complex_float* q, lapack_int* ldq,
-                   lapack_complex_float* z, lapack_int* ldz, lapack_complex_float* work, lapack_int* lwork,
-                   float* rwork, lapack_int* info);
-void LAPACK_zhgeqz(char* job, char* compq, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi,
-                   lapack_complex_double* h, lapack_int* ldh, lapack_complex_double* t, lapack_int* ldt,
-                   lapack_complex_double* alpha, lapack_complex_double* beta, lapack_complex_double* q, lapack_int* ldq,
-                   lapack_complex_double* z, lapack_int* ldz, lapack_complex_double* work, lapack_int* lwork,
-                   double* rwork, lapack_int* info);
-void LAPACK_stgevc(char* side, char* howmny, const lapack_logical* select, lapack_int* n, const float* s,
-                   lapack_int* lds, const float* p, lapack_int* ldp, float* vl, lapack_int* ldvl, float* vr,
-                   lapack_int* ldvr, lapack_int* mm, lapack_int* m, float* work, lapack_int* info);
-void LAPACK_dtgevc(char* side, char* howmny, const lapack_logical* select, lapack_int* n, const double* s,
-                   lapack_int* lds, const double* p, lapack_int* ldp, double* vl, lapack_int* ldvl, double* vr,
-                   lapack_int* ldvr, lapack_int* mm, lapack_int* m, double* work, lapack_int* info);
-void LAPACK_ctgevc(char* side, char* howmny, const lapack_logical* select, lapack_int* n, const lapack_complex_float* s,
-                   lapack_int* lds, const lapack_complex_float* p, lapack_int* ldp, lapack_complex_float* vl,
-                   lapack_int* ldvl, lapack_complex_float* vr, lapack_int* ldvr, lapack_int* mm, lapack_int* m,
-                   lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_ztgevc(char* side, char* howmny, const lapack_logical* select, lapack_int* n,
-                   const lapack_complex_double* s, lapack_int* lds, const lapack_complex_double* p, lapack_int* ldp,
-                   lapack_complex_double* vl, lapack_int* ldvl, lapack_complex_double* vr, lapack_int* ldvr,
-                   lapack_int* mm, lapack_int* m, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_stgexc(lapack_logical* wantq, lapack_logical* wantz, lapack_int* n, float* a, lapack_int* lda, float* b,
-                   lapack_int* ldb, float* q, lapack_int* ldq, float* z, lapack_int* ldz, lapack_int* ifst,
-                   lapack_int* ilst, float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dtgexc(lapack_logical* wantq, lapack_logical* wantz, lapack_int* n, double* a, lapack_int* lda, double* b,
-                   lapack_int* ldb, double* q, lapack_int* ldq, double* z, lapack_int* ldz, lapack_int* ifst,
-                   lapack_int* ilst, double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_ctgexc(lapack_logical* wantq, lapack_logical* wantz, lapack_int* n, lapack_complex_float* a,
-                   lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* q, lapack_int* ldq,
-                   lapack_complex_float* z, lapack_int* ldz, lapack_int* ifst, lapack_int* ilst, lapack_int* info);
-void LAPACK_ztgexc(lapack_logical* wantq, lapack_logical* wantz, lapack_int* n, lapack_complex_double* a,
-                   lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* q,
-                   lapack_int* ldq, lapack_complex_double* z, lapack_int* ldz, lapack_int* ifst, lapack_int* ilst,
-                   lapack_int* info);
-void LAPACK_stgsen(lapack_int* ijob, lapack_logical* wantq, lapack_logical* wantz, const lapack_logical* select,
-                   lapack_int* n, float* a, lapack_int* lda, float* b, lapack_int* ldb, float* alphar, float* alphai,
-                   float* beta, float* q, lapack_int* ldq, float* z, lapack_int* ldz, lapack_int* m, float* pl,
-                   float* pr, float* dif, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_int* info);
-void LAPACK_dtgsen(lapack_int* ijob, lapack_logical* wantq, lapack_logical* wantz, const lapack_logical* select,
-                   lapack_int* n, double* a, lapack_int* lda, double* b, lapack_int* ldb, double* alphar,
-                   double* alphai, double* beta, double* q, lapack_int* ldq, double* z, lapack_int* ldz, lapack_int* m,
-                   double* pl, double* pr, double* dif, double* work, lapack_int* lwork, lapack_int* iwork,
-                   lapack_int* liwork, lapack_int* info);
-void LAPACK_ctgsen(lapack_int* ijob, lapack_logical* wantq, lapack_logical* wantz, const lapack_logical* select,
-                   lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* alpha, lapack_complex_float* beta, lapack_complex_float* q, lapack_int* ldq,
-                   lapack_complex_float* z, lapack_int* ldz, lapack_int* m, float* pl, float* pr, float* dif,
-                   lapack_complex_float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_int* info);
-void LAPACK_ztgsen(lapack_int* ijob, lapack_logical* wantq, lapack_logical* wantz, const lapack_logical* select,
-                   lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* alpha, lapack_complex_double* beta, lapack_complex_double* q, lapack_int* ldq,
-                   lapack_complex_double* z, lapack_int* ldz, lapack_int* m, double* pl, double* pr, double* dif,
-                   lapack_complex_double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_int* info);
-void LAPACK_stgsyl(char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n, const float* a, lapack_int* lda,
-                   const float* b, lapack_int* ldb, float* c, lapack_int* ldc, const float* d, lapack_int* ldd,
-                   const float* e, lapack_int* lde, float* f, lapack_int* ldf, float* scale, float* dif, float* work,
-                   lapack_int* lwork, lapack_int* iwork, lapack_int* info);
-void LAPACK_dtgsyl(char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n, const double* a, lapack_int* lda,
-                   const double* b, lapack_int* ldb, double* c, lapack_int* ldc, const double* d, lapack_int* ldd,
-                   const double* e, lapack_int* lde, double* f, lapack_int* ldf, double* scale, double* dif,
-                   double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* info);
-void LAPACK_ctgsyl(char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n, const lapack_complex_float* a,
-                   lapack_int* lda, const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* c,
-                   lapack_int* ldc, const lapack_complex_float* d, lapack_int* ldd, const lapack_complex_float* e,
-                   lapack_int* lde, lapack_complex_float* f, lapack_int* ldf, float* scale, float* dif,
-                   lapack_complex_float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* info);
-void LAPACK_ztgsyl(char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n, const lapack_complex_double* a,
-                   lapack_int* lda, const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* c,
-                   lapack_int* ldc, const lapack_complex_double* d, lapack_int* ldd, const lapack_complex_double* e,
-                   lapack_int* lde, lapack_complex_double* f, lapack_int* ldf, double* scale, double* dif,
-                   lapack_complex_double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* info);
-void LAPACK_stgsna(char* job, char* howmny, const lapack_logical* select, lapack_int* n, const float* a,
-                   lapack_int* lda, const float* b, lapack_int* ldb, const float* vl, lapack_int* ldvl, const float* vr,
-                   lapack_int* ldvr, float* s, float* dif, lapack_int* mm, lapack_int* m, float* work,
-                   lapack_int* lwork, lapack_int* iwork, lapack_int* info);
-void LAPACK_dtgsna(char* job, char* howmny, const lapack_logical* select, lapack_int* n, const double* a,
-                   lapack_int* lda, const double* b, lapack_int* ldb, const double* vl, lapack_int* ldvl,
-                   const double* vr, lapack_int* ldvr, double* s, double* dif, lapack_int* mm, lapack_int* m,
-                   double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* info);
-void LAPACK_ctgsna(char* job, char* howmny, const lapack_logical* select, lapack_int* n, const lapack_complex_float* a,
-                   lapack_int* lda, const lapack_complex_float* b, lapack_int* ldb, const lapack_complex_float* vl,
-                   lapack_int* ldvl, const lapack_complex_float* vr, lapack_int* ldvr, float* s, float* dif,
-                   lapack_int* mm, lapack_int* m, lapack_complex_float* work, lapack_int* lwork, lapack_int* iwork,
-                   lapack_int* info);
-void LAPACK_ztgsna(char* job, char* howmny, const lapack_logical* select, lapack_int* n, const lapack_complex_double* a,
-                   lapack_int* lda, const lapack_complex_double* b, lapack_int* ldb, const lapack_complex_double* vl,
-                   lapack_int* ldvl, const lapack_complex_double* vr, lapack_int* ldvr, double* s, double* dif,
-                   lapack_int* mm, lapack_int* m, lapack_complex_double* work, lapack_int* lwork, lapack_int* iwork,
-                   lapack_int* info);
-void LAPACK_sggsvp(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* p, lapack_int* n, float* a,
-                   lapack_int* lda, float* b, lapack_int* ldb, float* tola, float* tolb, lapack_int* k, lapack_int* l,
-                   float* u, lapack_int* ldu, float* v, lapack_int* ldv, float* q, lapack_int* ldq, lapack_int* iwork,
-                   float* tau, float* work, lapack_int* info);
-void LAPACK_dggsvp(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* p, lapack_int* n, double* a,
-                   lapack_int* lda, double* b, lapack_int* ldb, double* tola, double* tolb, lapack_int* k,
-                   lapack_int* l, double* u, lapack_int* ldu, double* v, lapack_int* ldv, double* q, lapack_int* ldq,
-                   lapack_int* iwork, double* tau, double* work, lapack_int* info);
-void LAPACK_cggsvp(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* p, lapack_int* n,
-                   lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, float* tola,
-                   float* tolb, lapack_int* k, lapack_int* l, lapack_complex_float* u, lapack_int* ldu,
-                   lapack_complex_float* v, lapack_int* ldv, lapack_complex_float* q, lapack_int* ldq,
-                   lapack_int* iwork, float* rwork, lapack_complex_float* tau, lapack_complex_float* work,
-                   lapack_int* info);
-void LAPACK_zggsvp(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* p, lapack_int* n,
-                   lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, double* tola,
-                   double* tolb, lapack_int* k, lapack_int* l, lapack_complex_double* u, lapack_int* ldu,
-                   lapack_complex_double* v, lapack_int* ldv, lapack_complex_double* q, lapack_int* ldq,
-                   lapack_int* iwork, double* rwork, lapack_complex_double* tau, lapack_complex_double* work,
-                   lapack_int* info);
-void LAPACK_stgsja(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* p, lapack_int* n, lapack_int* k,
-                   lapack_int* l, float* a, lapack_int* lda, float* b, lapack_int* ldb, float* tola, float* tolb,
-                   float* alpha, float* beta, float* u, lapack_int* ldu, float* v, lapack_int* ldv, float* q,
-                   lapack_int* ldq, float* work, lapack_int* ncycle, lapack_int* info);
-void LAPACK_dtgsja(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* p, lapack_int* n, lapack_int* k,
-                   lapack_int* l, double* a, lapack_int* lda, double* b, lapack_int* ldb, double* tola, double* tolb,
-                   double* alpha, double* beta, double* u, lapack_int* ldu, double* v, lapack_int* ldv, double* q,
-                   lapack_int* ldq, double* work, lapack_int* ncycle, lapack_int* info);
-void LAPACK_ctgsja(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* p, lapack_int* n, lapack_int* k,
-                   lapack_int* l, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
-                   float* tola, float* tolb, float* alpha, float* beta, lapack_complex_float* u, lapack_int* ldu,
-                   lapack_complex_float* v, lapack_int* ldv, lapack_complex_float* q, lapack_int* ldq,
-                   lapack_complex_float* work, lapack_int* ncycle, lapack_int* info);
-void LAPACK_ztgsja(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* p, lapack_int* n, lapack_int* k,
-                   lapack_int* l, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
-                   double* tola, double* tolb, double* alpha, double* beta, lapack_complex_double* u, lapack_int* ldu,
-                   lapack_complex_double* v, lapack_int* ldv, lapack_complex_double* q, lapack_int* ldq,
-                   lapack_complex_double* work, lapack_int* ncycle, lapack_int* info);
-void LAPACK_sgels(char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* b,
-                  lapack_int* ldb, float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dgels(char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* b,
-                  lapack_int* ldb, double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_cgels(char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
-                  lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* work, lapack_int* lwork,
-                  lapack_int* info);
-void LAPACK_zgels(char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a,
-                  lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* work,
-                  lapack_int* lwork, lapack_int* info);
-void LAPACK_sgelsy(lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* b, lapack_int* ldb,
-                   lapack_int* jpvt, float* rcond, lapack_int* rank, float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dgelsy(lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* b,
-                   lapack_int* ldb, lapack_int* jpvt, double* rcond, lapack_int* rank, double* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_cgelsy(lapack_int* m, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* b, lapack_int* ldb, lapack_int* jpvt, float* rcond, lapack_int* rank,
-                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* info);
-void LAPACK_zgelsy(lapack_int* m, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* b, lapack_int* ldb, lapack_int* jpvt, double* rcond, lapack_int* rank,
-                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* info);
-void LAPACK_sgelss(lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* b, lapack_int* ldb,
-                   float* s, float* rcond, lapack_int* rank, float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dgelss(lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* b,
-                   lapack_int* ldb, double* s, double* rcond, lapack_int* rank, double* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_cgelss(lapack_int* m, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* b, lapack_int* ldb, float* s, float* rcond, lapack_int* rank,
-                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* info);
-void LAPACK_zgelss(lapack_int* m, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* b, lapack_int* ldb, double* s, double* rcond, lapack_int* rank,
-                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* info);
-void LAPACK_sgelsd(lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* b, lapack_int* ldb,
-                   float* s, float* rcond, lapack_int* rank, float* work, lapack_int* lwork, lapack_int* iwork,
-                   lapack_int* info);
-void LAPACK_dgelsd(lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* b,
-                   lapack_int* ldb, double* s, double* rcond, lapack_int* rank, double* work, lapack_int* lwork,
-                   lapack_int* iwork, lapack_int* info);
-void LAPACK_cgelsd(lapack_int* m, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* b, lapack_int* ldb, float* s, float* rcond, lapack_int* rank,
-                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* iwork, lapack_int* info);
-void LAPACK_zgelsd(lapack_int* m, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* b, lapack_int* ldb, double* s, double* rcond, lapack_int* rank,
-                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* iwork, lapack_int* info);
-void LAPACK_sgglse(lapack_int* m, lapack_int* n, lapack_int* p, float* a, lapack_int* lda, float* b, lapack_int* ldb,
-                   float* c, float* d, float* x, float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dgglse(lapack_int* m, lapack_int* n, lapack_int* p, double* a, lapack_int* lda, double* b, lapack_int* ldb,
-                   double* c, double* d, double* x, double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_cgglse(lapack_int* m, lapack_int* n, lapack_int* p, lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* c, lapack_complex_float* d,
-                   lapack_complex_float* x, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zgglse(lapack_int* m, lapack_int* n, lapack_int* p, lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* c, lapack_complex_double* d,
-                   lapack_complex_double* x, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_sggglm(lapack_int* n, lapack_int* m, lapack_int* p, float* a, lapack_int* lda, float* b, lapack_int* ldb,
-                   float* d, float* x, float* y, float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dggglm(lapack_int* n, lapack_int* m, lapack_int* p, double* a, lapack_int* lda, double* b, lapack_int* ldb,
-                   double* d, double* x, double* y, double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_cggglm(lapack_int* n, lapack_int* m, lapack_int* p, lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* d, lapack_complex_float* x,
-                   lapack_complex_float* y, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zggglm(lapack_int* n, lapack_int* m, lapack_int* p, lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* d, lapack_complex_double* x,
-                   lapack_complex_double* y, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_ssyev(char* jobz, char* uplo, lapack_int* n, float* a, lapack_int* lda, float* w, float* work,
-                  lapack_int* lwork, lapack_int* info);
-void LAPACK_dsyev(char* jobz, char* uplo, lapack_int* n, double* a, lapack_int* lda, double* w, double* work,
-                  lapack_int* lwork, lapack_int* info);
-void LAPACK_cheev(char* jobz, char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, float* w,
-                  lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* info);
-void LAPACK_zheev(char* jobz, char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, double* w,
-                  lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* info);
-void LAPACK_ssyevd(char* jobz, char* uplo, lapack_int* n, float* a, lapack_int* lda, float* w, float* work,
-                   lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_dsyevd(char* jobz, char* uplo, lapack_int* n, double* a, lapack_int* lda, double* w, double* work,
-                   lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_cheevd(char* jobz, char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, float* w,
-                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* lrwork, lapack_int* iwork,
-                   lapack_int* liwork, lapack_int* info);
-void LAPACK_zheevd(char* jobz, char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, double* w,
-                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* lrwork, lapack_int* iwork,
-                   lapack_int* liwork, lapack_int* info);
-void LAPACK_ssyevx(char* jobz, char* range, char* uplo, lapack_int* n, float* a, lapack_int* lda, float* vl, float* vu,
-                   lapack_int* il, lapack_int* iu, float* abstol, lapack_int* m, float* w, float* z, lapack_int* ldz,
-                   float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_dsyevx(char* jobz, char* range, char* uplo, lapack_int* n, double* a, lapack_int* lda, double* vl,
-                   double* vu, lapack_int* il, lapack_int* iu, double* abstol, lapack_int* m, double* w, double* z,
-                   lapack_int* ldz, double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* ifail,
-                   lapack_int* info);
-void LAPACK_cheevx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
-                   float* vl, float* vu, lapack_int* il, lapack_int* iu, float* abstol, lapack_int* m, float* w,
-                   lapack_complex_float* z, lapack_int* ldz, lapack_complex_float* work, lapack_int* lwork,
-                   float* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_zheevx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                   double* vl, double* vu, lapack_int* il, lapack_int* iu, double* abstol, lapack_int* m, double* w,
-                   lapack_complex_double* z, lapack_int* ldz, lapack_complex_double* work, lapack_int* lwork,
-                   double* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_ssyevr(char* jobz, char* range, char* uplo, lapack_int* n, float* a, lapack_int* lda, float* vl, float* vu,
-                   lapack_int* il, lapack_int* iu, float* abstol, lapack_int* m, float* w, float* z, lapack_int* ldz,
-                   lapack_int* isuppz, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_int* info);
-void LAPACK_dsyevr(char* jobz, char* range, char* uplo, lapack_int* n, double* a, lapack_int* lda, double* vl,
-                   double* vu, lapack_int* il, lapack_int* iu, double* abstol, lapack_int* m, double* w, double* z,
-                   lapack_int* ldz, lapack_int* isuppz, double* work, lapack_int* lwork, lapack_int* iwork,
-                   lapack_int* liwork, lapack_int* info);
-void LAPACK_cheevr(char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
-                   float* vl, float* vu, lapack_int* il, lapack_int* iu, float* abstol, lapack_int* m, float* w,
-                   lapack_complex_float* z, lapack_int* ldz, lapack_int* isuppz, lapack_complex_float* work,
-                   lapack_int* lwork, float* rwork, lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_int* info);
-void LAPACK_zheevr(char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                   double* vl, double* vu, lapack_int* il, lapack_int* iu, double* abstol, lapack_int* m, double* w,
-                   lapack_complex_double* z, lapack_int* ldz, lapack_int* isuppz, lapack_complex_double* work,
-                   lapack_int* lwork, double* rwork, lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_int* info);
-void LAPACK_sspev(char* jobz, char* uplo, lapack_int* n, float* ap, float* w, float* z, lapack_int* ldz, float* work,
-                  lapack_int* info);
-void LAPACK_dspev(char* jobz, char* uplo, lapack_int* n, double* ap, double* w, double* z, lapack_int* ldz,
-                  double* work, lapack_int* info);
-void LAPACK_chpev(char* jobz, char* uplo, lapack_int* n, lapack_complex_float* ap, float* w, lapack_complex_float* z,
-                  lapack_int* ldz, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zhpev(char* jobz, char* uplo, lapack_int* n, lapack_complex_double* ap, double* w, lapack_complex_double* z,
-                  lapack_int* ldz, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_sspevd(char* jobz, char* uplo, lapack_int* n, float* ap, float* w, float* z, lapack_int* ldz, float* work,
-                   lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_dspevd(char* jobz, char* uplo, lapack_int* n, double* ap, double* w, double* z, lapack_int* ldz,
-                   double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_chpevd(char* jobz, char* uplo, lapack_int* n, lapack_complex_float* ap, float* w, lapack_complex_float* z,
-                   lapack_int* ldz, lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* lrwork,
-                   lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_zhpevd(char* jobz, char* uplo, lapack_int* n, lapack_complex_double* ap, double* w,
-                   lapack_complex_double* z, lapack_int* ldz, lapack_complex_double* work, lapack_int* lwork,
-                   double* rwork, lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_sspevx(char* jobz, char* range, char* uplo, lapack_int* n, float* ap, float* vl, float* vu, lapack_int* il,
-                   lapack_int* iu, float* abstol, lapack_int* m, float* w, float* z, lapack_int* ldz, float* work,
-                   lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_dspevx(char* jobz, char* range, char* uplo, lapack_int* n, double* ap, double* vl, double* vu,
-                   lapack_int* il, lapack_int* iu, double* abstol, lapack_int* m, double* w, double* z, lapack_int* ldz,
-                   double* work, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_chpevx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_float* ap, float* vl, float* vu,
-                   lapack_int* il, lapack_int* iu, float* abstol, lapack_int* m, float* w, lapack_complex_float* z,
-                   lapack_int* ldz, lapack_complex_float* work, float* rwork, lapack_int* iwork, lapack_int* ifail,
-                   lapack_int* info);
-void LAPACK_zhpevx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_double* ap, double* vl,
-                   double* vu, lapack_int* il, lapack_int* iu, double* abstol, lapack_int* m, double* w,
-                   lapack_complex_double* z, lapack_int* ldz, lapack_complex_double* work, double* rwork,
-                   lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_ssbev(char* jobz, char* uplo, lapack_int* n, lapack_int* kd, float* ab, lapack_int* ldab, float* w,
-                  float* z, lapack_int* ldz, float* work, lapack_int* info);
-void LAPACK_dsbev(char* jobz, char* uplo, lapack_int* n, lapack_int* kd, double* ab, lapack_int* ldab, double* w,
-                  double* z, lapack_int* ldz, double* work, lapack_int* info);
-void LAPACK_chbev(char* jobz, char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_float* ab, lapack_int* ldab,
-                  float* w, lapack_complex_float* z, lapack_int* ldz, lapack_complex_float* work, float* rwork,
-                  lapack_int* info);
-void LAPACK_zhbev(char* jobz, char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_double* ab, lapack_int* ldab,
-                  double* w, lapack_complex_double* z, lapack_int* ldz, lapack_complex_double* work, double* rwork,
-                  lapack_int* info);
-void LAPACK_ssbevd(char* jobz, char* uplo, lapack_int* n, lapack_int* kd, float* ab, lapack_int* ldab, float* w,
-                   float* z, lapack_int* ldz, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_int* info);
-void LAPACK_dsbevd(char* jobz, char* uplo, lapack_int* n, lapack_int* kd, double* ab, lapack_int* ldab, double* w,
-                   double* z, lapack_int* ldz, double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_int* info);
-void LAPACK_chbevd(char* jobz, char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_float* ab, lapack_int* ldab,
-                   float* w, lapack_complex_float* z, lapack_int* ldz, lapack_complex_float* work, lapack_int* lwork,
-                   float* rwork, lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_zhbevd(char* jobz, char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_double* ab, lapack_int* ldab,
-                   double* w, lapack_complex_double* z, lapack_int* ldz, lapack_complex_double* work, lapack_int* lwork,
-                   double* rwork, lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_ssbevx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_int* kd, float* ab, lapack_int* ldab,
-                   float* q, lapack_int* ldq, float* vl, float* vu, lapack_int* il, lapack_int* iu, float* abstol,
-                   lapack_int* m, float* w, float* z, lapack_int* ldz, float* work, lapack_int* iwork,
-                   lapack_int* ifail, lapack_int* info);
-void LAPACK_dsbevx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_int* kd, double* ab, lapack_int* ldab,
-                   double* q, lapack_int* ldq, double* vl, double* vu, lapack_int* il, lapack_int* iu, double* abstol,
-                   lapack_int* m, double* w, double* z, lapack_int* ldz, double* work, lapack_int* iwork,
-                   lapack_int* ifail, lapack_int* info);
-void LAPACK_chbevx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_float* ab,
-                   lapack_int* ldab, lapack_complex_float* q, lapack_int* ldq, float* vl, float* vu, lapack_int* il,
-                   lapack_int* iu, float* abstol, lapack_int* m, float* w, lapack_complex_float* z, lapack_int* ldz,
-                   lapack_complex_float* work, float* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_zhbevx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_double* ab,
-                   lapack_int* ldab, lapack_complex_double* q, lapack_int* ldq, double* vl, double* vu, lapack_int* il,
-                   lapack_int* iu, double* abstol, lapack_int* m, double* w, lapack_complex_double* z, lapack_int* ldz,
-                   lapack_complex_double* work, double* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_sstev(char* jobz, lapack_int* n, float* d, float* e, float* z, lapack_int* ldz, float* work,
-                  lapack_int* info);
-void LAPACK_dstev(char* jobz, lapack_int* n, double* d, double* e, double* z, lapack_int* ldz, double* work,
-                  lapack_int* info);
-void LAPACK_sstevd(char* jobz, lapack_int* n, float* d, float* e, float* z, lapack_int* ldz, float* work,
-                   lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_dstevd(char* jobz, lapack_int* n, double* d, double* e, double* z, lapack_int* ldz, double* work,
-                   lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_sstevx(char* jobz, char* range, lapack_int* n, float* d, float* e, float* vl, float* vu, lapack_int* il,
-                   lapack_int* iu, float* abstol, lapack_int* m, float* w, float* z, lapack_int* ldz, float* work,
-                   lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_dstevx(char* jobz, char* range, lapack_int* n, double* d, double* e, double* vl, double* vu, lapack_int* il,
-                   lapack_int* iu, double* abstol, lapack_int* m, double* w, double* z, lapack_int* ldz, double* work,
-                   lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_sstevr(char* jobz, char* range, lapack_int* n, float* d, float* e, float* vl, float* vu, lapack_int* il,
-                   lapack_int* iu, float* abstol, lapack_int* m, float* w, float* z, lapack_int* ldz,
-                   lapack_int* isuppz, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_int* info);
-void LAPACK_dstevr(char* jobz, char* range, lapack_int* n, double* d, double* e, double* vl, double* vu, lapack_int* il,
-                   lapack_int* iu, double* abstol, lapack_int* m, double* w, double* z, lapack_int* ldz,
-                   lapack_int* isuppz, double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_int* info);
-void LAPACK_sgees(char* jobvs, char* sort, LAPACK_S_SELECT2 select, lapack_int* n, float* a, lapack_int* lda,
-                  lapack_int* sdim, float* wr, float* wi, float* vs, lapack_int* ldvs, float* work, lapack_int* lwork,
-                  lapack_logical* bwork, lapack_int* info);
-void LAPACK_dgees(char* jobvs, char* sort, LAPACK_D_SELECT2 select, lapack_int* n, double* a, lapack_int* lda,
-                  lapack_int* sdim, double* wr, double* wi, double* vs, lapack_int* ldvs, double* work,
-                  lapack_int* lwork, lapack_logical* bwork, lapack_int* info);
-void LAPACK_cgees(char* jobvs, char* sort, LAPACK_C_SELECT1 select, lapack_int* n, lapack_complex_float* a,
-                  lapack_int* lda, lapack_int* sdim, lapack_complex_float* w, lapack_complex_float* vs,
-                  lapack_int* ldvs, lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_logical* bwork,
-                  lapack_int* info);
-void LAPACK_zgees(char* jobvs, char* sort, LAPACK_Z_SELECT1 select, lapack_int* n, lapack_complex_double* a,
-                  lapack_int* lda, lapack_int* sdim, lapack_complex_double* w, lapack_complex_double* vs,
-                  lapack_int* ldvs, lapack_complex_double* work, lapack_int* lwork, double* rwork,
-                  lapack_logical* bwork, lapack_int* info);
-void LAPACK_sgeesx(char* jobvs, char* sort, LAPACK_S_SELECT2 select, char* sense, lapack_int* n, float* a,
-                   lapack_int* lda, lapack_int* sdim, float* wr, float* wi, float* vs, lapack_int* ldvs, float* rconde,
-                   float* rcondv, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_logical* bwork, lapack_int* info);
-void LAPACK_dgeesx(char* jobvs, char* sort, LAPACK_D_SELECT2 select, char* sense, lapack_int* n, double* a,
-                   lapack_int* lda, lapack_int* sdim, double* wr, double* wi, double* vs, lapack_int* ldvs,
-                   double* rconde, double* rcondv, double* work, lapack_int* lwork, lapack_int* iwork,
-                   lapack_int* liwork, lapack_logical* bwork, lapack_int* info);
-void LAPACK_cgeesx(char* jobvs, char* sort, LAPACK_C_SELECT1 select, char* sense, lapack_int* n,
-                   lapack_complex_float* a, lapack_int* lda, lapack_int* sdim, lapack_complex_float* w,
-                   lapack_complex_float* vs, lapack_int* ldvs, float* rconde, float* rcondv, lapack_complex_float* work,
-                   lapack_int* lwork, float* rwork, lapack_logical* bwork, lapack_int* info);
-void LAPACK_zgeesx(char* jobvs, char* sort, LAPACK_Z_SELECT1 select, char* sense, lapack_int* n,
-                   lapack_complex_double* a, lapack_int* lda, lapack_int* sdim, lapack_complex_double* w,
-                   lapack_complex_double* vs, lapack_int* ldvs, double* rconde, double* rcondv,
-                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_logical* bwork,
-                   lapack_int* info);
-void LAPACK_sgeev(char* jobvl, char* jobvr, lapack_int* n, float* a, lapack_int* lda, float* wr, float* wi, float* vl,
-                  lapack_int* ldvl, float* vr, lapack_int* ldvr, float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dgeev(char* jobvl, char* jobvr, lapack_int* n, double* a, lapack_int* lda, double* wr, double* wi,
-                  double* vl, lapack_int* ldvl, double* vr, lapack_int* ldvr, double* work, lapack_int* lwork,
-                  lapack_int* info);
-void LAPACK_cgeev(char* jobvl, char* jobvr, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
-                  lapack_complex_float* w, lapack_complex_float* vl, lapack_int* ldvl, lapack_complex_float* vr,
-                  lapack_int* ldvr, lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* info);
-void LAPACK_zgeev(char* jobvl, char* jobvr, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                  lapack_complex_double* w, lapack_complex_double* vl, lapack_int* ldvl, lapack_complex_double* vr,
-                  lapack_int* ldvr, lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* info);
-void LAPACK_sgeevx(char* balanc, char* jobvl, char* jobvr, char* sense, lapack_int* n, float* a, lapack_int* lda,
-                   float* wr, float* wi, float* vl, lapack_int* ldvl, float* vr, lapack_int* ldvr, lapack_int* ilo,
-                   lapack_int* ihi, float* scale, float* abnrm, float* rconde, float* rcondv, float* work,
-                   lapack_int* lwork, lapack_int* iwork, lapack_int* info);
-void LAPACK_dgeevx(char* balanc, char* jobvl, char* jobvr, char* sense, lapack_int* n, double* a, lapack_int* lda,
-                   double* wr, double* wi, double* vl, lapack_int* ldvl, double* vr, lapack_int* ldvr, lapack_int* ilo,
-                   lapack_int* ihi, double* scale, double* abnrm, double* rconde, double* rcondv, double* work,
-                   lapack_int* lwork, lapack_int* iwork, lapack_int* info);
-void LAPACK_cgeevx(char* balanc, char* jobvl, char* jobvr, char* sense, lapack_int* n, lapack_complex_float* a,
-                   lapack_int* lda, lapack_complex_float* w, lapack_complex_float* vl, lapack_int* ldvl,
-                   lapack_complex_float* vr, lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi, float* scale,
-                   float* abnrm, float* rconde, float* rcondv, lapack_complex_float* work, lapack_int* lwork,
-                   float* rwork, lapack_int* info);
-void LAPACK_zgeevx(char* balanc, char* jobvl, char* jobvr, char* sense, lapack_int* n, lapack_complex_double* a,
-                   lapack_int* lda, lapack_complex_double* w, lapack_complex_double* vl, lapack_int* ldvl,
-                   lapack_complex_double* vr, lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi, double* scale,
-                   double* abnrm, double* rconde, double* rcondv, lapack_complex_double* work, lapack_int* lwork,
-                   double* rwork, lapack_int* info);
-void LAPACK_sgesvd(char* jobu, char* jobvt, lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* s, float* u,
-                   lapack_int* ldu, float* vt, lapack_int* ldvt, float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dgesvd(char* jobu, char* jobvt, lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* s,
-                   double* u, lapack_int* ldu, double* vt, lapack_int* ldvt, double* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_cgesvd(char* jobu, char* jobvt, lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
-                   float* s, lapack_complex_float* u, lapack_int* ldu, lapack_complex_float* vt, lapack_int* ldvt,
-                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* info);
-void LAPACK_zgesvd(char* jobu, char* jobvt, lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                   double* s, lapack_complex_double* u, lapack_int* ldu, lapack_complex_double* vt, lapack_int* ldvt,
-                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* info);
-void LAPACK_sgesdd(char* jobz, lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* s, float* u,
-                   lapack_int* ldu, float* vt, lapack_int* ldvt, float* work, lapack_int* lwork, lapack_int* iwork,
-                   lapack_int* info);
-void LAPACK_dgesdd(char* jobz, lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* s, double* u,
-                   lapack_int* ldu, double* vt, lapack_int* ldvt, double* work, lapack_int* lwork, lapack_int* iwork,
-                   lapack_int* info);
-void LAPACK_cgesdd(char* jobz, lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, float* s,
-                   lapack_complex_float* u, lapack_int* ldu, lapack_complex_float* vt, lapack_int* ldvt,
-                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* iwork, lapack_int* info);
-void LAPACK_zgesdd(char* jobz, lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, double* s,
-                   lapack_complex_double* u, lapack_int* ldu, lapack_complex_double* vt, lapack_int* ldvt,
-                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* iwork, lapack_int* info);
-void LAPACK_dgejsv(char* joba, char* jobu, char* jobv, char* jobr, char* jobt, char* jobp, lapack_int* m, lapack_int* n,
-                   double* a, lapack_int* lda, double* sva, double* u, lapack_int* ldu, double* v, lapack_int* ldv,
-                   double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* info);
-void LAPACK_sgejsv(char* joba, char* jobu, char* jobv, char* jobr, char* jobt, char* jobp, lapack_int* m, lapack_int* n,
-                   float* a, lapack_int* lda, float* sva, float* u, lapack_int* ldu, float* v, lapack_int* ldv,
-                   float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* info);
-void LAPACK_dgesvj(char* joba, char* jobu, char* jobv, lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
-                   double* sva, lapack_int* mv, double* v, lapack_int* ldv, double* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_sgesvj(char* joba, char* jobu, char* jobv, lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
-                   float* sva, lapack_int* mv, float* v, lapack_int* ldv, float* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_sggsvd(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k,
-                   lapack_int* l, float* a, lapack_int* lda, float* b, lapack_int* ldb, float* alpha, float* beta,
-                   float* u, lapack_int* ldu, float* v, lapack_int* ldv, float* q, lapack_int* ldq, float* work,
-                   lapack_int* iwork, lapack_int* info);
-void LAPACK_dggsvd(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k,
-                   lapack_int* l, double* a, lapack_int* lda, double* b, lapack_int* ldb, double* alpha, double* beta,
-                   double* u, lapack_int* ldu, double* v, lapack_int* ldv, double* q, lapack_int* ldq, double* work,
-                   lapack_int* iwork, lapack_int* info);
-void LAPACK_cggsvd(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k,
-                   lapack_int* l, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
-                   float* alpha, float* beta, lapack_complex_float* u, lapack_int* ldu, lapack_complex_float* v,
-                   lapack_int* ldv, lapack_complex_float* q, lapack_int* ldq, lapack_complex_float* work, float* rwork,
-                   lapack_int* iwork, lapack_int* info);
-void LAPACK_zggsvd(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k,
-                   lapack_int* l, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
-                   double* alpha, double* beta, lapack_complex_double* u, lapack_int* ldu, lapack_complex_double* v,
-                   lapack_int* ldv, lapack_complex_double* q, lapack_int* ldq, lapack_complex_double* work,
-                   double* rwork, lapack_int* iwork, lapack_int* info);
-void LAPACK_ssygv(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, float* a, lapack_int* lda, float* b,
-                  lapack_int* ldb, float* w, float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dsygv(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, double* a, lapack_int* lda, double* b,
-                  lapack_int* ldb, double* w, double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_chegv(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
-                  lapack_complex_float* b, lapack_int* ldb, float* w, lapack_complex_float* work, lapack_int* lwork,
-                  float* rwork, lapack_int* info);
-void LAPACK_zhegv(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                  lapack_complex_double* b, lapack_int* ldb, double* w, lapack_complex_double* work, lapack_int* lwork,
-                  double* rwork, lapack_int* info);
-void LAPACK_ssygvd(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, float* a, lapack_int* lda, float* b,
-                   lapack_int* ldb, float* w, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_int* info);
-void LAPACK_dsygvd(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, double* a, lapack_int* lda, double* b,
-                   lapack_int* ldb, double* w, double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_int* info);
-void LAPACK_chegvd(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* b, lapack_int* ldb, float* w, lapack_complex_float* work, lapack_int* lwork,
-                   float* rwork, lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_zhegvd(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* b, lapack_int* ldb, double* w, lapack_complex_double* work, lapack_int* lwork,
-                   double* rwork, lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_ssygvx(lapack_int* itype, char* jobz, char* range, char* uplo, lapack_int* n, float* a, lapack_int* lda,
-                   float* b, lapack_int* ldb, float* vl, float* vu, lapack_int* il, lapack_int* iu, float* abstol,
-                   lapack_int* m, float* w, float* z, lapack_int* ldz, float* work, lapack_int* lwork,
-                   lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_dsygvx(lapack_int* itype, char* jobz, char* range, char* uplo, lapack_int* n, double* a, lapack_int* lda,
-                   double* b, lapack_int* ldb, double* vl, double* vu, lapack_int* il, lapack_int* iu, double* abstol,
-                   lapack_int* m, double* w, double* z, lapack_int* ldz, double* work, lapack_int* lwork,
-                   lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_chegvx(lapack_int* itype, char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_float* a,
-                   lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, float* vl, float* vu, lapack_int* il,
-                   lapack_int* iu, float* abstol, lapack_int* m, float* w, lapack_complex_float* z, lapack_int* ldz,
-                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* iwork, lapack_int* ifail,
-                   lapack_int* info);
-void LAPACK_zhegvx(lapack_int* itype, char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_double* a,
-                   lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, double* vl, double* vu, lapack_int* il,
-                   lapack_int* iu, double* abstol, lapack_int* m, double* w, lapack_complex_double* z, lapack_int* ldz,
-                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* iwork, lapack_int* ifail,
-                   lapack_int* info);
-void LAPACK_sspgv(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, float* ap, float* bp, float* w, float* z,
-                  lapack_int* ldz, float* work, lapack_int* info);
-void LAPACK_dspgv(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, double* ap, double* bp, double* w,
-                  double* z, lapack_int* ldz, double* work, lapack_int* info);
-void LAPACK_chpgv(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, lapack_complex_float* ap,
-                  lapack_complex_float* bp, float* w, lapack_complex_float* z, lapack_int* ldz,
-                  lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zhpgv(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, lapack_complex_double* ap,
-                  lapack_complex_double* bp, double* w, lapack_complex_double* z, lapack_int* ldz,
-                  lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_sspgvd(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, float* ap, float* bp, float* w, float* z,
-                   lapack_int* ldz, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_int* info);
-void LAPACK_dspgvd(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, double* ap, double* bp, double* w,
-                   double* z, lapack_int* ldz, double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_int* info);
-void LAPACK_chpgvd(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, lapack_complex_float* ap,
-                   lapack_complex_float* bp, float* w, lapack_complex_float* z, lapack_int* ldz,
-                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* lrwork, lapack_int* iwork,
-                   lapack_int* liwork, lapack_int* info);
-void LAPACK_zhpgvd(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, lapack_complex_double* ap,
-                   lapack_complex_double* bp, double* w, lapack_complex_double* z, lapack_int* ldz,
-                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* lrwork, lapack_int* iwork,
-                   lapack_int* liwork, lapack_int* info);
-void LAPACK_sspgvx(lapack_int* itype, char* jobz, char* range, char* uplo, lapack_int* n, float* ap, float* bp,
-                   float* vl, float* vu, lapack_int* il, lapack_int* iu, float* abstol, lapack_int* m, float* w,
-                   float* z, lapack_int* ldz, float* work, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_dspgvx(lapack_int* itype, char* jobz, char* range, char* uplo, lapack_int* n, double* ap, double* bp,
-                   double* vl, double* vu, lapack_int* il, lapack_int* iu, double* abstol, lapack_int* m, double* w,
-                   double* z, lapack_int* ldz, double* work, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_chpgvx(lapack_int* itype, char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_float* ap,
-                   lapack_complex_float* bp, float* vl, float* vu, lapack_int* il, lapack_int* iu, float* abstol,
-                   lapack_int* m, float* w, lapack_complex_float* z, lapack_int* ldz, lapack_complex_float* work,
-                   float* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_zhpgvx(lapack_int* itype, char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_double* ap,
-                   lapack_complex_double* bp, double* vl, double* vu, lapack_int* il, lapack_int* iu, double* abstol,
-                   lapack_int* m, double* w, lapack_complex_double* z, lapack_int* ldz, lapack_complex_double* work,
-                   double* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_ssbgv(char* jobz, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, float* ab, lapack_int* ldab,
-                  float* bb, lapack_int* ldbb, float* w, float* z, lapack_int* ldz, float* work, lapack_int* info);
-void LAPACK_dsbgv(char* jobz, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, double* ab, lapack_int* ldab,
-                  double* bb, lapack_int* ldbb, double* w, double* z, lapack_int* ldz, double* work, lapack_int* info);
-void LAPACK_chbgv(char* jobz, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, lapack_complex_float* ab,
-                  lapack_int* ldab, lapack_complex_float* bb, lapack_int* ldbb, float* w, lapack_complex_float* z,
-                  lapack_int* ldz, lapack_complex_float* work, float* rwork, lapack_int* info);
-void LAPACK_zhbgv(char* jobz, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, lapack_complex_double* ab,
-                  lapack_int* ldab, lapack_complex_double* bb, lapack_int* ldbb, double* w, lapack_complex_double* z,
-                  lapack_int* ldz, lapack_complex_double* work, double* rwork, lapack_int* info);
-void LAPACK_ssbgvd(char* jobz, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, float* ab, lapack_int* ldab,
-                   float* bb, lapack_int* ldbb, float* w, float* z, lapack_int* ldz, float* work, lapack_int* lwork,
-                   lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_dsbgvd(char* jobz, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, double* ab, lapack_int* ldab,
-                   double* bb, lapack_int* ldbb, double* w, double* z, lapack_int* ldz, double* work, lapack_int* lwork,
-                   lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_chbgvd(char* jobz, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, lapack_complex_float* ab,
-                   lapack_int* ldab, lapack_complex_float* bb, lapack_int* ldbb, float* w, lapack_complex_float* z,
-                   lapack_int* ldz, lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* lrwork,
-                   lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_zhbgvd(char* jobz, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, lapack_complex_double* ab,
-                   lapack_int* ldab, lapack_complex_double* bb, lapack_int* ldbb, double* w, lapack_complex_double* z,
-                   lapack_int* ldz, lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* lrwork,
-                   lapack_int* iwork, lapack_int* liwork, lapack_int* info);
-void LAPACK_ssbgvx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, float* ab,
-                   lapack_int* ldab, float* bb, lapack_int* ldbb, float* q, lapack_int* ldq, float* vl, float* vu,
-                   lapack_int* il, lapack_int* iu, float* abstol, lapack_int* m, float* w, float* z, lapack_int* ldz,
-                   float* work, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_dsbgvx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, double* ab,
-                   lapack_int* ldab, double* bb, lapack_int* ldbb, double* q, lapack_int* ldq, double* vl, double* vu,
-                   lapack_int* il, lapack_int* iu, double* abstol, lapack_int* m, double* w, double* z, lapack_int* ldz,
-                   double* work, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_chbgvx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb,
-                   lapack_complex_float* ab, lapack_int* ldab, lapack_complex_float* bb, lapack_int* ldbb,
-                   lapack_complex_float* q, lapack_int* ldq, float* vl, float* vu, lapack_int* il, lapack_int* iu,
-                   float* abstol, lapack_int* m, float* w, lapack_complex_float* z, lapack_int* ldz,
-                   lapack_complex_float* work, float* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_zhbgvx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb,
-                   lapack_complex_double* ab, lapack_int* ldab, lapack_complex_double* bb, lapack_int* ldbb,
-                   lapack_complex_double* q, lapack_int* ldq, double* vl, double* vu, lapack_int* il, lapack_int* iu,
-                   double* abstol, lapack_int* m, double* w, lapack_complex_double* z, lapack_int* ldz,
-                   lapack_complex_double* work, double* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
-void LAPACK_sgges(char* jobvsl, char* jobvsr, char* sort, LAPACK_S_SELECT3 selctg, lapack_int* n, float* a,
-                  lapack_int* lda, float* b, lapack_int* ldb, lapack_int* sdim, float* alphar, float* alphai,
-                  float* beta, float* vsl, lapack_int* ldvsl, float* vsr, lapack_int* ldvsr, float* work,
-                  lapack_int* lwork, lapack_logical* bwork, lapack_int* info);
-void LAPACK_dgges(char* jobvsl, char* jobvsr, char* sort, LAPACK_D_SELECT3 selctg, lapack_int* n, double* a,
-                  lapack_int* lda, double* b, lapack_int* ldb, lapack_int* sdim, double* alphar, double* alphai,
-                  double* beta, double* vsl, lapack_int* ldvsl, double* vsr, lapack_int* ldvsr, double* work,
-                  lapack_int* lwork, lapack_logical* bwork, lapack_int* info);
-void LAPACK_cgges(char* jobvsl, char* jobvsr, char* sort, LAPACK_C_SELECT2 selctg, lapack_int* n,
-                  lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, lapack_int* sdim,
-                  lapack_complex_float* alpha, lapack_complex_float* beta, lapack_complex_float* vsl, lapack_int* ldvsl,
-                  lapack_complex_float* vsr, lapack_int* ldvsr, lapack_complex_float* work, lapack_int* lwork,
-                  float* rwork, lapack_logical* bwork, lapack_int* info);
-void LAPACK_zgges(char* jobvsl, char* jobvsr, char* sort, LAPACK_Z_SELECT2 selctg, lapack_int* n,
-                  lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
-                  lapack_int* sdim, lapack_complex_double* alpha, lapack_complex_double* beta,
-                  lapack_complex_double* vsl, lapack_int* ldvsl, lapack_complex_double* vsr, lapack_int* ldvsr,
-                  lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_logical* bwork,
-                  lapack_int* info);
-void LAPACK_sggesx(char* jobvsl, char* jobvsr, char* sort, LAPACK_S_SELECT3 selctg, char* sense, lapack_int* n,
-                   float* a, lapack_int* lda, float* b, lapack_int* ldb, lapack_int* sdim, float* alphar, float* alphai,
-                   float* beta, float* vsl, lapack_int* ldvsl, float* vsr, lapack_int* ldvsr, float* rconde,
-                   float* rcondv, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_logical* bwork, lapack_int* info);
-void LAPACK_dggesx(char* jobvsl, char* jobvsr, char* sort, LAPACK_D_SELECT3 selctg, char* sense, lapack_int* n,
-                   double* a, lapack_int* lda, double* b, lapack_int* ldb, lapack_int* sdim, double* alphar,
-                   double* alphai, double* beta, double* vsl, lapack_int* ldvsl, double* vsr, lapack_int* ldvsr,
-                   double* rconde, double* rcondv, double* work, lapack_int* lwork, lapack_int* iwork,
-                   lapack_int* liwork, lapack_logical* bwork, lapack_int* info);
-void LAPACK_cggesx(char* jobvsl, char* jobvsr, char* sort, LAPACK_C_SELECT2 selctg, char* sense, lapack_int* n,
-                   lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, lapack_int* sdim,
-                   lapack_complex_float* alpha, lapack_complex_float* beta, lapack_complex_float* vsl,
-                   lapack_int* ldvsl, lapack_complex_float* vsr, lapack_int* ldvsr, float* rconde, float* rcondv,
-                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* iwork, lapack_int* liwork,
-                   lapack_logical* bwork, lapack_int* info);
-void LAPACK_zggesx(char* jobvsl, char* jobvsr, char* sort, LAPACK_Z_SELECT2 selctg, char* sense, lapack_int* n,
-                   lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
-                   lapack_int* sdim, lapack_complex_double* alpha, lapack_complex_double* beta,
-                   lapack_complex_double* vsl, lapack_int* ldvsl, lapack_complex_double* vsr, lapack_int* ldvsr,
-                   double* rconde, double* rcondv, lapack_complex_double* work, lapack_int* lwork, double* rwork,
-                   lapack_int* iwork, lapack_int* liwork, lapack_logical* bwork, lapack_int* info);
-void LAPACK_sggev(char* jobvl, char* jobvr, lapack_int* n, float* a, lapack_int* lda, float* b, lapack_int* ldb,
-                  float* alphar, float* alphai, float* beta, float* vl, lapack_int* ldvl, float* vr, lapack_int* ldvr,
-                  float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dggev(char* jobvl, char* jobvr, lapack_int* n, double* a, lapack_int* lda, double* b, lapack_int* ldb,
-                  double* alphar, double* alphai, double* beta, double* vl, lapack_int* ldvl, double* vr,
-                  lapack_int* ldvr, double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_cggev(char* jobvl, char* jobvr, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
-                  lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* alpha, lapack_complex_float* beta,
-                  lapack_complex_float* vl, lapack_int* ldvl, lapack_complex_float* vr, lapack_int* ldvr,
-                  lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* info);
-void LAPACK_zggev(char* jobvl, char* jobvr, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                  lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* alpha, lapack_complex_double* beta,
-                  lapack_complex_double* vl, lapack_int* ldvl, lapack_complex_double* vr, lapack_int* ldvr,
-                  lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* info);
-void LAPACK_sggevx(char* balanc, char* jobvl, char* jobvr, char* sense, lapack_int* n, float* a, lapack_int* lda,
-                   float* b, lapack_int* ldb, float* alphar, float* alphai, float* beta, float* vl, lapack_int* ldvl,
-                   float* vr, lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi, float* lscale, float* rscale,
-                   float* abnrm, float* bbnrm, float* rconde, float* rcondv, float* work, lapack_int* lwork,
-                   lapack_int* iwork, lapack_logical* bwork, lapack_int* info);
-void LAPACK_dggevx(char* balanc, char* jobvl, char* jobvr, char* sense, lapack_int* n, double* a, lapack_int* lda,
-                   double* b, lapack_int* ldb, double* alphar, double* alphai, double* beta, double* vl,
-                   lapack_int* ldvl, double* vr, lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi, double* lscale,
-                   double* rscale, double* abnrm, double* bbnrm, double* rconde, double* rcondv, double* work,
-                   lapack_int* lwork, lapack_int* iwork, lapack_logical* bwork, lapack_int* info);
-void LAPACK_cggevx(char* balanc, char* jobvl, char* jobvr, char* sense, lapack_int* n, lapack_complex_float* a,
-                   lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* alpha,
-                   lapack_complex_float* beta, lapack_complex_float* vl, lapack_int* ldvl, lapack_complex_float* vr,
-                   lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi, float* lscale, float* rscale, float* abnrm,
-                   float* bbnrm, float* rconde, float* rcondv, lapack_complex_float* work, lapack_int* lwork,
-                   float* rwork, lapack_int* iwork, lapack_logical* bwork, lapack_int* info);
-void LAPACK_zggevx(char* balanc, char* jobvl, char* jobvr, char* sense, lapack_int* n, lapack_complex_double* a,
-                   lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* alpha,
-                   lapack_complex_double* beta, lapack_complex_double* vl, lapack_int* ldvl, lapack_complex_double* vr,
-                   lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi, double* lscale, double* rscale, double* abnrm,
-                   double* bbnrm, double* rconde, double* rcondv, lapack_complex_double* work, lapack_int* lwork,
-                   double* rwork, lapack_int* iwork, lapack_logical* bwork, lapack_int* info);
-void LAPACK_dsfrk(char* transr, char* uplo, char* trans, lapack_int* n, lapack_int* k, double* alpha, const double* a,
-                  lapack_int* lda, double* beta, double* c);
-void LAPACK_ssfrk(char* transr, char* uplo, char* trans, lapack_int* n, lapack_int* k, float* alpha, const float* a,
-                  lapack_int* lda, float* beta, float* c);
-void LAPACK_zhfrk(char* transr, char* uplo, char* trans, lapack_int* n, lapack_int* k, double* alpha,
-                  const lapack_complex_double* a, lapack_int* lda, double* beta, lapack_complex_double* c);
-void LAPACK_chfrk(char* transr, char* uplo, char* trans, lapack_int* n, lapack_int* k, float* alpha,
-                  const lapack_complex_float* a, lapack_int* lda, float* beta, lapack_complex_float* c);
-void LAPACK_dtfsm(char* transr, char* side, char* uplo, char* trans, char* diag, lapack_int* m, lapack_int* n,
-                  double* alpha, const double* a, double* b, lapack_int* ldb);
-void LAPACK_stfsm(char* transr, char* side, char* uplo, char* trans, char* diag, lapack_int* m, lapack_int* n,
-                  float* alpha, const float* a, float* b, lapack_int* ldb);
-void LAPACK_ztfsm(char* transr, char* side, char* uplo, char* trans, char* diag, lapack_int* m, lapack_int* n,
-                  lapack_complex_double* alpha, const lapack_complex_double* a, lapack_complex_double* b,
-                  lapack_int* ldb);
-void LAPACK_ctfsm(char* transr, char* side, char* uplo, char* trans, char* diag, lapack_int* m, lapack_int* n,
-                  lapack_complex_float* alpha, const lapack_complex_float* a, lapack_complex_float* b, lapack_int* ldb);
-void LAPACK_dtfttp(char* transr, char* uplo, lapack_int* n, const double* arf, double* ap, lapack_int* info);
-void LAPACK_stfttp(char* transr, char* uplo, lapack_int* n, const float* arf, float* ap, lapack_int* info);
-void LAPACK_ztfttp(char* transr, char* uplo, lapack_int* n, const lapack_complex_double* arf, lapack_complex_double* ap,
-                   lapack_int* info);
-void LAPACK_ctfttp(char* transr, char* uplo, lapack_int* n, const lapack_complex_float* arf, lapack_complex_float* ap,
-                   lapack_int* info);
-void LAPACK_dtfttr(char* transr, char* uplo, lapack_int* n, const double* arf, double* a, lapack_int* lda,
-                   lapack_int* info);
-void LAPACK_stfttr(char* transr, char* uplo, lapack_int* n, const float* arf, float* a, lapack_int* lda,
-                   lapack_int* info);
-void LAPACK_ztfttr(char* transr, char* uplo, lapack_int* n, const lapack_complex_double* arf, lapack_complex_double* a,
-                   lapack_int* lda, lapack_int* info);
-void LAPACK_ctfttr(char* transr, char* uplo, lapack_int* n, const lapack_complex_float* arf, lapack_complex_float* a,
-                   lapack_int* lda, lapack_int* info);
-void LAPACK_dtpttf(char* transr, char* uplo, lapack_int* n, const double* ap, double* arf, lapack_int* info);
-void LAPACK_stpttf(char* transr, char* uplo, lapack_int* n, const float* ap, float* arf, lapack_int* info);
-void LAPACK_ztpttf(char* transr, char* uplo, lapack_int* n, const lapack_complex_double* ap, lapack_complex_double* arf,
-                   lapack_int* info);
-void LAPACK_ctpttf(char* transr, char* uplo, lapack_int* n, const lapack_complex_float* ap, lapack_complex_float* arf,
-                   lapack_int* info);
-void LAPACK_dtpttr(char* uplo, lapack_int* n, const double* ap, double* a, lapack_int* lda, lapack_int* info);
-void LAPACK_stpttr(char* uplo, lapack_int* n, const float* ap, float* a, lapack_int* lda, lapack_int* info);
-void LAPACK_ztpttr(char* uplo, lapack_int* n, const lapack_complex_double* ap, lapack_complex_double* a,
-                   lapack_int* lda, lapack_int* info);
-void LAPACK_ctpttr(char* uplo, lapack_int* n, const lapack_complex_float* ap, lapack_complex_float* a, lapack_int* lda,
-                   lapack_int* info);
-void LAPACK_dtrttf(char* transr, char* uplo, lapack_int* n, const double* a, lapack_int* lda, double* arf,
-                   lapack_int* info);
-void LAPACK_strttf(char* transr, char* uplo, lapack_int* n, const float* a, lapack_int* lda, float* arf,
-                   lapack_int* info);
-void LAPACK_ztrttf(char* transr, char* uplo, lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* arf, lapack_int* info);
-void LAPACK_ctrttf(char* transr, char* uplo, lapack_int* n, const lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* arf, lapack_int* info);
-void LAPACK_dtrttp(char* uplo, lapack_int* n, const double* a, lapack_int* lda, double* ap, lapack_int* info);
-void LAPACK_strttp(char* uplo, lapack_int* n, const float* a, lapack_int* lda, float* ap, lapack_int* info);
-void LAPACK_ztrttp(char* uplo, lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* ap, lapack_int* info);
-void LAPACK_ctrttp(char* uplo, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, lapack_complex_float* ap,
-                   lapack_int* info);
-void LAPACK_sgeqrfp(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* tau, float* work, lapack_int* lwork,
-                    lapack_int* info);
-void LAPACK_dgeqrfp(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* tau, double* work,
-                    lapack_int* lwork, lapack_int* info);
-void LAPACK_cgeqrfp(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* tau,
-                    lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zgeqrfp(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* tau,
-                    lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_clacgv(lapack_int* n, lapack_complex_float* x, lapack_int* incx);
-void LAPACK_zlacgv(lapack_int* n, lapack_complex_double* x, lapack_int* incx);
-void LAPACK_slarnv(lapack_int* idist, lapack_int* iseed, lapack_int* n, float* x);
-void LAPACK_dlarnv(lapack_int* idist, lapack_int* iseed, lapack_int* n, double* x);
-void LAPACK_clarnv(lapack_int* idist, lapack_int* iseed, lapack_int* n, lapack_complex_float* x);
-void LAPACK_zlarnv(lapack_int* idist, lapack_int* iseed, lapack_int* n, lapack_complex_double* x);
-void LAPACK_sgeqr2(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* tau, float* work, lapack_int* info);
-void LAPACK_dgeqr2(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* tau, double* work,
-                   lapack_int* info);
-void LAPACK_cgeqr2(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* tau,
-                   lapack_complex_float* work, lapack_int* info);
-void LAPACK_zgeqr2(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* tau,
-                   lapack_complex_double* work, lapack_int* info);
-void LAPACK_slacpy(char* uplo, lapack_int* m, lapack_int* n, const float* a, lapack_int* lda, float* b,
-                   lapack_int* ldb);
-void LAPACK_dlacpy(char* uplo, lapack_int* m, lapack_int* n, const double* a, lapack_int* lda, double* b,
-                   lapack_int* ldb);
-void LAPACK_clacpy(char* uplo, lapack_int* m, lapack_int* n, const lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* b, lapack_int* ldb);
-void LAPACK_zlacpy(char* uplo, lapack_int* m, lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* b, lapack_int* ldb);
-void LAPACK_sgetf2(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, lapack_int* ipiv, lapack_int* info);
-void LAPACK_dgetf2(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, lapack_int* ipiv, lapack_int* info);
-void LAPACK_cgetf2(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
-                   lapack_int* info);
-void LAPACK_zgetf2(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
-                   lapack_int* info);
-void LAPACK_slaswp(lapack_int* n, float* a, lapack_int* lda, lapack_int* k1, lapack_int* k2, const lapack_int* ipiv,
-                   lapack_int* incx);
-void LAPACK_dlaswp(lapack_int* n, double* a, lapack_int* lda, lapack_int* k1, lapack_int* k2, const lapack_int* ipiv,
-                   lapack_int* incx);
-void LAPACK_claswp(lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* k1, lapack_int* k2,
-                   const lapack_int* ipiv, lapack_int* incx);
-void LAPACK_zlaswp(lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* k1, lapack_int* k2,
-                   const lapack_int* ipiv, lapack_int* incx);
-float LAPACK_slange(char* norm, lapack_int* m, lapack_int* n, const float* a, lapack_int* lda, float* work);
-double LAPACK_dlange(char* norm, lapack_int* m, lapack_int* n, const double* a, lapack_int* lda, double* work);
-float LAPACK_clange(char* norm, lapack_int* m, lapack_int* n, const lapack_complex_float* a, lapack_int* lda,
-                    float* work);
-double LAPACK_zlange(char* norm, lapack_int* m, lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
-                     double* work);
-float LAPACK_clanhe(char* norm, char* uplo, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* work);
-double LAPACK_zlanhe(char* norm, char* uplo, lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
-                     double* work);
-float LAPACK_slansy(char* norm, char* uplo, lapack_int* n, const float* a, lapack_int* lda, float* work);
-double LAPACK_dlansy(char* norm, char* uplo, lapack_int* n, const double* a, lapack_int* lda, double* work);
-float LAPACK_clansy(char* norm, char* uplo, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* work);
-double LAPACK_zlansy(char* norm, char* uplo, lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
-                     double* work);
-float LAPACK_slantr(char* norm, char* uplo, char* diag, lapack_int* m, lapack_int* n, const float* a, lapack_int* lda,
-                    float* work);
-double LAPACK_dlantr(char* norm, char* uplo, char* diag, lapack_int* m, lapack_int* n, const double* a, lapack_int* lda,
-                     double* work);
-float LAPACK_clantr(char* norm, char* uplo, char* diag, lapack_int* m, lapack_int* n, const lapack_complex_float* a,
-                    lapack_int* lda, float* work);
-double LAPACK_zlantr(char* norm, char* uplo, char* diag, lapack_int* m, lapack_int* n, const lapack_complex_double* a,
-                     lapack_int* lda, double* work);
-float LAPACK_slamch(char* cmach);
-double LAPACK_dlamch(char* cmach);
-void LAPACK_sgelq2(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* tau, float* work, lapack_int* info);
-void LAPACK_dgelq2(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* tau, double* work,
-                   lapack_int* info);
-void LAPACK_cgelq2(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* tau,
-                   lapack_complex_float* work, lapack_int* info);
-void LAPACK_zgelq2(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* tau,
-                   lapack_complex_double* work, lapack_int* info);
-void LAPACK_slarfb(char* side, char* trans, char* direct, char* storev, lapack_int* m, lapack_int* n, lapack_int* k,
-                   const float* v, lapack_int* ldv, const float* t, lapack_int* ldt, float* c, lapack_int* ldc,
-                   float* work, lapack_int* ldwork);
-void LAPACK_dlarfb(char* side, char* trans, char* direct, char* storev, lapack_int* m, lapack_int* n, lapack_int* k,
-                   const double* v, lapack_int* ldv, const double* t, lapack_int* ldt, double* c, lapack_int* ldc,
-                   double* work, lapack_int* ldwork);
-void LAPACK_clarfb(char* side, char* trans, char* direct, char* storev, lapack_int* m, lapack_int* n, lapack_int* k,
-                   const lapack_complex_float* v, lapack_int* ldv, const lapack_complex_float* t, lapack_int* ldt,
-                   lapack_complex_float* c, lapack_int* ldc, lapack_complex_float* work, lapack_int* ldwork);
-void LAPACK_zlarfb(char* side, char* trans, char* direct, char* storev, lapack_int* m, lapack_int* n, lapack_int* k,
-                   const lapack_complex_double* v, lapack_int* ldv, const lapack_complex_double* t, lapack_int* ldt,
-                   lapack_complex_double* c, lapack_int* ldc, lapack_complex_double* work, lapack_int* ldwork);
-void LAPACK_slarfg(lapack_int* n, float* alpha, float* x, lapack_int* incx, float* tau);
-void LAPACK_dlarfg(lapack_int* n, double* alpha, double* x, lapack_int* incx, double* tau);
-void LAPACK_clarfg(lapack_int* n, lapack_complex_float* alpha, lapack_complex_float* x, lapack_int* incx,
-                   lapack_complex_float* tau);
-void LAPACK_zlarfg(lapack_int* n, lapack_complex_double* alpha, lapack_complex_double* x, lapack_int* incx,
-                   lapack_complex_double* tau);
-void LAPACK_slarft(char* direct, char* storev, lapack_int* n, lapack_int* k, const float* v, lapack_int* ldv,
-                   const float* tau, float* t, lapack_int* ldt);
-void LAPACK_dlarft(char* direct, char* storev, lapack_int* n, lapack_int* k, const double* v, lapack_int* ldv,
-                   const double* tau, double* t, lapack_int* ldt);
-void LAPACK_clarft(char* direct, char* storev, lapack_int* n, lapack_int* k, const lapack_complex_float* v,
-                   lapack_int* ldv, const lapack_complex_float* tau, lapack_complex_float* t, lapack_int* ldt);
-void LAPACK_zlarft(char* direct, char* storev, lapack_int* n, lapack_int* k, const lapack_complex_double* v,
-                   lapack_int* ldv, const lapack_complex_double* tau, lapack_complex_double* t, lapack_int* ldt);
-void LAPACK_slarfx(char* side, lapack_int* m, lapack_int* n, const float* v, float* tau, float* c, lapack_int* ldc,
-                   float* work);
-void LAPACK_dlarfx(char* side, lapack_int* m, lapack_int* n, const double* v, double* tau, double* c, lapack_int* ldc,
-                   double* work);
-void LAPACK_clarfx(char* side, lapack_int* m, lapack_int* n, const lapack_complex_float* v, lapack_complex_float* tau,
-                   lapack_complex_float* c, lapack_int* ldc, lapack_complex_float* work);
-void LAPACK_zlarfx(char* side, lapack_int* m, lapack_int* n, const lapack_complex_double* v, lapack_complex_double* tau,
-                   lapack_complex_double* c, lapack_int* ldc, lapack_complex_double* work);
-void LAPACK_slatms(lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed, char* sym, float* d, lapack_int* mode,
-                   float* cond, float* dmax, lapack_int* kl, lapack_int* ku, char* pack, float* a, lapack_int* lda,
-                   float* work, lapack_int* info);
-void LAPACK_dlatms(lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed, char* sym, double* d, lapack_int* mode,
-                   double* cond, double* dmax, lapack_int* kl, lapack_int* ku, char* pack, double* a, lapack_int* lda,
-                   double* work, lapack_int* info);
-void LAPACK_clatms(lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed, char* sym, float* d, lapack_int* mode,
-                   float* cond, float* dmax, lapack_int* kl, lapack_int* ku, char* pack, lapack_complex_float* a,
-                   lapack_int* lda, lapack_complex_float* work, lapack_int* info);
-void LAPACK_zlatms(lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed, char* sym, double* d, lapack_int* mode,
-                   double* cond, double* dmax, lapack_int* kl, lapack_int* ku, char* pack, lapack_complex_double* a,
-                   lapack_int* lda, lapack_complex_double* work, lapack_int* info);
-void LAPACK_slag2d(lapack_int* m, lapack_int* n, const float* sa, lapack_int* ldsa, double* a, lapack_int* lda,
-                   lapack_int* info);
-void LAPACK_dlag2s(lapack_int* m, lapack_int* n, const double* a, lapack_int* lda, float* sa, lapack_int* ldsa,
-                   lapack_int* info);
-void LAPACK_clag2z(lapack_int* m, lapack_int* n, const lapack_complex_float* sa, lapack_int* ldsa,
-                   lapack_complex_double* a, lapack_int* lda, lapack_int* info);
-void LAPACK_zlag2c(lapack_int* m, lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_float* sa, lapack_int* ldsa, lapack_int* info);
-void LAPACK_slauum(char* uplo, lapack_int* n, float* a, lapack_int* lda, lapack_int* info);
-void LAPACK_dlauum(char* uplo, lapack_int* n, double* a, lapack_int* lda, lapack_int* info);
-void LAPACK_clauum(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* info);
-void LAPACK_zlauum(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* info);
-void LAPACK_slagge(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const float* d, float* a,
-                   lapack_int* lda, lapack_int* iseed, float* work, lapack_int* info);
-void LAPACK_dlagge(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const double* d, double* a,
-                   lapack_int* lda, lapack_int* iseed, double* work, lapack_int* info);
-void LAPACK_clagge(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const float* d,
-                   lapack_complex_float* a, lapack_int* lda, lapack_int* iseed, lapack_complex_float* work,
-                   lapack_int* info);
-void LAPACK_zlagge(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const double* d,
-                   lapack_complex_double* a, lapack_int* lda, lapack_int* iseed, lapack_complex_double* work,
-                   lapack_int* info);
-void LAPACK_slaset(char* uplo, lapack_int* m, lapack_int* n, float* alpha, float* beta, float* a, lapack_int* lda);
-void LAPACK_dlaset(char* uplo, lapack_int* m, lapack_int* n, double* alpha, double* beta, double* a, lapack_int* lda);
-void LAPACK_claset(char* uplo, lapack_int* m, lapack_int* n, lapack_complex_float* alpha, lapack_complex_float* beta,
-                   lapack_complex_float* a, lapack_int* lda);
-void LAPACK_zlaset(char* uplo, lapack_int* m, lapack_int* n, lapack_complex_double* alpha, lapack_complex_double* beta,
-                   lapack_complex_double* a, lapack_int* lda);
-void LAPACK_slasrt(char* id, lapack_int* n, float* d, lapack_int* info);
-void LAPACK_dlasrt(char* id, lapack_int* n, double* d, lapack_int* info);
-void LAPACK_claghe(lapack_int* n, lapack_int* k, const float* d, lapack_complex_float* a, lapack_int* lda,
-                   lapack_int* iseed, lapack_complex_float* work, lapack_int* info);
-void LAPACK_zlaghe(lapack_int* n, lapack_int* k, const double* d, lapack_complex_double* a, lapack_int* lda,
-                   lapack_int* iseed, lapack_complex_double* work, lapack_int* info);
-void LAPACK_slagsy(lapack_int* n, lapack_int* k, const float* d, float* a, lapack_int* lda, lapack_int* iseed,
-                   float* work, lapack_int* info);
-void LAPACK_dlagsy(lapack_int* n, lapack_int* k, const double* d, double* a, lapack_int* lda, lapack_int* iseed,
-                   double* work, lapack_int* info);
-void LAPACK_clagsy(lapack_int* n, lapack_int* k, const float* d, lapack_complex_float* a, lapack_int* lda,
-                   lapack_int* iseed, lapack_complex_float* work, lapack_int* info);
-void LAPACK_zlagsy(lapack_int* n, lapack_int* k, const double* d, lapack_complex_double* a, lapack_int* lda,
-                   lapack_int* iseed, lapack_complex_double* work, lapack_int* info);
-void LAPACK_slapmr(lapack_logical* forwrd, lapack_int* m, lapack_int* n, float* x, lapack_int* ldx, lapack_int* k);
-void LAPACK_dlapmr(lapack_logical* forwrd, lapack_int* m, lapack_int* n, double* x, lapack_int* ldx, lapack_int* k);
-void LAPACK_clapmr(lapack_logical* forwrd, lapack_int* m, lapack_int* n, lapack_complex_float* x, lapack_int* ldx,
-                   lapack_int* k);
-void LAPACK_zlapmr(lapack_logical* forwrd, lapack_int* m, lapack_int* n, lapack_complex_double* x, lapack_int* ldx,
-                   lapack_int* k);
-float LAPACK_slapy2(float* x, float* y);
-double LAPACK_dlapy2(double* x, double* y);
-float LAPACK_slapy3(float* x, float* y, float* z);
-double LAPACK_dlapy3(double* x, double* y, double* z);
-void LAPACK_slartgp(float* f, float* g, float* cs, float* sn, float* r);
-void LAPACK_dlartgp(double* f, double* g, double* cs, double* sn, double* r);
-void LAPACK_slartgs(float* x, float* y, float* sigma, float* cs, float* sn);
-void LAPACK_dlartgs(double* x, double* y, double* sigma, double* cs, double* sn);
-// LAPACK 3.3.0
-void LAPACK_cbbcsd(char* jobu1, char* jobu2, char* jobv1t, char* jobv2t, char* trans, lapack_int* m, lapack_int* p,
-                   lapack_int* q, float* theta, float* phi, lapack_complex_float* u1, lapack_int* ldu1,
-                   lapack_complex_float* u2, lapack_int* ldu2, lapack_complex_float* v1t, lapack_int* ldv1t,
-                   lapack_complex_float* v2t, lapack_int* ldv2t, float* b11d, float* b11e, float* b12d, float* b12e,
-                   float* b21d, float* b21e, float* b22d, float* b22e, float* rwork, lapack_int* lrwork,
-                   lapack_int* info);
-void LAPACK_cheswapr(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* i1, lapack_int* i2);
-void LAPACK_chetri2(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, const lapack_int* ipiv,
-                    lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_chetri2x(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, const lapack_int* ipiv,
-                     lapack_complex_float* work, lapack_int* nb, lapack_int* info);
-void LAPACK_chetrs2(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
-                    const lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* work,
-                    lapack_int* info);
-void LAPACK_csyconv(char* uplo, char* way, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
-                    const lapack_int* ipiv, lapack_complex_float* work, lapack_int* info);
-void LAPACK_csyswapr(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* i1, lapack_int* i2);
-void LAPACK_csytri2(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, const lapack_int* ipiv,
-                    lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_csytri2x(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, const lapack_int* ipiv,
-                     lapack_complex_float* work, lapack_int* nb, lapack_int* info);
-void LAPACK_csytrs2(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
-                    const lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* work,
-                    lapack_int* info);
-void LAPACK_cunbdb(char* trans, char* signs, lapack_int* m, lapack_int* p, lapack_int* q, lapack_complex_float* x11,
-                   lapack_int* ldx11, lapack_complex_float* x12, lapack_int* ldx12, lapack_complex_float* x21,
-                   lapack_int* ldx21, lapack_complex_float* x22, lapack_int* ldx22, float* theta, float* phi,
-                   lapack_complex_float* taup1, lapack_complex_float* taup2, lapack_complex_float* tauq1,
-                   lapack_complex_float* tauq2, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_cuncsd(char* jobu1, char* jobu2, char* jobv1t, char* jobv2t, char* trans, char* signs, lapack_int* m,
-                   lapack_int* p, lapack_int* q, lapack_complex_float* x11, lapack_int* ldx11,
-                   lapack_complex_float* x12, lapack_int* ldx12, lapack_complex_float* x21, lapack_int* ldx21,
-                   lapack_complex_float* x22, lapack_int* ldx22, float* theta, lapack_complex_float* u1,
-                   lapack_int* ldu1, lapack_complex_float* u2, lapack_int* ldu2, lapack_complex_float* v1t,
-                   lapack_int* ldv1t, lapack_complex_float* v2t, lapack_int* ldv2t, lapack_complex_float* work,
-                   lapack_int* lwork, float* rwork, lapack_int* lrwork, lapack_int* iwork, lapack_int* info);
-void LAPACK_dbbcsd(char* jobu1, char* jobu2, char* jobv1t, char* jobv2t, char* trans, lapack_int* m, lapack_int* p,
-                   lapack_int* q, double* theta, double* phi, double* u1, lapack_int* ldu1, double* u2,
-                   lapack_int* ldu2, double* v1t, lapack_int* ldv1t, double* v2t, lapack_int* ldv2t, double* b11d,
-                   double* b11e, double* b12d, double* b12e, double* b21d, double* b21e, double* b22d, double* b22e,
-                   double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dorbdb(char* trans, char* signs, lapack_int* m, lapack_int* p, lapack_int* q, double* x11,
-                   lapack_int* ldx11, double* x12, lapack_int* ldx12, double* x21, lapack_int* ldx21, double* x22,
-                   lapack_int* ldx22, double* theta, double* phi, double* taup1, double* taup2, double* tauq1,
-                   double* tauq2, double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dorcsd(char* jobu1, char* jobu2, char* jobv1t, char* jobv2t, char* trans, char* signs, lapack_int* m,
-                   lapack_int* p, lapack_int* q, double* x11, lapack_int* ldx11, double* x12, lapack_int* ldx12,
-                   double* x21, lapack_int* ldx21, double* x22, lapack_int* ldx22, double* theta, double* u1,
-                   lapack_int* ldu1, double* u2, lapack_int* ldu2, double* v1t, lapack_int* ldv1t, double* v2t,
-                   lapack_int* ldv2t, double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* info);
-void LAPACK_dsyconv(char* uplo, char* way, lapack_int* n, double* a, lapack_int* lda, const lapack_int* ipiv,
-                    double* work, lapack_int* info);
-void LAPACK_dsyswapr(char* uplo, lapack_int* n, double* a, lapack_int* i1, lapack_int* i2);
-void LAPACK_dsytri2(char* uplo, lapack_int* n, double* a, lapack_int* lda, const lapack_int* ipiv,
-                    lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_dsytri2x(char* uplo, lapack_int* n, double* a, lapack_int* lda, const lapack_int* ipiv, double* work,
-                     lapack_int* nb, lapack_int* info);
-void LAPACK_dsytrs2(char* uplo, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda,
-                    const lapack_int* ipiv, double* b, lapack_int* ldb, double* work, lapack_int* info);
-void LAPACK_sbbcsd(char* jobu1, char* jobu2, char* jobv1t, char* jobv2t, char* trans, lapack_int* m, lapack_int* p,
-                   lapack_int* q, float* theta, float* phi, float* u1, lapack_int* ldu1, float* u2, lapack_int* ldu2,
-                   float* v1t, lapack_int* ldv1t, float* v2t, lapack_int* ldv2t, float* b11d, float* b11e, float* b12d,
-                   float* b12e, float* b21d, float* b21e, float* b22d, float* b22e, float* work, lapack_int* lwork,
-                   lapack_int* info);
-void LAPACK_sorbdb(char* trans, char* signs, lapack_int* m, lapack_int* p, lapack_int* q, float* x11, lapack_int* ldx11,
-                   float* x12, lapack_int* ldx12, float* x21, lapack_int* ldx21, float* x22, lapack_int* ldx22,
-                   float* theta, float* phi, float* taup1, float* taup2, float* tauq1, float* tauq2, float* work,
-                   lapack_int* lwork, lapack_int* info);
-void LAPACK_sorcsd(char* jobu1, char* jobu2, char* jobv1t, char* jobv2t, char* trans, char* signs, lapack_int* m,
-                   lapack_int* p, lapack_int* q, float* x11, lapack_int* ldx11, float* x12, lapack_int* ldx12,
-                   float* x21, lapack_int* ldx21, float* x22, lapack_int* ldx22, float* theta, float* u1,
-                   lapack_int* ldu1, float* u2, lapack_int* ldu2, float* v1t, lapack_int* ldv1t, float* v2t,
-                   lapack_int* ldv2t, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* info);
-void LAPACK_ssyconv(char* uplo, char* way, lapack_int* n, float* a, lapack_int* lda, const lapack_int* ipiv,
-                    float* work, lapack_int* info);
-void LAPACK_ssyswapr(char* uplo, lapack_int* n, float* a, lapack_int* i1, lapack_int* i2);
-void LAPACK_ssytri2(char* uplo, lapack_int* n, float* a, lapack_int* lda, const lapack_int* ipiv,
-                    lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_ssytri2x(char* uplo, lapack_int* n, float* a, lapack_int* lda, const lapack_int* ipiv, float* work,
-                     lapack_int* nb, lapack_int* info);
-void LAPACK_ssytrs2(char* uplo, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda,
-                    const lapack_int* ipiv, float* b, lapack_int* ldb, float* work, lapack_int* info);
-void LAPACK_zbbcsd(char* jobu1, char* jobu2, char* jobv1t, char* jobv2t, char* trans, lapack_int* m, lapack_int* p,
-                   lapack_int* q, double* theta, double* phi, lapack_complex_double* u1, lapack_int* ldu1,
-                   lapack_complex_double* u2, lapack_int* ldu2, lapack_complex_double* v1t, lapack_int* ldv1t,
-                   lapack_complex_double* v2t, lapack_int* ldv2t, double* b11d, double* b11e, double* b12d,
-                   double* b12e, double* b21d, double* b21e, double* b22d, double* b22e, double* rwork,
-                   lapack_int* lrwork, lapack_int* info);
-void LAPACK_zheswapr(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* i1, lapack_int* i2);
-void LAPACK_zhetri2(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, const lapack_int* ipiv,
-                    lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zhetri2x(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, const lapack_int* ipiv,
-                     lapack_complex_double* work, lapack_int* nb, lapack_int* info);
-void LAPACK_zhetrs2(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
-                    const lapack_int* ipiv, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* work,
-                    lapack_int* info);
-void LAPACK_zsyconv(char* uplo, char* way, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                    const lapack_int* ipiv, lapack_complex_double* work, lapack_int* info);
-void LAPACK_zsyswapr(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* i1, lapack_int* i2);
-void LAPACK_zsytri2(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, const lapack_int* ipiv,
-                    lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zsytri2x(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, const lapack_int* ipiv,
-                     lapack_complex_double* work, lapack_int* nb, lapack_int* info);
-void LAPACK_zsytrs2(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
-                    const lapack_int* ipiv, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* work,
-                    lapack_int* info);
-void LAPACK_zunbdb(char* trans, char* signs, lapack_int* m, lapack_int* p, lapack_int* q, lapack_complex_double* x11,
-                   lapack_int* ldx11, lapack_complex_double* x12, lapack_int* ldx12, lapack_complex_double* x21,
-                   lapack_int* ldx21, lapack_complex_double* x22, lapack_int* ldx22, double* theta, double* phi,
-                   lapack_complex_double* taup1, lapack_complex_double* taup2, lapack_complex_double* tauq1,
-                   lapack_complex_double* tauq2, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
-void LAPACK_zuncsd(char* jobu1, char* jobu2, char* jobv1t, char* jobv2t, char* trans, char* signs, lapack_int* m,
-                   lapack_int* p, lapack_int* q, lapack_complex_double* x11, lapack_int* ldx11,
-                   lapack_complex_double* x12, lapack_int* ldx12, lapack_complex_double* x21, lapack_int* ldx21,
-                   lapack_complex_double* x22, lapack_int* ldx22, double* theta, lapack_complex_double* u1,
-                   lapack_int* ldu1, lapack_complex_double* u2, lapack_int* ldu2, lapack_complex_double* v1t,
-                   lapack_int* ldv1t, lapack_complex_double* v2t, lapack_int* ldv2t, lapack_complex_double* work,
-                   lapack_int* lwork, double* rwork, lapack_int* lrwork, lapack_int* iwork, lapack_int* info);
-// LAPACK 3.4.0
-void LAPACK_sgemqrt(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* nb,
-                    const float* v, lapack_int* ldv, const float* t, lapack_int* ldt, float* c, lapack_int* ldc,
-                    float* work, lapack_int* info);
-void LAPACK_dgemqrt(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* nb,
-                    const double* v, lapack_int* ldv, const double* t, lapack_int* ldt, double* c, lapack_int* ldc,
-                    double* work, lapack_int* info);
-void LAPACK_cgemqrt(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* nb,
-                    const lapack_complex_float* v, lapack_int* ldv, const lapack_complex_float* t, lapack_int* ldt,
-                    lapack_complex_float* c, lapack_int* ldc, lapack_complex_float* work, lapack_int* info);
-void LAPACK_zgemqrt(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* nb,
-                    const lapack_complex_double* v, lapack_int* ldv, const lapack_complex_double* t, lapack_int* ldt,
-                    lapack_complex_double* c, lapack_int* ldc, lapack_complex_double* work, lapack_int* info);
-void LAPACK_sgeqrt(lapack_int* m, lapack_int* n, lapack_int* nb, float* a, lapack_int* lda, float* t, lapack_int* ldt,
-                   float* work, lapack_int* info);
-void LAPACK_dgeqrt(lapack_int* m, lapack_int* n, lapack_int* nb, double* a, lapack_int* lda, double* t, lapack_int* ldt,
-                   double* work, lapack_int* info);
-void LAPACK_cgeqrt(lapack_int* m, lapack_int* n, lapack_int* nb, lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* t, lapack_int* ldt, lapack_complex_float* work, lapack_int* info);
-void LAPACK_zgeqrt(lapack_int* m, lapack_int* n, lapack_int* nb, lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* t, lapack_int* ldt, lapack_complex_double* work, lapack_int* info);
-void LAPACK_sgeqrt2(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* t, lapack_int* ldt,
-                    lapack_int* info);
-void LAPACK_dgeqrt2(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* t, lapack_int* ldt,
-                    lapack_int* info);
-void LAPACK_cgeqrt2(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* t,
-                    lapack_int* ldt, lapack_int* info);
-void LAPACK_zgeqrt2(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* t,
-                    lapack_int* ldt, lapack_int* info);
-void LAPACK_sgeqrt3(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* t, lapack_int* ldt,
-                    lapack_int* info);
-void LAPACK_dgeqrt3(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* t, lapack_int* ldt,
-                    lapack_int* info);
-void LAPACK_cgeqrt3(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* t,
-                    lapack_int* ldt, lapack_int* info);
-void LAPACK_zgeqrt3(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* t,
-                    lapack_int* ldt, lapack_int* info);
-void LAPACK_stpmqrt(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l, lapack_int* nb,
-                    const float* v, lapack_int* ldv, const float* t, lapack_int* ldt, float* a, lapack_int* lda,
-                    float* b, lapack_int* ldb, float* work, lapack_int* info);
-void LAPACK_dtpmqrt(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l, lapack_int* nb,
-                    const double* v, lapack_int* ldv, const double* t, lapack_int* ldt, double* a, lapack_int* lda,
-                    double* b, lapack_int* ldb, double* work, lapack_int* info);
-void LAPACK_ctpmqrt(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l, lapack_int* nb,
-                    const lapack_complex_float* v, lapack_int* ldv, const lapack_complex_float* t, lapack_int* ldt,
-                    lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* work, lapack_int* info);
-void LAPACK_ztpmqrt(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l, lapack_int* nb,
-                    const lapack_complex_double* v, lapack_int* ldv, const lapack_complex_double* t, lapack_int* ldt,
-                    lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* work, lapack_int* info);
-void LAPACK_dtpqrt(lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb, double* a, lapack_int* lda, double* b,
-                   lapack_int* ldb, double* t, lapack_int* ldt, double* work, lapack_int* info);
-void LAPACK_ctpqrt(lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb, lapack_complex_float* a,
-                   lapack_int* lda, lapack_complex_float* t, lapack_complex_float* b, lapack_int* ldb, lapack_int* ldt,
-                   lapack_complex_float* work, lapack_int* info);
-void LAPACK_ztpqrt(lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb, lapack_complex_double* a,
-                   lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* t,
-                   lapack_int* ldt, lapack_complex_double* work, lapack_int* info);
-void LAPACK_stpqrt2(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* b, lapack_int* ldb, float* t,
-                    lapack_int* ldt, lapack_int* info);
-void LAPACK_dtpqrt2(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* b, lapack_int* ldb, double* t,
-                    lapack_int* ldt, lapack_int* info);
-void LAPACK_ctpqrt2(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b,
-                    lapack_int* ldb, lapack_complex_float* t, lapack_int* ldt, lapack_int* info);
-void LAPACK_ztpqrt2(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b,
-                    lapack_int* ldb, lapack_complex_double* t, lapack_int* ldt, lapack_int* info);
-void LAPACK_stprfb(char* side, char* trans, char* direct, char* storev, lapack_int* m, lapack_int* n, lapack_int* k,
-                   lapack_int* l, const float* v, lapack_int* ldv, const float* t, lapack_int* ldt, float* a,
-                   lapack_int* lda, float* b, lapack_int* ldb, const float* mywork, lapack_int* myldwork);
-void LAPACK_dtprfb(char* side, char* trans, char* direct, char* storev, lapack_int* m, lapack_int* n, lapack_int* k,
-                   lapack_int* l, const double* v, lapack_int* ldv, const double* t, lapack_int* ldt, double* a,
-                   lapack_int* lda, double* b, lapack_int* ldb, const double* mywork, lapack_int* myldwork);
-void LAPACK_ctprfb(char* side, char* trans, char* direct, char* storev, lapack_int* m, lapack_int* n, lapack_int* k,
-                   lapack_int* l, const lapack_complex_float* v, lapack_int* ldv, const lapack_complex_float* t,
-                   lapack_int* ldt, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
-                   const float* mywork, lapack_int* myldwork);
-void LAPACK_ztprfb(char* side, char* trans, char* direct, char* storev, lapack_int* m, lapack_int* n, lapack_int* k,
-                   lapack_int* l, const lapack_complex_double* v, lapack_int* ldv, const lapack_complex_double* t,
-                   lapack_int* ldt, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b,
-                   lapack_int* ldb, const double* mywork, lapack_int* myldwork);
-// LAPACK 3.X.X
-void LAPACK_csyr(char* uplo, lapack_int* n, lapack_complex_float* alpha, const lapack_complex_float* x,
-                 lapack_int* incx, lapack_complex_float* a, lapack_int* lda);
-void LAPACK_zsyr(char* uplo, lapack_int* n, lapack_complex_double* alpha, const lapack_complex_double* x,
-                 lapack_int* incx, lapack_complex_double* a, lapack_int* lda);
+lapack_int LAPACKE_cheev(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                         float* w);
+lapack_int LAPACKE_zheev(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                         double* w);
 
 #ifdef __cplusplus
 }
diff --git a/Eigen/src/plugins/ArrayCwiseBinaryOps.inc b/Eigen/src/plugins/ArrayCwiseBinaryOps.inc
index c8c2434d7c9..ae240f13c4b 100644
--- a/Eigen/src/plugins/ArrayCwiseBinaryOps.inc
+++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.inc
@@ -4,8 +4,8 @@
  * \sa MatrixBase::cwiseProduct
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived, OtherDerived, product) operator*(
-    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived, OtherDerived, product)
+operator*(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const {
   return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived, OtherDerived, product)(derived(), other.derived());
 }
 
@@ -14,7 +14,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Deriv
  * \sa MatrixBase::cwiseQuotient
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseBinaryOp<
     internal::scalar_quotient_op<Scalar, typename OtherDerived::Scalar>, const Derived, const OtherDerived>
 operator/(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const {
   return CwiseBinaryOp<internal::scalar_quotient_op<Scalar, typename OtherDerived::Scalar>, const Derived,
@@ -29,7 +29,7 @@ operator/(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const {
  * \sa max()
  */
 template <int NaNPropagation = PropagateFast, typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_min_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>
 #ifdef EIGEN_PARSED_BY_DOXYGEN
     min
@@ -46,7 +46,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
  * \sa max()
  */
 template <int NaNPropagation = PropagateFast>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_min_op<Scalar, Scalar, NaNPropagation>, const Derived,
                   const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
 #ifdef EIGEN_PARSED_BY_DOXYGEN
@@ -66,7 +66,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
  * \sa min()
  */
 template <int NaNPropagation = PropagateFast, typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_max_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>
 #ifdef EIGEN_PARSED_BY_DOXYGEN
     max
@@ -83,7 +83,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
  * \sa min()
  */
 template <int NaNPropagation = PropagateFast>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_max_op<Scalar, Scalar, NaNPropagation>, const Derived,
                   const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
 #ifdef EIGEN_PARSED_BY_DOXYGEN
@@ -105,7 +105,7 @@ EIGEN_MAKE_CWISE_BINARY_OP(absolute_difference, absolute_difference)
  *
  * \sa absolute_difference()
  */
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_absolute_difference_op<Scalar, Scalar>, const Derived,
                   const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
 #ifdef EIGEN_PARSED_BY_DOXYGEN
@@ -133,10 +133,10 @@ EIGEN_MAKE_CWISE_BINARY_OP(pow, pow)
  */
 EIGEN_MAKE_CWISE_BINARY_OP(atan2, atan2)
 
-// TODO code generating macros could be moved to Macros.h and could include generation of documentation
+// TODO: code generating macros could be moved to Macros.h and could include generation of documentation
 #define EIGEN_MAKE_CWISE_COMP_OP(OP, COMPARATOR)                                                                     \
   template <typename OtherDerived>                                                                                   \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const                                                                        \
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const                                                              \
       CwiseBinaryOp<internal::scalar_cmp_op<Scalar, typename OtherDerived::Scalar, internal::cmp_##COMPARATOR>,      \
                     const Derived, const OtherDerived>                                                               \
       OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const {                                        \
@@ -149,24 +149,24 @@ EIGEN_MAKE_CWISE_BINARY_OP(atan2, atan2)
   typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_##COMPARATOR>,                         \
                         const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject>, const Derived>      \
       RCmp##COMPARATOR##ReturnType;                                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Cmp##COMPARATOR##ReturnType OP(const Scalar &s) const {                \
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const Cmp##COMPARATOR##ReturnType OP(const Scalar &s) const {      \
     return this->OP(Derived::PlainObject::Constant(rows(), cols(), s));                                              \
   }                                                                                                                  \
-  EIGEN_DEVICE_FUNC friend EIGEN_STRONG_INLINE const RCmp##COMPARATOR##ReturnType OP(                                \
+  EIGEN_DEVICE_FUNC constexpr friend EIGEN_STRONG_INLINE const RCmp##COMPARATOR##ReturnType OP(                      \
       const Scalar &s, const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived> &d) {                                         \
     return Derived::PlainObject::Constant(d.rows(), d.cols(), s).OP(d);                                              \
   }
 
 #define EIGEN_MAKE_CWISE_COMP_R_OP(OP, R_OP, RCOMPARATOR)                                                             \
   template <typename OtherDerived>                                                                                    \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const                                                                         \
+  EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const                                                               \
       CwiseBinaryOp<internal::scalar_cmp_op<typename OtherDerived::Scalar, Scalar, internal::cmp_##RCOMPARATOR>,      \
                     const OtherDerived, const Derived>                                                                \
       OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const {                                         \
     return CwiseBinaryOp<internal::scalar_cmp_op<typename OtherDerived::Scalar, Scalar, internal::cmp_##RCOMPARATOR>, \
                          const OtherDerived, const Derived>(other.derived(), derived());                              \
   }                                                                                                                   \
-  EIGEN_DEVICE_FUNC inline const RCmp##RCOMPARATOR##ReturnType OP(const Scalar &s) const {                            \
+  EIGEN_DEVICE_FUNC constexpr inline const RCmp##RCOMPARATOR##ReturnType OP(const Scalar &s) const {                  \
     return Derived::PlainObject::Constant(rows(), cols(), s).R_OP(*this);                                             \
   }                                                                                                                   \
   friend inline const Cmp##RCOMPARATOR##ReturnType OP(const Scalar &s, const Derived &d) {                            \
@@ -301,26 +301,6 @@ friend inline const CwiseBinaryOp<internal::scalar_quotient_op<T, Scalar>, Const
     const T &s, const StorageBaseType &a);
 #endif
 
-// NOTE disabled until we agree on argument order
-#if 0
-/** \cpp11 \returns an expression of the coefficient-wise polygamma function.
-  *
-  * \specialfunctions_module
-  *
-  * It returns the \a n -th derivative of the digamma(psi) evaluated at \c *this.
-  *
-  * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
-  *
-  * \sa Eigen::polygamma()
-  */
-template<typename DerivedN>
-inline const CwiseBinaryOp<internal::scalar_polygamma_op<Scalar>, const DerivedN, const Derived>
-polygamma(const EIGEN_CURRENT_STORAGE_BASE_CLASS<DerivedN> &n) const
-{
-  return CwiseBinaryOp<internal::scalar_polygamma_op<Scalar>, const DerivedN, const Derived>(n.derived(), this->derived());
-}
-#endif
-
 /** \returns an expression of the coefficient-wise zeta function.
  *
  * \specialfunctions_module
diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.inc b/Eigen/src/plugins/ArrayCwiseUnaryOps.inc
index 753aeb4fb77..4bc999e5868 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.inc
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.inc
@@ -49,7 +49,7 @@ typedef CwiseUnaryOp<internal::scalar_isfinite_op<Scalar, true>, const Derived>
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_abs">Math functions</a>, abs2()
  */
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const AbsReturnType abs() const { return AbsReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const AbsReturnType abs() const { return AbsReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise phase angle of \c *this
  *
@@ -58,9 +58,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const AbsReturnType abs() const { return A
  *
  * \sa abs()
  */
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ArgReturnType arg() const { return ArgReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const ArgReturnType arg() const { return ArgReturnType(derived()); }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CArgReturnType carg() const { return CArgReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CArgReturnType carg() const { return CArgReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise squared absolute value of \c *this
  *
@@ -69,7 +69,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CArgReturnType carg() const { return
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_abs2">Math functions</a>, abs(), square()
  */
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Abs2ReturnType abs2() const { return Abs2ReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const Abs2ReturnType abs2() const { return Abs2ReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise exponential of *this.
  *
@@ -82,7 +82,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Abs2ReturnType abs2() const { return
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_exp">Math functions</a>, exp2(), pow(), log(), sin(),
  * cos()
  */
-EIGEN_DEVICE_FUNC inline const ExpReturnType exp() const { return ExpReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const ExpReturnType exp() const { return ExpReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise exponential of *this.
  *
@@ -91,7 +91,7 @@ EIGEN_DEVICE_FUNC inline const ExpReturnType exp() const { return ExpReturnType(
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_exp">Math functions</a>, exp(), pow(), log(), sin(),
  * cos()
  */
-EIGEN_DEVICE_FUNC inline const Exp2ReturnType exp2() const { return Exp2ReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const Exp2ReturnType exp2() const { return Exp2ReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise exponential of *this minus 1.
  *
@@ -100,7 +100,7 @@ EIGEN_DEVICE_FUNC inline const Exp2ReturnType exp2() const { return Exp2ReturnTy
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_expm1">Math functions</a>, exp()
  */
-EIGEN_DEVICE_FUNC inline const Expm1ReturnType expm1() const { return Expm1ReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const Expm1ReturnType expm1() const { return Expm1ReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise logarithm of *this.
  *
@@ -112,7 +112,7 @@ EIGEN_DEVICE_FUNC inline const Expm1ReturnType expm1() const { return Expm1Retur
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log">Math functions</a>, log()
  */
-EIGEN_DEVICE_FUNC inline const LogReturnType log() const { return LogReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const LogReturnType log() const { return LogReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise logarithm of 1 plus \c *this.
  *
@@ -121,7 +121,7 @@ EIGEN_DEVICE_FUNC inline const LogReturnType log() const { return LogReturnType(
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log1p">Math functions</a>, log()
  */
-EIGEN_DEVICE_FUNC inline const Log1pReturnType log1p() const { return Log1pReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const Log1pReturnType log1p() const { return Log1pReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise base-10 logarithm of *this.
  *
@@ -132,14 +132,14 @@ EIGEN_DEVICE_FUNC inline const Log1pReturnType log1p() const { return Log1pRetur
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log10">Math functions</a>, log()
  */
-EIGEN_DEVICE_FUNC inline const Log10ReturnType log10() const { return Log10ReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const Log10ReturnType log10() const { return Log10ReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise base-2 logarithm of *this.
  *
  * This function computes the coefficient-wise base-2 logarithm.
  *
  */
-EIGEN_DEVICE_FUNC inline const Log2ReturnType log2() const { return Log2ReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const Log2ReturnType log2() const { return Log2ReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise square root of *this.
  *
@@ -151,7 +151,7 @@ EIGEN_DEVICE_FUNC inline const Log2ReturnType log2() const { return Log2ReturnTy
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sqrt">Math functions</a>, pow(), square(), cbrt()
  */
-EIGEN_DEVICE_FUNC inline const SqrtReturnType sqrt() const { return SqrtReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const SqrtReturnType sqrt() const { return SqrtReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise cube root of *this.
  *
@@ -162,7 +162,7 @@ EIGEN_DEVICE_FUNC inline const SqrtReturnType sqrt() const { return SqrtReturnTy
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cbrt">Math functions</a>, sqrt(), pow(), square()
  */
-EIGEN_DEVICE_FUNC inline const CbrtReturnType cbrt() const { return CbrtReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CbrtReturnType cbrt() const { return CbrtReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise inverse square root of *this.
  *
@@ -173,7 +173,7 @@ EIGEN_DEVICE_FUNC inline const CbrtReturnType cbrt() const { return CbrtReturnTy
  *
  * \sa pow(), square()
  */
-EIGEN_DEVICE_FUNC inline const RsqrtReturnType rsqrt() const { return RsqrtReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const RsqrtReturnType rsqrt() const { return RsqrtReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise signum of *this.
  *
@@ -184,7 +184,7 @@ EIGEN_DEVICE_FUNC inline const RsqrtReturnType rsqrt() const { return RsqrtRetur
  *
  * \sa pow(), square()
  */
-EIGEN_DEVICE_FUNC inline const SignReturnType sign() const { return SignReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const SignReturnType sign() const { return SignReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise cosine of *this.
  *
@@ -196,7 +196,7 @@ EIGEN_DEVICE_FUNC inline const SignReturnType sign() const { return SignReturnTy
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cos">Math functions</a>, sin(), acos()
  */
-EIGEN_DEVICE_FUNC inline const CosReturnType cos() const { return CosReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CosReturnType cos() const { return CosReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise sine of *this.
  *
@@ -208,7 +208,7 @@ EIGEN_DEVICE_FUNC inline const CosReturnType cos() const { return CosReturnType(
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sin">Math functions</a>, cos(), asin()
  */
-EIGEN_DEVICE_FUNC inline const SinReturnType sin() const { return SinReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const SinReturnType sin() const { return SinReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise tan of *this.
  *
@@ -217,7 +217,7 @@ EIGEN_DEVICE_FUNC inline const SinReturnType sin() const { return SinReturnType(
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_tan">Math functions</a>, cos(), sin()
  */
-EIGEN_DEVICE_FUNC inline const TanReturnType tan() const { return TanReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const TanReturnType tan() const { return TanReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise arc tan of *this.
  *
@@ -226,7 +226,7 @@ EIGEN_DEVICE_FUNC inline const TanReturnType tan() const { return TanReturnType(
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_atan">Math functions</a>, tan(), asin(), acos()
  */
-EIGEN_DEVICE_FUNC inline const AtanReturnType atan() const { return AtanReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const AtanReturnType atan() const { return AtanReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise arc cosine of *this.
  *
@@ -235,7 +235,7 @@ EIGEN_DEVICE_FUNC inline const AtanReturnType atan() const { return AtanReturnTy
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_acos">Math functions</a>, cos(), asin()
  */
-EIGEN_DEVICE_FUNC inline const AcosReturnType acos() const { return AcosReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const AcosReturnType acos() const { return AcosReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise arc sine of *this.
  *
@@ -244,7 +244,7 @@ EIGEN_DEVICE_FUNC inline const AcosReturnType acos() const { return AcosReturnTy
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_asin">Math functions</a>, sin(), acos()
  */
-EIGEN_DEVICE_FUNC inline const AsinReturnType asin() const { return AsinReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const AsinReturnType asin() const { return AsinReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise hyperbolic tan of *this.
  *
@@ -253,7 +253,7 @@ EIGEN_DEVICE_FUNC inline const AsinReturnType asin() const { return AsinReturnTy
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_tanh">Math functions</a>, tan(), sinh(), cosh()
  */
-EIGEN_DEVICE_FUNC inline const TanhReturnType tanh() const { return TanhReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const TanhReturnType tanh() const { return TanhReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise hyperbolic sin of *this.
  *
@@ -262,7 +262,7 @@ EIGEN_DEVICE_FUNC inline const TanhReturnType tanh() const { return TanhReturnTy
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sinh">Math functions</a>, sin(), tanh(), cosh()
  */
-EIGEN_DEVICE_FUNC inline const SinhReturnType sinh() const { return SinhReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const SinhReturnType sinh() const { return SinhReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise hyperbolic cos of *this.
  *
@@ -271,29 +271,29 @@ EIGEN_DEVICE_FUNC inline const SinhReturnType sinh() const { return SinhReturnTy
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cosh">Math functions</a>, tanh(), sinh(), cosh()
  */
-EIGEN_DEVICE_FUNC inline const CoshReturnType cosh() const { return CoshReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CoshReturnType cosh() const { return CoshReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise inverse hyperbolic tan of *this.
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_atanh">Math functions</a>, atanh(), asinh(), acosh()
  */
-EIGEN_DEVICE_FUNC inline const AtanhReturnType atanh() const { return AtanhReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const AtanhReturnType atanh() const { return AtanhReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise inverse hyperbolic sin of *this.
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_asinh">Math functions</a>, atanh(), asinh(), acosh()
  */
-EIGEN_DEVICE_FUNC inline const AsinhReturnType asinh() const { return AsinhReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const AsinhReturnType asinh() const { return AsinhReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise inverse hyperbolic cos of *this.
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_acosh">Math functions</a>, atanh(), asinh(), acosh()
  */
-EIGEN_DEVICE_FUNC inline const AcoshReturnType acosh() const { return AcoshReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const AcoshReturnType acosh() const { return AcoshReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise logistic of *this.
  */
-EIGEN_DEVICE_FUNC inline const LogisticReturnType logistic() const { return LogisticReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const LogisticReturnType logistic() const { return LogisticReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise inverse of *this.
  *
@@ -302,7 +302,7 @@ EIGEN_DEVICE_FUNC inline const LogisticReturnType logistic() const { return Logi
  *
  * \sa operator/(), operator*()
  */
-EIGEN_DEVICE_FUNC inline const InverseReturnType inverse() const { return InverseReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const InverseReturnType inverse() const { return InverseReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise square of *this.
  *
@@ -311,7 +311,7 @@ EIGEN_DEVICE_FUNC inline const InverseReturnType inverse() const { return Invers
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_squareE">Math functions</a>, abs2(), cube(), pow()
  */
-EIGEN_DEVICE_FUNC inline const SquareReturnType square() const { return SquareReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const SquareReturnType square() const { return SquareReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise cube of *this.
  *
@@ -320,7 +320,7 @@ EIGEN_DEVICE_FUNC inline const SquareReturnType square() const { return SquareRe
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cube">Math functions</a>, square(), pow()
  */
-EIGEN_DEVICE_FUNC inline const CubeReturnType cube() const { return CubeReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CubeReturnType cube() const { return CubeReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise rint of *this.
  *
@@ -329,7 +329,7 @@ EIGEN_DEVICE_FUNC inline const CubeReturnType cube() const { return CubeReturnTy
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_rint">Math functions</a>, ceil(), floor()
  */
-EIGEN_DEVICE_FUNC inline const RintReturnType rint() const { return RintReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const RintReturnType rint() const { return RintReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise round of *this.
  *
@@ -338,7 +338,7 @@ EIGEN_DEVICE_FUNC inline const RintReturnType rint() const { return RintReturnTy
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_round">Math functions</a>, ceil(), floor()
  */
-EIGEN_DEVICE_FUNC inline const RoundReturnType round() const { return RoundReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const RoundReturnType round() const { return RoundReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise floor of *this.
  *
@@ -347,7 +347,7 @@ EIGEN_DEVICE_FUNC inline const RoundReturnType round() const { return RoundRetur
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_floor">Math functions</a>, ceil(), round()
  */
-EIGEN_DEVICE_FUNC inline const FloorReturnType floor() const { return FloorReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const FloorReturnType floor() const { return FloorReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise ceil of *this.
  *
@@ -356,7 +356,7 @@ EIGEN_DEVICE_FUNC inline const FloorReturnType floor() const { return FloorRetur
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_ceil">Math functions</a>, floor(), round()
  */
-EIGEN_DEVICE_FUNC inline const CeilReturnType ceil() const { return CeilReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CeilReturnType ceil() const { return CeilReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise truncation of *this.
  *
@@ -365,7 +365,7 @@ EIGEN_DEVICE_FUNC inline const CeilReturnType ceil() const { return CeilReturnTy
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_trunc">Math functions</a>, floor(), round()
  */
-EIGEN_DEVICE_FUNC inline const TruncReturnType trunc() const { return TruncReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const TruncReturnType trunc() const { return TruncReturnType(derived()); }
 
 template <int N>
 struct ShiftRightXpr {
@@ -380,7 +380,7 @@ struct ShiftRightXpr {
  * \sa shiftLeft()
  */
 template <int N>
-EIGEN_DEVICE_FUNC typename ShiftRightXpr<N>::Type shiftRight() const {
+EIGEN_DEVICE_FUNC constexpr typename ShiftRightXpr<N>::Type shiftRight() const {
   return typename ShiftRightXpr<N>::Type(derived());
 }
 
@@ -397,7 +397,7 @@ struct ShiftLeftXpr {
  * \sa shiftRight()
  */
 template <int N>
-EIGEN_DEVICE_FUNC typename ShiftLeftXpr<N>::Type shiftLeft() const {
+EIGEN_DEVICE_FUNC constexpr typename ShiftLeftXpr<N>::Type shiftLeft() const {
   return typename ShiftLeftXpr<N>::Type(derived());
 }
 
@@ -408,7 +408,7 @@ EIGEN_DEVICE_FUNC typename ShiftLeftXpr<N>::Type shiftLeft() const {
  *
  * \sa isfinite(), isinf()
  */
-EIGEN_DEVICE_FUNC inline const IsNaNReturnType isNaN() const { return IsNaNReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const IsNaNReturnType isNaN() const { return IsNaNReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise isinf of *this.
  *
@@ -417,7 +417,7 @@ EIGEN_DEVICE_FUNC inline const IsNaNReturnType isNaN() const { return IsNaNRetur
  *
  * \sa isnan(), isfinite()
  */
-EIGEN_DEVICE_FUNC inline const IsInfReturnType isInf() const { return IsInfReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const IsInfReturnType isInf() const { return IsInfReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise isfinite of *this.
  *
@@ -426,8 +426,8 @@ EIGEN_DEVICE_FUNC inline const IsInfReturnType isInf() const { return IsInfRetur
  *
  * \sa isnan(), isinf()
  */
-EIGEN_DEVICE_FUNC inline const IsFiniteReturnType isFinite() const { return IsFiniteReturnType(derived()); }
-EIGEN_DEVICE_FUNC inline const IsFiniteTypedReturnType isFiniteTyped() const {
+EIGEN_DEVICE_FUNC constexpr inline const IsFiniteReturnType isFinite() const { return IsFiniteReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const IsFiniteTypedReturnType isFiniteTyped() const {
   return IsFiniteTypedReturnType(derived());
 }
 
@@ -438,11 +438,15 @@ EIGEN_DEVICE_FUNC inline const IsFiniteTypedReturnType isFiniteTyped() const {
  *
  * \sa operator!=()
  */
-EIGEN_DEVICE_FUNC inline const BooleanNotReturnType operator!() const { return BooleanNotReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const BooleanNotReturnType operator!() const {
+  return BooleanNotReturnType(derived());
+}
 
 /** \returns an expression of the bitwise ~ operator of *this
  */
-EIGEN_DEVICE_FUNC inline const BitwiseNotReturnType operator~() const { return BitwiseNotReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const BitwiseNotReturnType operator~() const {
+  return BitwiseNotReturnType(derived());
+}
 
 // --- SpecialFunctions module ---
 
@@ -452,17 +456,16 @@ typedef CwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived> ErfReturnTy
 typedef CwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived> ErfcReturnType;
 typedef CwiseUnaryOp<internal::scalar_ndtri_op<Scalar>, const Derived> NdtriReturnType;
 
-/** \cpp11 \returns an expression of the coefficient-wise ln(|gamma(*this)|).
+/** \returns an expression of the coefficient-wise ln(|gamma(*this)|).
  *
  * \specialfunctions_module
  *
- * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
- * or float/double in non c++11 mode, the user has to provide implementations of lgamma(T) for any scalar
- * type T to be supported.
+ * \note This function supports only float and double scalar types. To support other scalar types,
+ * the user has to provide implementations of lgamma(T) for any scalar type T to be supported.
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_lgamma">Math functions</a>, digamma()
  */
-EIGEN_DEVICE_FUNC inline const LgammaReturnType lgamma() const { return LgammaReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const LgammaReturnType lgamma() const { return LgammaReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise digamma (psi, derivative of lgamma).
  *
@@ -475,33 +478,31 @@ EIGEN_DEVICE_FUNC inline const LgammaReturnType lgamma() const { return LgammaRe
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_digamma">Math functions</a>, Eigen::digamma(),
  * Eigen::polygamma(), lgamma()
  */
-EIGEN_DEVICE_FUNC inline const DigammaReturnType digamma() const { return DigammaReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const DigammaReturnType digamma() const { return DigammaReturnType(derived()); }
 
-/** \cpp11 \returns an expression of the coefficient-wise Gauss error
+/** \returns an expression of the coefficient-wise Gauss error
  * function of *this.
  *
  * \specialfunctions_module
  *
- * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
- * or float/double in non c++11 mode, the user has to provide implementations of erf(T) for any scalar
- * type T to be supported.
+ * \note This function supports only float and double scalar types. To support other scalar types,
+ * the user has to provide implementations of erf(T) for any scalar type T to be supported.
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_erf">Math functions</a>, erfc()
  */
-EIGEN_DEVICE_FUNC inline const ErfReturnType erf() const { return ErfReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const ErfReturnType erf() const { return ErfReturnType(derived()); }
 
-/** \cpp11 \returns an expression of the coefficient-wise Complementary error
+/** \returns an expression of the coefficient-wise Complementary error
  * function of *this.
  *
  * \specialfunctions_module
  *
- * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
- * or float/double in non c++11 mode, the user has to provide implementations of erfc(T) for any scalar
- * type T to be supported.
+ * \note This function supports only float and double scalar types. To support other scalar types,
+ * the user has to provide implementations of erfc(T) for any scalar type T to be supported.
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_erfc">Math functions</a>, erf()
  */
-EIGEN_DEVICE_FUNC inline const ErfcReturnType erfc() const { return ErfcReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const ErfcReturnType erfc() const { return ErfcReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise inverse of the CDF of the Normal distribution function
  * function of *this.
@@ -516,7 +517,7 @@ EIGEN_DEVICE_FUNC inline const ErfcReturnType erfc() const { return ErfcReturnTy
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_ndtri">Math functions</a>
  */
-EIGEN_DEVICE_FUNC inline const NdtriReturnType ndtri() const { return NdtriReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const NdtriReturnType ndtri() const { return NdtriReturnType(derived()); }
 
 template <typename ScalarExponent>
 using UnaryPowReturnType =
@@ -538,7 +539,7 @@ using UnaryPowReturnType =
  * \sa ArrayBase::pow(ArrayBase), square(), cube(), exp(), log()
  */
 template <typename ScalarExponent>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryPowReturnType<ScalarExponent> pow(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const UnaryPowReturnType<ScalarExponent> pow(
     const ScalarExponent& exponent) const {
   return UnaryPowReturnType<ScalarExponent>(derived(), internal::scalar_unary_pow_op<Scalar, ScalarExponent>(exponent));
 }
diff --git a/Eigen/src/plugins/BlockMethods.inc b/Eigen/src/plugins/BlockMethods.inc
index 0782aa39aff..2d85955135b 100644
--- a/Eigen/src/plugins/BlockMethods.inc
+++ b/Eigen/src/plugins/BlockMethods.inc
@@ -66,12 +66,12 @@ struct ConstFixedSegmentReturnType {
 };
 
 /// \internal inner-vector
-typedef Block<Derived, IsRowMajor ? 1 : Dynamic, IsRowMajor ? Dynamic : 1, true> InnerVectorReturnType;
-typedef Block<const Derived, IsRowMajor ? 1 : Dynamic, IsRowMajor ? Dynamic : 1, true> ConstInnerVectorReturnType;
+typedef std::conditional_t<IsRowMajor, RowXpr, ColXpr> InnerVectorReturnType;
+typedef std::conditional_t<IsRowMajor, ConstRowXpr, ConstColXpr> ConstInnerVectorReturnType;
 
 /// \internal set of inner-vectors
-typedef Block<Derived, Dynamic, Dynamic, true> InnerVectorsReturnType;
-typedef Block<const Derived, Dynamic, Dynamic, true> ConstInnerVectorsReturnType;
+typedef std::conditional_t<IsRowMajor, RowsBlockXpr, ColsBlockXpr> InnerVectorsReturnType;
+typedef std::conditional_t<IsRowMajor, ConstRowsBlockXpr, ConstColsBlockXpr> ConstInnerVectorsReturnType;
 
 #endif  // not EIGEN_PARSED_BY_DOXYGEN
 
@@ -111,7 +111,7 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa class Block, fix, fix<N>(int)
 ///
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                            internal::get_fixed_value<NColsType>::value>::Type
@@ -128,7 +128,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 
 /// This is the const version of block(Index,Index,NRowsType,NColsType)
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                                       internal::get_fixed_value<NColsType>::value>::Type
@@ -163,7 +163,7 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                            internal::get_fixed_value<NColsType>::value>::Type
@@ -181,7 +181,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 
 /// This is the const version of topRightCorner(NRowsType, NColsType).
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                                       internal::get_fixed_value<NColsType>::value>::Type
@@ -213,13 +213,14 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa class Block, block<int,int>(Index,Index)
 ///
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topRightCorner() {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topRightCorner() {
   return typename FixedBlockXpr<CRows, CCols>::Type(derived(), 0, cols() - CCols);
 }
 
 /// This is the const version of topRightCorner<int, int>().
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topRightCorner() const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topRightCorner()
+    const {
   return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), 0, cols() - CCols);
 }
 
@@ -243,14 +244,14 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa class Block
 ///
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topRightCorner(Index cRows,
-                                                                                                Index cCols) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topRightCorner(Index cRows,
+                                                                                                          Index cCols) {
   return typename FixedBlockXpr<CRows, CCols>::Type(derived(), 0, cols() - cCols, cRows, cCols);
 }
 
 /// This is the const version of topRightCorner<int, int>(Index, Index).
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topRightCorner(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topRightCorner(
     Index cRows, Index cCols) const {
   return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), 0, cols() - cCols, cRows, cCols);
 }
@@ -274,7 +275,7 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                            internal::get_fixed_value<NColsType>::value>::Type
@@ -290,7 +291,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 
 /// This is the const version of topLeftCorner(Index, Index).
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                                       internal::get_fixed_value<NColsType>::value>::Type
@@ -318,13 +319,14 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topLeftCorner() {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topLeftCorner() {
   return typename FixedBlockXpr<CRows, CCols>::Type(derived(), 0, 0);
 }
 
 /// This is the const version of topLeftCorner<int, int>().
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topLeftCorner() const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topLeftCorner()
+    const {
   return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), 0, 0);
 }
 
@@ -348,14 +350,14 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa class Block
 ///
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topLeftCorner(Index cRows,
-                                                                                               Index cCols) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topLeftCorner(Index cRows,
+                                                                                                         Index cCols) {
   return typename FixedBlockXpr<CRows, CCols>::Type(derived(), 0, 0, cRows, cCols);
 }
 
 /// This is the const version of topLeftCorner<int, int>(Index, Index).
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topLeftCorner(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topLeftCorner(
     Index cRows, Index cCols) const {
   return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), 0, 0, cRows, cCols);
 }
@@ -379,7 +381,7 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                            internal::get_fixed_value<NColsType>::value>::Type
@@ -395,7 +397,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 
 /// This is the const version of bottomRightCorner(NRowsType, NColsType).
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                                       internal::get_fixed_value<NColsType>::value>::Type
@@ -423,13 +425,14 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type bottomRightCorner() {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type bottomRightCorner() {
   return typename FixedBlockXpr<CRows, CCols>::Type(derived(), rows() - CRows, cols() - CCols);
 }
 
 /// This is the const version of bottomRightCorner<int, int>().
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type bottomRightCorner() const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type
+bottomRightCorner() const {
   return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), rows() - CRows, cols() - CCols);
 }
 
@@ -453,14 +456,14 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa class Block
 ///
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type bottomRightCorner(Index cRows,
-                                                                                                   Index cCols) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type bottomRightCorner(
+    Index cRows, Index cCols) {
   return typename FixedBlockXpr<CRows, CCols>::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
 /// This is the const version of bottomRightCorner<int, int>(Index, Index).
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type bottomRightCorner(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type bottomRightCorner(
     Index cRows, Index cCols) const {
   return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
@@ -484,7 +487,7 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                            internal::get_fixed_value<NColsType>::value>::Type
@@ -503,7 +506,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 
 /// This is the const version of bottomLeftCorner(NRowsType, NColsType).
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
                                 internal::get_fixed_value<NColsType>::value>::Type
@@ -530,13 +533,14 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type bottomLeftCorner() {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type bottomLeftCorner() {
   return typename FixedBlockXpr<CRows, CCols>::Type(derived(), rows() - CRows, 0);
 }
 
 /// This is the const version of bottomLeftCorner<int, int>().
 template <int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type bottomLeftCorner() const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type bottomLeftCorner()
+    const {
   return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), rows() - CRows, 0);
 }
 
@@ -588,7 +592,7 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row - major)
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NRowsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
 #else
@@ -601,7 +605,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 
 /// This is the const version of topRows(NRowsType).
 template <typename NRowsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
 #else
@@ -628,13 +632,13 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row - major)
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NRowsBlockXpr<N>::Type topRows(Index n = N) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename NRowsBlockXpr<N>::Type topRows(Index n = N) {
   return typename NRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
 }
 
 /// This is the const version of topRows<int>().
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const {
   return typename ConstNRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
 }
 
@@ -655,7 +659,7 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row - major)
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NRowsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
 #else
@@ -668,7 +672,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 
 /// This is the const version of bottomRows(NRowsType).
 template <typename NRowsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
 #else
@@ -695,13 +699,13 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row - major)
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NRowsBlockXpr<N>::Type bottomRows(Index n = N) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename NRowsBlockXpr<N>::Type bottomRows(Index n = N) {
   return typename NRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
 }
 
 /// This is the const version of bottomRows<int>().
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const {
   return typename ConstNRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
 }
 
@@ -723,7 +727,7 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row - major)
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NRowsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
 #else
@@ -736,7 +740,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 
 /// This is the const version of middleRows(Index,NRowsType).
 template <typename NRowsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
 #else
@@ -764,14 +768,15 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row - major)
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename NRowsBlockXpr<N>::Type middleRows(Index startRow,
+                                                                                           Index n = N) {
   return typename NRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
 }
 
 /// This is the const version of middleRows<int>().
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow,
-                                                                                      Index n = N) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow,
+                                                                                                Index n = N) const {
   return typename ConstNRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
 }
 
@@ -792,7 +797,7 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column - major)
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
 #else
@@ -805,7 +810,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 
 /// This is the const version of leftCols(NColsType).
 template <typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
 #else
@@ -832,13 +837,13 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column - major)
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NColsBlockXpr<N>::Type leftCols(Index n = N) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename NColsBlockXpr<N>::Type leftCols(Index n = N) {
   return typename NColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
 }
 
 /// This is the const version of leftCols<int>().
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const {
   return typename ConstNColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
 }
 
@@ -859,7 +864,7 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column - major)
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
 #else
@@ -872,7 +877,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 
 /// This is the const version of rightCols(NColsType).
 template <typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
 #else
@@ -899,13 +904,13 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column - major)
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NColsBlockXpr<N>::Type rightCols(Index n = N) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename NColsBlockXpr<N>::Type rightCols(Index n = N) {
   return typename NColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
 }
 
 /// This is the const version of rightCols<int>().
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const {
   return typename ConstNColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
 }
 
@@ -927,7 +932,7 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column - major)
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
 #else
@@ -940,7 +945,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 
 /// This is the const version of middleCols(Index,NColsType).
 template <typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
 #else
@@ -968,14 +973,15 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column - major)
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename NColsBlockXpr<N>::Type middleCols(Index startCol,
+                                                                                           Index n = N) {
   return typename NColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
 }
 
 /// This is the const version of middleCols<int>().
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol,
-                                                                                      Index n = N) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol,
+                                                                                                Index n = N) const {
   return typename ConstNColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
 }
 
@@ -990,7 +996,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstNColsBlockXpr<N>::Type middl
 /// Example: \include MatrixBase_block_int_int.cpp
 /// Output: \verbinclude MatrixBase_block_int_int.out
 ///
-/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic
+/// \note The usage of this overload is discouraged from %Eigen 3.4, better use the generic
 /// block(Index,Index,NRowsType,NColsType), here is the one-to-one equivalence:
 /// \code
 /// mat.template block<NRows,NCols>(i,j)  <-->  mat.block(i,j,fix<NRows>,fix<NCols>)
@@ -1004,13 +1010,14 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int NRows, int NCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<NRows, NCols>::Type block(Index startRow, Index startCol) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedBlockXpr<NRows, NCols>::Type block(Index startRow,
+                                                                                                 Index startCol) {
   return typename FixedBlockXpr<NRows, NCols>::Type(derived(), startRow, startCol);
 }
 
 /// This is the const version of block<>(Index, Index). */
 template <int NRows, int NCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<NRows, NCols>::Type block(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<NRows, NCols>::Type block(
     Index startRow, Index startCol) const {
   return typename ConstFixedBlockXpr<NRows, NCols>::Type(derived(), startRow, startCol);
 }
@@ -1032,7 +1039,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<NRows, N
 /// Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp
 /// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.out
 ///
-/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic
+/// \note The usage of this overload is discouraged from %Eigen 3.4, better use the generic
 /// block(Index,Index,NRowsType,NColsType), here is the one-to-one complete equivalence:
 /// \code
 /// mat.template block<NRows,NCols>(i,j,rows,cols)     <-->  mat.block(i,j,fix<NRows>(rows),fix<NCols>(cols))
@@ -1047,15 +1054,16 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template <int NRows, int NCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<NRows, NCols>::Type block(Index startRow, Index startCol,
-                                                                                       Index blockRows,
-                                                                                       Index blockCols) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedBlockXpr<NRows, NCols>::Type block(Index startRow,
+                                                                                                 Index startCol,
+                                                                                                 Index blockRows,
+                                                                                                 Index blockCols) {
   return typename FixedBlockXpr<NRows, NCols>::Type(derived(), startRow, startCol, blockRows, blockCols);
 }
 
 /// This is the const version of block<>(Index, Index, Index, Index).
 template <int NRows, int NCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<NRows, NCols>::Type block(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<NRows, NCols>::Type block(
     Index startRow, Index startCol, Index blockRows, Index blockCols) const {
   return typename ConstFixedBlockXpr<NRows, NCols>::Type(derived(), startRow, startCol, blockRows, blockCols);
 }
@@ -1068,10 +1076,10 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<NRows, N
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column - major)
 /**
  * \sa row(), class Block */
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ColXpr col(Index i) { return ColXpr(derived(), i); }
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE ColXpr col(Index i) { return ColXpr(derived(), i); }
 
 /// This is the const version of col().
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ConstColXpr col(Index i) const { return ConstColXpr(derived(), i); }
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE ConstColXpr col(Index i) const { return ConstColXpr(derived(), i); }
 
 /// \returns an expression of the \a i-th row of \c *this. Note that the numbering starts at 0.
 ///
@@ -1081,10 +1089,10 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ConstColXpr col(Index i) const { return Co
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row - major)
 /**
  * \sa col(), class Block */
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RowXpr row(Index i) { return RowXpr(derived(), i); }
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE RowXpr row(Index i) { return RowXpr(derived(), i); }
 
 /// This is the const version of row(). */
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ConstRowXpr row(Index i) const { return ConstRowXpr(derived(), i); }
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE ConstRowXpr row(Index i) const { return ConstRowXpr(derived(), i); }
 
 /// \returns an expression of a segment (i.e. a vector block) in \c *this with either dynamic or fixed sizes.
 ///
@@ -1108,7 +1116,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ConstRowXpr row(Index i) const { return Co
 /// \sa block(Index,Index,NRowsType,NColsType), fix<N>, fix<N>(int), class Block
 ///
 template <typename NType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
 #else
@@ -1122,7 +1130,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 
 /// This is the const version of segment(Index,NType).
 template <typename NType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
 #else
@@ -1155,7 +1163,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 /// \sa class Block, block(Index,Index)
 ///
 template <typename NType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
 #else
@@ -1169,7 +1177,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 
 /// This is the const version of head(NType).
 template <typename NType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
 #else
@@ -1202,7 +1210,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 /// \sa class Block, block(Index,Index)
 ///
 template <typename NType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
 #else
@@ -1216,7 +1224,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 
 /// This is the const version of tail(Index).
 template <typename NType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
 #else
@@ -1245,15 +1253,16 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 /// \sa segment(Index,NType), class Block
 ///
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedSegmentReturnType<N>::Type segment(Index start, Index n = N) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedSegmentReturnType<N>::Type segment(Index start,
+                                                                                                 Index n = N) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename FixedSegmentReturnType<N>::Type(derived(), start, n);
 }
 
 /// This is the const version of segment<int>(Index).
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::Type segment(Index start,
-                                                                                            Index n = N) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::Type segment(
+    Index start, Index n = N) const {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), start, n);
 }
@@ -1274,14 +1283,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::T
 /// \sa head(NType), class Block
 ///
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedSegmentReturnType<N>::Type head(Index n = N) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedSegmentReturnType<N>::Type head(Index n = N) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename FixedSegmentReturnType<N>::Type(derived(), 0, n);
 }
 
 /// This is the const version of head<int>().
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), 0, n);
 }
@@ -1302,14 +1311,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::T
 /// \sa tail(NType), class Block
 ///
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedSegmentReturnType<N>::Type tail(Index n = N) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename FixedSegmentReturnType<N>::Type tail(Index n = N) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename FixedSegmentReturnType<N>::Type(derived(), size() - n, n);
 }
 
 /// This is the const version of tail<int>.
 template <int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), size() - n, n);
 }
@@ -1317,47 +1326,47 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::T
 /// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
 /// is col-major (resp. row-major).
 ///
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE InnerVectorReturnType innerVector(Index outer) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE InnerVectorReturnType innerVector(Index outer) {
   return InnerVectorReturnType(derived(), outer);
 }
 
 /// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
 /// is col-major (resp. row-major). Read-only.
 ///
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ConstInnerVectorReturnType innerVector(Index outer) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const ConstInnerVectorReturnType innerVector(Index outer) const {
   return ConstInnerVectorReturnType(derived(), outer);
 }
 
 /// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
 /// is col-major (resp. row-major).
 ///
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) {
-  return Block<Derived, Dynamic, Dynamic, true>(derived(), IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
-                                                IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) {
+  return InnerVectorsReturnType(derived(), IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
+                                IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
 }
 
 /// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
 /// is col-major (resp. row-major). Read-only.
 ///
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ConstInnerVectorsReturnType innerVectors(Index outerStart,
-                                                                                     Index outerSize) const {
-  return Block<const Derived, Dynamic, Dynamic, true>(derived(), IsRowMajor ? outerStart : 0,
-                                                      IsRowMajor ? 0 : outerStart, IsRowMajor ? outerSize : rows(),
-                                                      IsRowMajor ? cols() : outerSize);
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const ConstInnerVectorsReturnType innerVectors(Index outerStart,
+                                                                                               Index outerSize) const {
+  return ConstInnerVectorsReturnType(derived(), IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
+                                     IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
 }
 
 /** \returns the i-th subvector (column or vector) according to the \c Direction
  * \sa subVectors()
  */
 template <DirectionType Direction>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::conditional_t<Direction == Vertical, ColXpr, RowXpr> subVector(Index i) {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE std::conditional_t<Direction == Vertical, ColXpr, RowXpr> subVector(
+    Index i) {
   return std::conditional_t<Direction == Vertical, ColXpr, RowXpr>(derived(), i);
 }
 
 /** This is the const version of subVector(Index) */
 template <DirectionType Direction>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::conditional_t<Direction == Vertical, ConstColXpr, ConstRowXpr> subVector(
-    Index i) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE std::conditional_t<Direction == Vertical, ConstColXpr, ConstRowXpr>
+subVector(Index i) const {
   return std::conditional_t<Direction == Vertical, ConstColXpr, ConstRowXpr>(derived(), i);
 }
 
diff --git a/Eigen/src/plugins/CommonCwiseBinaryOps.inc b/Eigen/src/plugins/CommonCwiseBinaryOps.inc
index f1ba3010903..d5127e35c58 100644
--- a/Eigen/src/plugins/CommonCwiseBinaryOps.inc
+++ b/Eigen/src/plugins/CommonCwiseBinaryOps.inc
@@ -38,8 +38,9 @@ EIGEN_MAKE_CWISE_BINARY_OP(operator+, sum)
  * \sa class CwiseBinaryOp, operator+(), operator-(), cwiseProduct()
  */
 template <typename CustomBinaryOp, typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived> binaryExpr(
-    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other, const CustomBinaryOp& func = CustomBinaryOp()) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
+binaryExpr(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other,
+           const CustomBinaryOp& func = CustomBinaryOp()) const {
   return CwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other.derived(), func);
 }
 
@@ -63,7 +64,8 @@ EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(operator/, quotient)
  * \sa operator||(), select()
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<internal::scalar_boolean_and_op<Scalar>, const Derived, const OtherDerived>
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryOp<internal::scalar_boolean_and_op<Scalar>, const Derived,
+                                                       const OtherDerived>
 operator&&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryOp<internal::scalar_boolean_and_op<Scalar>, const Derived, const OtherDerived>(derived(),
                                                                                                    other.derived());
@@ -77,7 +79,8 @@ operator&&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
  * \sa operator&&(), select()
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<internal::scalar_boolean_or_op<Scalar>, const Derived, const OtherDerived>
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryOp<internal::scalar_boolean_or_op<Scalar>, const Derived,
+                                                       const OtherDerived>
 operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryOp<internal::scalar_boolean_or_op<Scalar>, const Derived, const OtherDerived>(derived(),
                                                                                                   other.derived());
@@ -88,7 +91,8 @@ operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
  * \sa operator|(), operator^()
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<internal::scalar_bitwise_and_op<Scalar>, const Derived, const OtherDerived>
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryOp<internal::scalar_bitwise_and_op<Scalar>, const Derived,
+                                                       const OtherDerived>
 operator&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryOp<internal::scalar_bitwise_and_op<Scalar>, const Derived, const OtherDerived>(derived(),
                                                                                                    other.derived());
@@ -99,7 +103,8 @@ operator&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
  * \sa operator&(), operator^()
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<internal::scalar_bitwise_or_op<Scalar>, const Derived, const OtherDerived>
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryOp<internal::scalar_bitwise_or_op<Scalar>, const Derived,
+                                                       const OtherDerived>
 operator|(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryOp<internal::scalar_bitwise_or_op<Scalar>, const Derived, const OtherDerived>(derived(),
                                                                                                   other.derived());
@@ -109,7 +114,8 @@ operator|(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
  * \sa operator&(), operator|()
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<internal::scalar_bitwise_xor_op<Scalar>, const Derived, const OtherDerived>
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryOp<internal::scalar_bitwise_xor_op<Scalar>, const Derived,
+                                                       const OtherDerived>
 operator^(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryOp<internal::scalar_bitwise_xor_op<Scalar>, const Derived, const OtherDerived>(derived(),
                                                                                                    other.derived());
diff --git a/Eigen/src/plugins/CommonCwiseUnaryOps.inc b/Eigen/src/plugins/CommonCwiseUnaryOps.inc
index 64f3648840f..bf8a9406fba 100644
--- a/Eigen/src/plugins/CommonCwiseUnaryOps.inc
+++ b/Eigen/src/plugins/CommonCwiseUnaryOps.inc
@@ -37,7 +37,7 @@ typedef CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived> Negati
 ///
 EIGEN_DOC_UNARY_ADDONS(operator-, opposite)
 ///
-EIGEN_DEVICE_FUNC inline const NegativeReturnType operator-() const { return NegativeReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const NegativeReturnType operator-() const { return NegativeReturnType(derived()); }
 
 template <class NewType>
 struct CastXpr {
@@ -55,7 +55,7 @@ EIGEN_DOC_UNARY_ADDONS(cast, conversion function)
 /// \sa class CwiseUnaryOp
 ///
 template <typename NewType>
-EIGEN_DEVICE_FUNC typename CastXpr<NewType>::Type cast() const {
+EIGEN_DEVICE_FUNC constexpr typename CastXpr<NewType>::Type cast() const {
   return typename CastXpr<NewType>::Type(derived());
 }
 
@@ -64,7 +64,7 @@ EIGEN_DEVICE_FUNC typename CastXpr<NewType>::Type cast() const {
 EIGEN_DOC_UNARY_ADDONS(conjugate, complex conjugate)
 ///
 /// \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_conj">Math functions</a>, MatrixBase::adjoint()
-EIGEN_DEVICE_FUNC inline ConjugateReturnType conjugate() const { return ConjugateReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline ConjugateReturnType conjugate() const { return ConjugateReturnType(derived()); }
 
 /// \returns an expression of the complex conjugate of \c *this if Cond==true, returns derived() otherwise.
 ///
@@ -72,7 +72,7 @@ EIGEN_DOC_UNARY_ADDONS(conjugate, complex conjugate)
 ///
 /// \sa conjugate()
 template <bool Cond>
-EIGEN_DEVICE_FUNC inline std::conditional_t<Cond, ConjugateReturnType, const Derived&> conjugateIf() const {
+EIGEN_DEVICE_FUNC constexpr inline std::conditional_t<Cond, ConjugateReturnType, const Derived&> conjugateIf() const {
   typedef std::conditional_t<Cond, ConjugateReturnType, const Derived&> ReturnType;
   return ReturnType(derived());
 }
@@ -82,14 +82,14 @@ EIGEN_DEVICE_FUNC inline std::conditional_t<Cond, ConjugateReturnType, const Der
 EIGEN_DOC_UNARY_ADDONS(real, real part function)
 ///
 /// \sa imag()
-EIGEN_DEVICE_FUNC inline RealReturnType real() const { return RealReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline RealReturnType real() const { return RealReturnType(derived()); }
 
 /// \returns an read-only expression of the imaginary part of \c *this.
 ///
 EIGEN_DOC_UNARY_ADDONS(imag, imaginary part function)
 ///
 /// \sa real()
-EIGEN_DEVICE_FUNC inline const ImagReturnType imag() const { return ImagReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const ImagReturnType imag() const { return ImagReturnType(derived()); }
 
 /// \brief Apply a unary operator coefficient-wise
 /// \param[in]  func  Functor implementing the unary operator
@@ -113,7 +113,7 @@ EIGEN_DOC_UNARY_ADDONS(unaryExpr, unary function)
 /// \sa unaryViewExpr, binaryExpr, class CwiseUnaryOp
 ///
 template <typename CustomUnaryOp>
-EIGEN_DEVICE_FUNC inline const CwiseUnaryOp<CustomUnaryOp, const Derived> unaryExpr(
+EIGEN_DEVICE_FUNC constexpr inline const CwiseUnaryOp<CustomUnaryOp, const Derived> unaryExpr(
     const CustomUnaryOp& func = CustomUnaryOp()) const {
   return CwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
 }
@@ -132,7 +132,7 @@ EIGEN_DOC_UNARY_ADDONS(unaryViewExpr, unary function)
 /// \sa unaryExpr, binaryExpr class CwiseUnaryOp
 ///
 template <typename CustomViewOp>
-EIGEN_DEVICE_FUNC inline const CwiseUnaryView<CustomViewOp, const Derived> unaryViewExpr(
+EIGEN_DEVICE_FUNC constexpr inline const CwiseUnaryView<CustomViewOp, const Derived> unaryViewExpr(
     const CustomViewOp& func = CustomViewOp()) const {
   return CwiseUnaryView<CustomViewOp, const Derived>(derived(), func);
 }
@@ -147,7 +147,7 @@ EIGEN_DOC_UNARY_ADDONS(unaryViewExpr, unary function)
 /// \sa unaryExpr, binaryExpr class CwiseUnaryOp
 ///
 template <typename CustomViewOp>
-EIGEN_DEVICE_FUNC inline CwiseUnaryView<CustomViewOp, Derived> unaryViewExpr(
+EIGEN_DEVICE_FUNC constexpr inline CwiseUnaryView<CustomViewOp, Derived> unaryViewExpr(
     const CustomViewOp& func = CustomViewOp()) {
   return CwiseUnaryView<CustomViewOp, Derived>(derived(), func);
 }
@@ -157,11 +157,11 @@ EIGEN_DEVICE_FUNC inline CwiseUnaryView<CustomViewOp, Derived> unaryViewExpr(
 EIGEN_DOC_UNARY_ADDONS(real, real part function)
 ///
 /// \sa imag()
-EIGEN_DEVICE_FUNC inline NonConstRealReturnType real() { return NonConstRealReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline NonConstRealReturnType real() { return NonConstRealReturnType(derived()); }
 
 /// \returns a non const expression of the imaginary part of \c *this.
 ///
 EIGEN_DOC_UNARY_ADDONS(imag, imaginary part function)
 ///
 /// \sa real()
-EIGEN_DEVICE_FUNC inline NonConstImagReturnType imag() { return NonConstImagReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline NonConstImagReturnType imag() { return NonConstImagReturnType(derived()); }
diff --git a/Eigen/src/plugins/MatrixCwiseBinaryOps.inc b/Eigen/src/plugins/MatrixCwiseBinaryOps.inc
index fae92d8d9ae..8e0da06e3d9 100644
--- a/Eigen/src/plugins/MatrixCwiseBinaryOps.inc
+++ b/Eigen/src/plugins/MatrixCwiseBinaryOps.inc
@@ -18,7 +18,7 @@
  * \sa class CwiseBinaryOp, cwiseAbs2
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived, OtherDerived, product)
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived, OtherDerived, product)
     cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived, OtherDerived, product)(derived(), other.derived());
 }
@@ -55,7 +55,7 @@ using CwiseBinaryGreaterOrEqualReturnType =
  * \sa cwiseNotEqual(), isApprox(), isMuchSmallerThan()
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryEqualReturnType<OtherDerived> cwiseEqual(
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryEqualReturnType<OtherDerived> cwiseEqual(
     const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryEqualReturnType<OtherDerived>(derived(), other.derived());
 }
@@ -73,35 +73,35 @@ EIGEN_DEVICE_FUNC inline const CwiseBinaryEqualReturnType<OtherDerived> cwiseEqu
  * \sa cwiseEqual(), isApprox(), isMuchSmallerThan()
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryNotEqualReturnType<OtherDerived> cwiseNotEqual(
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryNotEqualReturnType<OtherDerived> cwiseNotEqual(
     const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryNotEqualReturnType<OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise < operator of *this and \a other */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryLessReturnType<OtherDerived> cwiseLess(
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryLessReturnType<OtherDerived> cwiseLess(
     const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryLessReturnType<OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise > operator of *this and \a other */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryGreaterReturnType<OtherDerived> cwiseGreater(
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryGreaterReturnType<OtherDerived> cwiseGreater(
     const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryGreaterReturnType<OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise <= operator of *this and \a other */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryLessOrEqualReturnType<OtherDerived> cwiseLessOrEqual(
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryLessOrEqualReturnType<OtherDerived> cwiseLessOrEqual(
     const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryLessOrEqualReturnType<OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise >= operator of *this and \a other */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC inline const CwiseBinaryGreaterOrEqualReturnType<OtherDerived> cwiseGreaterOrEqual(
+EIGEN_DEVICE_FUNC constexpr inline const CwiseBinaryGreaterOrEqualReturnType<OtherDerived> cwiseGreaterOrEqual(
     const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryGreaterOrEqualReturnType<OtherDerived>(derived(), other.derived());
 }
@@ -114,7 +114,7 @@ EIGEN_DEVICE_FUNC inline const CwiseBinaryGreaterOrEqualReturnType<OtherDerived>
  * \sa class CwiseBinaryOp, max()
  */
 template <int NaNPropagation = PropagateFast, typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_min_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>
     cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryOp<internal::scalar_min_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>(
@@ -126,7 +126,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
  * \sa class CwiseBinaryOp, min()
  */
 template <int NaNPropagation = PropagateFast>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_min_op<Scalar, Scalar, NaNPropagation>, const Derived, const ConstantReturnType>
     cwiseMin(const Scalar& other) const {
   return cwiseMin<NaNPropagation>(Derived::Constant(rows(), cols(), other));
@@ -140,7 +140,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
  * \sa class CwiseBinaryOp, min()
  */
 template <int NaNPropagation = PropagateFast, typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_max_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>
     cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryOp<internal::scalar_max_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>(
@@ -152,7 +152,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
  * \sa class CwiseBinaryOp, min()
  */
 template <int NaNPropagation = PropagateFast>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_max_op<Scalar, Scalar, NaNPropagation>, const Derived, const ConstantReturnType>
     cwiseMax(const Scalar& other) const {
   return cwiseMax<NaNPropagation>(Derived::Constant(rows(), cols(), other));
@@ -166,7 +166,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
  * \sa class CwiseBinaryOp, cwiseProduct(), cwiseInverse()
  */
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const
     CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
     cwiseQuotient(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>(derived(),
@@ -195,7 +195,7 @@ using CwiseScalarGreaterOrEqualReturnType =
  *
  * \sa cwiseEqual(const MatrixBase<OtherDerived> &) const
  */
-EIGEN_DEVICE_FUNC inline const CwiseScalarEqualReturnType cwiseEqual(const Scalar& s) const {
+EIGEN_DEVICE_FUNC constexpr inline const CwiseScalarEqualReturnType cwiseEqual(const Scalar& s) const {
   return CwiseScalarEqualReturnType(derived(), Derived::Constant(rows(), cols(), s));
 }
 
@@ -208,27 +208,28 @@ EIGEN_DEVICE_FUNC inline const CwiseScalarEqualReturnType cwiseEqual(const Scala
  *
  * \sa cwiseEqual(const MatrixBase<OtherDerived> &) const
  */
-EIGEN_DEVICE_FUNC inline const CwiseScalarNotEqualReturnType cwiseNotEqual(const Scalar& s) const {
+EIGEN_DEVICE_FUNC constexpr inline const CwiseScalarNotEqualReturnType cwiseNotEqual(const Scalar& s) const {
   return CwiseScalarNotEqualReturnType(derived(), Derived::Constant(rows(), cols(), s));
 }
 
 /** \returns an expression of the coefficient-wise < operator of \c *this and a scalar \a s */
-EIGEN_DEVICE_FUNC inline const CwiseScalarLessReturnType cwiseLess(const Scalar& s) const {
+EIGEN_DEVICE_FUNC constexpr inline const CwiseScalarLessReturnType cwiseLess(const Scalar& s) const {
   return CwiseScalarLessReturnType(derived(), Derived::Constant(rows(), cols(), s));
 }
 
 /** \returns an expression of the coefficient-wise > operator of \c *this and a scalar \a s */
-EIGEN_DEVICE_FUNC inline const CwiseScalarGreaterReturnType cwiseGreater(const Scalar& s) const {
+EIGEN_DEVICE_FUNC constexpr inline const CwiseScalarGreaterReturnType cwiseGreater(const Scalar& s) const {
   return CwiseScalarGreaterReturnType(derived(), Derived::Constant(rows(), cols(), s));
 }
 
 /** \returns an expression of the coefficient-wise <= operator of \c *this and a scalar \a s */
-EIGEN_DEVICE_FUNC inline const CwiseScalarLessOrEqualReturnType cwiseLessOrEqual(const Scalar& s) const {
+EIGEN_DEVICE_FUNC constexpr inline const CwiseScalarLessOrEqualReturnType cwiseLessOrEqual(const Scalar& s) const {
   return CwiseScalarLessOrEqualReturnType(derived(), Derived::Constant(rows(), cols(), s));
 }
 
 /** \returns an expression of the coefficient-wise >= operator of \c *this and a scalar \a s */
-EIGEN_DEVICE_FUNC inline const CwiseScalarGreaterOrEqualReturnType cwiseGreaterOrEqual(const Scalar& s) const {
+EIGEN_DEVICE_FUNC constexpr inline const CwiseScalarGreaterOrEqualReturnType cwiseGreaterOrEqual(
+    const Scalar& s) const {
   return CwiseScalarGreaterOrEqualReturnType(derived(), Derived::Constant(rows(), cols(), s));
 }
 
@@ -252,37 +253,37 @@ using CwiseBinaryTypedGreaterOrEqualReturnType =
     CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE, true>, const Derived, const OtherDerived>;
 
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryTypedEqualReturnType<OtherDerived> cwiseTypedEqual(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseBinaryTypedEqualReturnType<OtherDerived> cwiseTypedEqual(
     const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryTypedEqualReturnType<OtherDerived>(derived(), other.derived());
 }
 
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryTypedNotEqualReturnType<OtherDerived> cwiseTypedNotEqual(
-    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseBinaryTypedNotEqualReturnType<OtherDerived>
+cwiseTypedNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryTypedNotEqualReturnType<OtherDerived>(derived(), other.derived());
 }
 
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryTypedLessReturnType<OtherDerived> cwiseTypedLess(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseBinaryTypedLessReturnType<OtherDerived> cwiseTypedLess(
     const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryTypedLessReturnType<OtherDerived>(derived(), other.derived());
 }
 
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryTypedGreaterReturnType<OtherDerived> cwiseTypedGreater(
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseBinaryTypedGreaterReturnType<OtherDerived> cwiseTypedGreater(
     const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryTypedGreaterReturnType<OtherDerived>(derived(), other.derived());
 }
 
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryTypedLessOrEqualReturnType<OtherDerived> cwiseTypedLessOrEqual(
-    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseBinaryTypedLessOrEqualReturnType<OtherDerived>
+cwiseTypedLessOrEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryTypedLessOrEqualReturnType<OtherDerived>(derived(), other.derived());
 }
 
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryTypedGreaterOrEqualReturnType<OtherDerived>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseBinaryTypedGreaterOrEqualReturnType<OtherDerived>
 cwiseTypedGreaterOrEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
   return CwiseBinaryTypedGreaterOrEqualReturnType<OtherDerived>(derived(), other.derived());
 }
@@ -303,29 +304,32 @@ using CwiseScalarTypedGreaterOrEqualReturnType =
     CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE, true>, const Derived,
                   const ConstantReturnType>;
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseScalarTypedEqualReturnType cwiseTypedEqual(const Scalar& s) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseScalarTypedEqualReturnType
+cwiseTypedEqual(const Scalar& s) const {
   return CwiseScalarTypedEqualReturnType(derived(), ConstantReturnType(rows(), cols(), s));
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseScalarTypedNotEqualReturnType
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseScalarTypedNotEqualReturnType
 cwiseTypedNotEqual(const Scalar& s) const {
   return CwiseScalarTypedNotEqualReturnType(derived(), ConstantReturnType(rows(), cols(), s));
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseScalarTypedLessReturnType cwiseTypedLess(const Scalar& s) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseScalarTypedLessReturnType
+cwiseTypedLess(const Scalar& s) const {
   return CwiseScalarTypedLessReturnType(derived(), ConstantReturnType(rows(), cols(), s));
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseScalarTypedGreaterReturnType cwiseTypedGreater(const Scalar& s) const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseScalarTypedGreaterReturnType
+cwiseTypedGreater(const Scalar& s) const {
   return CwiseScalarTypedGreaterReturnType(derived(), ConstantReturnType(rows(), cols(), s));
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseScalarTypedLessOrEqualReturnType
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseScalarTypedLessOrEqualReturnType
 cwiseTypedLessOrEqual(const Scalar& s) const {
   return CwiseScalarTypedLessOrEqualReturnType(derived(), ConstantReturnType(rows(), cols(), s));
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseScalarTypedGreaterOrEqualReturnType
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseScalarTypedGreaterOrEqualReturnType
 cwiseTypedGreaterOrEqual(const Scalar& s) const {
   return CwiseScalarTypedGreaterOrEqualReturnType(derived(), ConstantReturnType(rows(), cols(), s));
 }
diff --git a/Eigen/src/plugins/MatrixCwiseUnaryOps.inc b/Eigen/src/plugins/MatrixCwiseUnaryOps.inc
index ffaf5aab2a8..a57548c0130 100644
--- a/Eigen/src/plugins/MatrixCwiseUnaryOps.inc
+++ b/Eigen/src/plugins/MatrixCwiseUnaryOps.inc
@@ -30,7 +30,7 @@ EIGEN_DOC_UNARY_ADDONS(cwiseAbs, absolute value)
 ///
 /// \sa cwiseAbs2()
 ///
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseAbsReturnType cwiseAbs() const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseAbsReturnType cwiseAbs() const {
   return CwiseAbsReturnType(derived());
 }
 
@@ -43,7 +43,7 @@ EIGEN_DOC_UNARY_ADDONS(cwiseAbs2, squared absolute value)
 ///
 /// \sa cwiseAbs()
 ///
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseAbs2ReturnType cwiseAbs2() const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseAbs2ReturnType cwiseAbs2() const {
   return CwiseAbs2ReturnType(derived());
 }
 
@@ -56,7 +56,9 @@ EIGEN_DOC_UNARY_ADDONS(cwiseSqrt, square - root)
 ///
 /// \sa cwisePow(), cwiseSquare(), cwiseCbrt()
 ///
-EIGEN_DEVICE_FUNC inline const CwiseSqrtReturnType cwiseSqrt() const { return CwiseSqrtReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CwiseSqrtReturnType cwiseSqrt() const {
+  return CwiseSqrtReturnType(derived());
+}
 
 /// \returns an expression of the coefficient-wise cube root of *this.
 ///
@@ -64,7 +66,9 @@ EIGEN_DOC_UNARY_ADDONS(cwiseCbrt, cube - root)
 ///
 /// \sa cwiseSqrt(), cwiseSquare(), cwisePow()
 ///
-EIGEN_DEVICE_FUNC inline const CwiseCbrtReturnType cwiseCbrt() const { return CwiseCbrtReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CwiseCbrtReturnType cwiseCbrt() const {
+  return CwiseCbrtReturnType(derived());
+}
 
 /// \returns an expression of the coefficient-wise square of *this.
 ///
@@ -72,7 +76,9 @@ EIGEN_DOC_UNARY_ADDONS(cwiseSquare, square)
 ///
 /// \sa cwisePow(), cwiseSqrt(), cwiseCbrt()
 ///
-EIGEN_DEVICE_FUNC inline const CwiseSquareReturnType cwiseSquare() const { return CwiseSquareReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CwiseSquareReturnType cwiseSquare() const {
+  return CwiseSquareReturnType(derived());
+}
 
 /// \returns an expression of the coefficient-wise signum of *this.
 ///
@@ -81,7 +87,9 @@ EIGEN_DEVICE_FUNC inline const CwiseSquareReturnType cwiseSquare() const { retur
 ///
 EIGEN_DOC_UNARY_ADDONS(cwiseSign, sign function)
 ///
-EIGEN_DEVICE_FUNC inline const CwiseSignReturnType cwiseSign() const { return CwiseSignReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CwiseSignReturnType cwiseSign() const {
+  return CwiseSignReturnType(derived());
+}
 
 /// \returns an expression of the coefficient-wise inverse of *this.
 ///
@@ -92,7 +100,9 @@ EIGEN_DOC_UNARY_ADDONS(cwiseInverse, inverse)
 ///
 /// \sa cwiseProduct()
 ///
-EIGEN_DEVICE_FUNC inline const CwiseInverseReturnType cwiseInverse() const { return CwiseInverseReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CwiseInverseReturnType cwiseInverse() const {
+  return CwiseInverseReturnType(derived());
+}
 
 /// \returns an expression of the coefficient-wise phase angle of \c *this
 ///
@@ -101,9 +111,9 @@ EIGEN_DEVICE_FUNC inline const CwiseInverseReturnType cwiseInverse() const { ret
 ///
 EIGEN_DOC_UNARY_ADDONS(cwiseArg, arg)
 
-EIGEN_DEVICE_FUNC inline const CwiseArgReturnType cwiseArg() const { return CwiseArgReturnType(derived()); }
+EIGEN_DEVICE_FUNC constexpr inline const CwiseArgReturnType cwiseArg() const { return CwiseArgReturnType(derived()); }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseCArgReturnType cwiseCArg() const {
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE const CwiseCArgReturnType cwiseCArg() const {
   return CwiseCArgReturnType(derived());
 }
 
@@ -113,6 +123,7 @@ using CwisePowReturnType =
                      CwiseUnaryOp<internal::scalar_unary_pow_op<Scalar, ScalarExponent>, const Derived>>;
 
 template <typename ScalarExponent>
-EIGEN_DEVICE_FUNC inline const CwisePowReturnType<ScalarExponent> cwisePow(const ScalarExponent& exponent) const {
+EIGEN_DEVICE_FUNC constexpr inline const CwisePowReturnType<ScalarExponent> cwisePow(
+    const ScalarExponent& exponent) const {
   return CwisePowReturnType<ScalarExponent>(derived(), internal::scalar_unary_pow_op<Scalar, ScalarExponent>(exponent));
 }
diff --git a/Eigen/src/plugins/ReshapedMethods.inc b/Eigen/src/plugins/ReshapedMethods.inc
index c1f90e72977..1e3a1963626 100644
--- a/Eigen/src/plugins/ReshapedMethods.inc
+++ b/Eigen/src/plugins/ReshapedMethods.inc
@@ -27,11 +27,11 @@
 /// \sa class Reshaped, fix, fix<N>(int)
 ///
 template <int Order = ColMajor, typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC inline Reshaped<Derived, ...> reshaped(NRowsType nRows, NColsType nCols);
+EIGEN_DEVICE_FUNC constexpr inline Reshaped<Derived, ...> reshaped(NRowsType nRows, NColsType nCols);
 
 /// This is the const version of reshaped(NRowsType,NColsType).
 template <int Order = ColMajor, typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC inline const Reshaped<const Derived, ...> reshaped(NRowsType nRows, NColsType nCols) const;
+EIGEN_DEVICE_FUNC constexpr inline const Reshaped<const Derived, ...> reshaped(NRowsType nRows, NColsType nCols) const;
 
 /// \returns an expression of \c *this with columns (or rows) stacked to a linear column vector
 ///
@@ -56,11 +56,11 @@ EIGEN_DEVICE_FUNC inline const Reshaped<const Derived, ...> reshaped(NRowsType n
 /// \sa reshaped(NRowsType,NColsType), class Reshaped
 ///
 template <int Order = ColMajor>
-EIGEN_DEVICE_FUNC inline Reshaped<Derived, ...> reshaped();
+EIGEN_DEVICE_FUNC constexpr inline Reshaped<Derived, ...> reshaped();
 
 /// This is the const version of reshaped().
 template <int Order = ColMajor>
-EIGEN_DEVICE_FUNC inline const Reshaped<const Derived, ...> reshaped() const;
+EIGEN_DEVICE_FUNC constexpr inline const Reshaped<const Derived, ...> reshaped() const;
 
 #else
 
@@ -79,7 +79,7 @@ EIGEN_DEVICE_FUNC inline const Reshaped<const Derived, ...> reshaped() const;
 #endif
 
 template <typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC inline Reshaped<
+EIGEN_DEVICE_FUNC constexpr inline Reshaped<
     EIGEN_RESHAPED_METHOD_CONST Derived,
     internal::get_compiletime_reshape_size<NRowsType, NColsType, SizeAtCompileTime>::value,
     internal::get_compiletime_reshape_size<NColsType, NRowsType, SizeAtCompileTime>::value>
@@ -92,7 +92,7 @@ reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST {
 }
 
 template <int Order, typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC inline Reshaped<
+EIGEN_DEVICE_FUNC constexpr inline Reshaped<
     EIGEN_RESHAPED_METHOD_CONST Derived,
     internal::get_compiletime_reshape_size<NRowsType, NColsType, SizeAtCompileTime>::value,
     internal::get_compiletime_reshape_size<NColsType, NRowsType, SizeAtCompileTime>::value,
@@ -108,14 +108,14 @@ reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST {
 
 // Views as linear vectors
 
-EIGEN_DEVICE_FUNC inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1> reshaped()
+EIGEN_DEVICE_FUNC constexpr inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1> reshaped()
     EIGEN_RESHAPED_METHOD_CONST {
   return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1>(derived(), size(), 1);
 }
 
 template <int Order>
-EIGEN_DEVICE_FUNC inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,
-                                  internal::get_compiletime_reshape_order(Flags, Order)>
+EIGEN_DEVICE_FUNC constexpr inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,
+                                            internal::get_compiletime_reshape_order(Flags, Order)>
 reshaped() EIGEN_RESHAPED_METHOD_CONST {
   EIGEN_STATIC_ASSERT(Order == RowMajor || Order == ColMajor || Order == AutoOrder, INVALID_TEMPLATE_PARAMETER);
   return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,
diff --git a/README.md b/README.md
index 31b25c07ad2..ed957c22b04 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 **Eigen is a C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms.**
 
-For more information go to http://eigen.tuxfamily.org/ or https://libeigen.gitlab.io/docs/.
+For more information go to http://eigen.tuxfamily.org/ or https://libeigen.gitlab.io.
 
 For ***pull request***, ***bug reports***, and ***feature requests***, go to https://gitlab.com/libeigen/eigen.
diff --git a/cmake/Eigen3Config.cmake.in b/cmake/Eigen3Config.cmake.in
index 20acbdfcf12..96582f5d992 100644
--- a/cmake/Eigen3Config.cmake.in
+++ b/cmake/Eigen3Config.cmake.in
@@ -3,6 +3,6 @@
 
 @PACKAGE_INIT@
 
-if (NOT TARGET eigen)
-  include ("${CMAKE_CURRENT_LIST_DIR}/@EIGEN3_TARGETS_FILE@")
-endif ()
+if (NOT TARGET Eigen3::Eigen)
+  include ("${CMAKE_CURRENT_LIST_DIR}/Eigen3Targets.cmake")
+endif (NOT TARGET Eigen3::Eigen)

From c4c420a4cb31592f1cf6e354ad98717527dc8a21 Mon Sep 17 00:00:00 2001
From: "Hans J. Johnson" <hans-johnson@uiowa.edu>
Date: Thu, 30 Apr 2026 17:33:15 -0500
Subject: [PATCH 3/6] COMP: Suppress GCC -Wmaybe-uninitialized false positive
 on Eigen 5

The warning is purely from the Eigen master post-5.0.1 update
(EIGEN_VERSION_STRING="5.0.1-dev+master") changing instantiation/
inlining. Eigen's own CMakeLists.txt already adds
-Wno-maybe-uninitialized for its own builds (line 444, with a comment
that GCC 12+ emits false positives), and DisableStupidWarnings.h
doesn't cover this one. The fix is to suppress at the ITK consumer
site, mirroring Eigen's stance.
---
 Modules/Core/Common/include/itkMacro.h                  | 8 ++++++++
 Modules/Core/Common/include/itkSymmetricEigenAnalysis.h | 8 ++++++++
 2 files changed, 16 insertions(+)

diff --git a/Modules/Core/Common/include/itkMacro.h b/Modules/Core/Common/include/itkMacro.h
index 8a410427533..ba9dae84c99 100644
--- a/Modules/Core/Common/include/itkMacro.h
+++ b/Modules/Core/Common/include/itkMacro.h
@@ -117,6 +117,14 @@ namespace itk
 #  define ITK_GCC_SUPPRESS_Warray_bounds
 #endif
 
+// -Wmaybe-uninitialized is a real-GCC-only warning name; Apple Clang emits
+// -Wunknown-warning-option for it (which ITK's dashboard treats as fatal).
+#if defined(__GNUC__) && !defined(__clang__)
+#  define ITK_GCC_SUPPRESS_Wmaybe_uninitialized ITK_PRAGMA(GCC diagnostic ignored "-Wmaybe-uninitialized")
+#else
+#  define ITK_GCC_SUPPRESS_Wmaybe_uninitialized
+#endif
+
 // For Clang only (and not GCC):
 #if defined(__clang__) && defined(__has_warning)
 #  define ITK_CLANG_PRAGMA_PUSH ITK_PRAGMA(clang diagnostic push)
diff --git a/Modules/Core/Common/include/itkSymmetricEigenAnalysis.h b/Modules/Core/Common/include/itkSymmetricEigenAnalysis.h
index 4b64dfce0d5..4e82879651c 100644
--- a/Modules/Core/Common/include/itkSymmetricEigenAnalysis.h
+++ b/Modules/Core/Common/include/itkSymmetricEigenAnalysis.h
@@ -1029,8 +1029,16 @@ class ITK_TEMPLATE_EXPORT SymmetricEigenAnalysisFixedDimension
       }
     }
     using EigenSolverType = Eigen::SelfAdjointEigenSolver<EigenLibMatrixType>;
+    // GCC's IPA cannot prove that Eigen 5's analytical 3x3 SelfAdjointEigenSolver
+    // fully writes m_eivalues; Eigen's own CMakeLists adds -Wno-maybe-uninitialized
+    // for the same reason (see Modules/ThirdParty/Eigen3/.../CMakeLists.txt).
+    // clang-format off
+    ITK_GCC_PRAGMA_PUSH
+    ITK_GCC_SUPPRESS_Wmaybe_uninitialized
     const EigenSolverType solver(inputMatrix, Eigen::EigenvaluesOnly);
     auto                  eigenValues = solver.eigenvalues();
+    ITK_GCC_PRAGMA_POP
+    // clang-format on
     if (m_OrderEigenValues == EigenValueOrderEnum::OrderByMagnitude)
     {
       detail::sortEigenValuesByMagnitude(eigenValues, VDimension);

From f6eb95b9f87073c07b9c86fbb06282cef32ec975 Mon Sep 17 00:00:00 2001
From: "Hans J. Johnson" <hans-johnson@uiowa.edu>
Date: Thu, 30 Apr 2026 17:28:49 -0500
Subject: [PATCH 4/6] COMP: Address greptile review on vendored Eigen 5 import

- Restore @EIGEN3_TARGETS_FILE@ substitution in Eigen3Config.cmake.in.
- Remove duplicate option(ITK_USE_EIGEN_MPL2_ONLY) declaration.
- Remove commented-out binary-dir generator expression.

USE_SYSTEM_EIGEN for 3.3+ unaffected (all under itkeigen, only
configured when USE_SYSTEM_EIGEN=OFF).
---
 Modules/ThirdParty/Eigen3/src/itkeigen/CMakeLists.txt         | 4 +---
 .../Eigen3/src/itkeigen/cmake/Eigen3Config.cmake.in           | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/Modules/ThirdParty/Eigen3/src/itkeigen/CMakeLists.txt b/Modules/ThirdParty/Eigen3/src/itkeigen/CMakeLists.txt
index 80494f4445d..03aff039adf 100644
--- a/Modules/ThirdParty/Eigen3/src/itkeigen/CMakeLists.txt
+++ b/Modules/ThirdParty/Eigen3/src/itkeigen/CMakeLists.txt
@@ -949,8 +949,7 @@ add_library (ITKInternalEigen3::Eigen ALIAS eigen_internal)
 # This would wrongly enforce EIGEN_MPL2_ONLY to other libraries using Eigen.
 # We wrap this definition in ITK_USE_EIGEN_MPL2_ONLY, and only enabling it internally in the dashboards and CI,
 # to avoid introducing GPL code from Eigen3 internally in ITK.
-option(ITK_USE_EIGEN_MPL2_ONLY "Set compile definition EIGEN_MPL2_ONLY for ITKInternalEigen3." OFF)
-mark_as_advanced(ITK_USE_EIGEN_MPL2_ONLY)
+# (ITK_USE_EIGEN_MPL2_ONLY is declared once near line 844 of this file.)
 
 if(ITK_USE_EIGEN_MPL2_ONLY)
   target_compile_definitions (eigen_internal INTERFACE "EIGEN_MPL2_ONLY")
@@ -961,7 +960,6 @@ endif()
 # INSTALL: headers require pre-prend itkeigen/Eigen/X.
 target_include_directories (eigen_internal SYSTEM INTERFACE
   $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/..>
-  # $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/..>
   $<INSTALL_INTERFACE:${INCLUDE_INSTALL_DIR}>
   )
 
diff --git a/Modules/ThirdParty/Eigen3/src/itkeigen/cmake/Eigen3Config.cmake.in b/Modules/ThirdParty/Eigen3/src/itkeigen/cmake/Eigen3Config.cmake.in
index 96582f5d992..235f3ac9867 100644
--- a/Modules/ThirdParty/Eigen3/src/itkeigen/cmake/Eigen3Config.cmake.in
+++ b/Modules/ThirdParty/Eigen3/src/itkeigen/cmake/Eigen3Config.cmake.in
@@ -4,5 +4,5 @@
 @PACKAGE_INIT@
 
 if (NOT TARGET Eigen3::Eigen)
-  include ("${CMAKE_CURRENT_LIST_DIR}/Eigen3Targets.cmake")
+  include ("${CMAKE_CURRENT_LIST_DIR}/@EIGEN3_TARGETS_FILE@")
 endif (NOT TARGET Eigen3::Eigen)

From 66916d64faaf1e2cb989e12c68c3939275becedf Mon Sep 17 00:00:00 2001
From: "Hans J. Johnson" <hans-johnson@uiowa.edu>
Date: Thu, 30 Apr 2026 17:28:52 -0500
Subject: [PATCH 5/6] COMP: Bump itkeigen cmake_minimum to 3.16.3 and clarify
 dead-block wrap

Bump line 2 from 3.10.2 to 3.16.3 to match ITK proper, and replace the
if(FALSE) ... endif() wrapper around the upstream Eigen logic with
#[[ ... #]] CMake multiline-comment markers so IDEs render the
disabled region as dimmed dead code.
---
 Modules/ThirdParty/Eigen3/src/itkeigen/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Modules/ThirdParty/Eigen3/src/itkeigen/CMakeLists.txt b/Modules/ThirdParty/Eigen3/src/itkeigen/CMakeLists.txt
index 03aff039adf..b3658c338be 100644
--- a/Modules/ThirdParty/Eigen3/src/itkeigen/CMakeLists.txt
+++ b/Modules/ThirdParty/Eigen3/src/itkeigen/CMakeLists.txt
@@ -1,10 +1,10 @@
 # Same version of ITK cmake_minimum_required
-cmake_minimum_required(VERSION 3.10.2)
+cmake_minimum_required(VERSION 3.16.3)
 
 project(Eigen3)
 
 # ITK doesn't compile anything here, just generates targets for the INTERFACE library.
-if(FALSE)
+#[[ # ITK - START DISABLE UPSTREAM EIGEN LOGIC
 
 cmake_minimum_required(VERSION 3.10.0)
 
@@ -828,7 +828,7 @@ message(STATUS "")
 message(STATUS "Configured Eigen ${EIGEN_VERSION_STRING}")
 message(STATUS "")
 
-endif() # Regular CMakeLists of Eigen ends here
+#]] # ITK -STOP DISABLE UPSTREAM EIGEN LOGIC
 
 ###############################################################################
 ################################ ITK ##########################################

From 76d09b81a16fc0855c3b105a61ae08e2fe9a002f Mon Sep 17 00:00:00 2001
From: "Hans J. Johnson" <hans-johnson@uiowa.edu>
Date: Fri, 1 May 2026 09:22:22 -0500
Subject: [PATCH 6/6] COMP: Suppress MSVC C4750 dashboard warning from vendored
 Eigen3

Tridiagonalization.h emits C4750 ('function with _alloca() inlined into
a loop') under MSVC whenever an ITK consumer instantiates the symmetric
eigen-decomposition path.  The warning is benign (Eigen's intentional
small-allocation alloca path) but cannot be addressed inside ITK and
fails CDash because the dashboard treats any warning as fatal.

The previous attempt to set CMAKE_CXX_FLAGS in the vendored Eigen
CMakeLists was a no-op (Eigen is header-only, so no .cxx file picks up
the flag) and the alternative -- attaching /wd4750 to the eigen_internal
INTERFACE library -- would propagate to ITKCommon (which DEPENDS on
ITKEigen3) and from there to most of the toolkit, suppressing C4750
for non-Eigen ITK code too.  Both rejected per @blowekamp.

Instead, extend the existing 'Modules/ThirdParty/Eigen3/.*warning:.*'
CTestCustom regex with a sibling that matches MSVC's
'warning C####:' form.  Compiler still emits the warning so reviewers
see it in build logs, but the dashboard ignores it for warning-count
purposes -- exactly the same scope the GCC/Clang side has had since
this exception was added.
---
 CMake/CTestCustom.cmake.in | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CMake/CTestCustom.cmake.in b/CMake/CTestCustom.cmake.in
index c775cccdbbf..16a0c78aff1 100644
--- a/CMake/CTestCustom.cmake.in
+++ b/CMake/CTestCustom.cmake.in
@@ -132,6 +132,10 @@ set(CTEST_CUSTOM_WARNING_EXCEPTION
   # ignore some third party warnings
   ".*Modules/ThirdParty/.*warning:.*Wzero-as-null-pointer-constant"
   ".*Modules/ThirdParty/Eigen3/.*warning:.*"
+  # MSVC emits "warning C####:" rather than "warning:"; cover that form too
+  # so the Tridiagonalization.h C4750 ('_alloca() inlined into a loop') and
+  # similar Eigen-internal MSVC warnings do not fail the dashboard.
+  ".*Modules.ThirdParty.Eigen3.*warning C[0-9]+:.*"
 
   # CircleCI distcc warnings
   ".*WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED.*"