diff --git a/README.md b/README.md index a071be32f5f..600f5712ad0 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,28 @@ -
- -The Stan Math Library is a C++, reverse-mode automatic -differentiation library designed to be usable, extensive and -extensible, efficient, scalable, stable, portable, and redistributable -in order to facilitate the construction and utilization of algorithms -that utilize derivatives. - - -Documentation, Installation, and Examples --------------- - -All of Stan math's documentation is hosted on our website below. Please do not -reference articles in the wiki as they are outdated and not maintained. - -[mc-stan.org/math](https://mc-stan.org/math/) - - -Licensing ---------- -The Stan Math Library is licensed under the [new BSD -license](https://github.com/stan-dev/math/blob/develop/LICENSE%2Emd). - -The Stan Math Library depends on the Intel TBB library which is -licensed under the Apache 2.0 license. This dependency implies an -additional restriction as compared to the new BSD license alone. The -Apache 2.0 license is incompatible with GPL-2 licensed code if -distributed as a unitary binary. You may refer to the Licensing page on the [Stan wiki](https://github.com/stan-dev/stan/wiki/Stan-Licensing). +
+ +The Stan Math Library is a C++, reverse-mode automatic +differentiation library designed to be usable, extensive and +extensible, efficient, scalable, stable, portable, and redistributable +in order to facilitate the construction and utilization of algorithms +that utilize derivatives. + + +Documentation, Installation, and Examples +-------------- + +All of Stan math's documentation is hosted on our website below. Please do not +reference articles in the wiki as they are outdated and not maintained. + +[mc-stan.org/math](https://mc-stan.org/math/) + + +Licensing +--------- +The Stan Math Library is licensed under the [new BSD +license](https://github.com/stan-dev/math/blob/develop/LICENSE%2Emd). + +The Stan Math Library depends on the Intel TBB library which is +licensed under the Apache 2.0 license. This dependency implies an +additional restriction as compared to the new BSD license alone. The +Apache 2.0 license is incompatible with GPL-2 licensed code if +distributed as a unitary binary. You may refer to the Licensing page on the [Stan wiki](https://github.com/stan-dev/stan/wiki/Stan-Licensing). diff --git a/lib/eigen_3.3.9/Eigen/CMakeLists.txt b/lib/eigen_3.3.9/Eigen/CMakeLists.txt deleted file mode 100644 index 9eb502b792d..00000000000 --- a/lib/eigen_3.3.9/Eigen/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -include(RegexUtils) -test_escape_string_as_regex() - -file(GLOB Eigen_directory_files "*") - -escape_string_as_regex(ESCAPED_CMAKE_CURRENT_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") - -foreach(f ${Eigen_directory_files}) - if(NOT f MATCHES "\\.txt" AND NOT f MATCHES "${ESCAPED_CMAKE_CURRENT_SOURCE_DIR}/[.].+" AND NOT f MATCHES "${ESCAPED_CMAKE_CURRENT_SOURCE_DIR}/src") - list(APPEND Eigen_directory_files_to_install ${f}) - endif() -endforeach(f ${Eigen_directory_files}) - -install(FILES - ${Eigen_directory_files_to_install} - DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel - ) - -install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h") diff --git a/lib/eigen_3.3.9/Eigen/src/Core/GenericPacketMath.h b/lib/eigen_3.3.9/Eigen/src/Core/GenericPacketMath.h deleted file mode 100644 index e594437791b..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/GenericPacketMath.h +++ /dev/null @@ -1,590 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008 Gael Guennebaud -// Copyright (C) 2006-2008 Benoit Jacob -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_GENERIC_PACKET_MATH_H -#define EIGEN_GENERIC_PACKET_MATH_H - -namespace Eigen { - -namespace internal { - -/** \internal - * \file GenericPacketMath.h - * - * Default implementation for types not supported by the vectorization. - * In practice these functions are provided to make easier the writing - * of generic vectorized code. - */ - -#ifndef EIGEN_DEBUG_ALIGNED_LOAD -#define EIGEN_DEBUG_ALIGNED_LOAD -#endif - -#ifndef EIGEN_DEBUG_UNALIGNED_LOAD -#define EIGEN_DEBUG_UNALIGNED_LOAD -#endif - -#ifndef EIGEN_DEBUG_ALIGNED_STORE -#define EIGEN_DEBUG_ALIGNED_STORE -#endif - -#ifndef EIGEN_DEBUG_UNALIGNED_STORE -#define EIGEN_DEBUG_UNALIGNED_STORE -#endif - -struct default_packet_traits -{ - enum { - HasHalfPacket = 0, - - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasArg = 0, - HasAbs2 = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, - HasSetLinear = 1, - HasBlend = 0, - - HasDiv = 0, - HasSqrt = 0, - HasRsqrt = 0, - HasExp = 0, - HasLog = 0, - HasLog1p = 0, - HasLog10 = 0, - HasPow = 0, - - HasSin = 0, - HasCos = 0, - HasTan = 0, - HasASin = 0, - HasACos = 0, - HasATan = 0, - HasSinh = 0, - HasCosh = 0, - HasTanh = 0, - HasLGamma = 0, - HasDiGamma = 0, - HasZeta = 0, - HasPolygamma = 0, - HasErf = 0, - HasErfc = 0, - HasIGamma = 0, - HasIGammac = 0, - HasBetaInc = 0, - - HasRound = 0, - HasFloor = 0, - HasCeil = 0, - - HasSign = 0 - }; -}; - -template struct packet_traits : default_packet_traits -{ - typedef T type; - typedef T half; - enum { - Vectorizable = 0, - size = 1, - AlignedOnScalar = 0, - HasHalfPacket = 0 - }; - enum { - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0 - }; -}; - -template struct packet_traits : packet_traits { }; - -template struct type_casting_traits { - enum { - VectorizedCast = 0, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - - -/** \internal \returns static_cast(a) (coeff-wise) */ -template -EIGEN_DEVICE_FUNC inline TgtPacket -pcast(const SrcPacket& a) { - return static_cast(a); -} -template -EIGEN_DEVICE_FUNC inline TgtPacket -pcast(const SrcPacket& a, const SrcPacket& /*b*/) { - return static_cast(a); -} - -template -EIGEN_DEVICE_FUNC inline TgtPacket -pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/) { - return static_cast(a); -} - -/** \internal \returns a + b (coeff-wise) */ -template EIGEN_DEVICE_FUNC inline Packet -padd(const Packet& a, - const Packet& b) { return a+b; } - -/** \internal \returns a - b (coeff-wise) */ -template EIGEN_DEVICE_FUNC inline Packet -psub(const Packet& a, - const Packet& b) { return a-b; } - -/** \internal \returns -a (coeff-wise) */ -template EIGEN_DEVICE_FUNC inline Packet -pnegate(const Packet& a) { return -a; } - -/** \internal \returns conj(a) (coeff-wise) */ - -template EIGEN_DEVICE_FUNC inline Packet -pconj(const Packet& a) { return numext::conj(a); } - -/** \internal \returns a * b (coeff-wise) */ -template EIGEN_DEVICE_FUNC inline Packet -pmul(const Packet& a, - const Packet& b) { return a*b; } - -/** \internal \returns a / b (coeff-wise) */ -template EIGEN_DEVICE_FUNC inline Packet -pdiv(const Packet& a, - const Packet& b) { return a/b; } - -/** \internal \returns the min of \a a and \a b (coeff-wise) */ -template EIGEN_DEVICE_FUNC inline Packet -pmin(const Packet& a, - const Packet& b) { return numext::mini(a, b); } - -/** \internal \returns the max of \a a and \a b (coeff-wise) */ -template EIGEN_DEVICE_FUNC inline Packet -pmax(const Packet& a, - const Packet& b) { return numext::maxi(a, b); } - -/** \internal \returns the absolute value of \a a */ -template EIGEN_DEVICE_FUNC inline Packet -pabs(const Packet& a) { using std::abs; return abs(a); } - -/** \internal \returns the phase angle of \a a */ -template EIGEN_DEVICE_FUNC inline Packet -parg(const Packet& a) { using numext::arg; return arg(a); } - -/** \internal \returns the bitwise and of \a a and \a b */ -template EIGEN_DEVICE_FUNC inline Packet -pand(const Packet& a, const Packet& b) { return a & b; } - -/** \internal \returns the bitwise or of \a a and \a b */ -template EIGEN_DEVICE_FUNC inline Packet -por(const Packet& a, const Packet& b) { return a | b; } - -/** \internal \returns the bitwise xor of \a a and \a b */ -template EIGEN_DEVICE_FUNC inline Packet -pxor(const Packet& a, const Packet& b) { return a ^ b; } - -/** \internal \returns the bitwise andnot of \a a and \a b */ -template EIGEN_DEVICE_FUNC inline Packet -pandnot(const Packet& a, const Packet& b) { return a & (!b); } - -/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ -template EIGEN_DEVICE_FUNC inline Packet -pload(const typename unpacket_traits::type* from) { return *from; } - -/** \internal \returns a packet version of \a *from, (un-aligned load) */ -template EIGEN_DEVICE_FUNC inline Packet -ploadu(const typename unpacket_traits::type* from) { return *from; } - -/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ -template EIGEN_DEVICE_FUNC inline Packet -pset1(const typename unpacket_traits::type& a) { return a; } - -/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ -template EIGEN_DEVICE_FUNC inline Packet -pload1(const typename unpacket_traits::type *a) { return pset1(*a); } - -/** \internal \returns a packet with elements of \a *from duplicated. - * For instance, for a packet of 8 elements, 4 scalars will be read from \a *from and - * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]} - * Currently, this function is only used for scalar * complex products. - */ -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet -ploaddup(const typename unpacket_traits::type* from) { return *from; } - -/** \internal \returns a packet with elements of \a *from quadrupled. - * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and - * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]} - * Currently, this function is only used in matrix products. - * For packet-size smaller or equal to 4, this function is equivalent to pload1 - */ -template EIGEN_DEVICE_FUNC inline Packet -ploadquad(const typename unpacket_traits::type* from) -{ return pload1(from); } - -/** \internal equivalent to - * \code - * a0 = pload1(a+0); - * a1 = pload1(a+1); - * a2 = pload1(a+2); - * a3 = pload1(a+3); - * \endcode - * \sa pset1, pload1, ploaddup, pbroadcast2 - */ -template EIGEN_DEVICE_FUNC -inline void pbroadcast4(const typename unpacket_traits::type *a, - Packet& a0, Packet& a1, Packet& a2, Packet& a3) -{ - a0 = pload1(a+0); - a1 = pload1(a+1); - a2 = pload1(a+2); - a3 = pload1(a+3); -} - -/** \internal equivalent to - * \code - * a0 = pload1(a+0); - * a1 = pload1(a+1); - * \endcode - * \sa pset1, pload1, ploaddup, pbroadcast4 - */ -template EIGEN_DEVICE_FUNC -inline void pbroadcast2(const typename unpacket_traits::type *a, - Packet& a0, Packet& a1) -{ - a0 = pload1(a+0); - a1 = pload1(a+1); -} - -/** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet -plset(const typename unpacket_traits::type& a) { return a; } - -/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */ -template EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from) -{ (*to) = from; } - -/** \internal copy the packet \a from to \a *to, (un-aligned store) */ -template EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from) -{ (*to) = from; } - - template EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/) - { return ploadu(from); } - - template EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/) - { pstore(to, from); } - -/** \internal tries to do cache prefetching of \a addr */ -template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) -{ -#ifdef __CUDA_ARCH__ -#if defined(__LP64__) - // 64-bit pointer operand constraint for inlined asm - asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr)); -#else - // 32-bit pointer operand constraint for inlined asm - asm(" prefetch.L1 [ %1 ];" : "=r"(addr) : "r"(addr)); -#endif -#elif (!EIGEN_COMP_MSVC) && (EIGEN_COMP_GNUC || EIGEN_COMP_CLANG || EIGEN_COMP_ICC) - __builtin_prefetch(addr); -#endif -} - -/** \internal \returns the first element of a packet */ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type pfirst(const Packet& a) -{ return a; } - -/** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */ -template EIGEN_DEVICE_FUNC inline Packet -preduxp(const Packet* vecs) { return vecs[0]; } - -/** \internal \returns the sum of the elements of \a a*/ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux(const Packet& a) -{ return a; } - -/** \internal \returns the sum of the elements of \a a by block of 4 elements. - * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7} - * For packet-size smaller or equal to 4, this boils down to a noop. - */ -template EIGEN_DEVICE_FUNC inline -typename conditional<(unpacket_traits::size%8)==0,typename unpacket_traits::half,Packet>::type -predux_downto4(const Packet& a) -{ return a; } - -/** \internal \returns the product of the elements of \a a*/ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_mul(const Packet& a) -{ return a; } - -/** \internal \returns the min of the elements of \a a*/ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min(const Packet& a) -{ return a; } - -/** \internal \returns the max of the elements of \a a*/ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_max(const Packet& a) -{ return a; } - -/** \internal \returns the reversed elements of \a a*/ -template EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a) -{ return a; } - -/** \internal \returns \a a with real and imaginary part flipped (for complex type only) */ -template EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) -{ - return Packet(a.imag(),a.real()); -} - -/************************** -* Special math functions -***************************/ - -/** \internal \returns the sine of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet psin(const Packet& a) { using std::sin; return sin(a); } - -/** \internal \returns the cosine of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pcos(const Packet& a) { using std::cos; return cos(a); } - -/** \internal \returns the tan of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet ptan(const Packet& a) { using std::tan; return tan(a); } - -/** \internal \returns the arc sine of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pasin(const Packet& a) { using std::asin; return asin(a); } - -/** \internal \returns the arc cosine of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pacos(const Packet& a) { using std::acos; return acos(a); } - -/** \internal \returns the arc tangent of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet patan(const Packet& a) { using std::atan; return atan(a); } - -/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet psinh(const Packet& a) { using std::sinh; return sinh(a); } - -/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pcosh(const Packet& a) { using std::cosh; return cosh(a); } - -/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet ptanh(const Packet& a) { using std::tanh; return tanh(a); } - -/** \internal \returns the exp of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pexp(const Packet& a) { using std::exp; return exp(a); } - -/** \internal \returns the log of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet plog(const Packet& a) { using std::log; return log(a); } - -/** \internal \returns the log1p of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet plog1p(const Packet& a) { return numext::log1p(a); } - -/** \internal \returns the log10 of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet plog10(const Packet& a) { using std::log10; return log10(a); } - -/** \internal \returns the square-root of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); } - -/** \internal \returns the reciprocal square-root of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet prsqrt(const Packet& a) { - return pdiv(pset1(1), psqrt(a)); -} - -/** \internal \returns the rounded value of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pround(const Packet& a) { using numext::round; return round(a); } - -/** \internal \returns the floor of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pfloor(const Packet& a) { using numext::floor; return floor(a); } - -/** \internal \returns the ceil of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); } - -/*************************************************************************** -* The following functions might not have to be overwritten for vectorized types -***************************************************************************/ - -/** \internal copy a packet with constant coeficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */ -// NOTE: this function must really be templated on the packet type (think about different packet types for the same scalar type) -template -inline void pstore1(typename unpacket_traits::type* to, const typename unpacket_traits::type& a) -{ - pstore(to, pset1(a)); -} - -/** \internal \returns a * b + c (coeff-wise) */ -template EIGEN_DEVICE_FUNC inline Packet -pmadd(const Packet& a, - const Packet& b, - const Packet& c) -{ return padd(pmul(a, b),c); } - -/** \internal \returns a packet version of \a *from. - * The pointer \a from must be aligned on a \a Alignment bytes boundary. */ -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits::type* from) -{ - if(Alignment >= unpacket_traits::alignment) - return pload(from); - else - return ploadu(from); -} - -/** \internal copy the packet \a from to \a *to. - * The pointer \a from must be aligned on a \a Alignment bytes boundary. */ -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from) -{ - if(Alignment >= unpacket_traits::alignment) - pstore(to, from); - else - pstoreu(to, from); -} - -/** \internal \returns a packet version of \a *from. - * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the - * hardware if available to speedup the loading of data that won't be modified - * by the current computation. - */ -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits::type* from) -{ - return ploadt(from); -} - -/** \internal default implementation of palign() allowing partial specialization */ -template -struct palign_impl -{ - // by default data are aligned, so there is nothing to be done :) - static inline void run(PacketType&, const PacketType&) {} -}; - -/** \internal update \a first using the concatenation of the packet_size minus \a Offset last elements - * of \a first and \a Offset first elements of \a second. - * - * This function is currently only used to optimize matrix-vector products on unligned matrices. - * It takes 2 packets that represent a contiguous memory array, and returns a packet starting - * at the position \a Offset. For instance, for packets of 4 elements, we have: - * Input: - * - first = {f0,f1,f2,f3} - * - second = {s0,s1,s2,s3} - * Output: - * - if Offset==0 then {f0,f1,f2,f3} - * - if Offset==1 then {f1,f2,f3,s0} - * - if Offset==2 then {f2,f3,s0,s1} - * - if Offset==3 then {f3,s0,s1,s3} - */ -template -inline void palign(PacketType& first, const PacketType& second) -{ - palign_impl::run(first,second); -} - -/*************************************************************************** -* Fast complex products (GCC generates a function call which is very slow) -***************************************************************************/ - -// Eigen+CUDA does not support complexes. -#ifndef __CUDACC__ - -template<> inline std::complex pmul(const std::complex& a, const std::complex& b) -{ return std::complex(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); } - -template<> inline std::complex pmul(const std::complex& a, const std::complex& b) -{ return std::complex(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); } - -#endif - - -/*************************************************************************** - * PacketBlock, that is a collection of N packets where the number of words - * in the packet is a multiple of N. -***************************************************************************/ -template ::size> struct PacketBlock { - Packet packet[N]; -}; - -template EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& /*kernel*/) { - // Nothing to do in the scalar case, i.e. a 1x1 matrix. -} - -/*************************************************************************** - * Selector, i.e. vector of N boolean values used to select (i.e. blend) - * words from 2 packets. -***************************************************************************/ -template struct Selector { - bool select[N]; -}; - -template EIGEN_DEVICE_FUNC inline Packet -pblend(const Selector::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) { - return ifPacket.select[0] ? thenPacket : elsePacket; -} - -/** \internal \returns \a a with the first coefficient replaced by the scalar b */ -template EIGEN_DEVICE_FUNC inline Packet -pinsertfirst(const Packet& a, typename unpacket_traits::type b) -{ - // Default implementation based on pblend. - // It must be specialized for higher performance. - Selector::size> mask; - mask.select[0] = true; - // This for loop should be optimized away by the compiler. - for(Index i=1; i::size; ++i) - mask.select[i] = false; - return pblend(mask, pset1(b), a); -} - -/** \internal \returns \a a with the last coefficient replaced by the scalar b */ -template EIGEN_DEVICE_FUNC inline Packet -pinsertlast(const Packet& a, typename unpacket_traits::type b) -{ - // Default implementation based on pblend. - // It must be specialized for higher performance. - Selector::size> mask; - // This for loop should be optimized away by the compiler. - for(Index i=0; i::size-1; ++i) - mask.select[i] = false; - mask.select[unpacket_traits::size-1] = true; - return pblend(mask, pset1(b), a); -} - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_GENERIC_PACKET_MATH_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/MathFunctionsImpl.h b/lib/eigen_3.3.9/Eigen/src/Core/MathFunctionsImpl.h deleted file mode 100644 index 9c1ceb0eb0f..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/MathFunctionsImpl.h +++ /dev/null @@ -1,101 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com) -// Copyright (C) 2016 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_MATHFUNCTIONSIMPL_H -#define EIGEN_MATHFUNCTIONSIMPL_H - -namespace Eigen { - -namespace internal { - -/** \internal \returns the hyperbolic tan of \a a (coeff-wise) - Doesn't do anything fancy, just a 13/6-degree rational interpolant which - is accurate up to a couple of ulp in the range [-9, 9], outside of which - the tanh(x) = +/-1. - - This implementation works on both scalars and packets. -*/ -template -T generic_fast_tanh_float(const T& a_x) -{ - // Clamp the inputs to the range [-9, 9] since anything outside - // this range is +/-1.0f in single-precision. - const T plus_9 = pset1(9.f); - const T minus_9 = pset1(-9.f); - // NOTE GCC prior to 6.3 might improperly optimize this max/min - // step such that if a_x is nan, x will be either 9 or -9, - // and tanh will return 1 or -1 instead of nan. - // This is supposed to be fixed in gcc6.3, - // see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 - const T x = pmax(minus_9,pmin(plus_9,a_x)); - // The monomial coefficients of the numerator polynomial (odd). - const T alpha_1 = pset1(4.89352455891786e-03f); - const T alpha_3 = pset1(6.37261928875436e-04f); - const T alpha_5 = pset1(1.48572235717979e-05f); - const T alpha_7 = pset1(5.12229709037114e-08f); - const T alpha_9 = pset1(-8.60467152213735e-11f); - const T alpha_11 = pset1(2.00018790482477e-13f); - const T alpha_13 = pset1(-2.76076847742355e-16f); - - // The monomial coefficients of the denominator polynomial (even). - const T beta_0 = pset1(4.89352518554385e-03f); - const T beta_2 = pset1(2.26843463243900e-03f); - const T beta_4 = pset1(1.18534705686654e-04f); - const T beta_6 = pset1(1.19825839466702e-06f); - - // Since the polynomials are odd/even, we need x^2. - const T x2 = pmul(x, x); - - // Evaluate the numerator polynomial p. - T p = pmadd(x2, alpha_13, alpha_11); - p = pmadd(x2, p, alpha_9); - p = pmadd(x2, p, alpha_7); - p = pmadd(x2, p, alpha_5); - p = pmadd(x2, p, alpha_3); - p = pmadd(x2, p, alpha_1); - p = pmul(x, p); - - // Evaluate the denominator polynomial p. - T q = pmadd(x2, beta_6, beta_4); - q = pmadd(x2, q, beta_2); - q = pmadd(x2, q, beta_0); - - // Divide the numerator by the denominator. - return pdiv(p, q); -} - -template -EIGEN_STRONG_INLINE -RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y) -{ - EIGEN_USING_STD_MATH(sqrt); - RealScalar p, qp; - p = numext::maxi(x,y); - if(p==RealScalar(0)) return RealScalar(0); - qp = numext::mini(y,x) / p; - return p * sqrt(RealScalar(1) + qp*qp); -} - -template -struct hypot_impl -{ - typedef typename NumTraits::Real RealScalar; - static inline RealScalar run(const Scalar& x, const Scalar& y) - { - EIGEN_USING_STD_MATH(abs); - return positive_real_hypot(abs(x), abs(y)); - } -}; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_MATHFUNCTIONSIMPL_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/Redux.h b/lib/eigen_3.3.9/Eigen/src/Core/Redux.h deleted file mode 100644 index 760e9f86154..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/Redux.h +++ /dev/null @@ -1,505 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008 Gael Guennebaud -// Copyright (C) 2006-2008 Benoit Jacob -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_REDUX_H -#define EIGEN_REDUX_H - -namespace Eigen { - -namespace internal { - -// TODO -// * implement other kind of vectorization -// * factorize code - -/*************************************************************************** -* Part 1 : the logic deciding a strategy for vectorization and unrolling -***************************************************************************/ - -template -struct redux_traits -{ -public: - typedef typename find_best_packet::type PacketType; - enum { - PacketSize = unpacket_traits::size, - InnerMaxSize = int(Derived::IsRowMajor) - ? Derived::MaxColsAtCompileTime - : Derived::MaxRowsAtCompileTime - }; - - enum { - MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit) - && (functor_traits::PacketAccess), - MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit), - MaySliceVectorize = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize - }; - -public: - enum { - Traversal = int(MayLinearVectorize) ? int(LinearVectorizedTraversal) - : int(MaySliceVectorize) ? int(SliceVectorizedTraversal) - : int(DefaultTraversal) - }; - -public: - enum { - Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost - : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits::Cost, - UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize)) - }; - -public: - enum { - Unrolling = Cost <= UnrollingLimit ? CompleteUnrolling : NoUnrolling - }; - -#ifdef EIGEN_DEBUG_ASSIGN - static void debug() - { - std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl; - std::cerr.setf(std::ios::hex, std::ios::basefield); - EIGEN_DEBUG_VAR(Derived::Flags) - std::cerr.unsetf(std::ios::hex); - EIGEN_DEBUG_VAR(InnerMaxSize) - EIGEN_DEBUG_VAR(PacketSize) - EIGEN_DEBUG_VAR(MightVectorize) - EIGEN_DEBUG_VAR(MayLinearVectorize) - EIGEN_DEBUG_VAR(MaySliceVectorize) - EIGEN_DEBUG_VAR(Traversal) - EIGEN_DEBUG_VAR(UnrollingLimit) - EIGEN_DEBUG_VAR(Unrolling) - std::cerr << std::endl; - } -#endif -}; - -/*************************************************************************** -* Part 2 : unrollers -***************************************************************************/ - -/*** no vectorization ***/ - -template -struct redux_novec_unroller -{ - enum { - HalfLength = Length/2 - }; - - typedef typename Derived::Scalar Scalar; - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) - { - return func(redux_novec_unroller::run(mat,func), - redux_novec_unroller::run(mat,func)); - } -}; - -template -struct redux_novec_unroller -{ - enum { - outer = Start / Derived::InnerSizeAtCompileTime, - inner = Start % Derived::InnerSizeAtCompileTime - }; - - typedef typename Derived::Scalar Scalar; - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&) - { - return mat.coeffByOuterInner(outer, inner); - } -}; - -// This is actually dead code and will never be called. It is required -// to prevent false warnings regarding failed inlining though -// for 0 length run() will never be called at all. -template -struct redux_novec_unroller -{ - typedef typename Derived::Scalar Scalar; - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); } -}; - -/*** vectorization ***/ - -template -struct redux_vec_unroller -{ - enum { - PacketSize = redux_traits::PacketSize, - HalfLength = Length/2 - }; - - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; - - static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func) - { - return func.packetOp( - redux_vec_unroller::run(mat,func), - redux_vec_unroller::run(mat,func) ); - } -}; - -template -struct redux_vec_unroller -{ - enum { - index = Start * redux_traits::PacketSize, - outer = index / int(Derived::InnerSizeAtCompileTime), - inner = index % int(Derived::InnerSizeAtCompileTime), - alignment = Derived::Alignment - }; - - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; - - static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&) - { - return mat.template packetByOuterInner(outer, inner); - } -}; - -/*************************************************************************** -* Part 3 : implementation of all cases -***************************************************************************/ - -template::Traversal, - int Unrolling = redux_traits::Unrolling -> -struct redux_impl; - -template -struct redux_impl -{ - typedef typename Derived::Scalar Scalar; - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) - { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); - Scalar res; - res = mat.coeffByOuterInner(0, 0); - for(Index i = 1; i < mat.innerSize(); ++i) - res = func(res, mat.coeffByOuterInner(0, i)); - for(Index i = 1; i < mat.outerSize(); ++i) - for(Index j = 0; j < mat.innerSize(); ++j) - res = func(res, mat.coeffByOuterInner(i, j)); - return res; - } -}; - -template -struct redux_impl - : public redux_novec_unroller -{}; - -template -struct redux_impl -{ - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; - - static Scalar run(const Derived &mat, const Func& func) - { - const Index size = mat.size(); - - const Index packetSize = redux_traits::PacketSize; - const int packetAlignment = unpacket_traits::alignment; - enum { - alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), - alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment) - }; - const Index alignedStart = internal::first_default_aligned(mat.nestedExpression()); - const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize); - const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize); - const Index alignedEnd2 = alignedStart + alignedSize2; - const Index alignedEnd = alignedStart + alignedSize; - Scalar res; - if(alignedSize) - { - PacketScalar packet_res0 = mat.template packet(alignedStart); - if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop - { - PacketScalar packet_res1 = mat.template packet(alignedStart+packetSize); - for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize) - { - packet_res0 = func.packetOp(packet_res0, mat.template packet(index)); - packet_res1 = func.packetOp(packet_res1, mat.template packet(index+packetSize)); - } - - packet_res0 = func.packetOp(packet_res0,packet_res1); - if(alignedEnd>alignedEnd2) - packet_res0 = func.packetOp(packet_res0, mat.template packet(alignedEnd2)); - } - res = func.predux(packet_res0); - - for(Index index = 0; index < alignedStart; ++index) - res = func(res,mat.coeff(index)); - - for(Index index = alignedEnd; index < size; ++index) - res = func(res,mat.coeff(index)); - } - else // too small to vectorize anything. - // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize. - { - res = mat.coeff(0); - for(Index index = 1; index < size; ++index) - res = func(res,mat.coeff(index)); - } - - return res; - } -}; - -// NOTE: for SliceVectorizedTraversal we simply bypass unrolling -template -struct redux_impl -{ - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketType; - - EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func) - { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); - const Index innerSize = mat.innerSize(); - const Index outerSize = mat.outerSize(); - enum { - packetSize = redux_traits::PacketSize - }; - const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize; - Scalar res; - if(packetedInnerSize) - { - PacketType packet_res = mat.template packet(0,0); - for(Index j=0; j(j,i)); - - res = func.predux(packet_res); - for(Index j=0; j::run(mat, func); - } - - return res; - } -}; - -template -struct redux_impl -{ - typedef typename Derived::Scalar Scalar; - - typedef typename redux_traits::PacketType PacketScalar; - enum { - PacketSize = redux_traits::PacketSize, - Size = Derived::SizeAtCompileTime, - VectorizedSize = (Size / PacketSize) * PacketSize - }; - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) - { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); - if (VectorizedSize > 0) { - Scalar res = func.predux(redux_vec_unroller::run(mat,func)); - if (VectorizedSize != Size) - res = func(res,redux_novec_unroller::run(mat,func)); - return res; - } - else { - return redux_novec_unroller::run(mat,func); - } - } -}; - -// evaluator adaptor -template -class redux_evaluator -{ -public: - typedef _XprType XprType; - EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {} - - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketScalar PacketScalar; - typedef typename XprType::PacketReturnType PacketReturnType; - - enum { - MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime, - MaxColsAtCompileTime = XprType::MaxColsAtCompileTime, - // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator - Flags = evaluator::Flags & ~DirectAccessBit, - IsRowMajor = XprType::IsRowMajor, - SizeAtCompileTime = XprType::SizeAtCompileTime, - InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime, - CoeffReadCost = evaluator::CoeffReadCost, - Alignment = evaluator::Alignment - }; - - EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); } - EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); } - EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); } - EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); } - - EIGEN_DEVICE_FUNC - CoeffReturnType coeff(Index row, Index col) const - { return m_evaluator.coeff(row, col); } - - EIGEN_DEVICE_FUNC - CoeffReturnType coeff(Index index) const - { return m_evaluator.coeff(index); } - - template - PacketType packet(Index row, Index col) const - { return m_evaluator.template packet(row, col); } - - template - PacketType packet(Index index) const - { return m_evaluator.template packet(index); } - - EIGEN_DEVICE_FUNC - CoeffReturnType coeffByOuterInner(Index outer, Index inner) const - { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } - - template - PacketType packetByOuterInner(Index outer, Index inner) const - { return m_evaluator.template packet(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } - - const XprType & nestedExpression() const { return m_xpr; } - -protected: - internal::evaluator m_evaluator; - const XprType &m_xpr; -}; - -} // end namespace internal - -/*************************************************************************** -* Part 4 : public API -***************************************************************************/ - - -/** \returns the result of a full redux operation on the whole matrix or vector using \a func - * - * The template parameter \a BinaryOp is the type of the functor \a func which must be - * an associative operator. Both current C++98 and C++11 functor styles are handled. - * - * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise() - */ -template -template -EIGEN_STRONG_INLINE typename internal::traits::Scalar -DenseBase::redux(const Func& func) const -{ - eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); - - typedef typename internal::redux_evaluator ThisEvaluator; - ThisEvaluator thisEval(derived()); - - return internal::redux_impl::run(thisEval, func); -} - -/** \returns the minimum of all coefficients of \c *this. - * \warning the result is undefined if \c *this contains NaN. - */ -template -EIGEN_STRONG_INLINE typename internal::traits::Scalar -DenseBase::minCoeff() const -{ - return derived().redux(Eigen::internal::scalar_min_op()); -} - -/** \returns the maximum of all coefficients of \c *this. - * \warning the result is undefined if \c *this contains NaN. - */ -template -EIGEN_STRONG_INLINE typename internal::traits::Scalar -DenseBase::maxCoeff() const -{ - return derived().redux(Eigen::internal::scalar_max_op()); -} - -/** \returns the sum of all coefficients of \c *this - * - * If \c *this is empty, then the value 0 is returned. - * - * \sa trace(), prod(), mean() - */ -template -EIGEN_STRONG_INLINE typename internal::traits::Scalar -DenseBase::sum() const -{ - if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) - return Scalar(0); - return derived().redux(Eigen::internal::scalar_sum_op()); -} - -/** \returns the mean of all coefficients of *this -* -* \sa trace(), prod(), sum() -*/ -template -EIGEN_STRONG_INLINE typename internal::traits::Scalar -DenseBase::mean() const -{ -#ifdef __INTEL_COMPILER - #pragma warning push - #pragma warning ( disable : 2259 ) -#endif - return Scalar(derived().redux(Eigen::internal::scalar_sum_op())) / Scalar(this->size()); -#ifdef __INTEL_COMPILER - #pragma warning pop -#endif -} - -/** \returns the product of all coefficients of *this - * - * Example: \include MatrixBase_prod.cpp - * Output: \verbinclude MatrixBase_prod.out - * - * \sa sum(), mean(), trace() - */ -template -EIGEN_STRONG_INLINE typename internal::traits::Scalar -DenseBase::prod() const -{ - if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) - return Scalar(1); - return derived().redux(Eigen::internal::scalar_product_op()); -} - -/** \returns the trace of \c *this, i.e. the sum of the coefficients on the main diagonal. - * - * \c *this can be any matrix, not necessarily square. - * - * \sa diagonal(), sum() - */ -template -EIGEN_STRONG_INLINE typename internal::traits::Scalar -MatrixBase::trace() const -{ - return derived().diagonal().sum(); -} - -} // end namespace Eigen - -#endif // EIGEN_REDUX_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/arch/AVX/MathFunctions.h b/lib/eigen_3.3.9/Eigen/src/Core/arch/AVX/MathFunctions.h deleted file mode 100644 index 6af67ce2d65..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/arch/AVX/MathFunctions.h +++ /dev/null @@ -1,439 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com) -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_MATH_FUNCTIONS_AVX_H -#define EIGEN_MATH_FUNCTIONS_AVX_H - -/* The sin, cos, exp, and log functions of this file are loosely derived from - * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ - */ - -namespace Eigen { - -namespace internal { - -inline Packet8i pshiftleft(Packet8i v, int n) -{ -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_slli_epi32(v, n); -#else - __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(v, 0), n); - __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(v, 1), n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); -#endif -} - -inline Packet8f pshiftright(Packet8f v, int n) -{ -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n)); -#else - __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n); - __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n); - return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)); -#endif -} - -// Sine function -// Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and -// evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants -// are (anti-)symmetric and thus have only odd/even coefficients -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f -psin(const Packet8f& _x) { - Packet8f x = _x; - - // Some useful values. - _EIGEN_DECLARE_CONST_Packet8i(one, 1); - _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); - _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f); - _EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f); - _EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f); - _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00f); - _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04f); - _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07f); - _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00f); - - // Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period. - Packet8f z = pmul(x, p8f_one_over_pi); - Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four)); - x = pmadd(shift, p8f_neg_pi_first, x); - x = pmadd(shift, p8f_neg_pi_second, x); - x = pmadd(shift, p8f_neg_pi_third, x); - z = pmul(x, p8f_four_over_pi); - - // Make a mask for the entries that need flipping, i.e. wherever the shift - // is odd. - Packet8i shift_ints = _mm256_cvtps_epi32(shift); - Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one))); - Packet8i sign_flip_mask = pshiftleft(shift_isodd, 31); - - // Create a mask for which interpolant to use, i.e. if z > 1, then the mask - // is set to ones for that entry. - Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ); - - // Evaluate the polynomial for the interval [1,3] in z. - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04f); - Packet8f z_minus_two = psub(z, p8f_two); - Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two); - Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4); - right = pmadd(right, z_minus_two2, p8f_coeff_right_2); - right = pmadd(right, z_minus_two2, p8f_coeff_right_0); - - // Evaluate the polynomial for the interval [-1,1] in z. - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05f); - Packet8f z2 = pmul(z, z); - Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5); - left = pmadd(left, z2, p8f_coeff_left_3); - left = pmadd(left, z2, p8f_coeff_left_1); - left = pmul(left, z); - - // Assemble the results, i.e. select the left and right polynomials. - left = _mm256_andnot_ps(ival_mask, left); - right = _mm256_and_ps(ival_mask, right); - Packet8f res = _mm256_or_ps(left, right); - - // Flip the sign on the odd intervals and return the result. - res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask)); - return res; -} - -// Natural logarithm -// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) -// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can -// be easily approximated by a polynomial centered on m=1 for stability. -// TODO(gonnet): Further reduce the interval allowing for lower-degree -// polynomial interpolants -> ... -> profit! -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f -plog(const Packet8f& _x) { - Packet8f x = _x; - _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f); - _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f); - - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000); - - // The smallest non denormalized float number. - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000); - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000); - - // Polynomial coefficients. - _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f); - - Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ); // not greater equal is true if x is NaN - Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ); - - // Truncate input values to the minimum positive normal. - x = pmax(x, p8f_min_norm_pos); - - Packet8f emm0 = pshiftright(x,23); - Packet8f e = _mm256_sub_ps(emm0, p8f_126f); - - // Set the exponents to -1, i.e. x are in the range [0.5,1). - x = _mm256_and_ps(x, p8f_inv_mant_mask); - x = _mm256_or_ps(x, p8f_half); - - // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) - // and shift by -1. The values are then centered around 0, which improves - // the stability of the polynomial evaluation. - // if( x < SQRTHF ) { - // e -= 1; - // x = x + x - 1.0; - // } else { x = x - 1.0; } - Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ); - Packet8f tmp = _mm256_and_ps(x, mask); - x = psub(x, p8f_1); - e = psub(e, _mm256_and_ps(p8f_1, mask)); - x = padd(x, tmp); - - Packet8f x2 = pmul(x, x); - Packet8f x3 = pmul(x2, x); - - // Evaluate the polynomial approximant of degree 8 in three parts, probably - // to improve instruction-level parallelism. - Packet8f y, y1, y2; - y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1); - y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4); - y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7); - y = pmadd(y, x, p8f_cephes_log_p2); - y1 = pmadd(y1, x, p8f_cephes_log_p5); - y2 = pmadd(y2, x, p8f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - // Add the logarithm of the exponent back to the result of the interpolation. - y1 = pmul(e, p8f_cephes_log_q1); - tmp = pmul(x2, p8f_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p8f_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - - // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. - return _mm256_or_ps( - _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)), - _mm256_and_ps(iszero_mask, p8f_minus_inf)); -} - -// Exponential function. Works by writing "x = m*log(2) + r" where -// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then -// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1). -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f -pexp(const Packet8f& _x) { - _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f); - _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f); - - _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f); - - _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f); - - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f); - - // Clamp x. - Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo); - - // Express exp(x) as exp(m*ln(2) + r), start by extracting - // m = floor(x/ln(2) + 0.5). - Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half)); - -// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is -// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating -// truncation errors. Note that we don't use the "pmadd" function here to -// ensure that a precision-preserving FMA instruction is used. -#ifdef EIGEN_VECTORIZE_FMA - _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f); - Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x); -#else - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f); - Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1)); - r = psub(r, pmul(m, p8f_cephes_exp_C2)); -#endif - - Packet8f r2 = pmul(r, r); - - // TODO(gonnet): Split into odd/even polynomials and try to exploit - // instruction-level parallelism. - Packet8f y = p8f_cephes_exp_p0; - y = pmadd(y, r, p8f_cephes_exp_p1); - y = pmadd(y, r, p8f_cephes_exp_p2); - y = pmadd(y, r, p8f_cephes_exp_p3); - y = pmadd(y, r, p8f_cephes_exp_p4); - y = pmadd(y, r, p8f_cephes_exp_p5); - y = pmadd(y, r2, r); - y = padd(y, p8f_1); - - // Build emm0 = 2^m. - Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127)); - emm0 = pshiftleft(emm0, 23); - - // Return 2^m * exp(r). - return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x); -} - -// Hyperbolic Tangent function. -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f -ptanh(const Packet8f& x) { - return internal::generic_fast_tanh_float(x); -} - -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d -pexp(const Packet4d& _x) { - Packet4d x = _x; - - _EIGEN_DECLARE_CONST_Packet4d(1, 1.0); - _EIGEN_DECLARE_CONST_Packet4d(2, 2.0); - _EIGEN_DECLARE_CONST_Packet4d(half, 0.5); - - _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437); - _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6); - _EIGEN_DECLARE_CONST_Packet4i(1023, 1023); - - Packet4d tmp, fx; - - // clamp x - x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo); - // Express exp(x) as exp(g + n*log(2)). - fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half); - - // Get the integer modulus of log(2), i.e. the "n" described above. - fx = _mm256_floor_pd(fx); - - // Get the remainder modulo log(2), i.e. the "g" described above. Subtract - // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last - // digits right. - tmp = pmul(fx, p4d_cephes_exp_C1); - Packet4d z = pmul(fx, p4d_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - Packet4d x2 = pmul(x, x); - - // Evaluate the numerator polynomial of the rational interpolant. - Packet4d px = p4d_cephes_exp_p0; - px = pmadd(px, x2, p4d_cephes_exp_p1); - px = pmadd(px, x2, p4d_cephes_exp_p2); - px = pmul(px, x); - - // Evaluate the denominator polynomial of the rational interpolant. - Packet4d qx = p4d_cephes_exp_q0; - qx = pmadd(qx, x2, p4d_cephes_exp_q1); - qx = pmadd(qx, x2, p4d_cephes_exp_q2); - qx = pmadd(qx, x2, p4d_cephes_exp_q3); - - // I don't really get this bit, copied from the SSE2 routines, so... - // TODO(gonnet): Figure out what is going on here, perhaps find a better - // rational interpolant? - x = _mm256_div_pd(px, psub(qx, px)); - x = pmadd(p4d_2, x, p4d_1); - - // Build e=2^n by constructing the exponents in a 128-bit vector and - // shifting them to where they belong in double-precision values. - __m128i emm0 = _mm256_cvtpd_epi32(fx); - emm0 = _mm_add_epi32(emm0, p4i_1023); - emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0)); - __m128i lo = _mm_slli_epi64(emm0, 52); - __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52); - __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0); - e = _mm256_insertf128_si256(e, hi, 1); - - // Construct the result 2^n * exp(g) = e * x. The max is used to catch - // non-finite values in the input. - return pmax(pmul(x, _mm256_castsi256_pd(e)), _x); -} - -// Functions for sqrt. -// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step -// of Newton's method, at a cost of 1-2 bits of precision as opposed to the -// exact solution. It does not handle +inf, or denormalized numbers correctly. -// The main advantage of this approach is not just speed, but also the fact that -// it can be inlined and pipelined with other computations, further reducing its -// effective latency. This is similar to Quake3's fast inverse square root. -// For detail see here: http://www.beyond3d.com/content/articles/8/ -#if EIGEN_FAST_MATH -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f -psqrt(const Packet8f& _x) { - Packet8f half = pmul(_x, pset1(.5f)); - Packet8f denormal_mask = _mm256_and_ps( - _mm256_cmp_ps(_x, pset1((std::numeric_limits::min)()), - _CMP_LT_OQ), - _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_GE_OQ)); - - // Compute approximate reciprocal sqrt. - Packet8f x = _mm256_rsqrt_ps(_x); - // Do a single step of Newton's iteration. - x = pmul(x, psub(pset1(1.5f), pmul(half, pmul(x,x)))); - // Flush results for denormals to zero. - return _mm256_andnot_ps(denormal_mask, pmul(_x,x)); -} -#else -template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet8f psqrt(const Packet8f& x) { - return _mm256_sqrt_ps(x); -} -#endif -template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4d psqrt(const Packet4d& x) { - return _mm256_sqrt_pd(x); -} -#if EIGEN_FAST_MATH - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet8f prsqrt(const Packet8f& _x) { - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000); - _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f); - _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000); - - Packet8f neg_half = pmul(_x, p8f_minus_half); - - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ); - Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x)); - - // Fill in NaNs and Infs for the negative/zero entries. - Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ); - Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask); - Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan), - _mm256_and_ps(zero_mask, p8f_inf)); - - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five)); - - // Insert NaNs and Infs in all the right places. - return _mm256_or_ps(x, infs_and_nans); -} - -#else -template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet8f prsqrt(const Packet8f& x) { - _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); - return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x)); -} -#endif - -template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4d prsqrt(const Packet4d& x) { - _EIGEN_DECLARE_CONST_Packet4d(one, 1.0); - return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x)); -} - - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_MATH_FUNCTIONS_AVX_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/arch/AVX/PacketMath.h b/lib/eigen_3.3.9/Eigen/src/Core/arch/AVX/PacketMath.h deleted file mode 100644 index 923a124b206..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/arch/AVX/PacketMath.h +++ /dev/null @@ -1,637 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com) -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_PACKET_MATH_AVX_H -#define EIGEN_PACKET_MATH_AVX_H - -namespace Eigen { - -namespace internal { - -#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD -#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 -#endif - -#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) -#endif - -#ifdef __FMA__ -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#endif -#endif - -typedef __m256 Packet8f; -typedef __m256i Packet8i; -typedef __m256d Packet4d; - -template<> struct is_arithmetic<__m256> { enum { value = true }; }; -template<> struct is_arithmetic<__m256i> { enum { value = true }; }; -template<> struct is_arithmetic<__m256d> { enum { value = true }; }; - -#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \ - const Packet8f p8f_##NAME = pset1(X) - -#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \ - const Packet4d p4d_##NAME = pset1(X) - -#define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME,X) \ - const Packet8f p8f_##NAME = _mm256_castsi256_ps(pset1(X)) - -#define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \ - const Packet8i p8i_##NAME = pset1(X) - -// Use the packet_traits defined in AVX512/PacketMath.h instead if we're going -// to leverage AVX512 instructions. -#ifndef EIGEN_VECTORIZE_AVX512 -template<> struct packet_traits : default_packet_traits -{ - typedef Packet8f type; - typedef Packet4f half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=8, - HasHalfPacket = 1, - - HasDiv = 1, - HasSin = EIGEN_FAST_MATH, - HasCos = 0, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasTanh = EIGEN_FAST_MATH, - HasBlend = 1, - HasRound = 1, - HasFloor = 1, - HasCeil = 1 - }; -}; -template<> struct packet_traits : default_packet_traits -{ - typedef Packet4d type; - typedef Packet2d half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=4, - HasHalfPacket = 1, - - HasDiv = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasBlend = 1, - HasRound = 1, - HasFloor = 1, - HasCeil = 1 - }; -}; -#endif - -template<> struct scalar_div_cost { enum { value = 14 }; }; -template<> struct scalar_div_cost { enum { value = 16 }; }; - -/* Proper support for integers is only provided by AVX2. In the meantime, we'll - use SSE instructions and packets to deal with integers. -template<> struct packet_traits : default_packet_traits -{ - typedef Packet8i type; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=8 - }; -}; -*/ - -template<> struct unpacket_traits { typedef float type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; }; -template<> struct unpacket_traits { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; }; -template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; }; - -template<> EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { return _mm256_set1_ps(from); } -template<> EIGEN_STRONG_INLINE Packet4d pset1(const double& from) { return _mm256_set1_pd(from); } -template<> EIGEN_STRONG_INLINE Packet8i pset1(const int& from) { return _mm256_set1_epi32(from); } - -template<> EIGEN_STRONG_INLINE Packet8f pload1(const float* from) { return _mm256_broadcast_ss(from); } -template<> EIGEN_STRONG_INLINE Packet4d pload1(const double* from) { return _mm256_broadcast_sd(from); } - -template<> EIGEN_STRONG_INLINE Packet8f plset(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); } -template<> EIGEN_STRONG_INLINE Packet4d plset(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); } - -template<> EIGEN_STRONG_INLINE Packet8f padd(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d padd(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); } - -template<> EIGEN_STRONG_INLINE Packet8f psub(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d psub(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } - -template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) -{ - return _mm256_sub_ps(_mm256_set1_ps(0.0),a); -} -template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) -{ - return _mm256_sub_pd(_mm256_set1_pd(0.0),a); -} - -template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; } - -template<> EIGEN_STRONG_INLINE Packet8f pmul(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pmul(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); } - - -template<> EIGEN_STRONG_INLINE Packet8f pdiv(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pdiv(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, const Packet8i& /*b*/) -{ eigen_assert(false && "packet integer division are not supported by AVX"); - return pset1(0); -} - -#ifdef __FMA__ -template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { -#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) ) - // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, - // and even register spilling with clang>=6.0 (bug 1637). - // Gcc stupidly generates a vfmadd132ps instruction. - // So let's enforce it to generate a vfmadd231ps instruction since the most common use - // case is to accumulate the result of the product. - Packet8f res = c; - __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); - return res; -#else - return _mm256_fmadd_ps(a,b,c); -#endif -} -template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { -#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) ) - // see above - Packet4d res = c; - __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); - return res; -#else - return _mm256_fmadd_pd(a,b,c); -#endif -} -#endif - -template<> EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); } - -template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); } - -template<> EIGEN_STRONG_INLINE Packet8f pround(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } -template<> EIGEN_STRONG_INLINE Packet4d pround(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } - -template<> EIGEN_STRONG_INLINE Packet8f pceil(const Packet8f& a) { return _mm256_ceil_ps(a); } -template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { return _mm256_ceil_pd(a); } - -template<> EIGEN_STRONG_INLINE Packet8f pfloor(const Packet8f& a) { return _mm256_floor_ps(a); } -template<> EIGEN_STRONG_INLINE Packet4d pfloor(const Packet4d& a) { return _mm256_floor_pd(a); } - -template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pand(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } - -template<> EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d por(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); } - -template<> EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pxor(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); } - -template<> EIGEN_STRONG_INLINE Packet8f pandnot(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pandnot(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); } - -template<> EIGEN_STRONG_INLINE Packet8f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); } -template<> EIGEN_STRONG_INLINE Packet4d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); } -template<> EIGEN_STRONG_INLINE Packet8i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast(from)); } - -template<> EIGEN_STRONG_INLINE Packet8f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); } -template<> EIGEN_STRONG_INLINE Packet4d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); } -template<> EIGEN_STRONG_INLINE Packet8i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast(from)); } - -// Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} -template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) -{ - // TODO try to find a way to avoid the need of a temporary register -// Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from)); -// tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1); -// return _mm256_unpacklo_ps(tmp,tmp); - - // _mm256_insertf128_ps is very slow on Haswell, thus: - Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from); - // mimic an "inplace" permutation of the lower 128bits using a blend - tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15); - // then we can perform a consistent permutation on the global register to get everything in shape: - return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)); -} -// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1} -template<> EIGEN_STRONG_INLINE Packet4d ploaddup(const double* from) -{ - Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from); - return _mm256_permute_pd(tmp, 3<<2); -} - -// Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1} -template<> EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) -{ - Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from)); - return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1); -} - -template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); } -template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); } -template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } - -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } - -// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available -// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4); -template<> EIGEN_DEVICE_FUNC inline Packet8f pgather(const float* from, Index stride) -{ - return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride], - from[3*stride], from[2*stride], from[1*stride], from[0*stride]); -} -template<> EIGEN_DEVICE_FUNC inline Packet4d pgather(const double* from, Index stride) -{ - return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); -} - -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet8f& from, Index stride) -{ - __m128 low = _mm256_extractf128_ps(from, 0); - to[stride*0] = _mm_cvtss_f32(low); - to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)); - to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)); - to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)); - - __m128 high = _mm256_extractf128_ps(from, 1); - to[stride*4] = _mm_cvtss_f32(high); - to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)); - to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)); - to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet4d& from, Index stride) -{ - __m128d low = _mm256_extractf128_pd(from, 0); - to[stride*0] = _mm_cvtsd_f64(low); - to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)); - __m128d high = _mm256_extractf128_pd(from, 1); - to[stride*2] = _mm_cvtsd_f64(high); - to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)); -} - -template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) -{ - Packet8f pa = pset1(a); - pstore(to, pa); -} -template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& a) -{ - Packet4d pa = pset1(a); - pstore(to, pa); -} -template<> EIGEN_STRONG_INLINE void pstore1(int* to, const int& a) -{ - Packet8i pa = pset1(a); - pstore(to, pa); -} - -#ifndef EIGEN_VECTORIZE_AVX512 -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } -#endif - -template<> EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) { - return _mm_cvtss_f32(_mm256_castps256_ps128(a)); -} -template<> EIGEN_STRONG_INLINE double pfirst(const Packet4d& a) { - return _mm_cvtsd_f64(_mm256_castpd256_pd128(a)); -} -template<> EIGEN_STRONG_INLINE int pfirst(const Packet8i& a) { - return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); -} - - -template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) -{ - __m256 tmp = _mm256_shuffle_ps(a,a,0x1b); - return _mm256_permute2f128_ps(tmp, tmp, 1); -} -template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) -{ - __m256d tmp = _mm256_shuffle_pd(a,a,5); - return _mm256_permute2f128_pd(tmp, tmp, 1); - #if 0 - // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd - // exhibit the same latency/throughput, but it is here for future reference/benchmarking... - __m256d swap_halves = _mm256_permute2f128_pd(a,a,1); - return _mm256_permute_pd(swap_halves,5); - #endif -} - -// pabs should be ok -template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) -{ - const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); - return _mm256_and_ps(a,mask); -} -template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) -{ - const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); - return _mm256_and_pd(a,mask); -} - -// preduxp should be ok -// FIXME: why is this ok? why isn't the simply implementation working as expected? -template<> EIGEN_STRONG_INLINE Packet8f preduxp(const Packet8f* vecs) -{ - __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]); - __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]); - __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]); - __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]); - - __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); - __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); - __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); - __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); - - __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); - __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); - __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); - __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); - - __m256 sum1 = _mm256_add_ps(perm1, hsum5); - __m256 sum2 = _mm256_add_ps(perm2, hsum6); - __m256 sum3 = _mm256_add_ps(perm3, hsum7); - __m256 sum4 = _mm256_add_ps(perm4, hsum8); - - __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); - __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); - - __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); - return final; -} -template<> EIGEN_STRONG_INLINE Packet4d preduxp(const Packet4d* vecs) -{ - Packet4d tmp0, tmp1; - - tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]); - tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); - - tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]); - tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); - - return _mm256_blend_pd(tmp0, tmp1, 0xC); -} - -template<> EIGEN_STRONG_INLINE float predux(const Packet8f& a) -{ - return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)))); -} -template<> EIGEN_STRONG_INLINE double predux(const Packet4d& a) -{ - return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1)))); -} - -template<> EIGEN_STRONG_INLINE Packet4f predux_downto4(const Packet8f& a) -{ - return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)); -} - -template<> EIGEN_STRONG_INLINE float predux_mul(const Packet8f& a) -{ - Packet8f tmp; - tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1)); - tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); - return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); -} -template<> EIGEN_STRONG_INLINE double predux_mul(const Packet4d& a) -{ - Packet4d tmp; - tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1)); - return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1))); -} - -template<> EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) -{ - Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1)); - tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); - return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); -} -template<> EIGEN_STRONG_INLINE double predux_min(const Packet4d& a) -{ - Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1)); - return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); -} - -template<> EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) -{ - Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1)); - tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); - return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); -} - -template<> EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) -{ - Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1)); - return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); -} - - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second) - { - if (Offset==1) - { - first = _mm256_blend_ps(first, second, 1); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_blend_ps(tmp1, tmp2, 0x88); - } - else if (Offset==2) - { - first = _mm256_blend_ps(first, second, 3); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_blend_ps(tmp1, tmp2, 0xcc); - } - else if (Offset==3) - { - first = _mm256_blend_ps(first, second, 7); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_blend_ps(tmp1, tmp2, 0xee); - } - else if (Offset==4) - { - first = _mm256_blend_ps(first, second, 15); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0)); - } - else if (Offset==5) - { - first = _mm256_blend_ps(first, second, 31); - first = _mm256_permute2f128_ps(first, first, 1); - Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); - first = _mm256_permute2f128_ps(tmp, tmp, 1); - first = _mm256_blend_ps(tmp, first, 0x88); - } - else if (Offset==6) - { - first = _mm256_blend_ps(first, second, 63); - first = _mm256_permute2f128_ps(first, first, 1); - Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); - first = _mm256_permute2f128_ps(tmp, tmp, 1); - first = _mm256_blend_ps(tmp, first, 0xcc); - } - else if (Offset==7) - { - first = _mm256_blend_ps(first, second, 127); - first = _mm256_permute2f128_ps(first, first, 1); - Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); - first = _mm256_permute2f128_ps(tmp, tmp, 1); - first = _mm256_blend_ps(tmp, first, 0xee); - } - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second) - { - if (Offset==1) - { - first = _mm256_blend_pd(first, second, 1); - __m256d tmp = _mm256_permute_pd(first, 5); - first = _mm256_permute2f128_pd(tmp, tmp, 1); - first = _mm256_blend_pd(tmp, first, 0xA); - } - else if (Offset==2) - { - first = _mm256_blend_pd(first, second, 3); - first = _mm256_permute2f128_pd(first, first, 1); - } - else if (Offset==3) - { - first = _mm256_blend_pd(first, second, 7); - __m256d tmp = _mm256_permute_pd(first, 5); - first = _mm256_permute2f128_pd(tmp, tmp, 1); - first = _mm256_blend_pd(tmp, first, 5); - } - } -}; - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); - __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); - __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); - __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); - __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]); - __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]); - __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]); - __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]); - __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); - __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); - __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); - __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); - __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0)); - __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2)); - __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0)); - __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2)); - kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20); - kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20); - kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20); - kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20); - kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31); - kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31); - kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31); - kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); - __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); - __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); - __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); - - __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); - __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); - __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); - __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); - - kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20); - kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20); - kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31); - kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15); - __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0); - __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15); - __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0); - - kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32); - kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49); - kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32); - kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49); -} - -template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) { - const __m256 zero = _mm256_setzero_ps(); - const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ); - return _mm256_blendv_ps(thenPacket, elsePacket, false_mask); -} -template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) { - const __m256d zero = _mm256_setzero_pd(); - const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ); - return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); -} - -template<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b) -{ - return _mm256_blend_ps(a,pset1(b),1); -} - -template<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b) -{ - return _mm256_blend_pd(a,pset1(b),1); -} - -template<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b) -{ - return _mm256_blend_ps(a,pset1(b),(1<<7)); -} - -template<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b) -{ - return _mm256_blend_pd(a,pset1(b),(1<<3)); -} - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_PACKET_MATH_AVX_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/arch/AVX/TypeCasting.h b/lib/eigen_3.3.9/Eigen/src/Core/arch/AVX/TypeCasting.h deleted file mode 100644 index 83bfdc604be..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/arch/AVX/TypeCasting.h +++ /dev/null @@ -1,51 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_TYPE_CASTING_AVX_H -#define EIGEN_TYPE_CASTING_AVX_H - -namespace Eigen { - -namespace internal { - -// For now we use SSE to handle integers, so we can't use AVX instructions to cast -// from int to float -template <> -struct type_casting_traits { - enum { - VectorizedCast = 0, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 0, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - - - -template<> EIGEN_STRONG_INLINE Packet8i pcast(const Packet8f& a) { - return _mm256_cvtps_epi32(a); -} - -template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8i& a) { - return _mm256_cvtepi32_ps(a); -} - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_TYPE_CASTING_AVX_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/arch/AVX512/MathFunctions.h b/lib/eigen_3.3.9/Eigen/src/Core/arch/AVX512/MathFunctions.h deleted file mode 100644 index b259c1e1f92..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ /dev/null @@ -1,389 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Pedro Gonnet (pedro.gonnet@gmail.com) -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_ -#define THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_ - -namespace Eigen { - -namespace internal { - -// Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics. -#if EIGEN_GNUC_AT_LEAST(5, 3) - -#define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \ - const Packet16f p16f_##NAME = pset1(X) - -#define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \ - const Packet16f p16f_##NAME = (__m512)pset1(X) - -#define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \ - const Packet8d p8d_##NAME = pset1(X) - -#define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \ - const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X)) - - -// Natural logarithm -// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) -// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can -// be easily approximated by a polynomial centered on m=1 for stability. -#if defined(EIGEN_VECTORIZE_AVX512DQ) -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f -plog(const Packet16f& _x) { - Packet16f x = _x; - _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f); - _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet16f(126f, 126.0f); - - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inv_mant_mask, ~0x7f800000); - - // The smallest non denormalized float number. - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000); - - // Polynomial coefficients. - _EIGEN_DECLARE_CONST_Packet16f(cephes_SQRTHF, 0.707106781186547524f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p0, 7.0376836292E-2f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p1, -1.1514610310E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p2, 1.1676998740E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p3, -1.2420140846E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p4, +1.4249322787E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p5, -1.6668057665E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p6, +2.0000714765E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p7, -2.4999993993E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p8, +3.3333331174E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q1, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f); - - // invalid_mask is set to true when x is NaN - __mmask16 invalid_mask = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ); - __mmask16 iszero_mask = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_OQ); - - // Truncate input values to the minimum positive normal. - x = pmax(x, p16f_min_norm_pos); - - // Extract the shifted exponents. - Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32((__m512i)x, 23)); - Packet16f e = _mm512_sub_ps(emm0, p16f_126f); - - // Set the exponents to -1, i.e. x are in the range [0.5,1). - x = _mm512_and_ps(x, p16f_inv_mant_mask); - x = _mm512_or_ps(x, p16f_half); - - // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) - // and shift by -1. The values are then centered around 0, which improves - // the stability of the polynomial evaluation. - // if( x < SQRTHF ) { - // e -= 1; - // x = x + x - 1.0; - // } else { x = x - 1.0; } - __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ); - Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x); - x = psub(x, p16f_1); - e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1)); - x = padd(x, tmp); - - Packet16f x2 = pmul(x, x); - Packet16f x3 = pmul(x2, x); - - // Evaluate the polynomial approximant of degree 8 in three parts, probably - // to improve instruction-level parallelism. - Packet16f y, y1, y2; - y = pmadd(p16f_cephes_log_p0, x, p16f_cephes_log_p1); - y1 = pmadd(p16f_cephes_log_p3, x, p16f_cephes_log_p4); - y2 = pmadd(p16f_cephes_log_p6, x, p16f_cephes_log_p7); - y = pmadd(y, x, p16f_cephes_log_p2); - y1 = pmadd(y1, x, p16f_cephes_log_p5); - y2 = pmadd(y2, x, p16f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - // Add the logarithm of the exponent back to the result of the interpolation. - y1 = pmul(e, p16f_cephes_log_q1); - tmp = pmul(x2, p16f_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p16f_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - - __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ); - // Filter out invalid inputs, i.e.: - // - negative arg will be NAN, - // - 0 will be -INF. - // - +INF will be +INF - return _mm512_mask_blend_ps(iszero_mask, - _mm512_mask_blend_ps(invalid_mask, - _mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf), - p16f_nan), - p16f_minus_inf); -} - -#endif - -// Exponential function. Works by writing "x = m*log(2) + r" where -// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then -// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1). -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f -pexp(const Packet16f& _x) { - _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f); - _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet16f(127, 127.0f); - - _EIGEN_DECLARE_CONST_Packet16f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet16f(exp_lo, -88.3762626647949f); - - _EIGEN_DECLARE_CONST_Packet16f(cephes_LOG2EF, 1.44269504088896341f); - - _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p5, 5.0000001201E-1f); - - // Clamp x. - Packet16f x = pmax(pmin(_x, p16f_exp_hi), p16f_exp_lo); - - // Express exp(x) as exp(m*ln(2) + r), start by extracting - // m = floor(x/ln(2) + 0.5). - Packet16f m = _mm512_floor_ps(pmadd(x, p16f_cephes_LOG2EF, p16f_half)); - - // Get r = x - m*ln(2). Note that we can do this without losing more than one - // ulp precision due to the FMA instruction. - _EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f); - Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x); - Packet16f r2 = pmul(r, r); - - // TODO(gonnet): Split into odd/even polynomials and try to exploit - // instruction-level parallelism. - Packet16f y = p16f_cephes_exp_p0; - y = pmadd(y, r, p16f_cephes_exp_p1); - y = pmadd(y, r, p16f_cephes_exp_p2); - y = pmadd(y, r, p16f_cephes_exp_p3); - y = pmadd(y, r, p16f_cephes_exp_p4); - y = pmadd(y, r, p16f_cephes_exp_p5); - y = pmadd(y, r2, r); - y = padd(y, p16f_1); - - // Build emm0 = 2^m. - Packet16i emm0 = _mm512_cvttps_epi32(padd(m, p16f_127)); - emm0 = _mm512_slli_epi32(emm0, 23); - - // Return 2^m * exp(r). - return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x); -} - -/*template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d -pexp(const Packet8d& _x) { - Packet8d x = _x; - - _EIGEN_DECLARE_CONST_Packet8d(1, 1.0); - _EIGEN_DECLARE_CONST_Packet8d(2, 2.0); - - _EIGEN_DECLARE_CONST_Packet8d(exp_hi, 709.437); - _EIGEN_DECLARE_CONST_Packet8d(exp_lo, -709.436139303); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_LOG2EF, 1.4426950408889634073599); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p0, 1.26177193074810590878e-4); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p1, 3.02994407707441961300e-2); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p2, 9.99999999999999999910e-1); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q0, 3.00198505138664455042e-6); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q1, 2.52448340349684104192e-3); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q2, 2.27265548208155028766e-1); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q3, 2.00000000000000000009e0); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C1, 0.693145751953125); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C2, 1.42860682030941723212e-6); - - // clamp x - x = pmax(pmin(x, p8d_exp_hi), p8d_exp_lo); - - // Express exp(x) as exp(g + n*log(2)). - const Packet8d n = - _mm512_mul_round_pd(p8d_cephes_LOG2EF, x, _MM_FROUND_TO_NEAREST_INT); - - // Get the remainder modulo log(2), i.e. the "g" described above. Subtract - // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last - // digits right. - const Packet8d nC1 = pmul(n, p8d_cephes_exp_C1); - const Packet8d nC2 = pmul(n, p8d_cephes_exp_C2); - x = psub(x, nC1); - x = psub(x, nC2); - - const Packet8d x2 = pmul(x, x); - - // Evaluate the numerator polynomial of the rational interpolant. - Packet8d px = p8d_cephes_exp_p0; - px = pmadd(px, x2, p8d_cephes_exp_p1); - px = pmadd(px, x2, p8d_cephes_exp_p2); - px = pmul(px, x); - - // Evaluate the denominator polynomial of the rational interpolant. - Packet8d qx = p8d_cephes_exp_q0; - qx = pmadd(qx, x2, p8d_cephes_exp_q1); - qx = pmadd(qx, x2, p8d_cephes_exp_q2); - qx = pmadd(qx, x2, p8d_cephes_exp_q3); - - // I don't really get this bit, copied from the SSE2 routines, so... - // TODO(gonnet): Figure out what is going on here, perhaps find a better - // rational interpolant? - x = _mm512_div_pd(px, psub(qx, px)); - x = pmadd(p8d_2, x, p8d_1); - - // Build e=2^n. - const Packet8d e = _mm512_castsi512_pd(_mm512_slli_epi64( - _mm512_add_epi64(_mm512_cvtpd_epi64(n), _mm512_set1_epi64(1023)), 52)); - - // Construct the result 2^n * exp(g) = e * x. The max is used to catch - // non-finite values in the input. - return pmax(pmul(x, e), _x); - }*/ - -// Functions for sqrt. -// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step -// of Newton's method, at a cost of 1-2 bits of precision as opposed to the -// exact solution. The main advantage of this approach is not just speed, but -// also the fact that it can be inlined and pipelined with other computations, -// further reducing its effective latency. -#if EIGEN_FAST_MATH -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f -psqrt(const Packet16f& _x) { - Packet16f neg_half = pmul(_x, pset1(-.5f)); - __mmask16 denormal_mask = _mm512_kand( - _mm512_cmp_ps_mask(_x, pset1((std::numeric_limits::min)()), - _CMP_LT_OQ), - _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ)); - - Packet16f x = _mm512_rsqrt14_ps(_x); - - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5f))); - - // Flush results for denormals to zero. - return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps()); -} - -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d -psqrt(const Packet8d& _x) { - Packet8d neg_half = pmul(_x, pset1(-.5)); - __mmask16 denormal_mask = _mm512_kand( - _mm512_cmp_pd_mask(_x, pset1((std::numeric_limits::min)()), - _CMP_LT_OQ), - _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ)); - - Packet8d x = _mm512_rsqrt14_pd(_x); - - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5))); - - // Do a second step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5))); - - return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd()); -} -#else -template <> -EIGEN_STRONG_INLINE Packet16f psqrt(const Packet16f& x) { - return _mm512_sqrt_ps(x); -} -template <> -EIGEN_STRONG_INLINE Packet8d psqrt(const Packet8d& x) { - return _mm512_sqrt_pd(x); -} -#endif - -// Functions for rsqrt. -// Almost identical to the sqrt routine, just leave out the last multiplication -// and fill in NaN/Inf where needed. Note that this function only exists as an -// iterative version for doubles since there is no instruction for diretly -// computing the reciprocal square root in AVX-512. -#ifdef EIGEN_FAST_MATH -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f -prsqrt(const Packet16f& _x) { - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000); - _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f); - _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000); - - Packet16f neg_half = pmul(_x, p16f_minus_half); - - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ); - Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps()); - - // Fill in NaNs and Infs for the negative/zero entries. - __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ); - Packet16f infs_and_nans = _mm512_mask_blend_ps( - neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan); - - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); - - // Insert NaNs and Infs in all the right places. - return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans); -} - -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d -prsqrt(const Packet8d& _x) { - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL); - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(nan, 0x7ff1000000000000LL); - _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5); - _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5); - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL); - - Packet8d neg_half = pmul(_x, p8d_minus_half); - - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ); - Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd()); - - // Fill in NaNs and Infs for the negative/zero entries. - __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ); - Packet8d infs_and_nans = _mm512_mask_blend_pd( - neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan); - - // Do a first step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); - - // Do a second step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); - - // Insert NaNs and Infs in all the right places. - return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans); -} -#elif defined(EIGEN_VECTORIZE_AVX512ER) -template <> -EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { - return _mm512_rsqrt28_ps(x); -} -#endif -#endif - -} // end namespace internal - -} // end namespace Eigen - -#endif // THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_ diff --git a/lib/eigen_3.3.9/Eigen/src/Core/arch/AVX512/PacketMath.h b/lib/eigen_3.3.9/Eigen/src/Core/arch/AVX512/PacketMath.h deleted file mode 100644 index 000b7762ff2..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/arch/AVX512/PacketMath.h +++ /dev/null @@ -1,1305 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner (benoit.steiner.goog@gmail.com) -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_PACKET_MATH_AVX512_H -#define EIGEN_PACKET_MATH_AVX512_H - -namespace Eigen { - -namespace internal { - -#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD -#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 -#endif - -#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 -#endif - -#ifdef EIGEN_VECTORIZE_FMA -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#endif -#endif - -typedef __m512 Packet16f; -typedef __m512i Packet16i; -typedef __m512d Packet8d; - -template <> -struct is_arithmetic<__m512> { - enum { value = true }; -}; -template <> -struct is_arithmetic<__m512i> { - enum { value = true }; -}; -template <> -struct is_arithmetic<__m512d> { - enum { value = true }; -}; - -template<> struct packet_traits : default_packet_traits -{ - typedef Packet16f type; - typedef Packet8f half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 16, - HasHalfPacket = 1, - HasBlend = 0, -#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) -#ifdef EIGEN_VECTORIZE_AVX512DQ - HasLog = 1, -#endif - HasExp = 1, - HasSqrt = EIGEN_FAST_MATH, - HasRsqrt = EIGEN_FAST_MATH, -#endif - HasDiv = 1 - }; - }; -template<> struct packet_traits : default_packet_traits -{ - typedef Packet8d type; - typedef Packet4d half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 8, - HasHalfPacket = 1, -#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) - HasSqrt = EIGEN_FAST_MATH, - HasRsqrt = EIGEN_FAST_MATH, -#endif - HasDiv = 1 - }; -}; - -/* TODO Implement AVX512 for integers -template<> struct packet_traits : default_packet_traits -{ - typedef Packet16i type; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=8 - }; -}; -*/ - -template <> -struct unpacket_traits { - typedef float type; - typedef Packet8f half; - typedef Packet16i integer_packet; - enum { size = 16, alignment=Aligned64 }; -}; -template <> -struct unpacket_traits { - typedef double type; - typedef Packet4d half; - enum { size = 8, alignment=Aligned64 }; -}; -template <> -struct unpacket_traits { - typedef int type; - typedef Packet8i half; - enum { size = 16, alignment=Aligned64 }; -}; - -template <> -EIGEN_STRONG_INLINE Packet16f pset1(const float& from) { - return _mm512_set1_ps(from); -} -template <> -EIGEN_STRONG_INLINE Packet8d pset1(const double& from) { - return _mm512_set1_pd(from); -} -template <> -EIGEN_STRONG_INLINE Packet16i pset1(const int& from) { - return _mm512_set1_epi32(from); -} - -template <> -EIGEN_STRONG_INLINE Packet16f pload1(const float* from) { - return _mm512_broadcastss_ps(_mm_load_ps1(from)); -} -template <> -EIGEN_STRONG_INLINE Packet8d pload1(const double* from) { - return _mm512_set1_pd(*from); -} - -template <> -EIGEN_STRONG_INLINE Packet16f plset(const float& a) { - return _mm512_add_ps( - _mm512_set1_ps(a), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, - 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); -} -template <> -EIGEN_STRONG_INLINE Packet8d plset(const double& a) { - return _mm512_add_pd(_mm512_set1_pd(a), - _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0)); -} - -template <> -EIGEN_STRONG_INLINE Packet16f padd(const Packet16f& a, - const Packet16f& b) { - return _mm512_add_ps(a, b); -} -template <> -EIGEN_STRONG_INLINE Packet8d padd(const Packet8d& a, - const Packet8d& b) { - return _mm512_add_pd(a, b); -} -template <> -EIGEN_STRONG_INLINE Packet16i padd(const Packet16i& a, - const Packet16i& b) { - return _mm512_add_epi32(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet16f psub(const Packet16f& a, - const Packet16f& b) { - return _mm512_sub_ps(a, b); -} -template <> -EIGEN_STRONG_INLINE Packet8d psub(const Packet8d& a, - const Packet8d& b) { - return _mm512_sub_pd(a, b); -} -template <> -EIGEN_STRONG_INLINE Packet16i psub(const Packet16i& a, - const Packet16i& b) { - return _mm512_sub_epi32(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) { - return _mm512_sub_ps(_mm512_set1_ps(0.0), a); -} -template <> -EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) { - return _mm512_sub_pd(_mm512_set1_pd(0.0), a); -} - -template <> -EIGEN_STRONG_INLINE Packet16f pconj(const Packet16f& a) { - return a; -} -template <> -EIGEN_STRONG_INLINE Packet8d pconj(const Packet8d& a) { - return a; -} -template <> -EIGEN_STRONG_INLINE Packet16i pconj(const Packet16i& a) { - return a; -} - -template <> -EIGEN_STRONG_INLINE Packet16f pmul(const Packet16f& a, - const Packet16f& b) { - return _mm512_mul_ps(a, b); -} -template <> -EIGEN_STRONG_INLINE Packet8d pmul(const Packet8d& a, - const Packet8d& b) { - return _mm512_mul_pd(a, b); -} -template <> -EIGEN_STRONG_INLINE Packet16i pmul(const Packet16i& a, - const Packet16i& b) { - return _mm512_mul_epi32(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet16f pdiv(const Packet16f& a, - const Packet16f& b) { - return _mm512_div_ps(a, b); -} -template <> -EIGEN_STRONG_INLINE Packet8d pdiv(const Packet8d& a, - const Packet8d& b) { - return _mm512_div_pd(a, b); -} - -#ifdef EIGEN_VECTORIZE_FMA -template <> -EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b, - const Packet16f& c) { - return _mm512_fmadd_ps(a, b, c); -} -template <> -EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b, - const Packet8d& c) { - return _mm512_fmadd_pd(a, b, c); -} -#endif - -template <> -EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, - const Packet16f& b) { - // Arguments are reversed to match NaN propagation behavior of std::min. - return _mm512_min_ps(b, a); -} -template <> -EIGEN_STRONG_INLINE Packet8d pmin(const Packet8d& a, - const Packet8d& b) { - // Arguments are reversed to match NaN propagation behavior of std::min. - return _mm512_min_pd(b, a); -} - -template <> -EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, - const Packet16f& b) { - // Arguments are reversed to match NaN propagation behavior of std::max. - return _mm512_max_ps(b, a); -} -template <> -EIGEN_STRONG_INLINE Packet8d pmax(const Packet8d& a, - const Packet8d& b) { - // Arguments are reversed to match NaN propagation behavior of std::max. - return _mm512_max_pd(b, a); -} - -#ifdef EIGEN_VECTORIZE_AVX512DQ -template EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); } -template EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); } -EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); } -#else -// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512 -template EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { - return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_)); -} - -// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512 -template EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { - return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_)); -} - -EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { - return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)), - _mm256_castps_si256(b),1)); -} -#endif - -// Helper function for bit packing snippet of low precision comparison. -// It packs the flags from 32x16 to 16x16. -EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) { - // Split data into small pieces and handle with AVX instructions - // to guarantee internal order of vector. - // Operation: - // dst[15:0] := Saturate16(rf[31:0]) - // dst[31:16] := Saturate16(rf[63:32]) - // ... - // dst[255:240] := Saturate16(rf[255:224]) - __m256i lo = _mm256_castps_si256(extract256<0>(rf)); - __m256i hi = _mm256_castps_si256(extract256<1>(rf)); - __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0), - _mm256_extractf128_si256(lo, 1)); - __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0), - _mm256_extractf128_si256(hi, 1)); - return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1); -} - -template <> -EIGEN_STRONG_INLINE Packet16i pand(const Packet16i& a, - const Packet16i& b) { - return _mm512_and_si512(a,b); -} - -template <> -EIGEN_STRONG_INLINE Packet16f pand(const Packet16f& a, - const Packet16f& b) { -#ifdef EIGEN_VECTORIZE_AVX512DQ - return _mm512_and_ps(a, b); -#else - return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b))); -#endif -} -template <> -EIGEN_STRONG_INLINE Packet8d pand(const Packet8d& a, - const Packet8d& b) { -#ifdef EIGEN_VECTORIZE_AVX512DQ - return _mm512_and_pd(a, b); -#else - Packet8d res = _mm512_undefined_pd(); - Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); - Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); - res = _mm512_insertf64x4(res, _mm256_and_pd(lane0_a, lane0_b), 0); - - Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); - Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); - return _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1); -#endif -} - -template <> -EIGEN_STRONG_INLINE Packet16i por(const Packet16i& a, const Packet16i& b) { - return _mm512_or_si512(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, const Packet16f& b) { -#ifdef EIGEN_VECTORIZE_AVX512DQ - return _mm512_or_ps(a, b); -#else - return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b))); -#endif -} - -template <> -EIGEN_STRONG_INLINE Packet8d por(const Packet8d& a, - const Packet8d& b) { -#ifdef EIGEN_VECTORIZE_AVX512DQ - return _mm512_or_pd(a, b); -#else - return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); -#endif -} - -template <> -EIGEN_STRONG_INLINE Packet16i pxor(const Packet16i& a, const Packet16i& b) { - return _mm512_xor_si512(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, const Packet16f& b) { -#ifdef EIGEN_VECTORIZE_AVX512DQ - return _mm512_xor_ps(a, b); -#else - return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b))); -#endif -} - -template <> -EIGEN_STRONG_INLINE Packet8d pxor(const Packet8d& a, const Packet8d& b) { -#ifdef EIGEN_VECTORIZE_AVX512DQ - return _mm512_xor_pd(a, b); -#else - return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); -#endif -} - -template <> -EIGEN_STRONG_INLINE Packet16i pandnot(const Packet16i& a, const Packet16i& b) { - return _mm512_andnot_si512(b, a); -} - -template <> -EIGEN_STRONG_INLINE Packet16f pandnot(const Packet16f& a, const Packet16f& b) { -#ifdef EIGEN_VECTORIZE_AVX512DQ - return _mm512_andnot_ps(b, a); -#else - return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b))); -#endif -} -template <> -EIGEN_STRONG_INLINE Packet8d pandnot(const Packet8d& a,const Packet8d& b) { -#ifdef EIGEN_VECTORIZE_AVX512DQ - return _mm512_andnot_pd(b, a); -#else - return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); -#endif -} - -template EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) { - return _mm512_srai_epi32(a, N); -} - -template EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) { - return _mm512_srli_epi32(a, N); -} - -template EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) { - return _mm512_slli_epi32(a, N); -} - -template <> -EIGEN_STRONG_INLINE Packet16f pload(const float* from) { - EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from); -} -template <> -EIGEN_STRONG_INLINE Packet8d pload(const double* from) { - EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_pd(from); -} -template <> -EIGEN_STRONG_INLINE Packet16i pload(const int* from) { - EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( - reinterpret_cast(from)); -} - -template <> -EIGEN_STRONG_INLINE Packet16f ploadu(const float* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ps(from); -} -template <> -EIGEN_STRONG_INLINE Packet8d ploadu(const double* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_pd(from); -} -template <> -EIGEN_STRONG_INLINE Packet16i ploadu(const int* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512( - reinterpret_cast(from)); -} - -// Loads 8 floats from memory a returns the packet -// {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7} -template <> -EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) { - // an unaligned load is required here as there is no requirement - // on the alignment of input pointer 'from' - __m256i low_half = _mm256_loadu_si256(reinterpret_cast(from)); - __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half)); - __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0)); - return pairs; -} - -#ifdef EIGEN_VECTORIZE_AVX512DQ -// FIXME: this does not look optimal, better load a Packet4d and shuffle... -// Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, -// a3} -template <> -EIGEN_STRONG_INLINE Packet8d ploaddup(const double* from) { - __m512d x = _mm512_setzero_pd(); - x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[0]), 0); - x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[1]), 1); - x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2); - x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3); - return x; -} -#else -template <> -EIGEN_STRONG_INLINE Packet8d ploaddup(const double* from) { - __m512d x = _mm512_setzero_pd(); - x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0)); - x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1)); - x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2)); - x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3)); - return x; -} -#endif - -// Loads 4 floats from memory a returns the packet -// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3} -template <> -EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) { - Packet16f tmp = _mm512_castps128_ps512(ploadu(from)); - const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0); - return _mm512_permutexvar_ps(scatter_mask, tmp); -} - -// Loads 2 doubles from memory a returns the packet -// {a0, a0 a0, a0, a1, a1, a1, a1} -template <> -EIGEN_STRONG_INLINE Packet8d ploadquad(const double* from) { - __m256d lane0 = _mm256_set1_pd(*from); - __m256d lane1 = _mm256_set1_pd(*(from+1)); - __m512d tmp = _mm512_undefined_pd(); - tmp = _mm512_insertf64x4(tmp, lane0, 0); - return _mm512_insertf64x4(tmp, lane1, 1); -} - -template <> -EIGEN_STRONG_INLINE void pstore(float* to, const Packet16f& from) { - EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ps(to, from); -} -template <> -EIGEN_STRONG_INLINE void pstore(double* to, const Packet8d& from) { - EIGEN_DEBUG_ALIGNED_STORE _mm512_store_pd(to, from); -} -template <> -EIGEN_STRONG_INLINE void pstore(int* to, const Packet16i& from) { - EIGEN_DEBUG_ALIGNED_STORE _mm512_storeu_si512(reinterpret_cast<__m512i*>(to), - from); -} - -template <> -EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet16f& from) { - EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ps(to, from); -} -template <> -EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet8d& from) { - EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_pd(to, from); -} -template <> -EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet16i& from) { - EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( - reinterpret_cast<__m512i*>(to), from); -} - -template <> -EIGEN_DEVICE_FUNC inline Packet16f pgather(const float* from, - Index stride) { - Packet16i stride_vector = _mm512_set1_epi32(convert_index(stride)); - Packet16i stride_multiplier = - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); - - return _mm512_i32gather_ps(indices, from, 4); -} -template <> -EIGEN_DEVICE_FUNC inline Packet8d pgather(const double* from, - Index stride) { - Packet8i stride_vector = _mm256_set1_epi32(convert_index(stride)); - Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); - - return _mm512_i32gather_pd(indices, from, 8); -} - -template <> -EIGEN_DEVICE_FUNC inline void pscatter(float* to, - const Packet16f& from, - Index stride) { - Packet16i stride_vector = _mm512_set1_epi32(convert_index(stride)); - Packet16i stride_multiplier = - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); - _mm512_i32scatter_ps(to, indices, from, 4); -} -template <> -EIGEN_DEVICE_FUNC inline void pscatter(double* to, - const Packet8d& from, - Index stride) { - Packet8i stride_vector = _mm256_set1_epi32(convert_index(stride)); - Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); - _mm512_i32scatter_pd(to, indices, from, 8); -} - -template <> -EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) { - Packet16f pa = pset1(a); - pstore(to, pa); -} -template <> -EIGEN_STRONG_INLINE void pstore1(double* to, const double& a) { - Packet8d pa = pset1(a); - pstore(to, pa); -} -template <> -EIGEN_STRONG_INLINE void pstore1(int* to, const int& a) { - Packet16i pa = pset1(a); - pstore(to, pa); -} - -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } - -template <> -EIGEN_STRONG_INLINE float pfirst(const Packet16f& a) { - return _mm_cvtss_f32(_mm512_extractf32x4_ps(a, 0)); -} -template <> -EIGEN_STRONG_INLINE double pfirst(const Packet8d& a) { - return _mm_cvtsd_f64(_mm256_extractf128_pd(_mm512_extractf64x4_pd(a, 0), 0)); -} -template <> -EIGEN_STRONG_INLINE int pfirst(const Packet16i& a) { - return _mm_extract_epi32(_mm512_extracti32x4_epi32(a, 0), 0); -} - -template<> EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) -{ - return _mm512_permutexvar_ps(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a); -} - -template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a) -{ - return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a); -} - -template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) -{ - // _mm512_abs_ps intrinsic not found, so hack around it - return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff))); -} -template <> -EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) { - // _mm512_abs_ps intrinsic not found, so hack around it - return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a), - _mm512_set1_epi64(0x7fffffffffffffff))); -} - -#ifdef EIGEN_VECTORIZE_AVX512DQ -// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512 -#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ - __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \ - __m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1) -#else -#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ - __m256 OUTPUT##_0 = _mm256_insertf128_ps( \ - _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \ - _mm512_extractf32x4_ps(INPUT, 1), 1); \ - __m256 OUTPUT##_1 = _mm256_insertf128_ps( \ - _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \ - _mm512_extractf32x4_ps(INPUT, 3), 1); -#endif - -#ifdef EIGEN_VECTORIZE_AVX512DQ -#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ - OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1); -#else -#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ - OUTPUT = _mm512_undefined_ps(); \ - OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \ - OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \ - OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \ - OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3); -#endif - -template <> -EIGEN_STRONG_INLINE float predux(const Packet16f& a) { -#ifdef EIGEN_VECTORIZE_AVX512DQ - __m256 lane0 = _mm512_extractf32x8_ps(a, 0); - __m256 lane1 = _mm512_extractf32x8_ps(a, 1); - Packet8f x = _mm256_add_ps(lane0, lane1); - return predux(x); -#else - __m128 lane0 = _mm512_extractf32x4_ps(a, 0); - __m128 lane1 = _mm512_extractf32x4_ps(a, 1); - __m128 lane2 = _mm512_extractf32x4_ps(a, 2); - __m128 lane3 = _mm512_extractf32x4_ps(a, 3); - __m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3)); - sum = _mm_hadd_ps(sum, sum); - sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1)); - return _mm_cvtss_f32(sum); -#endif -} -template <> -EIGEN_STRONG_INLINE double predux(const Packet8d& a) { - __m256d lane0 = _mm512_extractf64x4_pd(a, 0); - __m256d lane1 = _mm512_extractf64x4_pd(a, 1); - __m256d sum = _mm256_add_pd(lane0, lane1); - __m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1)); - return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0))); -} - -template <> -EIGEN_STRONG_INLINE Packet8f predux_downto4(const Packet16f& a) { -#ifdef EIGEN_VECTORIZE_AVX512DQ - Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); - Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); - return padd(lane0, lane1); -#else - Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); - Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); - Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); - Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); - Packet4f sum0 = padd(lane0, lane2); - Packet4f sum1 = padd(lane1, lane3); - return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1); -#endif -} -template <> -EIGEN_STRONG_INLINE Packet4d predux_downto4(const Packet8d& a) { - Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); - Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); - Packet4d res = padd(lane0, lane1); - return res; -} - -template <> -EIGEN_STRONG_INLINE float predux_mul(const Packet16f& a) { -//#ifdef EIGEN_VECTORIZE_AVX512DQ -#if 0 - Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); - Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); - Packet8f res = pmul(lane0, lane1); - res = pmul(res, _mm256_permute2f128_ps(res, res, 1)); - res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); -#else - __m128 lane0 = _mm512_extractf32x4_ps(a, 0); - __m128 lane1 = _mm512_extractf32x4_ps(a, 1); - __m128 lane2 = _mm512_extractf32x4_ps(a, 2); - __m128 lane3 = _mm512_extractf32x4_ps(a, 3); - __m128 res = pmul(pmul(lane0, lane1), pmul(lane2, lane3)); - res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); -#endif -} -template <> -EIGEN_STRONG_INLINE double predux_mul(const Packet8d& a) { - __m256d lane0 = _mm512_extractf64x4_pd(a, 0); - __m256d lane1 = _mm512_extractf64x4_pd(a, 1); - __m256d res = pmul(lane0, lane1); - res = pmul(res, _mm256_permute2f128_pd(res, res, 1)); - return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1))); -} - -template <> -EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) { - __m128 lane0 = _mm512_extractf32x4_ps(a, 0); - __m128 lane1 = _mm512_extractf32x4_ps(a, 1); - __m128 lane2 = _mm512_extractf32x4_ps(a, 2); - __m128 lane3 = _mm512_extractf32x4_ps(a, 3); - __m128 res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3)); - res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); -} -template <> -EIGEN_STRONG_INLINE double predux_min(const Packet8d& a) { - __m256d lane0 = _mm512_extractf64x4_pd(a, 0); - __m256d lane1 = _mm512_extractf64x4_pd(a, 1); - __m256d res = _mm256_min_pd(lane0, lane1); - res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1)); - return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1))); -} - -template <> -EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) { - __m128 lane0 = _mm512_extractf32x4_ps(a, 0); - __m128 lane1 = _mm512_extractf32x4_ps(a, 1); - __m128 lane2 = _mm512_extractf32x4_ps(a, 2); - __m128 lane3 = _mm512_extractf32x4_ps(a, 3); - __m128 res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3)); - res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); -} - -template <> -EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) { - __m256d lane0 = _mm512_extractf64x4_pd(a, 0); - __m256d lane1 = _mm512_extractf64x4_pd(a, 1); - __m256d res = _mm256_max_pd(lane0, lane1); - res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1)); - return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1))); -} - -template<> EIGEN_STRONG_INLINE Packet16f preduxp(const Packet16f* vecs) -{ - EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0); - EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1); - EIGEN_EXTRACT_8f_FROM_16f(vecs[2], vecs2); - EIGEN_EXTRACT_8f_FROM_16f(vecs[3], vecs3); - EIGEN_EXTRACT_8f_FROM_16f(vecs[4], vecs4); - EIGEN_EXTRACT_8f_FROM_16f(vecs[5], vecs5); - EIGEN_EXTRACT_8f_FROM_16f(vecs[6], vecs6); - EIGEN_EXTRACT_8f_FROM_16f(vecs[7], vecs7); - EIGEN_EXTRACT_8f_FROM_16f(vecs[8], vecs8); - EIGEN_EXTRACT_8f_FROM_16f(vecs[9], vecs9); - EIGEN_EXTRACT_8f_FROM_16f(vecs[10], vecs10); - EIGEN_EXTRACT_8f_FROM_16f(vecs[11], vecs11); - EIGEN_EXTRACT_8f_FROM_16f(vecs[12], vecs12); - EIGEN_EXTRACT_8f_FROM_16f(vecs[13], vecs13); - EIGEN_EXTRACT_8f_FROM_16f(vecs[14], vecs14); - EIGEN_EXTRACT_8f_FROM_16f(vecs[15], vecs15); - - __m256 hsum1 = _mm256_hadd_ps(vecs0_0, vecs1_0); - __m256 hsum2 = _mm256_hadd_ps(vecs2_0, vecs3_0); - __m256 hsum3 = _mm256_hadd_ps(vecs4_0, vecs5_0); - __m256 hsum4 = _mm256_hadd_ps(vecs6_0, vecs7_0); - - __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); - __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); - __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); - __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); - - __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); - __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); - __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); - __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); - - __m256 sum1 = _mm256_add_ps(perm1, hsum5); - __m256 sum2 = _mm256_add_ps(perm2, hsum6); - __m256 sum3 = _mm256_add_ps(perm3, hsum7); - __m256 sum4 = _mm256_add_ps(perm4, hsum8); - - __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); - __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); - - __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); - - hsum1 = _mm256_hadd_ps(vecs0_1, vecs1_1); - hsum2 = _mm256_hadd_ps(vecs2_1, vecs3_1); - hsum3 = _mm256_hadd_ps(vecs4_1, vecs5_1); - hsum4 = _mm256_hadd_ps(vecs6_1, vecs7_1); - - hsum5 = _mm256_hadd_ps(hsum1, hsum1); - hsum6 = _mm256_hadd_ps(hsum2, hsum2); - hsum7 = _mm256_hadd_ps(hsum3, hsum3); - hsum8 = _mm256_hadd_ps(hsum4, hsum4); - - perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); - perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); - perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); - perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); - - sum1 = _mm256_add_ps(perm1, hsum5); - sum2 = _mm256_add_ps(perm2, hsum6); - sum3 = _mm256_add_ps(perm3, hsum7); - sum4 = _mm256_add_ps(perm4, hsum8); - - blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); - blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); - - final = padd(final, _mm256_blend_ps(blend1, blend2, 0xf0)); - - hsum1 = _mm256_hadd_ps(vecs8_0, vecs9_0); - hsum2 = _mm256_hadd_ps(vecs10_0, vecs11_0); - hsum3 = _mm256_hadd_ps(vecs12_0, vecs13_0); - hsum4 = _mm256_hadd_ps(vecs14_0, vecs15_0); - - hsum5 = _mm256_hadd_ps(hsum1, hsum1); - hsum6 = _mm256_hadd_ps(hsum2, hsum2); - hsum7 = _mm256_hadd_ps(hsum3, hsum3); - hsum8 = _mm256_hadd_ps(hsum4, hsum4); - - perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); - perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); - perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); - perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); - - sum1 = _mm256_add_ps(perm1, hsum5); - sum2 = _mm256_add_ps(perm2, hsum6); - sum3 = _mm256_add_ps(perm3, hsum7); - sum4 = _mm256_add_ps(perm4, hsum8); - - blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); - blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); - - __m256 final_1 = _mm256_blend_ps(blend1, blend2, 0xf0); - - hsum1 = _mm256_hadd_ps(vecs8_1, vecs9_1); - hsum2 = _mm256_hadd_ps(vecs10_1, vecs11_1); - hsum3 = _mm256_hadd_ps(vecs12_1, vecs13_1); - hsum4 = _mm256_hadd_ps(vecs14_1, vecs15_1); - - hsum5 = _mm256_hadd_ps(hsum1, hsum1); - hsum6 = _mm256_hadd_ps(hsum2, hsum2); - hsum7 = _mm256_hadd_ps(hsum3, hsum3); - hsum8 = _mm256_hadd_ps(hsum4, hsum4); - - perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); - perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); - perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); - perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); - - sum1 = _mm256_add_ps(perm1, hsum5); - sum2 = _mm256_add_ps(perm2, hsum6); - sum3 = _mm256_add_ps(perm3, hsum7); - sum4 = _mm256_add_ps(perm4, hsum8); - - blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); - blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); - - final_1 = padd(final_1, _mm256_blend_ps(blend1, blend2, 0xf0)); - - __m512 final_output; - - EIGEN_INSERT_8f_INTO_16f(final_output, final, final_1); - return final_output; -} - -template<> EIGEN_STRONG_INLINE Packet8d preduxp(const Packet8d* vecs) -{ - Packet4d vecs0_0 = _mm512_extractf64x4_pd(vecs[0], 0); - Packet4d vecs0_1 = _mm512_extractf64x4_pd(vecs[0], 1); - - Packet4d vecs1_0 = _mm512_extractf64x4_pd(vecs[1], 0); - Packet4d vecs1_1 = _mm512_extractf64x4_pd(vecs[1], 1); - - Packet4d vecs2_0 = _mm512_extractf64x4_pd(vecs[2], 0); - Packet4d vecs2_1 = _mm512_extractf64x4_pd(vecs[2], 1); - - Packet4d vecs3_0 = _mm512_extractf64x4_pd(vecs[3], 0); - Packet4d vecs3_1 = _mm512_extractf64x4_pd(vecs[3], 1); - - Packet4d vecs4_0 = _mm512_extractf64x4_pd(vecs[4], 0); - Packet4d vecs4_1 = _mm512_extractf64x4_pd(vecs[4], 1); - - Packet4d vecs5_0 = _mm512_extractf64x4_pd(vecs[5], 0); - Packet4d vecs5_1 = _mm512_extractf64x4_pd(vecs[5], 1); - - Packet4d vecs6_0 = _mm512_extractf64x4_pd(vecs[6], 0); - Packet4d vecs6_1 = _mm512_extractf64x4_pd(vecs[6], 1); - - Packet4d vecs7_0 = _mm512_extractf64x4_pd(vecs[7], 0); - Packet4d vecs7_1 = _mm512_extractf64x4_pd(vecs[7], 1); - - Packet4d tmp0, tmp1; - - tmp0 = _mm256_hadd_pd(vecs0_0, vecs1_0); - tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); - - tmp1 = _mm256_hadd_pd(vecs2_0, vecs3_0); - tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); - - __m256d final_0 = _mm256_blend_pd(tmp0, tmp1, 0xC); - - tmp0 = _mm256_hadd_pd(vecs0_1, vecs1_1); - tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); - - tmp1 = _mm256_hadd_pd(vecs2_1, vecs3_1); - tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); - - final_0 = padd(final_0, _mm256_blend_pd(tmp0, tmp1, 0xC)); - - tmp0 = _mm256_hadd_pd(vecs4_0, vecs5_0); - tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); - - tmp1 = _mm256_hadd_pd(vecs6_0, vecs7_0); - tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); - - __m256d final_1 = _mm256_blend_pd(tmp0, tmp1, 0xC); - - tmp0 = _mm256_hadd_pd(vecs4_1, vecs5_1); - tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); - - tmp1 = _mm256_hadd_pd(vecs6_1, vecs7_1); - tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); - - final_1 = padd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC)); - - __m512d final_output = _mm512_insertf64x4(final_output, final_0, 0); - - return _mm512_insertf64x4(final_output, final_1, 1); -} - - - -#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \ - EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]); - -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]); - __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]); - __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]); - __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]); - __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]); - __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]); - __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]); - __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]); - __m512 T8 = _mm512_unpacklo_ps(kernel.packet[8], kernel.packet[9]); - __m512 T9 = _mm512_unpackhi_ps(kernel.packet[8], kernel.packet[9]); - __m512 T10 = _mm512_unpacklo_ps(kernel.packet[10], kernel.packet[11]); - __m512 T11 = _mm512_unpackhi_ps(kernel.packet[10], kernel.packet[11]); - __m512 T12 = _mm512_unpacklo_ps(kernel.packet[12], kernel.packet[13]); - __m512 T13 = _mm512_unpackhi_ps(kernel.packet[12], kernel.packet[13]); - __m512 T14 = _mm512_unpacklo_ps(kernel.packet[14], kernel.packet[15]); - __m512 T15 = _mm512_unpackhi_ps(kernel.packet[14], kernel.packet[15]); - __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 S4 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 S5 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 S6 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 S7 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 S8 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 S9 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 S10 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 S11 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 S12 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 S13 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 S14 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 S15 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(3, 2, 3, 2)); - - EIGEN_EXTRACT_8f_FROM_16f(S0, S0); - EIGEN_EXTRACT_8f_FROM_16f(S1, S1); - EIGEN_EXTRACT_8f_FROM_16f(S2, S2); - EIGEN_EXTRACT_8f_FROM_16f(S3, S3); - EIGEN_EXTRACT_8f_FROM_16f(S4, S4); - EIGEN_EXTRACT_8f_FROM_16f(S5, S5); - EIGEN_EXTRACT_8f_FROM_16f(S6, S6); - EIGEN_EXTRACT_8f_FROM_16f(S7, S7); - EIGEN_EXTRACT_8f_FROM_16f(S8, S8); - EIGEN_EXTRACT_8f_FROM_16f(S9, S9); - EIGEN_EXTRACT_8f_FROM_16f(S10, S10); - EIGEN_EXTRACT_8f_FROM_16f(S11, S11); - EIGEN_EXTRACT_8f_FROM_16f(S12, S12); - EIGEN_EXTRACT_8f_FROM_16f(S13, S13); - EIGEN_EXTRACT_8f_FROM_16f(S14, S14); - EIGEN_EXTRACT_8f_FROM_16f(S15, S15); - - PacketBlock tmp; - - tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S4_0, 0x20); - tmp.packet[1] = _mm256_permute2f128_ps(S1_0, S5_0, 0x20); - tmp.packet[2] = _mm256_permute2f128_ps(S2_0, S6_0, 0x20); - tmp.packet[3] = _mm256_permute2f128_ps(S3_0, S7_0, 0x20); - tmp.packet[4] = _mm256_permute2f128_ps(S0_0, S4_0, 0x31); - tmp.packet[5] = _mm256_permute2f128_ps(S1_0, S5_0, 0x31); - tmp.packet[6] = _mm256_permute2f128_ps(S2_0, S6_0, 0x31); - tmp.packet[7] = _mm256_permute2f128_ps(S3_0, S7_0, 0x31); - - tmp.packet[8] = _mm256_permute2f128_ps(S0_1, S4_1, 0x20); - tmp.packet[9] = _mm256_permute2f128_ps(S1_1, S5_1, 0x20); - tmp.packet[10] = _mm256_permute2f128_ps(S2_1, S6_1, 0x20); - tmp.packet[11] = _mm256_permute2f128_ps(S3_1, S7_1, 0x20); - tmp.packet[12] = _mm256_permute2f128_ps(S0_1, S4_1, 0x31); - tmp.packet[13] = _mm256_permute2f128_ps(S1_1, S5_1, 0x31); - tmp.packet[14] = _mm256_permute2f128_ps(S2_1, S6_1, 0x31); - tmp.packet[15] = _mm256_permute2f128_ps(S3_1, S7_1, 0x31); - - // Second set of _m256 outputs - tmp.packet[16] = _mm256_permute2f128_ps(S8_0, S12_0, 0x20); - tmp.packet[17] = _mm256_permute2f128_ps(S9_0, S13_0, 0x20); - tmp.packet[18] = _mm256_permute2f128_ps(S10_0, S14_0, 0x20); - tmp.packet[19] = _mm256_permute2f128_ps(S11_0, S15_0, 0x20); - tmp.packet[20] = _mm256_permute2f128_ps(S8_0, S12_0, 0x31); - tmp.packet[21] = _mm256_permute2f128_ps(S9_0, S13_0, 0x31); - tmp.packet[22] = _mm256_permute2f128_ps(S10_0, S14_0, 0x31); - tmp.packet[23] = _mm256_permute2f128_ps(S11_0, S15_0, 0x31); - - tmp.packet[24] = _mm256_permute2f128_ps(S8_1, S12_1, 0x20); - tmp.packet[25] = _mm256_permute2f128_ps(S9_1, S13_1, 0x20); - tmp.packet[26] = _mm256_permute2f128_ps(S10_1, S14_1, 0x20); - tmp.packet[27] = _mm256_permute2f128_ps(S11_1, S15_1, 0x20); - tmp.packet[28] = _mm256_permute2f128_ps(S8_1, S12_1, 0x31); - tmp.packet[29] = _mm256_permute2f128_ps(S9_1, S13_1, 0x31); - tmp.packet[30] = _mm256_permute2f128_ps(S10_1, S14_1, 0x31); - tmp.packet[31] = _mm256_permute2f128_ps(S11_1, S15_1, 0x31); - - // Pack them into the output - PACK_OUTPUT(kernel.packet, tmp.packet, 0, 16); - PACK_OUTPUT(kernel.packet, tmp.packet, 1, 16); - PACK_OUTPUT(kernel.packet, tmp.packet, 2, 16); - PACK_OUTPUT(kernel.packet, tmp.packet, 3, 16); - - PACK_OUTPUT(kernel.packet, tmp.packet, 4, 16); - PACK_OUTPUT(kernel.packet, tmp.packet, 5, 16); - PACK_OUTPUT(kernel.packet, tmp.packet, 6, 16); - PACK_OUTPUT(kernel.packet, tmp.packet, 7, 16); - - PACK_OUTPUT(kernel.packet, tmp.packet, 8, 16); - PACK_OUTPUT(kernel.packet, tmp.packet, 9, 16); - PACK_OUTPUT(kernel.packet, tmp.packet, 10, 16); - PACK_OUTPUT(kernel.packet, tmp.packet, 11, 16); - - PACK_OUTPUT(kernel.packet, tmp.packet, 12, 16); - PACK_OUTPUT(kernel.packet, tmp.packet, 13, 16); - PACK_OUTPUT(kernel.packet, tmp.packet, 14, 16); - PACK_OUTPUT(kernel.packet, tmp.packet, 15, 16); -} -#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE) \ - EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], \ - INPUT[2 * INDEX + STRIDE]); - -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]); - __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]); - __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]); - __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]); - - __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2)); - - EIGEN_EXTRACT_8f_FROM_16f(S0, S0); - EIGEN_EXTRACT_8f_FROM_16f(S1, S1); - EIGEN_EXTRACT_8f_FROM_16f(S2, S2); - EIGEN_EXTRACT_8f_FROM_16f(S3, S3); - - PacketBlock tmp; - - tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S1_0, 0x20); - tmp.packet[1] = _mm256_permute2f128_ps(S2_0, S3_0, 0x20); - tmp.packet[2] = _mm256_permute2f128_ps(S0_0, S1_0, 0x31); - tmp.packet[3] = _mm256_permute2f128_ps(S2_0, S3_0, 0x31); - - tmp.packet[4] = _mm256_permute2f128_ps(S0_1, S1_1, 0x20); - tmp.packet[5] = _mm256_permute2f128_ps(S2_1, S3_1, 0x20); - tmp.packet[6] = _mm256_permute2f128_ps(S0_1, S1_1, 0x31); - tmp.packet[7] = _mm256_permute2f128_ps(S2_1, S3_1, 0x31); - - PACK_OUTPUT_2(kernel.packet, tmp.packet, 0, 1); - PACK_OUTPUT_2(kernel.packet, tmp.packet, 1, 1); - PACK_OUTPUT_2(kernel.packet, tmp.packet, 2, 1); - PACK_OUTPUT_2(kernel.packet, tmp.packet, 3, 1); -} - -#define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE) \ - OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX], 0); \ - OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX + STRIDE], 1); - -#define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE) \ - OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \ - OUTPUT[INDEX] = \ - _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1); - -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - __m512d T0 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0); - __m512d T1 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0xff); - __m512d T2 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0); - __m512d T3 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0xff); - - PacketBlock tmp; - - tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), - _mm512_extractf64x4_pd(T2, 0), 0x20); - tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), - _mm512_extractf64x4_pd(T3, 0), 0x20); - tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), - _mm512_extractf64x4_pd(T2, 0), 0x31); - tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), - _mm512_extractf64x4_pd(T3, 0), 0x31); - - tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), - _mm512_extractf64x4_pd(T2, 1), 0x20); - tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), - _mm512_extractf64x4_pd(T3, 1), 0x20); - tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), - _mm512_extractf64x4_pd(T2, 1), 0x31); - tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), - _mm512_extractf64x4_pd(T3, 1), 0x31); - - PACK_OUTPUT_D(kernel.packet, tmp.packet, 0, 1); - PACK_OUTPUT_D(kernel.packet, tmp.packet, 1, 1); - PACK_OUTPUT_D(kernel.packet, tmp.packet, 2, 1); - PACK_OUTPUT_D(kernel.packet, tmp.packet, 3, 1); -} - -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - __m512d T0 = _mm512_unpacklo_pd(kernel.packet[0], kernel.packet[1]); - __m512d T1 = _mm512_unpackhi_pd(kernel.packet[0], kernel.packet[1]); - __m512d T2 = _mm512_unpacklo_pd(kernel.packet[2], kernel.packet[3]); - __m512d T3 = _mm512_unpackhi_pd(kernel.packet[2], kernel.packet[3]); - __m512d T4 = _mm512_unpacklo_pd(kernel.packet[4], kernel.packet[5]); - __m512d T5 = _mm512_unpackhi_pd(kernel.packet[4], kernel.packet[5]); - __m512d T6 = _mm512_unpacklo_pd(kernel.packet[6], kernel.packet[7]); - __m512d T7 = _mm512_unpackhi_pd(kernel.packet[6], kernel.packet[7]); - - PacketBlock tmp; - - tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), - _mm512_extractf64x4_pd(T2, 0), 0x20); - tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), - _mm512_extractf64x4_pd(T3, 0), 0x20); - tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), - _mm512_extractf64x4_pd(T2, 0), 0x31); - tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), - _mm512_extractf64x4_pd(T3, 0), 0x31); - - tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), - _mm512_extractf64x4_pd(T2, 1), 0x20); - tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), - _mm512_extractf64x4_pd(T3, 1), 0x20); - tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), - _mm512_extractf64x4_pd(T2, 1), 0x31); - tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), - _mm512_extractf64x4_pd(T3, 1), 0x31); - - tmp.packet[8] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0), - _mm512_extractf64x4_pd(T6, 0), 0x20); - tmp.packet[9] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0), - _mm512_extractf64x4_pd(T7, 0), 0x20); - tmp.packet[10] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0), - _mm512_extractf64x4_pd(T6, 0), 0x31); - tmp.packet[11] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0), - _mm512_extractf64x4_pd(T7, 0), 0x31); - - tmp.packet[12] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1), - _mm512_extractf64x4_pd(T6, 1), 0x20); - tmp.packet[13] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1), - _mm512_extractf64x4_pd(T7, 1), 0x20); - tmp.packet[14] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1), - _mm512_extractf64x4_pd(T6, 1), 0x31); - tmp.packet[15] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1), - _mm512_extractf64x4_pd(T7, 1), 0x31); - - PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 0, 8); - PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 1, 8); - PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 2, 8); - PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 3, 8); - - PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 4, 8); - PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 5, 8); - PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 6, 8); - PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 7, 8); -} -template <> -EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/, - const Packet16f& /*thenPacket*/, - const Packet16f& /*elsePacket*/) { - assert(false && "To be implemented"); - return Packet16f(); -} -template <> -EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket, - const Packet8d& thenPacket, - const Packet8d& elsePacket) { - __mmask8 m = (ifPacket.select[0] ) - | (ifPacket.select[1]<<1) - | (ifPacket.select[2]<<2) - | (ifPacket.select[3]<<3) - | (ifPacket.select[4]<<4) - | (ifPacket.select[5]<<5) - | (ifPacket.select[6]<<6) - | (ifPacket.select[7]<<7); - return _mm512_mask_blend_pd(m, elsePacket, thenPacket); -} - -template<> EIGEN_STRONG_INLINE Packet16i pcast(const Packet16f& a) { - return _mm512_cvttps_epi32(a); -} - -template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16i& a) { - return _mm512_cvtepi32_ps(a); -} - -template -struct palign_impl { - static EIGEN_STRONG_INLINE void run(Packet16f& first, - const Packet16f& second) { - if (Offset != 0) { - __m512i first_idx = _mm512_set_epi32( - Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11, - Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6, - Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset); - - __m512i second_idx = - _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4, - Offset - 5, Offset - 6, Offset - 7, Offset - 8, - Offset - 9, Offset - 10, Offset - 11, Offset - 12, - Offset - 13, Offset - 14, Offset - 15, Offset - 16); - - unsigned short mask = 0xFFFF; - mask <<= (16 - Offset); - - first = _mm512_permutexvar_ps(first_idx, first); - Packet16f tmp = _mm512_permutexvar_ps(second_idx, second); - first = _mm512_mask_blend_ps(mask, first, tmp); - } - } -}; -template -struct palign_impl { - static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) { - if (Offset != 0) { - __m512i first_idx = _mm512_set_epi32( - 0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0, - Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset); - - __m512i second_idx = _mm512_set_epi32( - 0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0, - Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8); - - unsigned char mask = 0xFF; - mask <<= (8 - Offset); - - first = _mm512_permutexvar_pd(first_idx, first); - Packet8d tmp = _mm512_permutexvar_pd(second_idx, second); - first = _mm512_mask_blend_pd(mask, first, tmp); - } - } -}; - - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_PACKET_MATH_AVX512_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/arch/AltiVec/PacketMath.h b/lib/eigen_3.3.9/Eigen/src/Core/arch/AltiVec/PacketMath.h deleted file mode 100755 index 08a27d15302..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ /dev/null @@ -1,1061 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008-2016 Konstantinos Margaritis -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_PACKET_MATH_ALTIVEC_H -#define EIGEN_PACKET_MATH_ALTIVEC_H - -namespace Eigen { - -namespace internal { - -#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD -#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 -#endif - -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#endif - -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - -// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 -#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 -#endif - -typedef __vector float Packet4f; -typedef __vector int Packet4i; -typedef __vector unsigned int Packet4ui; -typedef __vector __bool int Packet4bi; -typedef __vector short int Packet8i; -typedef __vector unsigned char Packet16uc; - -// We don't want to write the same code all the time, but we need to reuse the constants -// and it doesn't really work to declare them global, so we define macros instead - -#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ - Packet4f p4f_##NAME = reinterpret_cast(vec_splat_s32(X)) - -#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ - Packet4i p4i_##NAME = vec_splat_s32(X) - -#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ - Packet4f p4f_##NAME = pset1(X) - -#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ - Packet4i p4i_##NAME = pset1(X) - -#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ - Packet2d p2d_##NAME = pset1(X) - -#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \ - Packet2l p2l_##NAME = pset1(X) - -#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - const Packet4f p4f_##NAME = reinterpret_cast(pset1(X)) - -#define DST_CHAN 1 -#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) - - -// These constants are endian-agnostic -static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} -static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} -static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1} -static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16} -static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} -static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000} -#ifndef __VSX__ -static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} -#endif - -static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; -static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; - -static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; -static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; - -// Mask alignment -#ifdef __PPC64__ -#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 -#else -#define _EIGEN_MASK_ALIGNMENT 0xfffffff0 -#endif - -#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) - -// Handle endianness properly while loading constants -// Define global static constants: -#ifdef _BIG_ENDIAN -static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); -#ifdef __VSX__ -static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; -#endif -static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; -static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; -static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; -#else -static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; -static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; -static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; -static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; -static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; -#endif // _BIG_ENDIAN - -static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; -static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; -static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; -static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; - -static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; - -#ifdef _BIG_ENDIAN -static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; -#else -static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; -#endif // _BIG_ENDIAN - -#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC - #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR); -#else - #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); -#endif - -template<> struct packet_traits : default_packet_traits -{ - typedef Packet4f type; - typedef Packet4f half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=4, - HasHalfPacket = 1, - - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasMin = 1, - HasMax = 1, - HasAbs = 1, - HasSin = 0, - HasCos = 0, - HasLog = 0, - HasExp = 1, -#ifdef __VSX__ - HasSqrt = 1, -#if !EIGEN_COMP_CLANG - HasRsqrt = 1, -#else - HasRsqrt = 0, -#endif -#else - HasSqrt = 0, - HasRsqrt = 0, -#endif - HasRound = 1, - HasFloor = 1, - HasCeil = 1, - HasNegate = 1, - HasBlend = 1 - }; -}; -template<> struct packet_traits : default_packet_traits -{ - typedef Packet4i type; - typedef Packet4i half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 4, - HasHalfPacket = 0, - - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 0, - HasBlend = 1 - }; -}; - - -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; - -inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) -{ - union { - Packet16uc v; - unsigned char n[16]; - } vt; - vt.v = v; - for (int i=0; i< 16; i++) - s << (int)vt.n[i] << ", "; - return s; -} - -inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) -{ - union { - Packet4f v; - float n[4]; - } vt; - vt.v = v; - s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; - return s; -} - -inline std::ostream & operator <<(std::ostream & s, const Packet4i & v) -{ - union { - Packet4i v; - int n[4]; - } vt; - vt.v = v; - s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; - return s; -} - -inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) -{ - union { - Packet4ui v; - unsigned int n[4]; - } vt; - vt.v = v; - s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; - return s; -} - -// Need to define them first or we get specialization after instantiation errors -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) -{ - EIGEN_DEBUG_ALIGNED_LOAD -#ifdef __VSX__ - return vec_vsx_ld(0, from); -#else - return vec_ld(0, from); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) -{ - EIGEN_DEBUG_ALIGNED_LOAD -#ifdef __VSX__ - return vec_vsx_ld(0, from); -#else - return vec_ld(0, from); -#endif -} - -template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) -{ - EIGEN_DEBUG_ALIGNED_STORE -#ifdef __VSX__ - vec_vsx_st(from, 0, to); -#else - vec_st(from, 0, to); -#endif -} - -template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) -{ - EIGEN_DEBUG_ALIGNED_STORE -#ifdef __VSX__ - vec_vsx_st(from, 0, to); -#else - vec_st(from, 0, to); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { - Packet4f v = {from, from, from, from}; - return v; -} - -template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { - Packet4i v = {from, from, from, from}; - return v; -} -template<> EIGEN_STRONG_INLINE void -pbroadcast4(const float *a, - Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) -{ - a3 = pload(a); - a0 = vec_splat(a3, 0); - a1 = vec_splat(a3, 1); - a2 = vec_splat(a3, 2); - a3 = vec_splat(a3, 3); -} -template<> EIGEN_STRONG_INLINE void -pbroadcast4(const int *a, - Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) -{ - a3 = pload(a); - a0 = vec_splat(a3, 0); - a1 = vec_splat(a3, 1); - a2 = vec_splat(a3, 2); - a3 = vec_splat(a3, 3); -} - -template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) -{ - float EIGEN_ALIGN16 af[4]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - af[2] = from[2*stride]; - af[3] = from[3*stride]; - return pload(af); -} -template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) -{ - int EIGEN_ALIGN16 ai[4]; - ai[0] = from[0*stride]; - ai[1] = from[1*stride]; - ai[2] = from[2*stride]; - ai[3] = from[3*stride]; - return pload(ai); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) -{ - float EIGEN_ALIGN16 af[4]; - pstore(af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; - to[2*stride] = af[2]; - to[3*stride] = af[3]; -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) -{ - int EIGEN_ALIGN16 ai[4]; - pstore((int *)ai, from); - to[0*stride] = ai[0]; - to[1*stride] = ai[1]; - to[2*stride] = ai[2]; - to[3*stride] = ai[3]; -} - -template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return pset1(a) + p4f_COUNTDOWN; } -template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return pset1(a) + p4i_COUNTDOWN; } - -template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return a + b; } -template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return a + b; } - -template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return a - b; } -template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return a - b; } - -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; } -template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; } - -template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } - -template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); } -template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return a * b; } - -template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) -{ -#ifndef __VSX__ // VSX actually provides a div instruction - Packet4f t, y_0, y_1; - - // Altivec does not offer a divide instruction, we have to do a reciprocal approximation - y_0 = vec_re(b); - - // Do one Newton-Raphson iteration to get the needed accuracy - t = vec_nmsub(y_0, b, p4f_ONE); - y_1 = vec_madd(y_0, t, y_0); - - return vec_madd(a, y_1, p4f_MZERO); -#else - return vec_div(a, b); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, const Packet4i& /*b*/) -{ eigen_assert(false && "packet integer division are not supported by AltiVec"); - return pset1(0); -} - -// for some weird raisons, it has to be overloaded for packet of integers -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); } -template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; } - -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) -{ - #ifdef __VSX__ - Packet4f ret; - __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); - return ret; - #else - return vec_min(a, b); - #endif -} -template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } - -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) -{ - #ifdef __VSX__ - Packet4f ret; - __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); - return ret; - #else - return vec_max(a, b); - #endif -} -template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } - -template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } -template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } - -template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); } -template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } - -template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); } -template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } - -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } - -template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return vec_round(a); } -template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return vec_ceil(a); } -template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return vec_floor(a); } - -#ifdef _BIG_ENDIAN -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) -{ - EIGEN_DEBUG_ALIGNED_LOAD - Packet16uc MSQ, LSQ; - Packet16uc mask; - MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword - mask = vec_lvsl(0, from); // create the permute mask - return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data - -} -template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) -{ - EIGEN_DEBUG_ALIGNED_LOAD - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - Packet16uc MSQ, LSQ; - Packet16uc mask; - MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword - mask = vec_lvsl(0, from); // create the permute mask - return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data -} -#else -// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX -template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) -{ - EIGEN_DEBUG_UNALIGNED_LOAD - return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from)); -} -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) -{ - EIGEN_DEBUG_UNALIGNED_LOAD - return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from)); -} -#endif - -template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) -{ - Packet4f p; - if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); - return vec_perm(p, p, p16uc_DUPLICATE32_HI); -} -template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) -{ - Packet4i p; - if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); - return vec_perm(p, p, p16uc_DUPLICATE32_HI); -} - -#ifdef _BIG_ENDIAN -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) -{ - EIGEN_DEBUG_UNALIGNED_STORE - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - // Warning: not thread safe! - Packet16uc MSQ, LSQ, edges; - Packet16uc edgeAlign, align; - - MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword - edgeAlign = vec_lvsl(0, to); // permute map to extract edges - edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges - align = vec_lvsr( 0, to ); // permute map to misalign data - MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ) - LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ) - vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first - vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part -} -template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) -{ - EIGEN_DEBUG_UNALIGNED_STORE - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - // Warning: not thread safe! - Packet16uc MSQ, LSQ, edges; - Packet16uc edgeAlign, align; - - MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword - edgeAlign = vec_lvsl(0, to); // permute map to extract edges - edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges - align = vec_lvsr( 0, to ); // permute map to misalign data - MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) - LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) - vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first - vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part -} -#else -// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX -template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) -{ - EIGEN_DEBUG_ALIGNED_STORE - vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to)); -} -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) -{ - EIGEN_DEBUG_ALIGNED_STORE - vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); -} -#endif - -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_PPC_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_PPC_PREFETCH(addr); } - -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } -template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } - -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) -{ - return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); -} -template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) -{ - return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); } - -template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } - -template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) -{ - Packet4f b, sum; - b = vec_sld(a, a, 8); - sum = a + b; - b = vec_sld(sum, sum, 4); - sum += b; - return pfirst(sum); -} - -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) -{ - Packet4f v[4], sum[4]; - - // It's easier and faster to transpose then add as columns - // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation - // Do the transpose, first set of moves - v[0] = vec_mergeh(vecs[0], vecs[2]); - v[1] = vec_mergel(vecs[0], vecs[2]); - v[2] = vec_mergeh(vecs[1], vecs[3]); - v[3] = vec_mergel(vecs[1], vecs[3]); - // Get the resulting vectors - sum[0] = vec_mergeh(v[0], v[2]); - sum[1] = vec_mergel(v[0], v[2]); - sum[2] = vec_mergeh(v[1], v[3]); - sum[3] = vec_mergel(v[1], v[3]); - - // Now do the summation: - // Lines 0+1 - sum[0] = sum[0] + sum[1]; - // Lines 2+3 - sum[1] = sum[2] + sum[3]; - // Add the results - sum[0] = sum[0] + sum[1]; - - return sum[0]; -} - -template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) -{ - Packet4i sum; - sum = vec_sums(a, p4i_ZERO); -#ifdef _BIG_ENDIAN - sum = vec_sld(sum, p4i_ZERO, 12); -#else - sum = vec_sld(p4i_ZERO, sum, 4); -#endif - return pfirst(sum); -} - -template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) -{ - Packet4i v[4], sum[4]; - - // It's easier and faster to transpose then add as columns - // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation - // Do the transpose, first set of moves - v[0] = vec_mergeh(vecs[0], vecs[2]); - v[1] = vec_mergel(vecs[0], vecs[2]); - v[2] = vec_mergeh(vecs[1], vecs[3]); - v[3] = vec_mergel(vecs[1], vecs[3]); - // Get the resulting vectors - sum[0] = vec_mergeh(v[0], v[2]); - sum[1] = vec_mergel(v[0], v[2]); - sum[2] = vec_mergeh(v[1], v[3]); - sum[3] = vec_mergel(v[1], v[3]); - - // Now do the summation: - // Lines 0+1 - sum[0] = sum[0] + sum[1]; - // Lines 2+3 - sum[1] = sum[2] + sum[3]; - // Add the results - sum[0] = sum[0] + sum[1]; - - return sum[0]; -} - -// Other reduction functions: -// mul -template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) -{ - Packet4f prod; - prod = pmul(a, vec_sld(a, a, 8)); - return pfirst(pmul(prod, vec_sld(prod, prod, 4))); -} - -template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) -{ - EIGEN_ALIGN16 int aux[4]; - pstore(aux, a); - return aux[0] * aux[1] * aux[2] * aux[3]; -} - -// min -template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) -{ - Packet4f b, res; - b = vec_min(a, vec_sld(a, a, 8)); - res = vec_min(b, vec_sld(b, b, 4)); - return pfirst(res); -} - -template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) -{ - Packet4i b, res; - b = vec_min(a, vec_sld(a, a, 8)); - res = vec_min(b, vec_sld(b, b, 4)); - return pfirst(res); -} - -// max -template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) -{ - Packet4f b, res; - b = vec_max(a, vec_sld(a, a, 8)); - res = vec_max(b, vec_sld(b, b, 4)); - return pfirst(res); -} - -template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) -{ - Packet4i b, res; - b = vec_max(a, vec_sld(a, a, 8)); - res = vec_max(b, vec_sld(b, b, 4)); - return pfirst(res); -} - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { -#ifdef _BIG_ENDIAN - switch (Offset % 4) { - case 1: - first = vec_sld(first, second, 4); break; - case 2: - first = vec_sld(first, second, 8); break; - case 3: - first = vec_sld(first, second, 12); break; - } -#else - switch (Offset % 4) { - case 1: - first = vec_sld(second, first, 12); break; - case 2: - first = vec_sld(second, first, 8); break; - case 3: - first = vec_sld(second, first, 4); break; - } -#endif - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) - { -#ifdef _BIG_ENDIAN - switch (Offset % 4) { - case 1: - first = vec_sld(first, second, 4); break; - case 2: - first = vec_sld(first, second, 8); break; - case 3: - first = vec_sld(first, second, 12); break; - } -#else - switch (Offset % 4) { - case 1: - first = vec_sld(second, first, 12); break; - case 2: - first = vec_sld(second, first, 8); break; - case 3: - first = vec_sld(second, first, 4); break; - } -#endif - } -}; - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet4f t0, t1, t2, t3; - t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); - t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); - t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); - t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); - kernel.packet[0] = vec_mergeh(t0, t2); - kernel.packet[1] = vec_mergel(t0, t2); - kernel.packet[2] = vec_mergeh(t1, t3); - kernel.packet[3] = vec_mergel(t1, t3); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet4i t0, t1, t2, t3; - t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); - t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); - t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); - t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); - kernel.packet[0] = vec_mergeh(t0, t2); - kernel.packet[1] = vec_mergel(t0, t2); - kernel.packet[2] = vec_mergeh(t1, t3); - kernel.packet[3] = vec_mergel(t1, t3); -} - -template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { - Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; - Packet4ui mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p4i_ONE))); - return vec_sel(elsePacket, thenPacket, mask); -} - -template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { - Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; - Packet4ui mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p4i_ONE))); - return vec_sel(elsePacket, thenPacket, mask); -} - - -//---------- double ---------- -#ifdef __VSX__ -typedef __vector double Packet2d; -typedef __vector unsigned long long Packet2ul; -typedef __vector long long Packet2l; -#if EIGEN_COMP_CLANG -typedef Packet2ul Packet2bl; -#else -typedef __vector __bool long Packet2bl; -#endif - -static Packet2l p2l_ONE = { 1, 1 }; -static Packet2l p2l_ZERO = reinterpret_cast(p4i_ZERO); -static Packet2d p2d_ONE = { 1.0, 1.0 }; -static Packet2d p2d_ZERO = reinterpret_cast(p4f_ZERO); -static Packet2d p2d_MZERO = { -0.0, -0.0 }; - -#ifdef _BIG_ENDIAN -static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ZERO), reinterpret_cast(p2d_ONE), 8)); -#else -static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ONE), reinterpret_cast(p2d_ZERO), 8)); -#endif - -template Packet2d vec_splat_dbl(Packet2d& a); - -template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a) -{ - return reinterpret_cast(vec_perm(a, a, p16uc_PSET64_HI)); -} - -template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a) -{ - return reinterpret_cast(vec_perm(a, a, p16uc_PSET64_LO)); -} - -template<> struct packet_traits : default_packet_traits -{ - typedef Packet2d type; - typedef Packet2d half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=2, - HasHalfPacket = 1, - - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasMin = 1, - HasMax = 1, - HasAbs = 1, - HasSin = 0, - HasCos = 0, - HasLog = 0, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasRound = 1, - HasFloor = 1, - HasCeil = 1, - HasNegate = 1, - HasBlend = 1 - }; -}; - -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; - -inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) -{ - union { - Packet2l v; - int64_t n[2]; - } vt; - vt.v = v; - s << vt.n[0] << ", " << vt.n[1]; - return s; -} - -inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) -{ - union { - Packet2d v; - double n[2]; - } vt; - vt.v = v; - s << vt.n[0] << ", " << vt.n[1]; - return s; -} - -// Need to define them first or we get specialization after instantiation errors -template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) -{ - EIGEN_DEBUG_ALIGNED_LOAD -#ifdef __VSX__ - return vec_vsx_ld(0, from); -#else - return vec_ld(0, from); -#endif -} - -template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) -{ - EIGEN_DEBUG_ALIGNED_STORE -#ifdef __VSX__ - vec_vsx_st(from, 0, to); -#else - vec_st(from, 0, to); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { - Packet2d v = {from, from}; - return v; -} - -template<> EIGEN_STRONG_INLINE void -pbroadcast4(const double *a, - Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) -{ - a1 = pload(a); - a0 = vec_splat_dbl<0>(a1); - a1 = vec_splat_dbl<1>(a1); - a3 = pload(a+2); - a2 = vec_splat_dbl<0>(a3); - a3 = vec_splat_dbl<1>(a3); -} - -template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) -{ - double EIGEN_ALIGN16 af[2]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - return pload(af); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) -{ - double EIGEN_ALIGN16 af[2]; - pstore(af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; -} - -template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return pset1(a) + p2d_COUNTDOWN; } - -template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return a + b; } - -template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return a - b; } - -template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; } - -template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } - -template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); } -template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); } - -// for some weird raisons, it has to be overloaded for packet of integers -template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } - -template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) -{ - Packet2d ret; - __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); - return ret; - } - -template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) -{ - Packet2d ret; - __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); - return ret; -} - -template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } - -template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } - -template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } - -template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } - -template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return vec_round(a); } -template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return vec_ceil(a); } -template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return vec_floor(a); } - -template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) -{ - EIGEN_DEBUG_ALIGNED_LOAD - return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from)); -} - -template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) -{ - Packet2d p; - if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); - return vec_splat_dbl<0>(p); -} - -template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) -{ - EIGEN_DEBUG_ALIGNED_STORE - vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); -} - -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_PPC_PREFETCH(addr); } - -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } - -template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) -{ - return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE64)); -} -template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } - -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) -{ - Packet2d b, sum; - b = reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)); - sum = a + b; - return pfirst(sum); -} - -template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) -{ - Packet2d v[2], sum; - v[0] = vecs[0] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[0]), 8)); - v[1] = vecs[1] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[1]), 8)); - -#ifdef _BIG_ENDIAN - sum = reinterpret_cast(vec_sld(reinterpret_cast(v[0]), reinterpret_cast(v[1]), 8)); -#else - sum = reinterpret_cast(vec_sld(reinterpret_cast(v[1]), reinterpret_cast(v[0]), 8)); -#endif - - return sum; -} -// Other reduction functions: -// mul -template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) -{ - return pfirst(pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); -} - -// min -template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) -{ - return pfirst(pmin(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); -} - -// max -template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) -{ - return pfirst(pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); -} - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) - { - if (Offset == 1) -#ifdef _BIG_ENDIAN - first = reinterpret_cast(vec_sld(reinterpret_cast(first), reinterpret_cast(second), 8)); -#else - first = reinterpret_cast(vec_sld(reinterpret_cast(second), reinterpret_cast(first), 8)); -#endif - } -}; - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet2d t0, t1; - t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI); - t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO); - kernel.packet[0] = t0; - kernel.packet[1] = t1; -} - -template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { - Packet2l select = { ifPacket.select[0], ifPacket.select[1] }; - Packet2bl mask = reinterpret_cast( vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p2l_ONE)) ); - return vec_sel(elsePacket, thenPacket, mask); -} -#endif // __VSX__ -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_PACKET_MATH_ALTIVEC_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/arch/CUDA/Complex.h b/lib/eigen_3.3.9/Eigen/src/Core/arch/CUDA/Complex.h deleted file mode 100644 index 9c25365090b..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/arch/CUDA/Complex.h +++ /dev/null @@ -1,103 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_COMPLEX_CUDA_H -#define EIGEN_COMPLEX_CUDA_H - -// clang-format off - -namespace Eigen { - -namespace internal { - -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) - -// Many std::complex methods such as operator+, operator-, operator* and -// operator/ are not constexpr. Due to this, clang does not treat them as device -// functions and thus Eigen functors making use of these operators fail to -// compile. Here, we manually specialize these functors for complex types when -// building for CUDA to avoid non-constexpr methods. - -// Sum -template struct scalar_sum_op, const std::complex > : binary_op_base, const std::complex > { - typedef typename std::complex result_type; - - EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex operator() (const std::complex& a, const std::complex& b) const { - return std::complex(numext::real(a) + numext::real(b), - numext::imag(a) + numext::imag(b)); - } -}; - -template struct scalar_sum_op, std::complex > : scalar_sum_op, const std::complex > {}; - - -// Difference -template struct scalar_difference_op, const std::complex > : binary_op_base, const std::complex > { - typedef typename std::complex result_type; - - EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex operator() (const std::complex& a, const std::complex& b) const { - return std::complex(numext::real(a) - numext::real(b), - numext::imag(a) - numext::imag(b)); - } -}; - -template struct scalar_difference_op, std::complex > : scalar_difference_op, const std::complex > {}; - - -// Product -template struct scalar_product_op, const std::complex > : binary_op_base, const std::complex > { - enum { - Vectorizable = packet_traits>::HasMul - }; - typedef typename std::complex result_type; - - EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex operator() (const std::complex& a, const std::complex& b) const { - const T a_real = numext::real(a); - const T a_imag = numext::imag(a); - const T b_real = numext::real(b); - const T b_imag = numext::imag(b); - return std::complex(a_real * b_real - a_imag * b_imag, - a_real * b_imag + a_imag * b_real); - } -}; - -template struct scalar_product_op, std::complex > : scalar_product_op, const std::complex > {}; - - -// Quotient -template struct scalar_quotient_op, const std::complex > : binary_op_base, const std::complex > { - enum { - Vectorizable = packet_traits>::HasDiv - }; - typedef typename std::complex result_type; - - EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex operator() (const std::complex& a, const std::complex& b) const { - const T a_real = numext::real(a); - const T a_imag = numext::imag(a); - const T b_real = numext::real(b); - const T b_imag = numext::imag(b); - const T norm = T(1) / (b_real * b_real + b_imag * b_imag); - return std::complex((a_real * b_real + a_imag * b_imag) * norm, - (a_imag * b_real - a_real * b_imag) * norm); - } -}; - -template struct scalar_quotient_op, std::complex > : scalar_quotient_op, const std::complex > {}; - -#endif - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_COMPLEX_CUDA_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/arch/CUDA/Half.h b/lib/eigen_3.3.9/Eigen/src/Core/arch/CUDA/Half.h deleted file mode 100644 index 59717b4fe6c..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/arch/CUDA/Half.h +++ /dev/null @@ -1,675 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -// -// The conversion routines are Copyright (c) Fabian Giesen, 2016. -// The original license follows: -// -// Copyright (c) Fabian Giesen, 2016 -// All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted. -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -// Standard 16-bit float type, mostly useful for GPUs. Defines a new -// type Eigen::half (inheriting from CUDA's __half struct) with -// operator overloads such that it behaves basically as an arithmetic -// type. It will be quite slow on CPUs (so it is recommended to stay -// in float32_bits for CPUs, except for simple parameter conversions, I/O -// to disk and the likes), but fast on GPUs. - - -#ifndef EIGEN_HALF_CUDA_H -#define EIGEN_HALF_CUDA_H - -#if __cplusplus > 199711L -#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() -#else -#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type() -#endif - -#include - -namespace Eigen { - -struct half; - -namespace half_impl { - -#if !defined(EIGEN_HAS_CUDA_FP16) -// Make our own __half_raw definition that is similar to CUDA's. -struct __half_raw { - EIGEN_DEVICE_FUNC __half_raw() : x(0) {} - explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} - unsigned short x; -}; -#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 -// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw -typedef __half __half_raw; -#endif - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); - -struct half_base : public __half_raw { - EIGEN_DEVICE_FUNC half_base() {} - EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} - EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 - EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} -#endif -}; - -} // namespace half_impl - -// Class definition. -struct half : public half_impl::half_base { - #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000) - typedef half_impl::__half_raw __half_raw; - #endif - - EIGEN_DEVICE_FUNC half() {} - - EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} - EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 - EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} -#endif - - explicit EIGEN_DEVICE_FUNC half(bool b) - : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} - template - explicit EIGEN_DEVICE_FUNC half(const T& val) - : half_impl::half_base(half_impl::float_to_half_rtne(static_cast(val))) {} - explicit EIGEN_DEVICE_FUNC half(float f) - : half_impl::half_base(half_impl::float_to_half_rtne(f)) {} - - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const { - // +0.0 and -0.0 become false, everything else becomes true. - return (x & 0x7fff) != 0; - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const { - return static_cast(half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const { - return half_impl::half_to_float(*this); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const { - return static_cast(half_impl::half_to_float(*this)); - } - - EIGEN_DEVICE_FUNC half& operator=(const half& other) { - x = other.x; - return *this; - } -}; - -} // end namespace Eigen - -namespace std { -template<> -struct numeric_limits { - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool has_infinity = true; - static const bool has_quiet_NaN = true; - static const bool has_signaling_NaN = true; - static const float_denorm_style has_denorm = denorm_present; - static const bool has_denorm_loss = false; - static const std::float_round_style round_style = std::round_to_nearest; - static const bool is_iec559 = false; - static const bool is_bounded = false; - static const bool is_modulo = false; - static const int digits = 11; - static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html - static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html - static const int radix = 2; - static const int min_exponent = -13; - static const int min_exponent10 = -4; - static const int max_exponent = 16; - static const int max_exponent10 = 4; - static const bool traps = true; - static const bool tinyness_before = false; - - static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); } - static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } - static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } - static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); } - static Eigen::half round_error() { return Eigen::half(0.5); } - static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } - static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } - static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } - static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); } -}; - -// If std::numeric_limits is specialized, should also specialize -// std::numeric_limits, std::numeric_limits, and -// std::numeric_limits -// https://stackoverflow.com/a/16519653/ -template<> -struct numeric_limits : numeric_limits {}; -template<> -struct numeric_limits : numeric_limits {}; -template<> -struct numeric_limits : numeric_limits {}; -} // end namespace std - -namespace Eigen { - -namespace half_impl { - -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 - -// Intrinsics for native fp16 support. Note that on current hardware, -// these are no faster than float32_bits arithmetic (you need to use the half2 -// versions to get the ALU speed increased), but you do save the -// conversion steps back and forth. - -EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { - return __hadd(a, b); -} -EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { - return __hmul(a, b); -} -EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { - return __hsub(a, b); -} -EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { - float num = __half2float(a); - float denom = __half2float(b); - return __float2half(num / denom); -} -EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { - return __hneg(a); -} -EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) { - a = a + b; - return a; -} -EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) { - a = a * b; - return a; -} -EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) { - a = a - b; - return a; -} -EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) { - a = a / b; - return a; -} -EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) { - return __heq(a, b); -} -EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) { - return __hne(a, b); -} -EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) { - return __hlt(a, b); -} -EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) { - return __hle(a, b); -} -EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) { - return __hgt(a, b); -} -EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { - return __hge(a, b); -} - -#else // Emulate support for half floats - -// Definitions for CPUs and older CUDA, mostly working through conversion -// to/from float32_bits. - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { - return half(float(a) + float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { - return half(float(a) * float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { - return half(float(a) - float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { - return half(float(a) / float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { - half result; - result.x = a.x ^ 0x8000; - return result; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { - a = half(float(a) + float(b)); - return a; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { - a = half(float(a) * float(b)); - return a; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { - a = half(float(a) - float(b)); - return a; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { - a = half(float(a) / float(b)); - return a; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { - return numext::equal_strict(float(a),float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { - return numext::not_equal_strict(float(a), float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { - return float(a) < float(b); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { - return float(a) <= float(b); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { - return float(a) > float(b); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { - return float(a) >= float(b); -} - -#endif // Emulate support for half floats - -// Division by an index. Do it in full float precision to avoid accuracy -// issues in converting the denominator to half. -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { - return half(static_cast(a) / static_cast(b)); -} - -// Conversion routines, including fallbacks for the host or older CUDA. -// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of -// these in hardware. If we need more performance on older/other CPUs, they are -// also possible to vectorize directly. - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) { - __half_raw h; - h.x = x; - return h; -} - -union float32_bits { - unsigned int u; - float f; -}; - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 - __half tmp_ff = __float2half(ff); - return *(__half_raw*)&tmp_ff; - -#elif defined(EIGEN_HAS_FP16_C) - __half_raw h; - h.x = _cvtss_sh(ff, 0); - return h; - -#else - float32_bits f; f.f = ff; - - const float32_bits f32infty = { 255 << 23 }; - const float32_bits f16max = { (127 + 16) << 23 }; - const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; - unsigned int sign_mask = 0x80000000u; - __half_raw o; - o.x = static_cast(0x0u); - - unsigned int sign = f.u & sign_mask; - f.u ^= sign; - - // NOTE all the integer compares in this function can be safely - // compiled into signed compares since all operands are below - // 0x80000000. Important if you want fast straight SSE2 code - // (since there's no unsigned PCMPGTD). - - if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) - o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf - } else { // (De)normalized number or zero - if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero - // use a magic value to align our 10 mantissa bits at the bottom of - // the float. as long as FP addition is round-to-nearest-even this - // just works. - f.f += denorm_magic.f; - - // and one integer subtract of the bias later, we have our final float! - o.x = static_cast(f.u - denorm_magic.u); - } else { - unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd - - // update exponent, rounding bias part 1 - f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; - // rounding bias part 2 - f.u += mant_odd; - // take the bits! - o.x = static_cast(f.u >> 13); - } - } - - o.x |= static_cast(sign >> 16); - return o; -#endif -} - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 - return __half2float(h); - -#elif defined(EIGEN_HAS_FP16_C) - return _cvtsh_ss(h.x); - -#else - const float32_bits magic = { 113 << 23 }; - const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift - float32_bits o; - - o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits - unsigned int exp = shifted_exp & o.u; // just the exponent - o.u += (127 - 15) << 23; // exponent adjust - - // handle exponent special cases - if (exp == shifted_exp) { // Inf/NaN? - o.u += (128 - 16) << 23; // extra exp adjust - } else if (exp == 0) { // Zero/Denormal? - o.u += 1 << 23; // extra exp adjust - o.f -= magic.f; // renormalize - } - - o.u |= (h.x & 0x8000) << 16; // sign bit - return o.f; -#endif -} - -// --- standard functions --- - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { - return (a.x & 0x7fff) == 0x7c00; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 - return __hisnan(a); -#else - return (a.x & 0x7fff) > 0x7c00; -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) { - return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a)); -} - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { - half result; - result.x = a.x & 0x7FFF; - return result; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 - return half(hexp(a)); -#else - return half(::expf(float(a))); -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 - return half(::hlog(a)); -#else - return half(::logf(float(a))); -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) { - return half(numext::log1p(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { - return half(::log10f(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 - return half(hsqrt(a)); -#else - return half(::sqrtf(float(a))); -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) { - return half(::powf(float(a), float(b))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) { - return half(::sinf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) { - return half(::cosf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) { - return half(::tanf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { - return half(::tanhf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 - return half(hfloor(a)); -#else - return half(::floorf(float(a))); -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 - return half(hceil(a)); -#else - return half(::ceilf(float(a))); -#endif -} - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 - return __hlt(b, a) ? b : a; -#else - const float f1 = static_cast(a); - const float f2 = static_cast(b); - return f2 < f1 ? b : a; -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 - return __hlt(a, b) ? b : a; -#else - const float f1 = static_cast(a); - const float f2 = static_cast(b); - return f1 < f2 ? b : a; -#endif -} - -EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) { - os << static_cast(v); - return os; -} - -} // end namespace half_impl - -// import Eigen::half_impl::half into Eigen namespace -// using half_impl::half; - -namespace internal { - -template<> -struct random_default_impl -{ - static inline half run(const half& x, const half& y) - { - return x + (y-x) * half(float(std::rand()) / float(RAND_MAX)); - } - static inline half run() - { - return run(half(-1.f), half(1.f)); - } -}; - -template<> struct is_arithmetic { enum { value = true }; }; - -} // end namespace internal - -template<> struct NumTraits - : GenericNumTraits -{ - enum { - IsSigned = true, - IsInteger = false, - IsComplex = false, - RequireInitialization = false - }; - - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() { - return half_impl::raw_uint16_to_half(0x0800); - } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return Eigen::half(1e-2f); } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() { - return half_impl::raw_uint16_to_half(0x7bff); - } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() { - return half_impl::raw_uint16_to_half(0xfbff); - } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() { - return half_impl::raw_uint16_to_half(0x7c00); - } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() { - return half_impl::raw_uint16_to_half(0x7c01); - } -}; - -} // end namespace Eigen - -// C-like standard mathematical functions and trancendentals. -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) { - Eigen::half result; - result.x = a.x & 0x7FFF; - return result; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { - return Eigen::half(::expf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 - return Eigen::half(::hlog(a)); -#else - return Eigen::half(::logf(float(a))); -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) { - return Eigen::half(::sqrtf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) { - return Eigen::half(::powf(float(a), float(b))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) { - return Eigen::half(::floorf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) { - return Eigen::half(::ceilf(float(a))); -} - -namespace std { - -#if __cplusplus > 199711L -template <> -struct hash { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const { - return static_cast(a.x); - } -}; -#endif - -} // end namespace std - - -// Add the missing shfl_xor intrinsic -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 -__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { - #if EIGEN_CUDACC_VER < 90000 - return static_cast(__shfl_xor(static_cast(var), laneMask, width)); - #else - return static_cast(__shfl_xor_sync(0xFFFFFFFF, static_cast(var), laneMask, width)); - #endif -} -#endif - -// ldg() has an overload for __half_raw, but we also need one for Eigen::half. -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { - return Eigen::half_impl::raw_uint16_to_half( - __ldg(reinterpret_cast(ptr))); -} -#endif - - -#if defined(EIGEN_CUDA_ARCH) -namespace Eigen { -namespace numext { - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -bool (isnan)(const Eigen::half& h) { - return (half_impl::isnan)(h); -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -bool (isinf)(const Eigen::half& h) { - return (half_impl::isinf)(h); -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -bool (isfinite)(const Eigen::half& h) { - return (half_impl::isfinite)(h); -} - -} // namespace Eigen -} // namespace numext -#endif - -#endif // EIGEN_HALF_CUDA_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/arch/CUDA/PacketMath.h b/lib/eigen_3.3.9/Eigen/src/Core/arch/CUDA/PacketMath.h deleted file mode 100644 index 4dda63188d2..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/arch/CUDA/PacketMath.h +++ /dev/null @@ -1,333 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_PACKET_MATH_CUDA_H -#define EIGEN_PACKET_MATH_CUDA_H - -namespace Eigen { - -namespace internal { - -// Make sure this is only available when targeting a GPU: we don't want to -// introduce conflicts between these packet_traits definitions and the ones -// we'll use on the host side (SSE, AVX, ...) -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) -template<> struct is_arithmetic { enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; - -template<> struct packet_traits : default_packet_traits -{ - typedef float4 type; - typedef float4 half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=4, - HasHalfPacket = 0, - - HasDiv = 1, - HasSin = 0, - HasCos = 0, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasLGamma = 1, - HasDiGamma = 1, - HasZeta = 1, - HasPolygamma = 1, - HasErf = 1, - HasErfc = 1, - HasIGamma = 1, - HasIGammac = 1, - HasBetaInc = 1, - - HasBlend = 0, - }; -}; - -template<> struct packet_traits : default_packet_traits -{ - typedef double2 type; - typedef double2 half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=2, - HasHalfPacket = 0, - - HasDiv = 1, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasLGamma = 1, - HasDiGamma = 1, - HasZeta = 1, - HasPolygamma = 1, - HasErf = 1, - HasErfc = 1, - HasIGamma = 1, - HasIGammac = 1, - HasBetaInc = 1, - - HasBlend = 0, - }; -}; - - -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef float4 half; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; }; - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1(const float& from) { - return make_float4(from, from, from, from); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1(const double& from) { - return make_double2(from, from); -} - - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset(const float& a) { - return make_float4(a, a+1, a+2, a+3); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset(const double& a) { - return make_double2(a, a+1); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd(const float4& a, const float4& b) { - return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd(const double2& a, const double2& b) { - return make_double2(a.x+b.x, a.y+b.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub(const float4& a, const float4& b) { - return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub(const double2& a, const double2& b) { - return make_double2(a.x-b.x, a.y-b.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) { - return make_float4(-a.x, -a.y, -a.z, -a.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) { - return make_double2(-a.x, -a.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; } - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul(const float4& a, const float4& b) { - return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul(const double2& a, const double2& b) { - return make_double2(a.x*b.x, a.y*b.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv(const float4& a, const float4& b) { - return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv(const double2& a, const double2& b) { - return make_double2(a.x/b.x, a.y/b.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin(const float4& a, const float4& b) { - return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w)); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin(const double2& a, const double2& b) { - return make_double2(fmin(a.x, b.x), fmin(a.y, b.y)); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax(const float4& a, const float4& b) { - return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w)); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax(const double2& a, const double2& b) { - return make_double2(fmax(a.x, b.x), fmax(a.y, b.y)); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload(const float* from) { - return *reinterpret_cast(from); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload(const double* from) { - return *reinterpret_cast(from); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu(const float* from) { - return make_float4(from[0], from[1], from[2], from[3]); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu(const double* from) { - return make_double2(from[0], from[1]); -} - -template<> EIGEN_STRONG_INLINE float4 ploaddup(const float* from) { - return make_float4(from[0], from[0], from[1], from[1]); -} -template<> EIGEN_STRONG_INLINE double2 ploaddup(const double* from) { - return make_double2(from[0], from[0]); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(float* to, const float4& from) { - *reinterpret_cast(to) = from; -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(double* to, const double2& from) { - *reinterpret_cast(to) = from; -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(float* to, const float4& from) { - to[0] = from.x; - to[1] = from.y; - to[2] = from.z; - to[3] = from.w; -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(double* to, const double2& from) { - to[0] = from.x; - to[1] = from.y; -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 - return __ldg((const float4*)from); -#else - return make_float4(from[0], from[1], from[2], from[3]); -#endif -} -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 - return __ldg((const double2*)from); -#else - return make_double2(from[0], from[1]); -#endif -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 - return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); -#else - return make_float4(from[0], from[1], from[2], from[3]); -#endif -} -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 - return make_double2(__ldg(from+0), __ldg(from+1)); -#else - return make_double2(from[0], from[1]); -#endif -} - -template<> EIGEN_DEVICE_FUNC inline float4 pgather(const float* from, Index stride) { - return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); -} - -template<> EIGEN_DEVICE_FUNC inline double2 pgather(const double* from, Index stride) { - return make_double2(from[0*stride], from[1*stride]); -} - -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const float4& from, Index stride) { - to[stride*0] = from.x; - to[stride*1] = from.y; - to[stride*2] = from.z; - to[stride*3] = from.w; -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const double2& from, Index stride) { - to[stride*0] = from.x; - to[stride*1] = from.y; -} - -template<> EIGEN_DEVICE_FUNC inline float pfirst(const float4& a) { - return a.x; -} -template<> EIGEN_DEVICE_FUNC inline double pfirst(const double2& a) { - return a.x; -} - -template<> EIGEN_DEVICE_FUNC inline float predux(const float4& a) { - return a.x + a.y + a.z + a.w; -} -template<> EIGEN_DEVICE_FUNC inline double predux(const double2& a) { - return a.x + a.y; -} - -template<> EIGEN_DEVICE_FUNC inline float predux_max(const float4& a) { - return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w)); -} -template<> EIGEN_DEVICE_FUNC inline double predux_max(const double2& a) { - return fmax(a.x, a.y); -} - -template<> EIGEN_DEVICE_FUNC inline float predux_min(const float4& a) { - return fminf(fminf(a.x, a.y), fminf(a.z, a.w)); -} -template<> EIGEN_DEVICE_FUNC inline double predux_min(const double2& a) { - return fmin(a.x, a.y); -} - -template<> EIGEN_DEVICE_FUNC inline float predux_mul(const float4& a) { - return a.x * a.y * a.z * a.w; -} -template<> EIGEN_DEVICE_FUNC inline double predux_mul(const double2& a) { - return a.x * a.y; -} - -template<> EIGEN_DEVICE_FUNC inline float4 pabs(const float4& a) { - return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); -} -template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { - return make_double2(fabs(a.x), fabs(a.y)); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - float tmp = kernel.packet[0].y; - kernel.packet[0].y = kernel.packet[1].x; - kernel.packet[1].x = tmp; - - tmp = kernel.packet[0].z; - kernel.packet[0].z = kernel.packet[2].x; - kernel.packet[2].x = tmp; - - tmp = kernel.packet[0].w; - kernel.packet[0].w = kernel.packet[3].x; - kernel.packet[3].x = tmp; - - tmp = kernel.packet[1].z; - kernel.packet[1].z = kernel.packet[2].y; - kernel.packet[2].y = tmp; - - tmp = kernel.packet[1].w; - kernel.packet[1].w = kernel.packet[3].y; - kernel.packet[3].y = tmp; - - tmp = kernel.packet[2].w; - kernel.packet[2].w = kernel.packet[3].z; - kernel.packet[3].z = tmp; -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - double tmp = kernel.packet[0].y; - kernel.packet[0].y = kernel.packet[1].x; - kernel.packet[1].x = tmp; -} - -#endif - -} // end namespace internal - -} // end namespace Eigen - - -#endif // EIGEN_PACKET_MATH_CUDA_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/lib/eigen_3.3.9/Eigen/src/Core/arch/CUDA/PacketMathHalf.h deleted file mode 100644 index f749c573ff6..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ /dev/null @@ -1,1124 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H -#define EIGEN_PACKET_MATH_HALF_CUDA_H - - -namespace Eigen { -namespace internal { - -// Most of the following operations require arch >= 3.0 -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - -template<> struct is_arithmetic { enum { value = true }; }; - -template<> struct packet_traits : default_packet_traits -{ - typedef half2 type; - typedef half2 half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=2, - HasHalfPacket = 0, - HasAdd = 1, - HasMul = 1, - HasDiv = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasExp = 1, - HasLog = 1, - HasLog1p = 1 - }; -}; - -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; - -template<> __device__ EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { - return __half2half2(from); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { - return *reinterpret_cast(from); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { - return __halves2half2(from[0], from[1]); -} - -template<> EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { - return __halves2half2(from[0], from[0]); -} - -template<> __device__ EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) { - *reinterpret_cast(to) = from; -} - -template<> __device__ EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { - to[0] = __low2half(from); - to[1] = __high2half(from); -} - -template<> - __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { -#if __CUDA_ARCH__ >= 350 - return __ldg((const half2*)from); -#else - return __halves2half2(*(from+0), *(from+1)); -#endif -} - -template<> -__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { -#if __CUDA_ARCH__ >= 350 - return __halves2half2(__ldg(from+0), __ldg(from+1)); -#else - return __halves2half2(*(from+0), *(from+1)); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { - return __halves2half2(from[0*stride], from[1*stride]); -} - -template<> __device__ EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) { - to[stride*0] = __low2half(from); - to[stride*1] = __high2half(from); -} - -template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { - return __low2half(a); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pabs(const half2& a) { - half2 result; - unsigned temp = *(reinterpret_cast(&(a))); - *(reinterpret_cast(&(result))) = temp & 0x7FFF7FFF; - return result; -} - - -__device__ EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __half a1 = __low2half(kernel.packet[0]); - __half a2 = __high2half(kernel.packet[0]); - __half b1 = __low2half(kernel.packet[1]); - __half b2 = __high2half(kernel.packet[1]); - kernel.packet[0] = __halves2half2(a1, b1); - kernel.packet[1] = __halves2half2(a2, b2); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { -#if __CUDA_ARCH__ >= 530 - return __halves2half2(a, __hadd(a, __float2half(1.0f))); -#else - float f = __half2float(a) + 1.0f; - return __halves2half2(a, __float2half(f)); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { -#if __CUDA_ARCH__ >= 530 - return __hadd2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 + b1; - float r2 = a2 + b2; - return __floats2half2_rn(r1, r2); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { -#if __CUDA_ARCH__ >= 530 - return __hsub2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 - b1; - float r2 = a2 - b2; - return __floats2half2_rn(r1, r2); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { -#if __CUDA_ARCH__ >= 530 - return __hneg2(a); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return __floats2half2_rn(-a1, -a2); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } - -template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { -#if __CUDA_ARCH__ >= 530 - return __hmul2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 * b1; - float r2 = a2 * b2; - return __floats2half2_rn(r1, r2); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { -#if __CUDA_ARCH__ >= 530 - return __hfma2(a, b, c); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float c1 = __low2float(c); - float c2 = __high2float(c); - float r1 = a1 * b1 + c1; - float r2 = a2 * b2 + c2; - return __floats2half2_rn(r1, r2); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 / b1; - float r2 = a2 / b2; - return __floats2half2_rn(r1, r2); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); - __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); - return __halves2half2(r1, r2); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); - __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); - return __halves2half2(r1, r2); -} - -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { -#if __CUDA_ARCH__ >= 530 - return __hadd(__low2half(a), __high2half(a)); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return Eigen::half(__float2half_rn(a1 + a2)); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { -#if __CUDA_ARCH__ >= 530 - __half first = __low2half(a); - __half second = __high2half(a); - return __hgt(first, second) ? first : second; -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return a1 > a2 ? __low2half(a) : __high2half(a); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { -#if __CUDA_ARCH__ >= 530 - __half first = __low2half(a); - __half second = __high2half(a); - return __hlt(first, second) ? first : second; -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return a1 < a2 ? __low2half(a) : __high2half(a); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { -#if __CUDA_ARCH__ >= 530 - return __hmul(__low2half(a), __high2half(a)); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return Eigen::half(__float2half_rn(a1 * a2)); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = log1pf(a1); - float r2 = log1pf(a2); - return __floats2half2_rn(r1, r2); -} - -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 - -template<> __device__ EIGEN_STRONG_INLINE -half2 plog(const half2& a) { - return h2log(a); -} - -template<> __device__ EIGEN_STRONG_INLINE -half2 pexp(const half2& a) { - return h2exp(a); -} - -template<> __device__ EIGEN_STRONG_INLINE -half2 psqrt(const half2& a) { - return h2sqrt(a); -} - -template<> __device__ EIGEN_STRONG_INLINE -half2 prsqrt(const half2& a) { - return h2rsqrt(a); -} - -#else - -template<> __device__ EIGEN_STRONG_INLINE half2 plog(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = logf(a1); - float r2 = logf(a2); - return __floats2half2_rn(r1, r2); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pexp(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = expf(a1); - float r2 = expf(a2); - return __floats2half2_rn(r1, r2); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = sqrtf(a1); - float r2 = sqrtf(a2); - return __floats2half2_rn(r1, r2); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = rsqrtf(a1); - float r2 = rsqrtf(a2); - return __floats2half2_rn(r1, r2); -} - -#endif - -#elif defined EIGEN_VECTORIZE_AVX512 - -typedef struct { - __m256i x; -} Packet16h; - - -template<> struct is_arithmetic { enum { value = true }; }; - -template <> -struct packet_traits : default_packet_traits { - typedef Packet16h type; - // There is no half-size packet for Packet16h. - typedef Packet16h half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 16, - HasHalfPacket = 0, - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0, - HasDiv = 0, - HasSqrt = 0, - HasRsqrt = 0, - HasExp = 0, - HasLog = 0, - HasBlend = 0 - }; -}; - - -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; }; - -template<> EIGEN_STRONG_INLINE Packet16h pset1(const Eigen::half& from) { - Packet16h result; - result.x = _mm256_set1_epi16(from.x); - return result; -} - -template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet16h& from) { - return half_impl::raw_uint16_to_half(static_cast(_mm256_extract_epi16(from.x, 0))); -} - -template<> EIGEN_STRONG_INLINE Packet16h pload(const Eigen::half* from) { - Packet16h result; - result.x = _mm256_load_si256(reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet16h ploadu(const Eigen::half* from) { - Packet16h result; - result.x = _mm256_loadu_si256(reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet16h& from) { - _mm256_store_si256((__m256i*)to, from.x); -} - -template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet16h& from) { - _mm256_storeu_si256((__m256i*)to, from.x); -} - -template<> EIGEN_STRONG_INLINE Packet16h -ploadquad(const Eigen::half* from) { - Packet16h result; - unsigned short a = from[0].x; - unsigned short b = from[1].x; - unsigned short c = from[2].x; - unsigned short d = from[3].x; - result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a); - return result; -} - -EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { -#ifdef EIGEN_HAS_FP16_C - return _mm512_cvtph_ps(a.x); -#else - EIGEN_ALIGN64 half aux[16]; - pstore(aux, a); - float f0(aux[0]); - float f1(aux[1]); - float f2(aux[2]); - float f3(aux[3]); - float f4(aux[4]); - float f5(aux[5]); - float f6(aux[6]); - float f7(aux[7]); - float f8(aux[8]); - float f9(aux[9]); - float fa(aux[10]); - float fb(aux[11]); - float fc(aux[12]); - float fd(aux[13]); - float fe(aux[14]); - float ff(aux[15]); - - return _mm512_set_ps( - ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0); -#endif -} - -EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { -#ifdef EIGEN_HAS_FP16_C - Packet16h result; - result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); - return result; -#else - EIGEN_ALIGN64 float aux[16]; - pstore(aux, a); - half h0(aux[0]); - half h1(aux[1]); - half h2(aux[2]); - half h3(aux[3]); - half h4(aux[4]); - half h5(aux[5]); - half h6(aux[6]); - half h7(aux[7]); - half h8(aux[8]); - half h9(aux[9]); - half ha(aux[10]); - half hb(aux[11]); - half hc(aux[12]); - half hd(aux[13]); - half he(aux[14]); - half hf(aux[15]); - - Packet16h result; - result.x = _mm256_set_epi16( - hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x, - h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); - return result; -#endif -} - -template<> EIGEN_STRONG_INLINE Packet16h padd(const Packet16h& a, const Packet16h& b) { - Packet16f af = half2float(a); - Packet16f bf = half2float(b); - Packet16f rf = padd(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet16h pmul(const Packet16h& a, const Packet16h& b) { - Packet16f af = half2float(a); - Packet16f bf = half2float(b); - Packet16f rf = pmul(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE half predux(const Packet16h& from) { - Packet16f from_float = half2float(from); - return half(predux(from_float)); -} - -template<> EIGEN_STRONG_INLINE Packet16h pgather(const Eigen::half* from, Index stride) -{ - Packet16h result; - result.x = _mm256_set_epi16( - from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x, - from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x, - from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, - from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); - return result; -} - -template<> EIGEN_STRONG_INLINE void pscatter(half* to, const Packet16h& from, Index stride) -{ - EIGEN_ALIGN64 half aux[16]; - pstore(aux, from); - to[stride*0].x = aux[0].x; - to[stride*1].x = aux[1].x; - to[stride*2].x = aux[2].x; - to[stride*3].x = aux[3].x; - to[stride*4].x = aux[4].x; - to[stride*5].x = aux[5].x; - to[stride*6].x = aux[6].x; - to[stride*7].x = aux[7].x; - to[stride*8].x = aux[8].x; - to[stride*9].x = aux[9].x; - to[stride*10].x = aux[10].x; - to[stride*11].x = aux[11].x; - to[stride*12].x = aux[12].x; - to[stride*13].x = aux[13].x; - to[stride*14].x = aux[14].x; - to[stride*15].x = aux[15].x; -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __m256i a = kernel.packet[0].x; - __m256i b = kernel.packet[1].x; - __m256i c = kernel.packet[2].x; - __m256i d = kernel.packet[3].x; - __m256i e = kernel.packet[4].x; - __m256i f = kernel.packet[5].x; - __m256i g = kernel.packet[6].x; - __m256i h = kernel.packet[7].x; - __m256i i = kernel.packet[8].x; - __m256i j = kernel.packet[9].x; - __m256i k = kernel.packet[10].x; - __m256i l = kernel.packet[11].x; - __m256i m = kernel.packet[12].x; - __m256i n = kernel.packet[13].x; - __m256i o = kernel.packet[14].x; - __m256i p = kernel.packet[15].x; - - __m256i ab_07 = _mm256_unpacklo_epi16(a, b); - __m256i cd_07 = _mm256_unpacklo_epi16(c, d); - __m256i ef_07 = _mm256_unpacklo_epi16(e, f); - __m256i gh_07 = _mm256_unpacklo_epi16(g, h); - __m256i ij_07 = _mm256_unpacklo_epi16(i, j); - __m256i kl_07 = _mm256_unpacklo_epi16(k, l); - __m256i mn_07 = _mm256_unpacklo_epi16(m, n); - __m256i op_07 = _mm256_unpacklo_epi16(o, p); - - __m256i ab_8f = _mm256_unpackhi_epi16(a, b); - __m256i cd_8f = _mm256_unpackhi_epi16(c, d); - __m256i ef_8f = _mm256_unpackhi_epi16(e, f); - __m256i gh_8f = _mm256_unpackhi_epi16(g, h); - __m256i ij_8f = _mm256_unpackhi_epi16(i, j); - __m256i kl_8f = _mm256_unpackhi_epi16(k, l); - __m256i mn_8f = _mm256_unpackhi_epi16(m, n); - __m256i op_8f = _mm256_unpackhi_epi16(o, p); - - __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); - __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); - __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07); - __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07); - __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07); - __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07); - __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07); - __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07); - - __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); - __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); - __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f); - __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f); - __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f); - __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f); - __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f); - __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f); - - __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03); - __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03); - __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03); - __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03); - __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47); - __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47); - __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47); - __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47); - __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b); - __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b); - __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b); - __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b); - __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf); - __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf); - __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf); - __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf); - - // NOTE: no unpacklo/hi instr in this case, so using permute instr. - __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); - __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); - __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); - __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); - __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); - __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); - __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); - __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); - __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); - __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); - __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); - __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); - __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); - __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); - __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); - __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); - - kernel.packet[0].x = a_p_0; - kernel.packet[1].x = a_p_1; - kernel.packet[2].x = a_p_2; - kernel.packet[3].x = a_p_3; - kernel.packet[4].x = a_p_4; - kernel.packet[5].x = a_p_5; - kernel.packet[6].x = a_p_6; - kernel.packet[7].x = a_p_7; - kernel.packet[8].x = a_p_8; - kernel.packet[9].x = a_p_9; - kernel.packet[10].x = a_p_a; - kernel.packet[11].x = a_p_b; - kernel.packet[12].x = a_p_c; - kernel.packet[13].x = a_p_d; - kernel.packet[14].x = a_p_e; - kernel.packet[15].x = a_p_f; -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - EIGEN_ALIGN64 half in[8][16]; - pstore(in[0], kernel.packet[0]); - pstore(in[1], kernel.packet[1]); - pstore(in[2], kernel.packet[2]); - pstore(in[3], kernel.packet[3]); - pstore(in[4], kernel.packet[4]); - pstore(in[5], kernel.packet[5]); - pstore(in[6], kernel.packet[6]); - pstore(in[7], kernel.packet[7]); - - EIGEN_ALIGN64 half out[8][16]; - - for (int i = 0; i < 8; ++i) { - for (int j = 0; j < 8; ++j) { - out[i][j] = in[j][2*i]; - } - for (int j = 0; j < 8; ++j) { - out[i][j+8] = in[j][2*i+1]; - } - } - - kernel.packet[0] = pload(out[0]); - kernel.packet[1] = pload(out[1]); - kernel.packet[2] = pload(out[2]); - kernel.packet[3] = pload(out[3]); - kernel.packet[4] = pload(out[4]); - kernel.packet[5] = pload(out[5]); - kernel.packet[6] = pload(out[6]); - kernel.packet[7] = pload(out[7]); -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - EIGEN_ALIGN64 half in[4][16]; - pstore(in[0], kernel.packet[0]); - pstore(in[1], kernel.packet[1]); - pstore(in[2], kernel.packet[2]); - pstore(in[3], kernel.packet[3]); - - EIGEN_ALIGN64 half out[4][16]; - - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < 4; ++j) { - out[i][j] = in[j][4*i]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+4] = in[j][4*i+1]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+8] = in[j][4*i+2]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+12] = in[j][4*i+3]; - } - } - - kernel.packet[0] = pload(out[0]); - kernel.packet[1] = pload(out[1]); - kernel.packet[2] = pload(out[2]); - kernel.packet[3] = pload(out[3]); -} - - -#elif defined EIGEN_VECTORIZE_AVX - -typedef struct { - __m128i x; -} Packet8h; - - -template<> struct is_arithmetic { enum { value = true }; }; - -template <> -struct packet_traits : default_packet_traits { - typedef Packet8h type; - // There is no half-size packet for Packet8h. - typedef Packet8h half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 8, - HasHalfPacket = 0, - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0, - HasDiv = 0, - HasSqrt = 0, - HasRsqrt = 0, - HasExp = 0, - HasLog = 0, - HasBlend = 0 - }; -}; - - -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; }; - -template<> EIGEN_STRONG_INLINE Packet8h pset1(const Eigen::half& from) { - Packet8h result; - result.x = _mm_set1_epi16(from.x); - return result; -} - -template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet8h& from) { - return half_impl::raw_uint16_to_half(static_cast(_mm_extract_epi16(from.x, 0))); -} - -template<> EIGEN_STRONG_INLINE Packet8h pload(const Eigen::half* from) { - Packet8h result; - result.x = _mm_load_si128(reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet8h ploadu(const Eigen::half* from) { - Packet8h result; - result.x = _mm_loadu_si128(reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet8h& from) { - _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x); -} - -template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet8h& from) { - _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x); -} - -template<> EIGEN_STRONG_INLINE Packet8h -ploadquad(const Eigen::half* from) { - Packet8h result; - unsigned short a = from[0].x; - unsigned short b = from[1].x; - result.x = _mm_set_epi16(b, b, b, b, a, a, a, a); - return result; -} - -EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { -#ifdef EIGEN_HAS_FP16_C - return _mm256_cvtph_ps(a.x); -#else - EIGEN_ALIGN32 Eigen::half aux[8]; - pstore(aux, a); - float f0(aux[0]); - float f1(aux[1]); - float f2(aux[2]); - float f3(aux[3]); - float f4(aux[4]); - float f5(aux[5]); - float f6(aux[6]); - float f7(aux[7]); - - return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0); -#endif -} - -EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { -#ifdef EIGEN_HAS_FP16_C - Packet8h result; - result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); - return result; -#else - EIGEN_ALIGN32 float aux[8]; - pstore(aux, a); - Eigen::half h0(aux[0]); - Eigen::half h1(aux[1]); - Eigen::half h2(aux[2]); - Eigen::half h3(aux[3]); - Eigen::half h4(aux[4]); - Eigen::half h5(aux[5]); - Eigen::half h6(aux[6]); - Eigen::half h7(aux[7]); - - Packet8h result; - result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); - return result; -#endif -} - -template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } - -template<> EIGEN_STRONG_INLINE Packet8h padd(const Packet8h& a, const Packet8h& b) { - Packet8f af = half2float(a); - Packet8f bf = half2float(b); - Packet8f rf = padd(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet8h pmul(const Packet8h& a, const Packet8h& b) { - Packet8f af = half2float(a); - Packet8f bf = half2float(b); - Packet8f rf = pmul(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet8h pgather(const Eigen::half* from, Index stride) -{ - Packet8h result; - result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); - return result; -} - -template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet8h& from, Index stride) -{ - EIGEN_ALIGN32 Eigen::half aux[8]; - pstore(aux, from); - to[stride*0].x = aux[0].x; - to[stride*1].x = aux[1].x; - to[stride*2].x = aux[2].x; - to[stride*3].x = aux[3].x; - to[stride*4].x = aux[4].x; - to[stride*5].x = aux[5].x; - to[stride*6].x = aux[6].x; - to[stride*7].x = aux[7].x; -} - -template<> EIGEN_STRONG_INLINE Eigen::half predux(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux(af); - return Eigen::half(reduced); -} - -template<> EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux_max(af); - return Eigen::half(reduced); -} - -template<> EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux_min(af); - return Eigen::half(reduced); -} - -template<> EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux_mul(af); - return Eigen::half(reduced); -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __m128i a = kernel.packet[0].x; - __m128i b = kernel.packet[1].x; - __m128i c = kernel.packet[2].x; - __m128i d = kernel.packet[3].x; - __m128i e = kernel.packet[4].x; - __m128i f = kernel.packet[5].x; - __m128i g = kernel.packet[6].x; - __m128i h = kernel.packet[7].x; - - __m128i a03b03 = _mm_unpacklo_epi16(a, b); - __m128i c03d03 = _mm_unpacklo_epi16(c, d); - __m128i e03f03 = _mm_unpacklo_epi16(e, f); - __m128i g03h03 = _mm_unpacklo_epi16(g, h); - __m128i a47b47 = _mm_unpackhi_epi16(a, b); - __m128i c47d47 = _mm_unpackhi_epi16(c, d); - __m128i e47f47 = _mm_unpackhi_epi16(e, f); - __m128i g47h47 = _mm_unpackhi_epi16(g, h); - - __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); - __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); - __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); - __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); - __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); - __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); - __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); - __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); - - __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); - __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); - __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); - __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); - __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); - __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); - __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); - __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); - - kernel.packet[0].x = a0b0c0d0e0f0g0h0; - kernel.packet[1].x = a1b1c1d1e1f1g1h1; - kernel.packet[2].x = a2b2c2d2e2f2g2h2; - kernel.packet[3].x = a3b3c3d3e3f3g3h3; - kernel.packet[4].x = a4b4c4d4e4f4g4h4; - kernel.packet[5].x = a5b5c5d5e5f5g5h5; - kernel.packet[6].x = a6b6c6d6e6f6g6h6; - kernel.packet[7].x = a7b7c7d7e7f7g7h7; -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - EIGEN_ALIGN32 Eigen::half in[4][8]; - pstore(in[0], kernel.packet[0]); - pstore(in[1], kernel.packet[1]); - pstore(in[2], kernel.packet[2]); - pstore(in[3], kernel.packet[3]); - - EIGEN_ALIGN32 Eigen::half out[4][8]; - - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < 4; ++j) { - out[i][j] = in[j][2*i]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+4] = in[j][2*i+1]; - } - } - - kernel.packet[0] = pload(out[0]); - kernel.packet[1] = pload(out[1]); - kernel.packet[2] = pload(out[2]); - kernel.packet[3] = pload(out[3]); -} - - -// Disable the following code since it's broken on too many platforms / compilers. -//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) -#elif 0 - -typedef struct { - __m64 x; -} Packet4h; - - -template<> struct is_arithmetic { enum { value = true }; }; - -template <> -struct packet_traits : default_packet_traits { - typedef Packet4h type; - // There is no half-size packet for Packet4h. - typedef Packet4h half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 4, - HasHalfPacket = 0, - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0, - HasDiv = 0, - HasSqrt = 0, - HasRsqrt = 0, - HasExp = 0, - HasLog = 0, - HasBlend = 0 - }; -}; - - -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; }; - -template<> EIGEN_STRONG_INLINE Packet4h pset1(const Eigen::half& from) { - Packet4h result; - result.x = _mm_set1_pi16(from.x); - return result; -} - -template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet4h& from) { - return half_impl::raw_uint16_to_half(static_cast(_mm_cvtsi64_si32(from.x))); -} - -template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; } - -template<> EIGEN_STRONG_INLINE Packet4h padd(const Packet4h& a, const Packet4h& b) { - __int64_t a64 = _mm_cvtm64_si64(a.x); - __int64_t b64 = _mm_cvtm64_si64(b.x); - - Eigen::half h[4]; - - Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); - Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); - h[0] = ha + hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); - h[1] = ha + hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); - h[2] = ha + hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); - h[3] = ha + hb; - Packet4h result; - result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet4h pmul(const Packet4h& a, const Packet4h& b) { - __int64_t a64 = _mm_cvtm64_si64(a.x); - __int64_t b64 = _mm_cvtm64_si64(b.x); - - Eigen::half h[4]; - - Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); - Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); - h[0] = ha * hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); - h[1] = ha * hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); - h[2] = ha * hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); - h[3] = ha * hb; - Packet4h result; - result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet4h pload(const Eigen::half* from) { - Packet4h result; - result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet4h ploadu(const Eigen::half* from) { - Packet4h result; - result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet4h& from) { - __int64_t r = _mm_cvtm64_si64(from.x); - *(reinterpret_cast<__int64_t*>(to)) = r; -} - -template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet4h& from) { - __int64_t r = _mm_cvtm64_si64(from.x); - *(reinterpret_cast<__int64_t*>(to)) = r; -} - -template<> EIGEN_STRONG_INLINE Packet4h -ploadquad(const Eigen::half* from) { - return pset1(*from); -} - -template<> EIGEN_STRONG_INLINE Packet4h pgather(const Eigen::half* from, Index stride) -{ - Packet4h result; - result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); - return result; -} - -template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet4h& from, Index stride) -{ - __int64_t a = _mm_cvtm64_si64(from.x); - to[stride*0].x = static_cast(a); - to[stride*1].x = static_cast(a >> 16); - to[stride*2].x = static_cast(a >> 32); - to[stride*3].x = static_cast(a >> 48); -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x); - __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x); - __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x); - __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x); - - kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1); - kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1); - kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3); - kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3); -} - -#endif - -} -} - -#endif // EIGEN_PACKET_MATH_HALF_CUDA_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/arch/CUDA/TypeCasting.h b/lib/eigen_3.3.9/Eigen/src/Core/arch/CUDA/TypeCasting.h deleted file mode 100644 index aa5fbce8eac..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ /dev/null @@ -1,212 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_TYPE_CASTING_CUDA_H -#define EIGEN_TYPE_CASTING_CUDA_H - -namespace Eigen { - -namespace internal { - -template<> -struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) - typedef Eigen::half result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - return __float2half(a); - #else - return Eigen::half(a); - #endif - } -}; - -template<> -struct functor_traits > -{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; - - -template<> -struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) - typedef Eigen::half result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - return __float2half(static_cast(a)); - #else - return Eigen::half(static_cast(a)); - #endif - } -}; - -template<> -struct functor_traits > -{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; - - -template<> -struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) - typedef float result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - return __half2float(a); - #else - return static_cast(a); - #endif - } -}; - -template<> -struct functor_traits > -{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; - - - -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 2, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast(const half2& a, const half2& b) { - float2 r1 = __half22float2(a); - float2 r2 = __half22float2(b); - return make_float4(r1.x, r1.y, r2.x, r2.y); -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 2 - }; -}; - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast(const float4& a) { - // Simply discard the second half of the input - return __floats2half2_rn(a.x, a.y); -} - -#elif defined EIGEN_VECTORIZE_AVX512 -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16h& a) { - return half2float(a); -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet16h pcast(const Packet16f& a) { - return float2half(a); -} - -#elif defined EIGEN_VECTORIZE_AVX - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8h& a) { - return half2float(a); -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet8h pcast(const Packet8f& a) { - return float2half(a); -} - -// Disable the following code since it's broken on too many platforms / compilers. -//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) -#elif 0 - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4h& a) { - __int64_t a64 = _mm_cvtm64_si64(a.x); - Eigen::half h = raw_uint16_to_half(static_cast(a64)); - float f1 = static_cast(h); - h = raw_uint16_to_half(static_cast(a64 >> 16)); - float f2 = static_cast(h); - h = raw_uint16_to_half(static_cast(a64 >> 32)); - float f3 = static_cast(h); - h = raw_uint16_to_half(static_cast(a64 >> 48)); - float f4 = static_cast(h); - return _mm_set_ps(f4, f3, f2, f1); -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet4h pcast(const Packet4f& a) { - EIGEN_ALIGN16 float aux[4]; - pstore(aux, a); - Eigen::half h0(aux[0]); - Eigen::half h1(aux[1]); - Eigen::half h2(aux[2]); - Eigen::half h3(aux[3]); - - Packet4h result; - result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x); - return result; -} - -#endif - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_TYPE_CASTING_CUDA_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/arch/Default/ConjHelper.h b/lib/eigen_3.3.9/Eigen/src/Core/arch/Default/ConjHelper.h deleted file mode 100644 index 4cfe34e0526..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/arch/Default/ConjHelper.h +++ /dev/null @@ -1,29 +0,0 @@ - -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2017 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_ARCH_CONJ_HELPER_H -#define EIGEN_ARCH_CONJ_HELPER_H - -#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \ - template<> struct conj_helper { \ - EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \ - { return padd(c, pmul(x,y)); } \ - EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const \ - { return PACKET_CPLX(Eigen::internal::pmul(x, y.v)); } \ - }; \ - \ - template<> struct conj_helper { \ - EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \ - { return padd(c, pmul(x,y)); } \ - EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const \ - { return PACKET_CPLX(Eigen::internal::pmul(x.v, y)); } \ - }; - -#endif // EIGEN_ARCH_CONJ_HELPER_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/arch/NEON/Complex.h b/lib/eigen_3.3.9/Eigen/src/Core/arch/NEON/Complex.h deleted file mode 100644 index 306a309beb2..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/arch/NEON/Complex.h +++ /dev/null @@ -1,490 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2010 Gael Guennebaud -// Copyright (C) 2010 Konstantinos Margaritis -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_COMPLEX_NEON_H -#define EIGEN_COMPLEX_NEON_H - -namespace Eigen { - -namespace internal { - -inline uint32x4_t p4ui_CONJ_XOR() { -// See bug 1325, clang fails to call vld1q_u64. -#if EIGEN_COMP_CLANG - uint32x4_t ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; - return ret; -#else - static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; - return vld1q_u32( conj_XOR_DATA ); -#endif -} - -inline uint32x2_t p2ui_CONJ_XOR() { - static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000 }; - return vld1_u32( conj_XOR_DATA ); -} - -//---------- float ---------- -struct Packet2cf -{ - EIGEN_STRONG_INLINE Packet2cf() {} - EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} - Packet4f v; -}; - -template<> struct packet_traits > : default_packet_traits -{ - typedef Packet2cf type; - typedef Packet2cf half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 2, - HasHalfPacket = 0, - - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasSetLinear = 0 - }; -}; - -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; - -template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) -{ - float32x2_t r64; - r64 = vld1_f32((const float *)&from); - - return Packet2cf(vcombine_f32(r64, r64)); -} - -template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) -{ - Packet4ui b = vreinterpretq_u32_f32(a.v); - return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR()))); -} - -template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) -{ - Packet4f v1, v2; - - // Get the real values of a | a1_re | a1_re | a2_re | a2_re | - v1 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 0), vdup_lane_f32(vget_high_f32(a.v), 0)); - // Get the imag values of a | a1_im | a1_im | a2_im | a2_im | - v2 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 1), vdup_lane_f32(vget_high_f32(a.v), 1)); - // Multiply the real a with b - v1 = vmulq_f32(v1, b.v); - // Multiply the imag a with b - v2 = vmulq_f32(v2, b.v); - // Conjugate v2 - v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR())); - // Swap real/imag elements in v2. - v2 = vrev64q_f32(v2); - // Add and return the result - return Packet2cf(vaddq_f32(v1, v2)); -} - -template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) -{ - return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) -{ - return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) -{ - return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) -{ - return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); -} - -template<> EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } -template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } - -template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } - -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } - -template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) -{ - Packet4f res = pset1(0.f); - res = vsetq_lane_f32(std::real(from[0*stride]), res, 0); - res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1); - res = vsetq_lane_f32(std::real(from[1*stride]), res, 2); - res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3); - return Packet2cf(res); -} - -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) -{ - to[stride*0] = std::complex(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1)); - to[stride*1] = std::complex(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3)); -} - -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((const float *)addr); } - -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) -{ - std::complex EIGEN_ALIGN16 x[2]; - vst1q_f32((float *)x, a.v); - return x[0]; -} - -template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) -{ - float32x2_t a_lo, a_hi; - Packet4f a_r128; - - a_lo = vget_low_f32(a.v); - a_hi = vget_high_f32(a.v); - a_r128 = vcombine_f32(a_hi, a_lo); - - return Packet2cf(a_r128); -} - -template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& a) -{ - return Packet2cf(vrev64q_f32(a.v)); -} - -template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) -{ - float32x2_t a1, a2; - std::complex s; - - a1 = vget_low_f32(a.v); - a2 = vget_high_f32(a.v); - a2 = vadd_f32(a1, a2); - vst1_f32((float *)&s, a2); - - return s; -} - -template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) -{ - Packet4f sum1, sum2, sum; - - // Add the first two 64-bit float32x2_t of vecs[0] - sum1 = vcombine_f32(vget_low_f32(vecs[0].v), vget_low_f32(vecs[1].v)); - sum2 = vcombine_f32(vget_high_f32(vecs[0].v), vget_high_f32(vecs[1].v)); - sum = vaddq_f32(sum1, sum2); - - return Packet2cf(sum); -} - -template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) -{ - float32x2_t a1, a2, v1, v2, prod; - std::complex s; - - a1 = vget_low_f32(a.v); - a2 = vget_high_f32(a.v); - // Get the real values of a | a1_re | a1_re | a2_re | a2_re | - v1 = vdup_lane_f32(a1, 0); - // Get the real values of a | a1_im | a1_im | a2_im | a2_im | - v2 = vdup_lane_f32(a1, 1); - // Multiply the real a with b - v1 = vmul_f32(v1, a2); - // Multiply the imag a with b - v2 = vmul_f32(v2, a2); - // Conjugate v2 - v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR())); - // Swap real/imag elements in v2. - v2 = vrev64_f32(v2); - // Add v1, v2 - prod = vadd_f32(v1, v2); - - vst1_f32((float *)&s, prod); - - return s; -} - -template -struct palign_impl -{ - EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset==1) - { - first.v = vextq_f32(first.v, second.v, 2); - } - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - -EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) - -template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) -{ - // TODO optimize it for NEON - Packet2cf res = conj_helper().pmul(a,b); - Packet4f s, rev_s; - - // this computes the norm - s = vmulq_f32(b.v, b.v); - rev_s = vrev64q_f32(s); - - return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v)); - kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v)); - kernel.packet[1].v = tmp; -} - -//---------- double ---------- -#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG - -// See bug 1325, clang fails to call vld1q_u64. -#if EIGEN_COMP_CLANG - static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000}; -#else - const uint64_t p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 }; - static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA ); -#endif - -struct Packet1cd -{ - EIGEN_STRONG_INLINE Packet1cd() {} - EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {} - Packet2d v; -}; - -template<> struct packet_traits > : default_packet_traits -{ - typedef Packet1cd type; - typedef Packet1cd half; - enum { - Vectorizable = 1, - AlignedOnScalar = 0, - size = 1, - HasHalfPacket = 0, - - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasSetLinear = 0 - }; -}; - -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; - -template<> EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } - -template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) -{ /* here we really have to use unaligned loads :( */ return ploadu(&from); } - -template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(padd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(psub(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); } - -template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) -{ - Packet2d v1, v2; - - // Get the real values of a - v1 = vdupq_lane_f64(vget_low_f64(a.v), 0); - // Get the imag values of a - v2 = vdupq_lane_f64(vget_high_f64(a.v), 0); - // Multiply the real a with b - v1 = vmulq_f64(v1, b.v); - // Multiply the imag a with b - v2 = vmulq_f64(v2, b.v); - // Conjugate v2 - v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR)); - // Swap real/imag elements in v2. - v2 = preverse(v2); - // Add and return the result - return Packet1cd(vaddq_f64(v1, v2)); -} - -template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) -{ - return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) -{ - return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) -{ - return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) -{ - return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); -} - -template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } - -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } - -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((const double *)addr); } - -template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride) -{ - Packet2d res = pset1(0.0); - res = vsetq_lane_f64(std::real(from[0*stride]), res, 0); - res = vsetq_lane_f64(std::imag(from[0*stride]), res, 1); - return Packet1cd(res); -} - -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride) -{ - to[stride*0] = std::complex(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); -} - - -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) -{ - std::complex EIGEN_ALIGN16 res; - pstore >(&res, a); - - return res; -} - -template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } - -template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { return pfirst(a); } - -template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { return vecs[0]; } - -template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) - { - // FIXME is it sure we never have to align a Packet1cd? - // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - -EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) - -template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) -{ - // TODO optimize it for NEON - Packet1cd res = conj_helper().pmul(a,b); - Packet2d s = pmul(b.v, b.v); - Packet2d rev_s = preverse(s); - - return Packet1cd(pdiv(res.v, padd(s,rev_s))); -} - -EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) -{ - return Packet1cd(preverse(Packet2d(x.v))); -} - -EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - Packet2d tmp = vcombine_f64(vget_high_f64(kernel.packet[0].v), vget_high_f64(kernel.packet[1].v)); - kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v)); - kernel.packet[1].v = tmp; -} -#endif // EIGEN_ARCH_ARM64 - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_COMPLEX_NEON_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/arch/NEON/MathFunctions.h b/lib/eigen_3.3.9/Eigen/src/Core/arch/NEON/MathFunctions.h deleted file mode 100644 index 6bb05bb922a..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/arch/NEON/MathFunctions.h +++ /dev/null @@ -1,91 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/* The sin, cos, exp, and log functions of this file come from - * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ - */ - -#ifndef EIGEN_MATH_FUNCTIONS_NEON_H -#define EIGEN_MATH_FUNCTIONS_NEON_H - -namespace Eigen { - -namespace internal { - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pexp(const Packet4f& _x) -{ - Packet4f x = _x; - Packet4f tmp, fx; - - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); - - x = vminq_f32(x, p4f_exp_hi); - x = vmaxq_f32(x, p4f_exp_lo); - - /* express exp(x) as exp(g + n*log(2)) */ - fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF); - - /* perform a floorf */ - tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); - - /* if greater, substract 1 */ - Packet4ui mask = vcgtq_f32(tmp, fx); - mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1)); - - fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); - - tmp = vmulq_f32(fx, p4f_cephes_exp_C1); - Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2); - x = vsubq_f32(x, tmp); - x = vsubq_f32(x, z); - - Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x); - z = vmulq_f32(x, x); - y = vaddq_f32(y, p4f_cephes_exp_p1); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p2); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p3); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p4); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p5); - - y = vmulq_f32(y, z); - y = vaddq_f32(y, x); - y = vaddq_f32(y, p4f_1); - - /* build 2^n */ - int32x4_t mm; - mm = vcvtq_s32_f32(fx); - mm = vaddq_s32(mm, p4i_0x7f); - mm = vshlq_n_s32(mm, 23); - Packet4f pow2n = vreinterpretq_f32_s32(mm); - - y = vmulq_f32(y, pow2n); - return y; -} - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_MATH_FUNCTIONS_NEON_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/arch/NEON/PacketMath.h b/lib/eigen_3.3.9/Eigen/src/Core/arch/NEON/PacketMath.h deleted file mode 100644 index 3d5ed0d240c..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/arch/NEON/PacketMath.h +++ /dev/null @@ -1,760 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008-2009 Gael Guennebaud -// Copyright (C) 2010 Konstantinos Margaritis -// Heavily based on Gael's SSE version. -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_PACKET_MATH_NEON_H -#define EIGEN_PACKET_MATH_NEON_H - -namespace Eigen { - -namespace internal { - -#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD -#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 -#endif - -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#endif - -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - -#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#if EIGEN_ARCH_ARM64 -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 -#else -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 -#endif -#endif - -#if EIGEN_COMP_MSVC - -// In MSVC's arm_neon.h header file, all NEON vector types -// are aliases to the same underlying type __n128. -// We thus have to wrap them to make them different C++ types. -// (See also bug 1428) - -template -struct eigen_packet_wrapper -{ - operator T&() { return m_val; } - operator const T&() const { return m_val; } - eigen_packet_wrapper() {} - eigen_packet_wrapper(const T &v) : m_val(v) {} - eigen_packet_wrapper& operator=(const T &v) { - m_val = v; - return *this; - } - - T m_val; -}; -typedef eigen_packet_wrapper Packet2f; -typedef eigen_packet_wrapper Packet4f; -typedef eigen_packet_wrapper Packet4i; -typedef eigen_packet_wrapper Packet2i; -typedef eigen_packet_wrapper Packet4ui; - -#else - -typedef float32x2_t Packet2f; -typedef float32x4_t Packet4f; -typedef int32x4_t Packet4i; -typedef int32x2_t Packet2i; -typedef uint32x4_t Packet4ui; - -#endif // EIGEN_COMP_MSVC - -#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ - const Packet4f p4f_##NAME = pset1(X) - -#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1(X)) - -#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ - const Packet4i p4i_##NAME = pset1(X) - -#if EIGEN_ARCH_ARM64 - // __builtin_prefetch tends to do nothing on ARM64 compilers because the - // prefetch instructions there are too detailed for __builtin_prefetch to map - // meaningfully to them. - #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : ); -#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC - #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR); -#elif defined __pld - #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR) -#elif EIGEN_ARCH_ARM32 - #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : ); -#else - // by default no explicit prefetching - #define EIGEN_ARM_PREFETCH(ADDR) -#endif - -template<> struct packet_traits : default_packet_traits -{ - typedef Packet4f type; - typedef Packet4f half; // Packet2f intrinsics not implemented yet - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 4, - HasHalfPacket=0, // Packet2f intrinsics not implemented yet - - HasDiv = 1, - // FIXME check the Has* - HasSin = 0, - HasCos = 0, - HasLog = 0, - HasExp = 1, - HasSqrt = 0 - }; -}; -template<> struct packet_traits : default_packet_traits -{ - typedef Packet4i type; - typedef Packet4i half; // Packet2i intrinsics not implemented yet - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=4, - HasHalfPacket=0 // Packet2i intrinsics not implemented yet - // FIXME check the Has* - }; -}; - -#if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM -// workaround gcc 4.2, 4.3 and 4.4 compilatin issue -EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); } -EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); } -EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); } -EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } -EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } -#endif - -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; - -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { return vdupq_n_s32(from); } - -template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) -{ - const float f[] = {0, 1, 2, 3}; - Packet4f countdown = vld1q_f32(f); - return vaddq_f32(pset1(a), countdown); -} -template<> EIGEN_STRONG_INLINE Packet4i plset(const int32_t& a) -{ - const int32_t i[] = {0, 1, 2, 3}; - Packet4i countdown = vld1q_s32(i); - return vaddq_s32(pset1(a), countdown); -} - -template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); } -template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); } - -template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } - -template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) -{ -#if EIGEN_ARCH_ARM64 - return vdivq_f32(a,b); -#else - Packet4f inv, restep, div; - - // NEON does not offer a divide instruction, we have to do a reciprocal approximation - // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers - // a reciprocal estimate AND a reciprocal step -which saves a few instructions - // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with - // Newton-Raphson and vrecpsq_f32() - inv = vrecpeq_f32(b); - - // This returns a differential, by which we will have to multiply inv to get a better - // approximation of 1/b. - restep = vrecpsq_f32(b, inv); - inv = vmulq_f32(restep, inv); - - // Finally, multiply a by 1/b and get the wanted result of the division. - div = vmulq_f32(a, inv); - - return div; -#endif -} - -template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, const Packet4i& /*b*/) -{ eigen_assert(false && "packet integer division are not supported by NEON"); - return pset1(0); -} - -// Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available, -// then implements a slow software scalar fallback calling fmaf()! -// Filed LLVM bug: -// https://llvm.org/bugs/show_bug.cgi?id=27216 -#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM) -// See bug 936. -// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4. -// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding. -// MLA is not fused i.e. does 2 roundings. -// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4): -// MLA: 10 GFlop/s ; FMA: 12 GFlops/s. -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); } -#else -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { -#if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM - // Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu, - // at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on - // -march=armv7-a, that is a very common case. - // See e.g. this thread: - // http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html - // Filed LLVM bug: - // https://llvm.org/bugs/show_bug.cgi?id=27219 - Packet4f r = c; - asm volatile( - "vmla.f32 %q[r], %q[a], %q[b]" - : [r] "+w" (r) - : [a] "w" (a), - [b] "w" (b) - : ); - return r; -#else - return vmlaq_f32(c,a,b); -#endif -} -#endif - -// No FMA instruction for int, so use MLA unconditionally. -template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); } - -// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics -template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) -{ - return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); -} -template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) -{ - return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); -} -template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) -{ - return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); -} -template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) -{ - return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); -} -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } - -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } - -template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) -{ - float32x2_t lo, hi; - lo = vld1_dup_f32(from); - hi = vld1_dup_f32(from+1); - return vcombine_f32(lo, hi); -} -template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int32_t* from) -{ - int32x2_t lo, hi; - lo = vld1_dup_s32(from); - hi = vld1_dup_s32(from+1); - return vcombine_s32(lo, hi); -} - -template<> EIGEN_STRONG_INLINE void pstore (float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } -template<> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } - -template<> EIGEN_STRONG_INLINE void pstoreu (float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } - -template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) -{ - Packet4f res = pset1(0.f); - res = vsetq_lane_f32(from[0*stride], res, 0); - res = vsetq_lane_f32(from[1*stride], res, 1); - res = vsetq_lane_f32(from[2*stride], res, 2); - res = vsetq_lane_f32(from[3*stride], res, 3); - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int32_t* from, Index stride) -{ - Packet4i res = pset1(0); - res = vsetq_lane_s32(from[0*stride], res, 0); - res = vsetq_lane_s32(from[1*stride], res, 1); - res = vsetq_lane_s32(from[2*stride], res, 2); - res = vsetq_lane_s32(from[3*stride], res, 3); - return res; -} - -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) -{ - to[stride*0] = vgetq_lane_f32(from, 0); - to[stride*1] = vgetq_lane_f32(from, 1); - to[stride*2] = vgetq_lane_f32(from, 2); - to[stride*3] = vgetq_lane_f32(from, 3); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(int32_t* to, const Packet4i& from, Index stride) -{ - to[stride*0] = vgetq_lane_s32(from, 0); - to[stride*1] = vgetq_lane_s32(from, 1); - to[stride*2] = vgetq_lane_s32(from, 2); - to[stride*3] = vgetq_lane_s32(from, 3); -} - -template<> EIGEN_STRONG_INLINE void prefetch (const float* addr) { EIGEN_ARM_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); } - -// FIXME only store the 2 first elements ? -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } - -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { - float32x2_t a_lo, a_hi; - Packet4f a_r64; - - a_r64 = vrev64q_f32(a); - a_lo = vget_low_f32(a_r64); - a_hi = vget_high_f32(a_r64); - return vcombine_f32(a_hi, a_lo); -} -template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { - int32x2_t a_lo, a_hi; - Packet4i a_r64; - - a_r64 = vrev64q_s32(a); - a_lo = vget_low_s32(a_r64); - a_hi = vget_high_s32(a_r64); - return vcombine_s32(a_hi, a_lo); -} - -template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); } -template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); } - -template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) -{ - float32x2_t a_lo, a_hi, sum; - - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); - sum = vpadd_f32(a_lo, a_hi); - sum = vpadd_f32(sum, sum); - return vget_lane_f32(sum, 0); -} - -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) -{ - float32x4x2_t vtrn1, vtrn2, res1, res2; - Packet4f sum1, sum2, sum; - - // NEON zip performs interleaving of the supplied vectors. - // We perform two interleaves in a row to acquire the transposed vector - vtrn1 = vzipq_f32(vecs[0], vecs[2]); - vtrn2 = vzipq_f32(vecs[1], vecs[3]); - res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]); - res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]); - - // Do the addition of the resulting vectors - sum1 = vaddq_f32(res1.val[0], res1.val[1]); - sum2 = vaddq_f32(res2.val[0], res2.val[1]); - sum = vaddq_f32(sum1, sum2); - - return sum; -} - -template<> EIGEN_STRONG_INLINE int32_t predux(const Packet4i& a) -{ - int32x2_t a_lo, a_hi, sum; - - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); - sum = vpadd_s32(a_lo, a_hi); - sum = vpadd_s32(sum, sum); - return vget_lane_s32(sum, 0); -} - -template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) -{ - int32x4x2_t vtrn1, vtrn2, res1, res2; - Packet4i sum1, sum2, sum; - - // NEON zip performs interleaving of the supplied vectors. - // We perform two interleaves in a row to acquire the transposed vector - vtrn1 = vzipq_s32(vecs[0], vecs[2]); - vtrn2 = vzipq_s32(vecs[1], vecs[3]); - res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]); - res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]); - - // Do the addition of the resulting vectors - sum1 = vaddq_s32(res1.val[0], res1.val[1]); - sum2 = vaddq_s32(res2.val[0], res2.val[1]); - sum = vaddq_s32(sum1, sum2); - - return sum; -} - -// Other reduction functions: -// mul -template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) -{ - float32x2_t a_lo, a_hi, prod; - - // Get a_lo = |a1|a2| and a_hi = |a3|a4| - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); - // Get the product of a_lo * a_hi -> |a1*a3|a2*a4| - prod = vmul_f32(a_lo, a_hi); - // Multiply prod with its swapped value |a2*a4|a1*a3| - prod = vmul_f32(prod, vrev64_f32(prod)); - - return vget_lane_f32(prod, 0); -} -template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) -{ - int32x2_t a_lo, a_hi, prod; - - // Get a_lo = |a1|a2| and a_hi = |a3|a4| - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); - // Get the product of a_lo * a_hi -> |a1*a3|a2*a4| - prod = vmul_s32(a_lo, a_hi); - // Multiply prod with its swapped value |a2*a4|a1*a3| - prod = vmul_s32(prod, vrev64_s32(prod)); - - return vget_lane_s32(prod, 0); -} - -// min -template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) -{ - float32x2_t a_lo, a_hi, min; - - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); - min = vpmin_f32(a_lo, a_hi); - min = vpmin_f32(min, min); - - return vget_lane_f32(min, 0); -} - -template<> EIGEN_STRONG_INLINE int32_t predux_min(const Packet4i& a) -{ - int32x2_t a_lo, a_hi, min; - - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); - min = vpmin_s32(a_lo, a_hi); - min = vpmin_s32(min, min); - - return vget_lane_s32(min, 0); -} - -// max -template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) -{ - float32x2_t a_lo, a_hi, max; - - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); - max = vpmax_f32(a_lo, a_hi); - max = vpmax_f32(max, max); - - return vget_lane_f32(max, 0); -} - -template<> EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) -{ - int32x2_t a_lo, a_hi, max; - - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); - max = vpmax_s32(a_lo, a_hi); - max = vpmax_s32(max, max); - - return vget_lane_s32(max, 0); -} - -// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, -// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 -#define PALIGN_NEON(Offset,Type,Command) \ -template<>\ -struct palign_impl\ -{\ - EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\ - {\ - if (Offset!=0)\ - first = Command(first, second, Offset);\ - }\ -};\ - -PALIGN_NEON(0,Packet4f,vextq_f32) -PALIGN_NEON(1,Packet4f,vextq_f32) -PALIGN_NEON(2,Packet4f,vextq_f32) -PALIGN_NEON(3,Packet4f,vextq_f32) -PALIGN_NEON(0,Packet4i,vextq_s32) -PALIGN_NEON(1,Packet4i,vextq_s32) -PALIGN_NEON(2,Packet4i,vextq_s32) -PALIGN_NEON(3,Packet4i,vextq_s32) - -#undef PALIGN_NEON - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]); - float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]); - - kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0])); - kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0])); - kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1])); - kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1])); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]); - int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]); - kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0])); - kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0])); - kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1])); - kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1])); -} - -//---------- double ---------- - -// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double. -// Confirmed at least with __apple_build_version__ = 6000054. -#ifdef __apple_build_version__ -// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed. -// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with -// major toolchain updates. -#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000) -#else -#define EIGEN_APPLE_DOUBLE_NEON_BUG 0 -#endif - -#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG - -// Bug 907: workaround missing declarations of the following two functions in the ADK -// Defining these functions as templates ensures that if these intrinsics are -// already defined in arm_neon.h, then our workaround doesn't cause a conflict -// and has lower priority in overload resolution. -template -uint64x2_t vreinterpretq_u64_f64(T a) -{ - return (uint64x2_t) a; -} - -template -float64x2_t vreinterpretq_f64_u64(T a) -{ - return (float64x2_t) a; -} - -typedef float64x2_t Packet2d; -typedef float64x1_t Packet1d; - -template<> struct packet_traits : default_packet_traits -{ - typedef Packet2d type; - typedef Packet2d half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 2, - HasHalfPacket=0, - - HasDiv = 1, - // FIXME check the Has* - HasSin = 0, - HasCos = 0, - HasLog = 0, - HasExp = 0, - HasSqrt = 0 - }; -}; - -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; - -template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vdupq_n_f64(from); } - -template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) -{ - const double countdown_raw[] = {0.0,1.0}; - const Packet2d countdown = vld1q_f64(countdown_raw); - return vaddq_f64(pset1(a), countdown); -} -template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); } - -template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); } - -template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); } - -template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } - -template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); } - -template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); } - -#ifdef __ARM_FEATURE_FMA -// See bug 936. See above comment about FMA for float. -template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); } -#else -template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); } -#endif - -template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); } - -template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); } - -// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics -template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); -} - -template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); -} - -template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); -} - -template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); -} - -template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } - -template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); } - -template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) -{ - return vld1q_dup_f64(from); -} -template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); } - -template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); } - -template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) -{ - Packet2d res = pset1(0.0); - res = vsetq_lane_f64(from[0*stride], res, 0); - res = vsetq_lane_f64(from[1*stride], res, 1); - return res; -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) -{ - to[stride*0] = vgetq_lane_f64(from, 0); - to[stride*1] = vgetq_lane_f64(from, 1); -} -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ARM_PREFETCH(addr); } - -// FIXME only store the 2 first elements ? -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return vgetq_lane_f64(a, 0); } - -template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); } - -template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } - -#if EIGEN_COMP_CLANG && defined(__apple_build_version__) -// workaround ICE, see bug 907 -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; } -#else -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } -#endif - -template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) -{ - float64x2_t trn1, trn2; - - // NEON zip performs interleaving of the supplied vectors. - // We perform two interleaves in a row to acquire the transposed vector - trn1 = vzip1q_f64(vecs[0], vecs[1]); - trn2 = vzip2q_f64(vecs[0], vecs[1]); - - // Do the addition of the resulting vectors - return vaddq_f64(trn1, trn2); -} -// Other reduction functions: -// mul -#if EIGEN_COMP_CLANG && defined(__apple_build_version__) -template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; } -#else -template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } -#endif - -// min -template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); } - -// max -template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); } - -// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, -// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 -#define PALIGN_NEON(Offset,Type,Command) \ -template<>\ -struct palign_impl\ -{\ - EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\ - {\ - if (Offset!=0)\ - first = Command(first, second, Offset);\ - }\ -};\ - -PALIGN_NEON(0,Packet2d,vextq_f64) -PALIGN_NEON(1,Packet2d,vextq_f64) -#undef PALIGN_NEON - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]); - float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]); - - kernel.packet[0] = trn1; - kernel.packet[1] = trn2; -} -#endif // EIGEN_ARCH_ARM64 - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_PACKET_MATH_NEON_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/arch/SSE/MathFunctions.h b/lib/eigen_3.3.9/Eigen/src/Core/arch/SSE/MathFunctions.h deleted file mode 100644 index 7b5f948e119..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/arch/SSE/MathFunctions.h +++ /dev/null @@ -1,562 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2007 Julien Pommier -// Copyright (C) 2009 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/* The sin, cos, exp, and log functions of this file come from - * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ - */ - -#ifndef EIGEN_MATH_FUNCTIONS_SSE_H -#define EIGEN_MATH_FUNCTIONS_SSE_H - -namespace Eigen { - -namespace internal { - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f plog(const Packet4f& _x) -{ - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); - - /* the smallest non denormalized float number */ - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);//-1.f/0.f); - - /* natural logarithm computed for 4 simultaneous float - return NaN for x <= 0 - */ - _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); - - - Packet4i emm0; - - Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); // not greater equal is true if x is NaN - Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps()); - - x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */ - emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); - - /* keep only the fractional part */ - x = _mm_and_ps(x, p4f_inv_mant_mask); - x = _mm_or_ps(x, p4f_half); - - emm0 = _mm_sub_epi32(emm0, p4i_0x7f); - Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1); - - /* part2: - if( x < SQRTHF ) { - e -= 1; - x = x + x - 1.0; - } else { x = x - 1.0; } - */ - Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF); - Packet4f tmp = pand(x, mask); - x = psub(x, p4f_1); - e = psub(e, pand(p4f_1, mask)); - x = padd(x, tmp); - - Packet4f x2 = pmul(x,x); - Packet4f x3 = pmul(x2,x); - - Packet4f y, y1, y2; - y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); - y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); - y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); - y = pmadd(y , x, p4f_cephes_log_p2); - y1 = pmadd(y1, x, p4f_cephes_log_p5); - y2 = pmadd(y2, x, p4f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - y1 = pmul(e, p4f_cephes_log_q1); - tmp = pmul(x2, p4f_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p4f_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - // negative arg will be NAN, 0 will be -INF - return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)), - _mm_and_ps(iszero_mask, p4f_minus_inf)); -} - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pexp(const Packet4f& _x) -{ - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - - - _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); - - _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); - - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); - - Packet4f tmp, fx; - Packet4i emm0; - - // clamp x - x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); - - /* express exp(x) as exp(g + n*log(2)) */ - fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); - -#ifdef EIGEN_VECTORIZE_SSE4_1 - fx = _mm_floor_ps(fx); -#else - emm0 = _mm_cvttps_epi32(fx); - tmp = _mm_cvtepi32_ps(emm0); - /* if greater, substract 1 */ - Packet4f mask = _mm_cmpgt_ps(tmp, fx); - mask = _mm_and_ps(mask, p4f_1); - fx = psub(tmp, mask); -#endif - - tmp = pmul(fx, p4f_cephes_exp_C1); - Packet4f z = pmul(fx, p4f_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - z = pmul(x,x); - - Packet4f y = p4f_cephes_exp_p0; - y = pmadd(y, x, p4f_cephes_exp_p1); - y = pmadd(y, x, p4f_cephes_exp_p2); - y = pmadd(y, x, p4f_cephes_exp_p3); - y = pmadd(y, x, p4f_cephes_exp_p4); - y = pmadd(y, x, p4f_cephes_exp_p5); - y = pmadd(y, z, x); - y = padd(y, p4f_1); - - // build 2^n - emm0 = _mm_cvttps_epi32(fx); - emm0 = _mm_add_epi32(emm0, p4i_0x7f); - emm0 = _mm_slli_epi32(emm0, 23); - return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x); -} -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet2d pexp(const Packet2d& _x) -{ - Packet2d x = _x; - - _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); - _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); - _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); - - _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); - _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); - static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0); - - Packet2d tmp, fx; - Packet4i emm0; - - // clamp x - x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); - /* express exp(x) as exp(g + n*log(2)) */ - fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half); - -#ifdef EIGEN_VECTORIZE_SSE4_1 - fx = _mm_floor_pd(fx); -#else - emm0 = _mm_cvttpd_epi32(fx); - tmp = _mm_cvtepi32_pd(emm0); - /* if greater, substract 1 */ - Packet2d mask = _mm_cmpgt_pd(tmp, fx); - mask = _mm_and_pd(mask, p2d_1); - fx = psub(tmp, mask); -#endif - - tmp = pmul(fx, p2d_cephes_exp_C1); - Packet2d z = pmul(fx, p2d_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - Packet2d x2 = pmul(x,x); - - Packet2d px = p2d_cephes_exp_p0; - px = pmadd(px, x2, p2d_cephes_exp_p1); - px = pmadd(px, x2, p2d_cephes_exp_p2); - px = pmul (px, x); - - Packet2d qx = p2d_cephes_exp_q0; - qx = pmadd(qx, x2, p2d_cephes_exp_q1); - qx = pmadd(qx, x2, p2d_cephes_exp_q2); - qx = pmadd(qx, x2, p2d_cephes_exp_q3); - - x = pdiv(px,psub(qx,px)); - x = pmadd(p2d_2,x,p2d_1); - - // build 2^n - emm0 = _mm_cvttpd_epi32(fx); - emm0 = _mm_add_epi32(emm0, p4i_1023_0); - emm0 = _mm_slli_epi32(emm0, 20); - emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); - return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x); -} - -/* evaluation of 4 sines at onces, using SSE2 intrinsics. - - The code is the exact rewriting of the cephes sinf function. - Precision is excellent as long as x < 8192 (I did not bother to - take into account the special handling they have for greater values - -- it does not return garbage for arguments over 8192, though, but - the extra precision is missing). - - Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the - surprising but correct result. -*/ - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f psin(const Packet4f& _x) -{ - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - - _EIGEN_DECLARE_CONST_Packet4i(1, 1); - _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); - _EIGEN_DECLARE_CONST_Packet4i(2, 2); - _EIGEN_DECLARE_CONST_Packet4i(4, 4); - - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000); - - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI - - Packet4f xmm1, xmm2, xmm3, sign_bit, y; - - Packet4i emm0, emm2; - sign_bit = x; - /* take the absolute value */ - x = pabs(x); - - /* take the modulo */ - - /* extract the sign bit (upper one) */ - sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask); - - /* scale by 4/Pi */ - y = pmul(x, p4f_cephes_FOPI); - - /* store the integer part of y in mm0 */ - emm2 = _mm_cvttps_epi32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = _mm_add_epi32(emm2, p4i_1); - emm2 = _mm_and_si128(emm2, p4i_not1); - y = _mm_cvtepi32_ps(emm2); - /* get the swap sign flag */ - emm0 = _mm_and_si128(emm2, p4i_4); - emm0 = _mm_slli_epi32(emm0, 29); - /* get the polynom selection mask - there is one polynom for 0 <= x <= Pi/4 - and another one for Pi/4 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pcos(const Packet4f& _x) -{ - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - - _EIGEN_DECLARE_CONST_Packet4i(1, 1); - _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); - _EIGEN_DECLARE_CONST_Packet4i(2, 2); - _EIGEN_DECLARE_CONST_Packet4i(4, 4); - - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI - - Packet4f xmm1, xmm2, xmm3, y; - Packet4i emm0, emm2; - - x = pabs(x); - - /* scale by 4/Pi */ - y = pmul(x, p4f_cephes_FOPI); - - /* get the integer part of y */ - emm2 = _mm_cvttps_epi32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = _mm_add_epi32(emm2, p4i_1); - emm2 = _mm_and_si128(emm2, p4i_not1); - y = _mm_cvtepi32_ps(emm2); - - emm2 = _mm_sub_epi32(emm2, p4i_2); - - /* get the swap sign flag */ - emm0 = _mm_andnot_si128(emm2, p4i_4); - emm0 = _mm_slli_epi32(emm0, 29); - /* get the polynom selection mask */ - emm2 = _mm_and_si128(emm2, p4i_2); - emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); - - Packet4f sign_bit = _mm_castsi128_ps(emm0); - Packet4f poly_mask = _mm_castsi128_ps(emm2); - - /* The magic pass: "Extended precision modular arithmetic" - x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = pmul(y, p4f_minus_cephes_DP1); - xmm2 = pmul(y, p4f_minus_cephes_DP2); - xmm3 = pmul(y, p4f_minus_cephes_DP3); - x = padd(x, xmm1); - x = padd(x, xmm2); - x = padd(x, xmm3); - - /* Evaluate the first polynom (0 <= x <= Pi/4) */ - y = p4f_coscof_p0; - Packet4f z = pmul(x,x); - - y = pmadd(y,z,p4f_coscof_p1); - y = pmadd(y,z,p4f_coscof_p2); - y = pmul(y, z); - y = pmul(y, z); - Packet4f tmp = _mm_mul_ps(z, p4f_half); - y = psub(y, tmp); - y = padd(y, p4f_1); - - /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - Packet4f y2 = p4f_sincof_p0; - y2 = pmadd(y2, z, p4f_sincof_p1); - y2 = pmadd(y2, z, p4f_sincof_p2); - y2 = pmul(y2, z); - y2 = pmadd(y2, x, x); - - /* select the correct result from the two polynoms */ - y2 = _mm_and_ps(poly_mask, y2); - y = _mm_andnot_ps(poly_mask, y); - y = _mm_or_ps(y,y2); - - /* update the sign */ - return _mm_xor_ps(y, sign_bit); -} - -#if EIGEN_FAST_MATH - -// Functions for sqrt. -// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step -// of Newton's method, at a cost of 1-2 bits of precision as opposed to the -// exact solution. It does not handle +inf, or denormalized numbers correctly. -// The main advantage of this approach is not just speed, but also the fact that -// it can be inlined and pipelined with other computations, further reducing its -// effective latency. This is similar to Quake3's fast inverse square root. -// For detail see here: http://www.beyond3d.com/content/articles/8/ -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f psqrt(const Packet4f& _x) -{ - Packet4f half = pmul(_x, pset1(.5f)); - Packet4f denormal_mask = _mm_and_ps( - _mm_cmpge_ps(_x, _mm_setzero_ps()), - _mm_cmplt_ps(_x, pset1((std::numeric_limits::min)()))); - - // Compute approximate reciprocal sqrt. - Packet4f x = _mm_rsqrt_ps(_x); - // Do a single step of Newton's iteration. - x = pmul(x, psub(pset1(1.5f), pmul(half, pmul(x,x)))); - // Flush results for denormals to zero. - return _mm_andnot_ps(denormal_mask, pmul(_x,x)); -} - -#else - -template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f psqrt(const Packet4f& x) { return _mm_sqrt_ps(x); } - -#endif - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet2d psqrt(const Packet2d& x) { return _mm_sqrt_pd(x); } - -#if EIGEN_FAST_MATH - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f prsqrt(const Packet4f& _x) { - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000); - _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f); - _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000); - - Packet4f neg_half = pmul(_x, p4f_minus_half); - - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min); - Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x)); - - // Fill in NaNs and Infs for the negative/zero entries. - Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps()); - Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask); - Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan), - _mm_and_ps(zero_mask, p4f_inf)); - - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five)); - - // Insert NaNs and Infs in all the right places. - return _mm_or_ps(x, infs_and_nans); -} - -#else - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f prsqrt(const Packet4f& x) { - // Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation. - return _mm_div_ps(pset1(1.0f), _mm_sqrt_ps(x)); -} - -#endif - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet2d prsqrt(const Packet2d& x) { - // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. - return _mm_div_pd(pset1(1.0), _mm_sqrt_pd(x)); -} - -// Hyperbolic Tangent function. -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f -ptanh(const Packet4f& x) { - return internal::generic_fast_tanh_float(x); -} - -} // end namespace internal - -namespace numext { - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -float sqrt(const float &x) -{ - return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x)))); -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -double sqrt(const double &x) -{ -#if EIGEN_COMP_GNUC_STRICT - // This works around a GCC bug generating poor code for _mm_sqrt_pd - // See https://bitbucket.org/eigen/eigen/commits/14f468dba4d350d7c19c9b93072e19f7b3df563b - return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x)))); -#else - return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x)))); -#endif -} - -} // end namespace numex - -} // end namespace Eigen - -#endif // EIGEN_MATH_FUNCTIONS_SSE_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/arch/SSE/PacketMath.h b/lib/eigen_3.3.9/Eigen/src/Core/arch/SSE/PacketMath.h deleted file mode 100755 index 60e2517e4bd..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/arch/SSE/PacketMath.h +++ /dev/null @@ -1,895 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008-2009 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_PACKET_MATH_SSE_H -#define EIGEN_PACKET_MATH_SSE_H - -namespace Eigen { - -namespace internal { - -#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD -#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 -#endif - -#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) -#endif - -#ifdef __FMA__ -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 -#endif -#endif - -#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX -// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot -// have overloads for both types without linking error. -// One solution is to increase ABI version using -fabi-version=4 (or greater). -// Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper -// structure: -template -struct eigen_packet_wrapper -{ - EIGEN_ALWAYS_INLINE operator T&() { return m_val; } - EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } - EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} - EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} - EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { - m_val = v; - return *this; - } - - T m_val; -}; -typedef eigen_packet_wrapper<__m128> Packet4f; -typedef eigen_packet_wrapper<__m128i> Packet4i; -typedef eigen_packet_wrapper<__m128d> Packet2d; -#else -typedef __m128 Packet4f; -typedef __m128i Packet4i; -typedef __m128d Packet2d; -#endif - -template<> struct is_arithmetic<__m128> { enum { value = true }; }; -template<> struct is_arithmetic<__m128i> { enum { value = true }; }; -template<> struct is_arithmetic<__m128d> { enum { value = true }; }; - -#define vec4f_swizzle1(v,p,q,r,s) \ - (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p))))) - -#define vec4i_swizzle1(v,p,q,r,s) \ - (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p)))) - -#define vec2d_swizzle1(v,p,q) \ - (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) - -#define vec4f_swizzle2(a,b,p,q,r,s) \ - (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) - -#define vec4i_swizzle2(a,b,p,q,r,s) \ - (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) - -#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ - const Packet4f p4f_##NAME = pset1(X) - -#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ - const Packet2d p2d_##NAME = pset1(X) - -#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1(X)) - -#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ - const Packet4i p4i_##NAME = pset1(X) - - -// Use the packet_traits defined in AVX/PacketMath.h instead if we're going -// to leverage AVX instructions. -#ifndef EIGEN_VECTORIZE_AVX -template<> struct packet_traits : default_packet_traits -{ - typedef Packet4f type; - typedef Packet4f half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=4, - HasHalfPacket = 0, - - HasDiv = 1, - HasSin = EIGEN_FAST_MATH, - HasCos = EIGEN_FAST_MATH, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasTanh = EIGEN_FAST_MATH, - HasBlend = 1 - -#ifdef EIGEN_VECTORIZE_SSE4_1 - , - HasRound = 1, - HasFloor = 1, - HasCeil = 1 -#endif - }; -}; -template<> struct packet_traits : default_packet_traits -{ - typedef Packet2d type; - typedef Packet2d half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=2, - HasHalfPacket = 0, - - HasDiv = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasBlend = 1 - -#ifdef EIGEN_VECTORIZE_SSE4_1 - , - HasRound = 1, - HasFloor = 1, - HasCeil = 1 -#endif - }; -}; -#endif -template<> struct packet_traits : default_packet_traits -{ - typedef Packet4i type; - typedef Packet4i half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=4, - - HasBlend = 1 - }; -}; - -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; -template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; - -#ifndef EIGEN_VECTORIZE_AVX -template<> struct scalar_div_cost { enum { value = 7 }; }; -template<> struct scalar_div_cost { enum { value = 8 }; }; -#endif - -#if EIGEN_COMP_MSVC==1500 -// Workaround MSVC 9 internal compiler error. -// TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode -// TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)). -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set_ps(from,from,from,from); } -template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set_pd(from,from); } -template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set_epi32(from,from,from,from); } -#else -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set_ps1(from); } -template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set1_pd(from); } -template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } -#endif - -// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction. -// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203) -// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions. -// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply. -// Also note that with AVX, we want it to generate a vbroadcastss. -#if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__) -template<> EIGEN_STRONG_INLINE Packet4f pload1(const float *from) { - return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); -} -#endif - -template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return _mm_add_ps(pset1(a), _mm_set_ps(3,2,1,0)); } -template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return _mm_add_pd(pset1(a),_mm_set_pd(1,0)); } -template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return _mm_add_epi32(pset1(a),_mm_set_epi32(3,2,1,0)); } - -template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) -{ - const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); - return _mm_xor_ps(a,mask); -} -template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) -{ - const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000)); - return _mm_xor_pd(a,mask); -} -template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) -{ - return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a); -} - -template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } - -template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_mullo_epi32(a,b); -#else - // this version is slightly faster than 4 scalar products - return vec4i_swizzle1( - vec4i_swizzle2( - _mm_mul_epu32(a,b), - _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2), - vec4i_swizzle1(b,1,0,3,2)), - 0,2,0,2), - 0,2,1,3); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } - -// for some weird raisons, it has to be overloaded for packet of integers -template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } -#ifdef __FMA__ -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } -template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } -#endif - -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_min_epi32(a,b); -#else - // after some bench, this version *is* faster than a scalar implementation - Packet4i mask = _mm_cmplt_epi32(a,b); - return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_max_epi32(a,b); -#else - // after some bench, this version *is* faster than a scalar implementation - Packet4i mask = _mm_cmpgt_epi32(a,b); - return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); -#endif -} - -#ifdef EIGEN_VECTORIZE_SSE4_1 -template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return _mm_round_ps(a, 0); } -template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return _mm_round_pd(a, 0); } - -template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return _mm_ceil_ps(a); } -template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return _mm_ceil_pd(a); } - -template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return _mm_floor_ps(a); } -template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return _mm_floor_pd(a); } -#endif - -template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } -template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } -template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } - -#if EIGEN_COMP_MSVC - template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { - EIGEN_DEBUG_UNALIGNED_LOAD - #if (EIGEN_COMP_MSVC==1600) - // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps - // (i.e., it does not generate an unaligned load!! - __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from)); - res = _mm_loadh_pi(res, (const __m64*)(from+2)); - return res; - #else - return _mm_loadu_ps(from); - #endif - } -#else -// NOTE: with the code below, MSVC's compiler crashes! - -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) -{ - EIGEN_DEBUG_UNALIGNED_LOAD - return _mm_loadu_ps(from); -} -#endif - -template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) -{ - EIGEN_DEBUG_UNALIGNED_LOAD - return _mm_loadu_pd(from); -} -template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) -{ - EIGEN_DEBUG_UNALIGNED_LOAD - return _mm_loadu_si128(reinterpret_cast(from)); -} - - -template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) -{ - return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast(from))), 0, 0, 1, 1); -} -template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) -{ return pset1(from[0]); } -template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) -{ - Packet4i tmp; - tmp = _mm_loadl_epi64(reinterpret_cast(from)); - return vec4i_swizzle1(tmp, 0, 0, 1, 1); -} - -template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } -template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } -template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } - -template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } - -template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) -{ - return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); -} -template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) -{ - return _mm_set_pd(from[1*stride], from[0*stride]); -} -template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) -{ - return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); - } - -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) -{ - to[stride*0] = _mm_cvtss_f32(from); - to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1)); - to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2)); - to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3)); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) -{ - to[stride*0] = _mm_cvtsd_f64(from); - to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1)); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) -{ - to[stride*0] = _mm_cvtsi128_si32(from); - to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)); - to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); - to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); -} - -// some compilers might be tempted to perform multiple moves instead of using a vector path. -template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) -{ - Packet4f pa = _mm_set_ss(a); - pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0))); -} -// some compilers might be tempted to perform multiple moves instead of using a vector path. -template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& a) -{ - Packet2d pa = _mm_set_sd(a); - pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); -} - -#if EIGEN_COMP_PGI -typedef const void * SsePrefetchPtrType; -#else -typedef const char * SsePrefetchPtrType; -#endif - -#ifndef EIGEN_VECTORIZE_AVX -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } -#endif - -#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 -// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 -// Direct of the struct members fixed bug #62. -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { return a.m128_f32[0]; } -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return a.m128d_f64[0]; } -template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } -#elif EIGEN_COMP_MSVC_STRICT -// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; } -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; } -template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } -#else -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { return _mm_cvtss_f32(a); } -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return _mm_cvtsd_f64(a); } -template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { return _mm_cvtsi128_si32(a); } -#endif - -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) -{ return _mm_shuffle_ps(a,a,0x1B); } -template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) -{ return _mm_shuffle_pd(a,a,0x1); } -template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) -{ return _mm_shuffle_epi32(a,0x1B); } - -template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) -{ - const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); - return _mm_and_ps(a,mask); -} -template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) -{ - const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); - return _mm_and_pd(a,mask); -} -template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) -{ - #ifdef EIGEN_VECTORIZE_SSSE3 - return _mm_abs_epi32(a); - #else - Packet4i aux = _mm_srai_epi32(a,31); - return _mm_sub_epi32(_mm_xor_si128(a,aux),aux); - #endif -} - -// with AVX, the default implementations based on pload1 are faster -#ifndef __AVX__ -template<> EIGEN_STRONG_INLINE void -pbroadcast4(const float *a, - Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) -{ - a3 = pload(a); - a0 = vec4f_swizzle1(a3, 0,0,0,0); - a1 = vec4f_swizzle1(a3, 1,1,1,1); - a2 = vec4f_swizzle1(a3, 2,2,2,2); - a3 = vec4f_swizzle1(a3, 3,3,3,3); -} -template<> EIGEN_STRONG_INLINE void -pbroadcast4(const double *a, - Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) -{ -#ifdef EIGEN_VECTORIZE_SSE3 - a0 = _mm_loaddup_pd(a+0); - a1 = _mm_loaddup_pd(a+1); - a2 = _mm_loaddup_pd(a+2); - a3 = _mm_loaddup_pd(a+3); -#else - a1 = pload(a); - a0 = vec2d_swizzle1(a1, 0,0); - a1 = vec2d_swizzle1(a1, 1,1); - a3 = pload(a+2); - a2 = vec2d_swizzle1(a3, 0,0); - a3 = vec2d_swizzle1(a3, 1,1); -#endif -} -#endif - -EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) -{ - vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55)); - vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA)); - vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF)); - vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00)); -} - -#ifdef EIGEN_VECTORIZE_SSE3 -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) -{ - return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); -} - -template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) -{ - return _mm_hadd_pd(vecs[0], vecs[1]); -} - -#else -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) -{ - Packet4f tmp0, tmp1, tmp2; - tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]); - tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]); - tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]); - tmp0 = _mm_add_ps(tmp0, tmp1); - tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]); - tmp1 = _mm_add_ps(tmp1, tmp2); - tmp2 = _mm_movehl_ps(tmp1, tmp0); - tmp0 = _mm_movelh_ps(tmp0, tmp1); - return _mm_add_ps(tmp0, tmp2); -} - -template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) -{ - return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1])); -} -#endif // SSE3 - -template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) -{ - // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures - // (from Nehalem to Haswell) -// #ifdef EIGEN_VECTORIZE_SSE3 -// Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3)); -// return pfirst(_mm_hadd_ps(tmp, tmp)); -// #else - Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); -// #endif -} - -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) -{ - // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures - // (from Nehalem to Haswell) -// #ifdef EIGEN_VECTORIZE_SSE3 -// return pfirst(_mm_hadd_pd(a, a)); -// #else - return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a))); -// #endif -} - -#ifdef EIGEN_VECTORIZE_SSSE3 -template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) -{ - return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3])); -} -template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) -{ - Packet4i tmp0 = _mm_hadd_epi32(a,a); - return pfirst(_mm_hadd_epi32(tmp0,tmp0)); -} -#else -template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) -{ - Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); - return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); -} - -template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) -{ - Packet4i tmp0, tmp1, tmp2; - tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - tmp0 = _mm_add_epi32(tmp0, tmp1); - tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - tmp1 = _mm_add_epi32(tmp1, tmp2); - tmp2 = _mm_unpacklo_epi64(tmp0, tmp1); - tmp0 = _mm_unpackhi_epi64(tmp0, tmp1); - return _mm_add_epi32(tmp0, tmp2); -} -#endif -// Other reduction functions: - -// mul -template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) -{ - Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); -} -template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) -{ - return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); -} -template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) -{ - // after some experiments, it is seems this is the fastest way to implement it - // for GCC (eg., reusing pmul is very slow !) - // TODO try to call _mm_mul_epu32 directly - EIGEN_ALIGN16 int aux[4]; - pstore(aux, a); - return (aux[0] * aux[1]) * (aux[2] * aux[3]);; -} - -// min -template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) -{ - Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); -} -template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) -{ - return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); -} -template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); - return pfirst(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); -#else - // after some experiments, it is seems this is the fastest way to implement it - // for GCC (eg., it does not like using std::min after the pstore !!) - EIGEN_ALIGN16 int aux[4]; - pstore(aux, a); - int aux0 = aux[0] EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) -{ - Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); -} -template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) -{ - return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); -} -template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); - return pfirst(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); -#else - // after some experiments, it is seems this is the fastest way to implement it - // for GCC (eg., it does not like using std::min after the pstore !!) - EIGEN_ALIGN16 int aux[4]; - pstore(aux, a); - int aux0 = aux[0]>aux[1] ? aux[0] : aux[1]; - int aux2 = aux[2]>aux[3] ? aux[2] : aux[3]; - return aux0>aux2 ? aux0 : aux2; -#endif // EIGEN_VECTORIZE_SSE4_1 -} - -#if EIGEN_COMP_GNUC -// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) -// { -// Packet4f res = b; -// asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c)); -// return res; -// } -// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i& a, const Packet4i& b, const int i) -// { -// Packet4i res = a; -// asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i)); -// return res; -// } -#endif - -#ifdef EIGEN_VECTORIZE_SSSE3 -// SSSE3 versions -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - if (Offset!=0) - first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4)); - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) - { - if (Offset!=0) - first = _mm_alignr_epi8(second,first, Offset*4); - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) - { - if (Offset==1) - first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8)); - } -}; -#else -// SSE2 versions -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - if (Offset==1) - { - first = _mm_move_ss(first,second); - first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39)); - } - else if (Offset==2) - { - first = _mm_movehl_ps(first,first); - first = _mm_movelh_ps(first,second); - } - else if (Offset==3) - { - first = _mm_move_ss(first,second); - first = _mm_shuffle_ps(first,second,0x93); - } - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) - { - if (Offset==1) - { - first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); - first = _mm_shuffle_epi32(first,0x39); - } - else if (Offset==2) - { - first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first))); - first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); - } - else if (Offset==3) - { - first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); - first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93)); - } - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) - { - if (Offset==1) - { - first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first))); - first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second))); - } - } -}; -#endif - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]); - kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]); - kernel.packet[1] = tmp; -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]); - __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]); - __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]); - __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]); - - kernel.packet[0] = _mm_unpacklo_epi64(T0, T1); - kernel.packet[1] = _mm_unpackhi_epi64(T0, T1); - kernel.packet[2] = _mm_unpacklo_epi64(T2, T3); - kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); -} - -template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { - const __m128i zero = _mm_setzero_si128(); - const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - __m128i false_mask = _mm_cmpeq_epi32(select, zero); -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blendv_epi8(thenPacket, elsePacket, false_mask); -#else - return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket)); -#endif -} -template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { - const __m128 zero = _mm_setzero_ps(); - const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - __m128 false_mask = _mm_cmpeq_ps(select, zero); -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blendv_ps(thenPacket, elsePacket, false_mask); -#else - return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket)); -#endif -} -template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { - const __m128d zero = _mm_setzero_pd(); - const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]); - __m128d false_mask = _mm_cmpeq_pd(select, zero); -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blendv_pd(thenPacket, elsePacket, false_mask); -#else - return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket)); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blend_ps(a,pset1(b),1); -#else - return _mm_move_ss(a, _mm_load_ss(&b)); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blend_pd(a,pset1(b),1); -#else - return _mm_move_sd(a, _mm_load_sd(&b)); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blend_ps(a,pset1(b),(1<<3)); -#else - const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x0,0x0,0x0,0xFFFFFFFF)); - return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, pset1(b))); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blend_pd(a,pset1(b),(1<<1)); -#else - const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x0,0xFFFFFFFF,0xFFFFFFFF)); - return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, pset1(b))); -#endif -} - -// Scalar path for pmadd with FMA to ensure consistency with vectorized path. -#ifdef __FMA__ -template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) { - return ::fmaf(a,b,c); -} -template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) { - return ::fma(a,b,c); -} -#endif - -} // end namespace internal - -} // end namespace Eigen - -#if EIGEN_COMP_PGI -// PGI++ does not define the following intrinsics in C++ mode. -static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); } -static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); } -static inline __m128d _mm_castps_pd (__m128 x) { return reinterpret_cast<__m128d&>(x); } -static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); } -static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); } -static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); } -#endif - -#endif // EIGEN_PACKET_MATH_SSE_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/arch/SSE/TypeCasting.h b/lib/eigen_3.3.9/Eigen/src/Core/arch/SSE/TypeCasting.h deleted file mode 100644 index c6ca8c716c0..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/arch/SSE/TypeCasting.h +++ /dev/null @@ -1,77 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_TYPE_CASTING_SSE_H -#define EIGEN_TYPE_CASTING_SSE_H - -namespace Eigen { - -namespace internal { - -#ifndef EIGEN_VECTORIZE_AVX -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 2, - TgtCoeffRatio = 1 - }; -}; - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 2 - }; -}; -#endif - -template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { - return _mm_cvttps_epi32(a); -} - -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { - return _mm_cvtepi32_ps(a); -} - -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet2d& a, const Packet2d& b) { - return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6)); -} - -template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet4f& a) { - // Simply discard the second half of the input - return _mm_cvtps_pd(a); -} - - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_TYPE_CASTING_SSE_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/arch/ZVector/MathFunctions.h b/lib/eigen_3.3.9/Eigen/src/Core/arch/ZVector/MathFunctions.h deleted file mode 100644 index 5c7aa725678..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/arch/ZVector/MathFunctions.h +++ /dev/null @@ -1,137 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2007 Julien Pommier -// Copyright (C) 2009 Gael Guennebaud -// Copyright (C) 2016 Konstantinos Margaritis -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/* The sin, cos, exp, and log functions of this file come from - * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ - */ - -#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H -#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H - -namespace Eigen { - -namespace internal { - -static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); -static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); -static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); - -static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); -static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); - -static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); - -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); - -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); - -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet2d pexp(const Packet2d& _x) -{ - Packet2d x = _x; - - Packet2d tmp, fx; - Packet2l emm0; - - // clamp x - x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); - /* express exp(x) as exp(g + n*log(2)) */ - fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half); - - fx = vec_floor(fx); - - tmp = pmul(fx, p2d_cephes_exp_C1); - Packet2d z = pmul(fx, p2d_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - Packet2d x2 = pmul(x,x); - - Packet2d px = p2d_cephes_exp_p0; - px = pmadd(px, x2, p2d_cephes_exp_p1); - px = pmadd(px, x2, p2d_cephes_exp_p2); - px = pmul (px, x); - - Packet2d qx = p2d_cephes_exp_q0; - qx = pmadd(qx, x2, p2d_cephes_exp_q1); - qx = pmadd(qx, x2, p2d_cephes_exp_q2); - qx = pmadd(qx, x2, p2d_cephes_exp_q3); - - x = pdiv(px,psub(qx,px)); - x = pmadd(p2d_2,x,p2d_1); - - // build 2^n - emm0 = vec_ctsl(fx, 0); - - static const Packet2l p2l_1023 = { 1023, 1023 }; - static const Packet2ul p2ul_52 = { 52, 52 }; - - emm0 = emm0 + p2l_1023; - emm0 = emm0 << reinterpret_cast(p2ul_52); - - // Altivec's max & min operators just drop silent NaNs. Check NaNs in - // inputs and return them unmodified. - Packet2ul isnumber_mask = reinterpret_cast(vec_cmpeq(_x, _x)); - return vec_sel(_x, pmax(pmul(x, reinterpret_cast(emm0)), _x), - isnumber_mask); -} - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pexp(const Packet4f& x) -{ - Packet4f res; - res.v4f[0] = pexp(x.v4f[0]); - res.v4f[1] = pexp(x.v4f[1]); - return res; -} - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet2d psqrt(const Packet2d& x) -{ - return __builtin_s390_vfsqdb(x); -} - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f psqrt(const Packet4f& x) -{ - Packet4f res; - res.v4f[0] = psqrt(x.v4f[0]); - res.v4f[1] = psqrt(x.v4f[1]); - return res; -} - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet2d prsqrt(const Packet2d& x) { - // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. - return pset1(1.0) / psqrt(x); -} - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f prsqrt(const Packet4f& x) { - Packet4f res; - res.v4f[0] = prsqrt(x.v4f[0]); - res.v4f[1] = prsqrt(x.v4f[1]); - return res; -} - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_MATH_FUNCTIONS_ALTIVEC_H diff --git a/lib/eigen_3.3.9/Eigen/src/Core/products/GeneralMatrixVector.h b/lib/eigen_3.3.9/Eigen/src/Core/products/GeneralMatrixVector.h deleted file mode 100644 index a597c1f4ee6..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/Core/products/GeneralMatrixVector.h +++ /dev/null @@ -1,619 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008-2009 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H -#define EIGEN_GENERAL_MATRIX_VECTOR_H - -namespace Eigen { - -namespace internal { - -/* Optimized col-major matrix * vector product: - * This algorithm processes 4 columns at onces that allows to both reduce - * the number of load/stores of the result by a factor 4 and to reduce - * the instruction dependency. Moreover, we know that all bands have the - * same alignment pattern. - * - * Mixing type logic: C += alpha * A * B - * | A | B |alpha| comments - * |real |cplx |cplx | no vectorization - * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization - * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp - * |cplx |real |real | optimal case, vectorization possible via real-cplx mul - * - * Accesses to the matrix coefficients follow the following logic: - * - * - if all columns have the same alignment then - * - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case) - * - otherwise perform unaligned loads only (-> NoneAligned case) - * - otherwise - * - if even columns have the same alignment then - * // odd columns are guaranteed to have the same alignment too - * - if even or odd columns have the same alignment as the result, then - * // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double) - * - perform half aligned and half unaligned loads (-> EvenAligned case) - * - otherwise perform unaligned loads only (-> NoneAligned case) - * - otherwise, if the register size is 4 scalars (e.g., SSE with float) then - * - one over 4 consecutive columns is guaranteed to be aligned with the result vector, - * perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case) - * // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h - * - otherwise, - * // if we get here, this means the register size is greater than 4 (e.g., AVX with floats), - * // we currently fall back to the NoneAligned case - * - * The same reasoning apply for the transposed case. - * - * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet... - * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment - * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow - * compared to unaligned loads on a 4 byte boundary. - * - */ -template -struct general_matrix_vector_product -{ - typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; - -enum { - Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable - && int(packet_traits::size)==int(packet_traits::size), - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1 -}; - -typedef typename packet_traits::type _LhsPacket; -typedef typename packet_traits::type _RhsPacket; -typedef typename packet_traits::type _ResPacket; - -typedef typename conditional::type LhsPacket; -typedef typename conditional::type RhsPacket; -typedef typename conditional::type ResPacket; - -EIGEN_DONT_INLINE static void run( - Index rows, Index cols, - const LhsMapper& lhs, - const RhsMapper& rhs, - ResScalar* res, Index resIncr, - RhsScalar alpha); -}; - -template -EIGEN_DONT_INLINE void general_matrix_vector_product::run( - Index rows, Index cols, - const LhsMapper& lhs, - const RhsMapper& rhs, - ResScalar* res, Index resIncr, - RhsScalar alpha) -{ - EIGEN_UNUSED_VARIABLE(resIncr); - eigen_internal_assert(resIncr==1); - #ifdef _EIGEN_ACCUMULATE_PACKETS - #error _EIGEN_ACCUMULATE_PACKETS has already been defined - #endif - #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \ - pstore(&res[j], \ - padd(pload(&res[j]), \ - padd( \ - padd(pcj.pmul(lhs0.template load(j), ptmp0), \ - pcj.pmul(lhs1.template load(j), ptmp1)), \ - padd(pcj.pmul(lhs2.template load(j), ptmp2), \ - pcj.pmul(lhs3.template load(j), ptmp3)) ))) - - typedef typename LhsMapper::VectorMapper LhsScalars; - - conj_helper cj; - conj_helper pcj; - if(ConjugateRhs) - alpha = numext::conj(alpha); - - enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; - const Index columnsAtOnce = 4; - const Index peels = 2; - const Index LhsPacketAlignedMask = LhsPacketSize-1; - const Index ResPacketAlignedMask = ResPacketSize-1; -// const Index PeelAlignedMask = ResPacketSize*peels-1; - const Index size = rows; - - const Index lhsStride = lhs.stride(); - - // How many coeffs of the result do we have to skip to be aligned. - // Here we assume data are at least aligned on the base scalar type. - Index alignedStart = internal::first_default_aligned(res,size); - Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; - const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; - - const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; - Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(LhsPacketSize/2) ? EvenAligned - : FirstAligned; - - // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = lhs.firstAligned(size); - - // find how many columns do we have to skip to be aligned with the result (if possible) - Index skipColumns = 0; - // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) ) - { - alignedSize = 0; - alignedStart = 0; - alignmentPattern = NoneAligned; - } - else if(LhsPacketSize > 4) - { - // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. - // Currently, it seems to be better to perform unaligned loads anyway - alignmentPattern = NoneAligned; - } - else if (LhsPacketSize>1) - { - // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size= cols) - || LhsPacketSize > size - || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/ - } - else if(Vectorizable) - { - alignedStart = 0; - alignedSize = size; - alignmentPattern = AllAligned; - } - - const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1; - const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3; - - Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; - for (Index i=skipColumns; i(alpha*rhs(i, 0)), - ptmp1 = pset1(alpha*rhs(i+offset1, 0)), - ptmp2 = pset1(alpha*rhs(i+2, 0)), - ptmp3 = pset1(alpha*rhs(i+offset3, 0)); - - // this helps a lot generating better binary code - const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1), - lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3); - - if (Vectorizable) - { - /* explicit vectorization */ - // process initial unaligned coeffs - for (Index j=0; jalignedStart) - { - switch(alignmentPattern) - { - case AllAligned: - for (Index j = alignedStart; j1) - { - LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; - ResPacket T0, T1; - - A01 = lhs1.template load(alignedStart-1); - A02 = lhs2.template load(alignedStart-2); - A03 = lhs3.template load(alignedStart-3); - - for (; j(j-1+LhsPacketSize); palign<1>(A01,A11); - A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); - A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); - - A00 = lhs0.template load(j); - A10 = lhs0.template load(j+LhsPacketSize); - T0 = pcj.pmadd(A00, ptmp0, pload(&res[j])); - T1 = pcj.pmadd(A10, ptmp0, pload(&res[j+ResPacketSize])); - - T0 = pcj.pmadd(A01, ptmp1, T0); - A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); - T0 = pcj.pmadd(A02, ptmp2, T0); - A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); - T0 = pcj.pmadd(A03, ptmp3, T0); - pstore(&res[j],T0); - A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); - T1 = pcj.pmadd(A11, ptmp1, T1); - T1 = pcj.pmadd(A12, ptmp2, T1); - T1 = pcj.pmadd(A13, ptmp3, T1); - pstore(&res[j+ResPacketSize],T1); - } - } - for (; j(alpha*rhs(k, 0)); - const LhsScalars lhs0 = lhs.getVectorMapper(0, k); - - if (Vectorizable) - { - /* explicit vectorization */ - // process first unaligned result's coeffs - for (Index j=0; j(alignedStart)) - for (Index i = alignedStart;i(i), ptmp0, pload(&res[i]))); - else - for (Index i = alignedStart;i(i), ptmp0, pload(&res[i]))); - } - - // process remaining scalars (or all if no explicit vectorization) - for (Index i=alignedSize; i -struct general_matrix_vector_product -{ -typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; - -enum { - Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable - && int(packet_traits::size)==int(packet_traits::size), - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1 -}; - -typedef typename packet_traits::type _LhsPacket; -typedef typename packet_traits::type _RhsPacket; -typedef typename packet_traits::type _ResPacket; - -typedef typename conditional::type LhsPacket; -typedef typename conditional::type RhsPacket; -typedef typename conditional::type ResPacket; - -EIGEN_DONT_INLINE static void run( - Index rows, Index cols, - const LhsMapper& lhs, - const RhsMapper& rhs, - ResScalar* res, Index resIncr, - ResScalar alpha); -}; - -template -EIGEN_DONT_INLINE void general_matrix_vector_product::run( - Index rows, Index cols, - const LhsMapper& lhs, - const RhsMapper& rhs, - ResScalar* res, Index resIncr, - ResScalar alpha) -{ - eigen_internal_assert(rhs.stride()==1); - - #ifdef _EIGEN_ACCUMULATE_PACKETS - #error _EIGEN_ACCUMULATE_PACKETS has already been defined - #endif - - #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\ - RhsPacket b = rhs.getVectorMapper(j, 0).template load(0); \ - ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); \ - ptmp1 = pcj.pmadd(lhs1.template load(j), b, ptmp1); \ - ptmp2 = pcj.pmadd(lhs2.template load(j), b, ptmp2); \ - ptmp3 = pcj.pmadd(lhs3.template load(j), b, ptmp3); } - - conj_helper cj; - conj_helper pcj; - - typedef typename LhsMapper::VectorMapper LhsScalars; - - enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; - const Index rowsAtOnce = 4; - const Index peels = 2; - const Index RhsPacketAlignedMask = RhsPacketSize-1; - const Index LhsPacketAlignedMask = LhsPacketSize-1; - const Index depth = cols; - const Index lhsStride = lhs.stride(); - - // How many coeffs of the result do we have to skip to be aligned. - // Here we assume data are at least aligned on the base scalar type - // if that's not the case then vectorization is discarded, see below. - Index alignedStart = rhs.firstAligned(depth); - Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; - const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; - - const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; - Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(LhsPacketSize/2) ? EvenAligned - : FirstAligned; - - // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = lhs.firstAligned(depth); - const Index rhsAlignmentOffset = rhs.firstAligned(rows); - - // find how many rows do we have to skip to be aligned with rhs (if possible) - Index skipRows = 0; - // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || - (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) || - (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) ) - { - alignedSize = 0; - alignedStart = 0; - alignmentPattern = NoneAligned; - } - else if(LhsPacketSize > 4) - { - // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. - alignmentPattern = NoneAligned; - } - else if (LhsPacketSize>1) - { - // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth= rows) - || LhsPacketSize > depth - || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/ - } - else if(Vectorizable) - { - alignedStart = 0; - alignedSize = depth; - alignmentPattern = AllAligned; - } - - const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1; - const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3; - - Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; - for (Index i=skipRows; i(ResScalar(0)), ptmp1 = pset1(ResScalar(0)), - ptmp2 = pset1(ResScalar(0)), ptmp3 = pset1(ResScalar(0)); - - // process initial unaligned coeffs - // FIXME this loop get vectorized by the compiler ! - for (Index j=0; jalignedStart) - { - switch(alignmentPattern) - { - case AllAligned: - for (Index j = alignedStart; j1) - { - /* Here we proccess 4 rows with with two peeled iterations to hide - * the overhead of unaligned loads. Moreover unaligned loads are handled - * using special shift/move operations between the two aligned packets - * overlaping the desired unaligned packet. This is *much* more efficient - * than basic unaligned loads. - */ - LhsPacket A01, A02, A03, A11, A12, A13; - A01 = lhs1.template load(alignedStart-1); - A02 = lhs2.template load(alignedStart-2); - A03 = lhs3.template load(alignedStart-3); - - for (; j(0); - A11 = lhs1.template load(j-1+LhsPacketSize); palign<1>(A01,A11); - A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); - A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); - - ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); - ptmp1 = pcj.pmadd(A01, b, ptmp1); - A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); - ptmp2 = pcj.pmadd(A02, b, ptmp2); - A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); - ptmp3 = pcj.pmadd(A03, b, ptmp3); - A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); - - b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load(0); - ptmp0 = pcj.pmadd(lhs0.template load(j+LhsPacketSize), b, ptmp0); - ptmp1 = pcj.pmadd(A11, b, ptmp1); - ptmp2 = pcj.pmadd(A12, b, ptmp2); - ptmp3 = pcj.pmadd(A13, b, ptmp3); - } - } - for (; j(tmp0); - const LhsScalars lhs0 = lhs.getVectorMapper(i, 0); - // process first unaligned result's coeffs - // FIXME this loop get vectorized by the compiler ! - for (Index j=0; jalignedStart) - { - // process aligned rhs coeffs - if (lhs0.template aligned(alignedStart)) - for (Index j = alignedStart;j(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); - else - for (Index j = alignedStart;j(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); - tmp0 += predux(ptmp0); - } - - // process remaining scalars - // FIXME this loop get vectorized by the compiler ! - for (Index j=alignedSize; j -// Copyright (C) 2009-2010 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_GEOMETRY_SSE_H -#define EIGEN_GEOMETRY_SSE_H - -namespace Eigen { - -namespace internal { - -template -struct quat_product -{ - enum { - AAlignment = traits::Alignment, - BAlignment = traits::Alignment, - ResAlignment = traits >::Alignment - }; - static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) - { - Quaternion res; - const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f); - __m128 a = _a.coeffs().template packet(0); - __m128 b = _b.coeffs().template packet(0); - __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); - __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1)); - pstoret( - &res.x(), - _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)), - _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0), - vec4f_swizzle1(b,1,2,0,0))), - _mm_xor_ps(mask,_mm_add_ps(s1,s2)))); - - return res; - } -}; - -template -struct quat_conj -{ - enum { - ResAlignment = traits >::Alignment - }; - static inline Quaternion run(const QuaternionBase& q) - { - Quaternion res; - const __m128 mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f); - pstoret(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet::Alignment>(0))); - return res; - } -}; - - -template -struct cross3_impl -{ - enum { - ResAlignment = traits::type>::Alignment - }; - static inline typename plain_matrix_type::type - run(const VectorLhs& lhs, const VectorRhs& rhs) - { - __m128 a = lhs.template packet::Alignment>(0); - __m128 b = rhs.template packet::Alignment>(0); - __m128 mul1=_mm_mul_ps(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3)); - __m128 mul2=_mm_mul_ps(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3)); - typename plain_matrix_type::type res; - pstoret(&res.x(),_mm_sub_ps(mul1,mul2)); - return res; - } -}; - - - - -template -struct quat_product -{ - enum { - BAlignment = traits::Alignment, - ResAlignment = traits >::Alignment - }; - - static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) - { - const Packet2d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); - - Quaternion res; - - const double* a = _a.coeffs().data(); - Packet2d b_xy = _b.coeffs().template packet(0); - Packet2d b_zw = _b.coeffs().template packet(2); - Packet2d a_xx = pset1(a[0]); - Packet2d a_yy = pset1(a[1]); - Packet2d a_zz = pset1(a[2]); - Packet2d a_ww = pset1(a[3]); - - // two temporaries: - Packet2d t1, t2; - - /* - * t1 = ww*xy + yy*zw - * t2 = zz*xy - xx*zw - * res.xy = t1 +/- swap(t2) - */ - t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw)); - t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw)); -#ifdef EIGEN_VECTORIZE_SSE3 - EIGEN_UNUSED_VARIABLE(mask) - pstoret(&res.x(), _mm_addsub_pd(t1, preverse(t2))); -#else - pstoret(&res.x(), padd(t1, pxor(mask,preverse(t2)))); -#endif - - /* - * t1 = ww*zw - yy*xy - * t2 = zz*zw + xx*xy - * res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2) - */ - t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy)); - t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy)); -#ifdef EIGEN_VECTORIZE_SSE3 - EIGEN_UNUSED_VARIABLE(mask) - pstoret(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2))); -#else - pstoret(&res.z(), psub(t1, pxor(mask,preverse(t2)))); -#endif - - return res; -} -}; - -template -struct quat_conj -{ - enum { - ResAlignment = traits >::Alignment - }; - static inline Quaternion run(const QuaternionBase& q) - { - Quaternion res; - const __m128d mask0 = _mm_setr_pd(-0.,-0.); - const __m128d mask2 = _mm_setr_pd(-0.,0.); - pstoret(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet::Alignment>(0))); - pstoret(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet::Alignment>(2))); - return res; - } -}; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_GEOMETRY_SSE_H diff --git a/lib/eigen_3.3.9/Eigen/src/LU/arch/Inverse_SSE.h b/lib/eigen_3.3.9/Eigen/src/LU/arch/Inverse_SSE.h deleted file mode 100644 index 4dce2ef20ee..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/LU/arch/Inverse_SSE.h +++ /dev/null @@ -1,338 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2001 Intel Corporation -// Copyright (C) 2010 Gael Guennebaud -// Copyright (C) 2009 Benoit Jacob -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -// The SSE code for the 4x4 float and double matrix inverse in this file -// comes from the following Intel's library: -// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/ -// -// Here is the respective copyright and license statement: -// -// Copyright (c) 2001 Intel Corporation. -// -// Permition is granted to use, copy, distribute and prepare derivative works -// of this library for any purpose and without fee, provided, that the above -// copyright notice and this statement appear in all copies. -// Intel makes no representations about the suitability of this software for -// any purpose, and specifically disclaims all warranties. -// See LEGAL.TXT for all the legal information. - -#ifndef EIGEN_INVERSE_SSE_H -#define EIGEN_INVERSE_SSE_H - -namespace Eigen { - -namespace internal { - -template -struct compute_inverse_size4 -{ - enum { - MatrixAlignment = traits::Alignment, - ResultAlignment = traits::Alignment, - StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit) - }; - typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType; - - static void run(const MatrixType& mat, ResultType& result) - { - ActualMatrixType matrix(mat); - const Packet4f p4f_sign_PNNP = _mm_castsi128_ps(_mm_set_epi32(0x00000000, 0x80000000, 0x80000000, 0x00000000)); - - // Load the full matrix into registers - __m128 _L1 = matrix.template packet( 0); - __m128 _L2 = matrix.template packet( 4); - __m128 _L3 = matrix.template packet( 8); - __m128 _L4 = matrix.template packet(12); - - // The inverse is calculated using "Divide and Conquer" technique. The - // original matrix is divide into four 2x2 sub-matrices. Since each - // register holds four matrix element, the smaller matrices are - // represented as a registers. Hence we get a better locality of the - // calculations. - - __m128 A, B, C, D; // the four sub-matrices - if(!StorageOrdersMatch) - { - A = _mm_unpacklo_ps(_L1, _L2); - B = _mm_unpacklo_ps(_L3, _L4); - C = _mm_unpackhi_ps(_L1, _L2); - D = _mm_unpackhi_ps(_L3, _L4); - } - else - { - A = _mm_movelh_ps(_L1, _L2); - B = _mm_movehl_ps(_L2, _L1); - C = _mm_movelh_ps(_L3, _L4); - D = _mm_movehl_ps(_L4, _L3); - } - - __m128 iA, iB, iC, iD, // partial inverse of the sub-matrices - DC, AB; - __m128 dA, dB, dC, dD; // determinant of the sub-matrices - __m128 det, d, d1, d2; - __m128 rd; // reciprocal of the determinant - - // AB = A# * B - AB = _mm_mul_ps(_mm_shuffle_ps(A,A,0x0F), B); - AB = _mm_sub_ps(AB,_mm_mul_ps(_mm_shuffle_ps(A,A,0xA5), _mm_shuffle_ps(B,B,0x4E))); - // DC = D# * C - DC = _mm_mul_ps(_mm_shuffle_ps(D,D,0x0F), C); - DC = _mm_sub_ps(DC,_mm_mul_ps(_mm_shuffle_ps(D,D,0xA5), _mm_shuffle_ps(C,C,0x4E))); - - // dA = |A| - dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F),A); - dA = _mm_sub_ss(dA, _mm_movehl_ps(dA,dA)); - // dB = |B| - dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F),B); - dB = _mm_sub_ss(dB, _mm_movehl_ps(dB,dB)); - - // dC = |C| - dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F),C); - dC = _mm_sub_ss(dC, _mm_movehl_ps(dC,dC)); - // dD = |D| - dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F),D); - dD = _mm_sub_ss(dD, _mm_movehl_ps(dD,dD)); - - // d = trace(AB*DC) = trace(A#*B*D#*C) - d = _mm_mul_ps(_mm_shuffle_ps(DC,DC,0xD8),AB); - - // iD = C*A#*B - iD = _mm_mul_ps(_mm_shuffle_ps(C,C,0xA0), _mm_movelh_ps(AB,AB)); - iD = _mm_add_ps(iD,_mm_mul_ps(_mm_shuffle_ps(C,C,0xF5), _mm_movehl_ps(AB,AB))); - // iA = B*D#*C - iA = _mm_mul_ps(_mm_shuffle_ps(B,B,0xA0), _mm_movelh_ps(DC,DC)); - iA = _mm_add_ps(iA,_mm_mul_ps(_mm_shuffle_ps(B,B,0xF5), _mm_movehl_ps(DC,DC))); - - // d = trace(AB*DC) = trace(A#*B*D#*C) [continue] - d = _mm_add_ps(d, _mm_movehl_ps(d, d)); - d = _mm_add_ss(d, _mm_shuffle_ps(d, d, 1)); - d1 = _mm_mul_ss(dA,dD); - d2 = _mm_mul_ss(dB,dC); - - // iD = D*|A| - C*A#*B - iD = _mm_sub_ps(_mm_mul_ps(D,_mm_shuffle_ps(dA,dA,0)), iD); - - // iA = A*|D| - B*D#*C; - iA = _mm_sub_ps(_mm_mul_ps(A,_mm_shuffle_ps(dD,dD,0)), iA); - - // det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C) - det = _mm_sub_ss(_mm_add_ss(d1,d2),d); - rd = _mm_div_ss(_mm_set_ss(1.0f), det); - -// #ifdef ZERO_SINGULAR -// rd = _mm_and_ps(_mm_cmpneq_ss(det,_mm_setzero_ps()), rd); -// #endif - - // iB = D * (A#B)# = D*B#*A - iB = _mm_mul_ps(D, _mm_shuffle_ps(AB,AB,0x33)); - iB = _mm_sub_ps(iB, _mm_mul_ps(_mm_shuffle_ps(D,D,0xB1), _mm_shuffle_ps(AB,AB,0x66))); - // iC = A * (D#C)# = A*C#*D - iC = _mm_mul_ps(A, _mm_shuffle_ps(DC,DC,0x33)); - iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66))); - - rd = _mm_shuffle_ps(rd,rd,0); - rd = _mm_xor_ps(rd, p4f_sign_PNNP); - - // iB = C*|B| - D*B#*A - iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB); - - // iC = B*|C| - A*C#*D; - iC = _mm_sub_ps(_mm_mul_ps(B,_mm_shuffle_ps(dC,dC,0)), iC); - - // iX = iX / det - iA = _mm_mul_ps(rd,iA); - iB = _mm_mul_ps(rd,iB); - iC = _mm_mul_ps(rd,iC); - iD = _mm_mul_ps(rd,iD); - - Index res_stride = result.outerStride(); - float* res = result.data(); - pstoret(res+0, _mm_shuffle_ps(iA,iB,0x77)); - pstoret(res+res_stride, _mm_shuffle_ps(iA,iB,0x22)); - pstoret(res+2*res_stride, _mm_shuffle_ps(iC,iD,0x77)); - pstoret(res+3*res_stride, _mm_shuffle_ps(iC,iD,0x22)); - } - -}; - -template -struct compute_inverse_size4 -{ - enum { - MatrixAlignment = traits::Alignment, - ResultAlignment = traits::Alignment, - StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit) - }; - typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType; - - static void run(const MatrixType& mat, ResultType& result) - { - ActualMatrixType matrix(mat); - const __m128d _Sign_NP = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); - const __m128d _Sign_PN = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - - // The inverse is calculated using "Divide and Conquer" technique. The - // original matrix is divide into four 2x2 sub-matrices. Since each - // register of the matrix holds two elements, the smaller matrices are - // consisted of two registers. Hence we get a better locality of the - // calculations. - - // the four sub-matrices - __m128d A1, A2, B1, B2, C1, C2, D1, D2; - - if(StorageOrdersMatch) - { - A1 = matrix.template packet( 0); B1 = matrix.template packet( 2); - A2 = matrix.template packet( 4); B2 = matrix.template packet( 6); - C1 = matrix.template packet( 8); D1 = matrix.template packet(10); - C2 = matrix.template packet(12); D2 = matrix.template packet(14); - } - else - { - __m128d tmp; - A1 = matrix.template packet( 0); C1 = matrix.template packet( 2); - A2 = matrix.template packet( 4); C2 = matrix.template packet( 6); - tmp = A1; - A1 = _mm_unpacklo_pd(A1,A2); - A2 = _mm_unpackhi_pd(tmp,A2); - tmp = C1; - C1 = _mm_unpacklo_pd(C1,C2); - C2 = _mm_unpackhi_pd(tmp,C2); - - B1 = matrix.template packet( 8); D1 = matrix.template packet(10); - B2 = matrix.template packet(12); D2 = matrix.template packet(14); - tmp = B1; - B1 = _mm_unpacklo_pd(B1,B2); - B2 = _mm_unpackhi_pd(tmp,B2); - tmp = D1; - D1 = _mm_unpacklo_pd(D1,D2); - D2 = _mm_unpackhi_pd(tmp,D2); - } - - __m128d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2, // partial invese of the sub-matrices - DC1, DC2, AB1, AB2; - __m128d dA, dB, dC, dD; // determinant of the sub-matrices - __m128d det, d1, d2, rd; - - // dA = |A| - dA = _mm_shuffle_pd(A2, A2, 1); - dA = _mm_mul_pd(A1, dA); - dA = _mm_sub_sd(dA, _mm_shuffle_pd(dA,dA,3)); - // dB = |B| - dB = _mm_shuffle_pd(B2, B2, 1); - dB = _mm_mul_pd(B1, dB); - dB = _mm_sub_sd(dB, _mm_shuffle_pd(dB,dB,3)); - - // AB = A# * B - AB1 = _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,3)); - AB2 = _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,0)); - AB1 = _mm_sub_pd(AB1, _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,3))); - AB2 = _mm_sub_pd(AB2, _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,0))); - - // dC = |C| - dC = _mm_shuffle_pd(C2, C2, 1); - dC = _mm_mul_pd(C1, dC); - dC = _mm_sub_sd(dC, _mm_shuffle_pd(dC,dC,3)); - // dD = |D| - dD = _mm_shuffle_pd(D2, D2, 1); - dD = _mm_mul_pd(D1, dD); - dD = _mm_sub_sd(dD, _mm_shuffle_pd(dD,dD,3)); - - // DC = D# * C - DC1 = _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,3)); - DC2 = _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,0)); - DC1 = _mm_sub_pd(DC1, _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,3))); - DC2 = _mm_sub_pd(DC2, _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,0))); - - // rd = trace(AB*DC) = trace(A#*B*D#*C) - d1 = _mm_mul_pd(AB1, _mm_shuffle_pd(DC1, DC2, 0)); - d2 = _mm_mul_pd(AB2, _mm_shuffle_pd(DC1, DC2, 3)); - rd = _mm_add_pd(d1, d2); - rd = _mm_add_sd(rd, _mm_shuffle_pd(rd, rd,3)); - - // iD = C*A#*B - iD1 = _mm_mul_pd(AB1, _mm_shuffle_pd(C1,C1,0)); - iD2 = _mm_mul_pd(AB1, _mm_shuffle_pd(C2,C2,0)); - iD1 = _mm_add_pd(iD1, _mm_mul_pd(AB2, _mm_shuffle_pd(C1,C1,3))); - iD2 = _mm_add_pd(iD2, _mm_mul_pd(AB2, _mm_shuffle_pd(C2,C2,3))); - - // iA = B*D#*C - iA1 = _mm_mul_pd(DC1, _mm_shuffle_pd(B1,B1,0)); - iA2 = _mm_mul_pd(DC1, _mm_shuffle_pd(B2,B2,0)); - iA1 = _mm_add_pd(iA1, _mm_mul_pd(DC2, _mm_shuffle_pd(B1,B1,3))); - iA2 = _mm_add_pd(iA2, _mm_mul_pd(DC2, _mm_shuffle_pd(B2,B2,3))); - - // iD = D*|A| - C*A#*B - dA = _mm_shuffle_pd(dA,dA,0); - iD1 = _mm_sub_pd(_mm_mul_pd(D1, dA), iD1); - iD2 = _mm_sub_pd(_mm_mul_pd(D2, dA), iD2); - - // iA = A*|D| - B*D#*C; - dD = _mm_shuffle_pd(dD,dD,0); - iA1 = _mm_sub_pd(_mm_mul_pd(A1, dD), iA1); - iA2 = _mm_sub_pd(_mm_mul_pd(A2, dD), iA2); - - d1 = _mm_mul_sd(dA, dD); - d2 = _mm_mul_sd(dB, dC); - - // iB = D * (A#B)# = D*B#*A - iB1 = _mm_mul_pd(D1, _mm_shuffle_pd(AB2,AB1,1)); - iB2 = _mm_mul_pd(D2, _mm_shuffle_pd(AB2,AB1,1)); - iB1 = _mm_sub_pd(iB1, _mm_mul_pd(_mm_shuffle_pd(D1,D1,1), _mm_shuffle_pd(AB2,AB1,2))); - iB2 = _mm_sub_pd(iB2, _mm_mul_pd(_mm_shuffle_pd(D2,D2,1), _mm_shuffle_pd(AB2,AB1,2))); - - // det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C) - det = _mm_add_sd(d1, d2); - det = _mm_sub_sd(det, rd); - - // iC = A * (D#C)# = A*C#*D - iC1 = _mm_mul_pd(A1, _mm_shuffle_pd(DC2,DC1,1)); - iC2 = _mm_mul_pd(A2, _mm_shuffle_pd(DC2,DC1,1)); - iC1 = _mm_sub_pd(iC1, _mm_mul_pd(_mm_shuffle_pd(A1,A1,1), _mm_shuffle_pd(DC2,DC1,2))); - iC2 = _mm_sub_pd(iC2, _mm_mul_pd(_mm_shuffle_pd(A2,A2,1), _mm_shuffle_pd(DC2,DC1,2))); - - rd = _mm_div_sd(_mm_set_sd(1.0), det); -// #ifdef ZERO_SINGULAR -// rd = _mm_and_pd(_mm_cmpneq_sd(det,_mm_setzero_pd()), rd); -// #endif - rd = _mm_shuffle_pd(rd,rd,0); - - // iB = C*|B| - D*B#*A - dB = _mm_shuffle_pd(dB,dB,0); - iB1 = _mm_sub_pd(_mm_mul_pd(C1, dB), iB1); - iB2 = _mm_sub_pd(_mm_mul_pd(C2, dB), iB2); - - d1 = _mm_xor_pd(rd, _Sign_PN); - d2 = _mm_xor_pd(rd, _Sign_NP); - - // iC = B*|C| - A*C#*D; - dC = _mm_shuffle_pd(dC,dC,0); - iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1); - iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2); - - Index res_stride = result.outerStride(); - double* res = result.data(); - pstoret(res+0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1)); - pstoret(res+res_stride, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2)); - pstoret(res+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1)); - pstoret(res+res_stride+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2)); - pstoret(res+2*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1)); - pstoret(res+3*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2)); - pstoret(res+2*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1)); - pstoret(res+3*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2)); - } -}; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_INVERSE_SSE_H diff --git a/lib/eigen_3.3.9/Eigen/src/plugins/BlockMethods.h b/lib/eigen_3.3.9/Eigen/src/plugins/BlockMethods.h deleted file mode 100644 index ac35a0086cf..00000000000 --- a/lib/eigen_3.3.9/Eigen/src/plugins/BlockMethods.h +++ /dev/null @@ -1,1058 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008-2010 Gael Guennebaud -// Copyright (C) 2006-2010 Benoit Jacob -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_PARSED_BY_DOXYGEN - -/// \internal expression type of a column */ -typedef Block::RowsAtCompileTime, 1, !IsRowMajor> ColXpr; -typedef const Block::RowsAtCompileTime, 1, !IsRowMajor> ConstColXpr; -/// \internal expression type of a row */ -typedef Block::ColsAtCompileTime, IsRowMajor> RowXpr; -typedef const Block::ColsAtCompileTime, IsRowMajor> ConstRowXpr; -/// \internal expression type of a block of whole columns */ -typedef Block::RowsAtCompileTime, Dynamic, !IsRowMajor> ColsBlockXpr; -typedef const Block::RowsAtCompileTime, Dynamic, !IsRowMajor> ConstColsBlockXpr; -/// \internal expression type of a block of whole rows */ -typedef Block::ColsAtCompileTime, IsRowMajor> RowsBlockXpr; -typedef const Block::ColsAtCompileTime, IsRowMajor> ConstRowsBlockXpr; -/// \internal expression type of a block of whole columns */ -template struct NColsBlockXpr { typedef Block::RowsAtCompileTime, N, !IsRowMajor> Type; }; -template struct ConstNColsBlockXpr { typedef const Block::RowsAtCompileTime, N, !IsRowMajor> Type; }; -/// \internal expression type of a block of whole rows */ -template struct NRowsBlockXpr { typedef Block::ColsAtCompileTime, IsRowMajor> Type; }; -template struct ConstNRowsBlockXpr { typedef const Block::ColsAtCompileTime, IsRowMajor> Type; }; -/// \internal expression of a block */ -typedef Block BlockXpr; -typedef const Block ConstBlockXpr; -/// \internal expression of a block of fixed sizes */ -template struct FixedBlockXpr { typedef Block Type; }; -template struct ConstFixedBlockXpr { typedef Block Type; }; - -typedef VectorBlock SegmentReturnType; -typedef const VectorBlock ConstSegmentReturnType; -template struct FixedSegmentReturnType { typedef VectorBlock Type; }; -template struct ConstFixedSegmentReturnType { typedef const VectorBlock Type; }; - -#endif // not EIGEN_PARSED_BY_DOXYGEN - -/// \returns a dynamic-size expression of a block in *this. -/// -/// \param startRow the first row in the block -/// \param startCol the first column in the block -/// \param blockRows the number of rows in the block -/// \param blockCols the number of columns in the block -/// -/// Example: \include MatrixBase_block_int_int_int_int.cpp -/// Output: \verbinclude MatrixBase_block_int_int_int_int.out -/// -/// \note Even though the returned expression has dynamic size, in the case -/// when it is applied to a fixed-size matrix, it inherits a fixed maximal size, -/// which means that evaluating it does not cause a dynamic memory allocation. -/// -EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL -/// -/// \sa class Block, block(Index,Index) -/// -EIGEN_DEVICE_FUNC -inline BlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols) -{ - return BlockXpr(derived(), startRow, startCol, blockRows, blockCols); -} - -/// This is the const version of block(Index,Index,Index,Index). */ -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols) const -{ - return ConstBlockXpr(derived(), startRow, startCol, blockRows, blockCols); -} - - - - -/// \returns a dynamic-size expression of a top-right corner of *this. -/// -/// \param cRows the number of rows in the corner -/// \param cCols the number of columns in the corner -/// -/// Example: \include MatrixBase_topRightCorner_int_int.cpp -/// Output: \verbinclude MatrixBase_topRightCorner_int_int.out -/// -EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -EIGEN_DEVICE_FUNC -inline BlockXpr topRightCorner(Index cRows, Index cCols) -{ - return BlockXpr(derived(), 0, cols() - cCols, cRows, cCols); -} - -/// This is the const version of topRightCorner(Index, Index). -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr topRightCorner(Index cRows, Index cCols) const -{ - return ConstBlockXpr(derived(), 0, cols() - cCols, cRows, cCols); -} - -/// \returns an expression of a fixed-size top-right corner of *this. -/// -/// \tparam CRows the number of rows in the corner -/// \tparam CCols the number of columns in the corner -/// -/// Example: \include MatrixBase_template_int_int_topRightCorner.cpp -/// Output: \verbinclude MatrixBase_template_int_int_topRightCorner.out -/// -EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL -/// -/// \sa class Block, block(Index,Index) -/// -template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type topRightCorner() -{ - return typename FixedBlockXpr::Type(derived(), 0, cols() - CCols); -} - -/// This is the const version of topRightCorner(). -template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type topRightCorner() const -{ - return typename ConstFixedBlockXpr::Type(derived(), 0, cols() - CCols); -} - -/// \returns an expression of a top-right corner of *this. -/// -/// \tparam CRows number of rows in corner as specified at compile-time -/// \tparam CCols number of columns in corner as specified at compile-time -/// \param cRows number of rows in corner as specified at run-time -/// \param cCols number of columns in corner as specified at run-time -/// -/// This function is mainly useful for corners where the number of rows is specified at compile-time -/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time -/// information should not contradict. In other words, \a cRows should equal \a CRows unless -/// \a CRows is \a Dynamic, and the same for the number of columns. -/// -/// Example: \include MatrixBase_template_int_int_topRightCorner_int_int.cpp -/// Output: \verbinclude MatrixBase_template_int_int_topRightCorner_int_int.out -/// -EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL -/// -/// \sa class Block -/// -template -inline typename FixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) -{ - return typename FixedBlockXpr::Type(derived(), 0, cols() - cCols, cRows, cCols); -} - -/// This is the const version of topRightCorner(Index, Index). -template -inline const typename ConstFixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) const -{ - return typename ConstFixedBlockXpr::Type(derived(), 0, cols() - cCols, cRows, cCols); -} - - - -/// \returns a dynamic-size expression of a top-left corner of *this. -/// -/// \param cRows the number of rows in the corner -/// \param cCols the number of columns in the corner -/// -/// Example: \include MatrixBase_topLeftCorner_int_int.cpp -/// Output: \verbinclude MatrixBase_topLeftCorner_int_int.out -/// -EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -EIGEN_DEVICE_FUNC -inline BlockXpr topLeftCorner(Index cRows, Index cCols) -{ - return BlockXpr(derived(), 0, 0, cRows, cCols); -} - -/// This is the const version of topLeftCorner(Index, Index). -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr topLeftCorner(Index cRows, Index cCols) const -{ - return ConstBlockXpr(derived(), 0, 0, cRows, cCols); -} - -/// \returns an expression of a fixed-size top-left corner of *this. -/// -/// The template parameters CRows and CCols are the number of rows and columns in the corner. -/// -/// Example: \include MatrixBase_template_int_int_topLeftCorner.cpp -/// Output: \verbinclude MatrixBase_template_int_int_topLeftCorner.out -/// -EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type topLeftCorner() -{ - return typename FixedBlockXpr::Type(derived(), 0, 0); -} - -/// This is the const version of topLeftCorner(). -template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type topLeftCorner() const -{ - return typename ConstFixedBlockXpr::Type(derived(), 0, 0); -} - -/// \returns an expression of a top-left corner of *this. -/// -/// \tparam CRows number of rows in corner as specified at compile-time -/// \tparam CCols number of columns in corner as specified at compile-time -/// \param cRows number of rows in corner as specified at run-time -/// \param cCols number of columns in corner as specified at run-time -/// -/// This function is mainly useful for corners where the number of rows is specified at compile-time -/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time -/// information should not contradict. In other words, \a cRows should equal \a CRows unless -/// \a CRows is \a Dynamic, and the same for the number of columns. -/// -/// Example: \include MatrixBase_template_int_int_topLeftCorner_int_int.cpp -/// Output: \verbinclude MatrixBase_template_int_int_topLeftCorner_int_int.out -/// -EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL -/// -/// \sa class Block -/// -template -inline typename FixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) -{ - return typename FixedBlockXpr::Type(derived(), 0, 0, cRows, cCols); -} - -/// This is the const version of topLeftCorner(Index, Index). -template -inline const typename ConstFixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) const -{ - return typename ConstFixedBlockXpr::Type(derived(), 0, 0, cRows, cCols); -} - - - -/// \returns a dynamic-size expression of a bottom-right corner of *this. -/// -/// \param cRows the number of rows in the corner -/// \param cCols the number of columns in the corner -/// -/// Example: \include MatrixBase_bottomRightCorner_int_int.cpp -/// Output: \verbinclude MatrixBase_bottomRightCorner_int_int.out -/// -EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -EIGEN_DEVICE_FUNC -inline BlockXpr bottomRightCorner(Index cRows, Index cCols) -{ - return BlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols); -} - -/// This is the const version of bottomRightCorner(Index, Index). -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr bottomRightCorner(Index cRows, Index cCols) const -{ - return ConstBlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols); -} - -/// \returns an expression of a fixed-size bottom-right corner of *this. -/// -/// The template parameters CRows and CCols are the number of rows and columns in the corner. -/// -/// Example: \include MatrixBase_template_int_int_bottomRightCorner.cpp -/// Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner.out -/// -EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type bottomRightCorner() -{ - return typename FixedBlockXpr::Type(derived(), rows() - CRows, cols() - CCols); -} - -/// This is the const version of bottomRightCorner(). -template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type bottomRightCorner() const -{ - return typename ConstFixedBlockXpr::Type(derived(), rows() - CRows, cols() - CCols); -} - -/// \returns an expression of a bottom-right corner of *this. -/// -/// \tparam CRows number of rows in corner as specified at compile-time -/// \tparam CCols number of columns in corner as specified at compile-time -/// \param cRows number of rows in corner as specified at run-time -/// \param cCols number of columns in corner as specified at run-time -/// -/// This function is mainly useful for corners where the number of rows is specified at compile-time -/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time -/// information should not contradict. In other words, \a cRows should equal \a CRows unless -/// \a CRows is \a Dynamic, and the same for the number of columns. -/// -/// Example: \include MatrixBase_template_int_int_bottomRightCorner_int_int.cpp -/// Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner_int_int.out -/// -EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL -/// -/// \sa class Block -/// -template -inline typename FixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) -{ - return typename FixedBlockXpr::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols); -} - -/// This is the const version of bottomRightCorner(Index, Index). -template -inline const typename ConstFixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) const -{ - return typename ConstFixedBlockXpr::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols); -} - - - -/// \returns a dynamic-size expression of a bottom-left corner of *this. -/// -/// \param cRows the number of rows in the corner -/// \param cCols the number of columns in the corner -/// -/// Example: \include MatrixBase_bottomLeftCorner_int_int.cpp -/// Output: \verbinclude MatrixBase_bottomLeftCorner_int_int.out -/// -EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -EIGEN_DEVICE_FUNC -inline BlockXpr bottomLeftCorner(Index cRows, Index cCols) -{ - return BlockXpr(derived(), rows() - cRows, 0, cRows, cCols); -} - -/// This is the const version of bottomLeftCorner(Index, Index). -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr bottomLeftCorner(Index cRows, Index cCols) const -{ - return ConstBlockXpr(derived(), rows() - cRows, 0, cRows, cCols); -} - -/// \returns an expression of a fixed-size bottom-left corner of *this. -/// -/// The template parameters CRows and CCols are the number of rows and columns in the corner. -/// -/// Example: \include MatrixBase_template_int_int_bottomLeftCorner.cpp -/// Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner.out -/// -EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type bottomLeftCorner() -{ - return typename FixedBlockXpr::Type(derived(), rows() - CRows, 0); -} - -/// This is the const version of bottomLeftCorner(). -template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type bottomLeftCorner() const -{ - return typename ConstFixedBlockXpr::Type(derived(), rows() - CRows, 0); -} - -/// \returns an expression of a bottom-left corner of *this. -/// -/// \tparam CRows number of rows in corner as specified at compile-time -/// \tparam CCols number of columns in corner as specified at compile-time -/// \param cRows number of rows in corner as specified at run-time -/// \param cCols number of columns in corner as specified at run-time -/// -/// This function is mainly useful for corners where the number of rows is specified at compile-time -/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time -/// information should not contradict. In other words, \a cRows should equal \a CRows unless -/// \a CRows is \a Dynamic, and the same for the number of columns. -/// -/// Example: \include MatrixBase_template_int_int_bottomLeftCorner_int_int.cpp -/// Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner_int_int.out -/// -EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL -/// -/// \sa class Block -/// -template -inline typename FixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) -{ - return typename FixedBlockXpr::Type(derived(), rows() - cRows, 0, cRows, cCols); -} - -/// This is the const version of bottomLeftCorner(Index, Index). -template -inline const typename ConstFixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) const -{ - return typename ConstFixedBlockXpr::Type(derived(), rows() - cRows, 0, cRows, cCols); -} - - - -/// \returns a block consisting of the top rows of *this. -/// -/// \param n the number of rows in the block -/// -/// Example: \include MatrixBase_topRows_int.cpp -/// Output: \verbinclude MatrixBase_topRows_int.out -/// -EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -EIGEN_DEVICE_FUNC -inline RowsBlockXpr topRows(Index n) -{ - return RowsBlockXpr(derived(), 0, 0, n, cols()); -} - -/// This is the const version of topRows(Index). -EIGEN_DEVICE_FUNC -inline ConstRowsBlockXpr topRows(Index n) const -{ - return ConstRowsBlockXpr(derived(), 0, 0, n, cols()); -} - -/// \returns a block consisting of the top rows of *this. -/// -/// \tparam N the number of rows in the block as specified at compile-time -/// \param n the number of rows in the block as specified at run-time -/// -/// The compile-time and run-time information should not contradict. In other words, -/// \a n should equal \a N unless \a N is \a Dynamic. -/// -/// Example: \include MatrixBase_template_int_topRows.cpp -/// Output: \verbinclude MatrixBase_template_int_topRows.out -/// -EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -template -EIGEN_DEVICE_FUNC -inline typename NRowsBlockXpr::Type topRows(Index n = N) -{ - return typename NRowsBlockXpr::Type(derived(), 0, 0, n, cols()); -} - -/// This is the const version of topRows(). -template -EIGEN_DEVICE_FUNC -inline typename ConstNRowsBlockXpr::Type topRows(Index n = N) const -{ - return typename ConstNRowsBlockXpr::Type(derived(), 0, 0, n, cols()); -} - - - -/// \returns a block consisting of the bottom rows of *this. -/// -/// \param n the number of rows in the block -/// -/// Example: \include MatrixBase_bottomRows_int.cpp -/// Output: \verbinclude MatrixBase_bottomRows_int.out -/// -EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -EIGEN_DEVICE_FUNC -inline RowsBlockXpr bottomRows(Index n) -{ - return RowsBlockXpr(derived(), rows() - n, 0, n, cols()); -} - -/// This is the const version of bottomRows(Index). -EIGEN_DEVICE_FUNC -inline ConstRowsBlockXpr bottomRows(Index n) const -{ - return ConstRowsBlockXpr(derived(), rows() - n, 0, n, cols()); -} - -/// \returns a block consisting of the bottom rows of *this. -/// -/// \tparam N the number of rows in the block as specified at compile-time -/// \param n the number of rows in the block as specified at run-time -/// -/// The compile-time and run-time information should not contradict. In other words, -/// \a n should equal \a N unless \a N is \a Dynamic. -/// -/// Example: \include MatrixBase_template_int_bottomRows.cpp -/// Output: \verbinclude MatrixBase_template_int_bottomRows.out -/// -EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -template -EIGEN_DEVICE_FUNC -inline typename NRowsBlockXpr::Type bottomRows(Index n = N) -{ - return typename NRowsBlockXpr::Type(derived(), rows() - n, 0, n, cols()); -} - -/// This is the const version of bottomRows(). -template -EIGEN_DEVICE_FUNC -inline typename ConstNRowsBlockXpr::Type bottomRows(Index n = N) const -{ - return typename ConstNRowsBlockXpr::Type(derived(), rows() - n, 0, n, cols()); -} - - - -/// \returns a block consisting of a range of rows of *this. -/// -/// \param startRow the index of the first row in the block -/// \param n the number of rows in the block -/// -/// Example: \include DenseBase_middleRows_int.cpp -/// Output: \verbinclude DenseBase_middleRows_int.out -/// -EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -EIGEN_DEVICE_FUNC -inline RowsBlockXpr middleRows(Index startRow, Index n) -{ - return RowsBlockXpr(derived(), startRow, 0, n, cols()); -} - -/// This is the const version of middleRows(Index,Index). -EIGEN_DEVICE_FUNC -inline ConstRowsBlockXpr middleRows(Index startRow, Index n) const -{ - return ConstRowsBlockXpr(derived(), startRow, 0, n, cols()); -} - -/// \returns a block consisting of a range of rows of *this. -/// -/// \tparam N the number of rows in the block as specified at compile-time -/// \param startRow the index of the first row in the block -/// \param n the number of rows in the block as specified at run-time -/// -/// The compile-time and run-time information should not contradict. In other words, -/// \a n should equal \a N unless \a N is \a Dynamic. -/// -/// Example: \include DenseBase_template_int_middleRows.cpp -/// Output: \verbinclude DenseBase_template_int_middleRows.out -/// -EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -template -EIGEN_DEVICE_FUNC -inline typename NRowsBlockXpr::Type middleRows(Index startRow, Index n = N) -{ - return typename NRowsBlockXpr::Type(derived(), startRow, 0, n, cols()); -} - -/// This is the const version of middleRows(). -template -EIGEN_DEVICE_FUNC -inline typename ConstNRowsBlockXpr::Type middleRows(Index startRow, Index n = N) const -{ - return typename ConstNRowsBlockXpr::Type(derived(), startRow, 0, n, cols()); -} - - - -/// \returns a block consisting of the left columns of *this. -/// -/// \param n the number of columns in the block -/// -/// Example: \include MatrixBase_leftCols_int.cpp -/// Output: \verbinclude MatrixBase_leftCols_int.out -/// -EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -EIGEN_DEVICE_FUNC -inline ColsBlockXpr leftCols(Index n) -{ - return ColsBlockXpr(derived(), 0, 0, rows(), n); -} - -/// This is the const version of leftCols(Index). -EIGEN_DEVICE_FUNC -inline ConstColsBlockXpr leftCols(Index n) const -{ - return ConstColsBlockXpr(derived(), 0, 0, rows(), n); -} - -/// \returns a block consisting of the left columns of *this. -/// -/// \tparam N the number of columns in the block as specified at compile-time -/// \param n the number of columns in the block as specified at run-time -/// -/// The compile-time and run-time information should not contradict. In other words, -/// \a n should equal \a N unless \a N is \a Dynamic. -/// -/// Example: \include MatrixBase_template_int_leftCols.cpp -/// Output: \verbinclude MatrixBase_template_int_leftCols.out -/// -EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -template -EIGEN_DEVICE_FUNC -inline typename NColsBlockXpr::Type leftCols(Index n = N) -{ - return typename NColsBlockXpr::Type(derived(), 0, 0, rows(), n); -} - -/// This is the const version of leftCols(). -template -EIGEN_DEVICE_FUNC -inline typename ConstNColsBlockXpr::Type leftCols(Index n = N) const -{ - return typename ConstNColsBlockXpr::Type(derived(), 0, 0, rows(), n); -} - - - -/// \returns a block consisting of the right columns of *this. -/// -/// \param n the number of columns in the block -/// -/// Example: \include MatrixBase_rightCols_int.cpp -/// Output: \verbinclude MatrixBase_rightCols_int.out -/// -EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -EIGEN_DEVICE_FUNC -inline ColsBlockXpr rightCols(Index n) -{ - return ColsBlockXpr(derived(), 0, cols() - n, rows(), n); -} - -/// This is the const version of rightCols(Index). -EIGEN_DEVICE_FUNC -inline ConstColsBlockXpr rightCols(Index n) const -{ - return ConstColsBlockXpr(derived(), 0, cols() - n, rows(), n); -} - -/// \returns a block consisting of the right columns of *this. -/// -/// \tparam N the number of columns in the block as specified at compile-time -/// \param n the number of columns in the block as specified at run-time -/// -/// The compile-time and run-time information should not contradict. In other words, -/// \a n should equal \a N unless \a N is \a Dynamic. -/// -/// Example: \include MatrixBase_template_int_rightCols.cpp -/// Output: \verbinclude MatrixBase_template_int_rightCols.out -/// -EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -template -EIGEN_DEVICE_FUNC -inline typename NColsBlockXpr::Type rightCols(Index n = N) -{ - return typename NColsBlockXpr::Type(derived(), 0, cols() - n, rows(), n); -} - -/// This is the const version of rightCols(). -template -EIGEN_DEVICE_FUNC -inline typename ConstNColsBlockXpr::Type rightCols(Index n = N) const -{ - return typename ConstNColsBlockXpr::Type(derived(), 0, cols() - n, rows(), n); -} - - - -/// \returns a block consisting of a range of columns of *this. -/// -/// \param startCol the index of the first column in the block -/// \param numCols the number of columns in the block -/// -/// Example: \include DenseBase_middleCols_int.cpp -/// Output: \verbinclude DenseBase_middleCols_int.out -/// -EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -EIGEN_DEVICE_FUNC -inline ColsBlockXpr middleCols(Index startCol, Index numCols) -{ - return ColsBlockXpr(derived(), 0, startCol, rows(), numCols); -} - -/// This is the const version of middleCols(Index,Index). -EIGEN_DEVICE_FUNC -inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const -{ - return ConstColsBlockXpr(derived(), 0, startCol, rows(), numCols); -} - -/// \returns a block consisting of a range of columns of *this. -/// -/// \tparam N the number of columns in the block as specified at compile-time -/// \param startCol the index of the first column in the block -/// \param n the number of columns in the block as specified at run-time -/// -/// The compile-time and run-time information should not contradict. In other words, -/// \a n should equal \a N unless \a N is \a Dynamic. -/// -/// Example: \include DenseBase_template_int_middleCols.cpp -/// Output: \verbinclude DenseBase_template_int_middleCols.out -/// -EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -template -EIGEN_DEVICE_FUNC -inline typename NColsBlockXpr::Type middleCols(Index startCol, Index n = N) -{ - return typename NColsBlockXpr::Type(derived(), 0, startCol, rows(), n); -} - -/// This is the const version of middleCols(). -template -EIGEN_DEVICE_FUNC -inline typename ConstNColsBlockXpr::Type middleCols(Index startCol, Index n = N) const -{ - return typename ConstNColsBlockXpr::Type(derived(), 0, startCol, rows(), n); -} - - - -/// \returns a fixed-size expression of a block in *this. -/// -/// The template parameters \a NRows and \a NCols are the number of -/// rows and columns in the block. -/// -/// \param startRow the first row in the block -/// \param startCol the first column in the block -/// -/// Example: \include MatrixBase_block_int_int.cpp -/// Output: \verbinclude MatrixBase_block_int_int.out -/// -/// \note since block is a templated member, the keyword template has to be used -/// if the matrix type is also a template parameter: \code m.template block<3,3>(1,1); \endcode -/// -EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type block(Index startRow, Index startCol) -{ - return typename FixedBlockXpr::Type(derived(), startRow, startCol); -} - -/// This is the const version of block<>(Index, Index). */ -template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol) const -{ - return typename ConstFixedBlockXpr::Type(derived(), startRow, startCol); -} - -/// \returns an expression of a block in *this. -/// -/// \tparam NRows number of rows in block as specified at compile-time -/// \tparam NCols number of columns in block as specified at compile-time -/// \param startRow the first row in the block -/// \param startCol the first column in the block -/// \param blockRows number of rows in block as specified at run-time -/// \param blockCols number of columns in block as specified at run-time -/// -/// This function is mainly useful for blocks where the number of rows is specified at compile-time -/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time -/// information should not contradict. In other words, \a blockRows should equal \a NRows unless -/// \a NRows is \a Dynamic, and the same for the number of columns. -/// -/// Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp -/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp -/// -EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL -/// -/// \sa class Block, block(Index,Index,Index,Index) -/// -template -inline typename FixedBlockXpr::Type block(Index startRow, Index startCol, - Index blockRows, Index blockCols) -{ - return typename FixedBlockXpr::Type(derived(), startRow, startCol, blockRows, blockCols); -} - -/// This is the const version of block<>(Index, Index, Index, Index). -template -inline const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol, - Index blockRows, Index blockCols) const -{ - return typename ConstFixedBlockXpr::Type(derived(), startRow, startCol, blockRows, blockCols); -} - -/// \returns an expression of the \a i-th column of *this. Note that the numbering starts at 0. -/// -/// Example: \include MatrixBase_col.cpp -/// Output: \verbinclude MatrixBase_col.out -/// -EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) -/** - * \sa row(), class Block */ -EIGEN_DEVICE_FUNC -inline ColXpr col(Index i) -{ - return ColXpr(derived(), i); -} - -/// This is the const version of col(). -EIGEN_DEVICE_FUNC -inline ConstColXpr col(Index i) const -{ - return ConstColXpr(derived(), i); -} - -/// \returns an expression of the \a i-th row of *this. Note that the numbering starts at 0. -/// -/// Example: \include MatrixBase_row.cpp -/// Output: \verbinclude MatrixBase_row.out -/// -EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) -/** - * \sa col(), class Block */ -EIGEN_DEVICE_FUNC -inline RowXpr row(Index i) -{ - return RowXpr(derived(), i); -} - -/// This is the const version of row(). */ -EIGEN_DEVICE_FUNC -inline ConstRowXpr row(Index i) const -{ - return ConstRowXpr(derived(), i); -} - -/// \returns a dynamic-size expression of a segment (i.e. a vector block) in *this. -/// -/// \only_for_vectors -/// -/// \param start the first coefficient in the segment -/// \param n the number of coefficients in the segment -/// -/// Example: \include MatrixBase_segment_int_int.cpp -/// Output: \verbinclude MatrixBase_segment_int_int.out -/// -/// \note Even though the returned expression has dynamic size, in the case -/// when it is applied to a fixed-size vector, it inherits a fixed maximal size, -/// which means that evaluating it does not cause a dynamic memory allocation. -/// -/// \sa class Block, segment(Index) -/// -EIGEN_DEVICE_FUNC -inline SegmentReturnType segment(Index start, Index n) -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return SegmentReturnType(derived(), start, n); -} - - -/// This is the const version of segment(Index,Index). -EIGEN_DEVICE_FUNC -inline ConstSegmentReturnType segment(Index start, Index n) const -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return ConstSegmentReturnType(derived(), start, n); -} - -/// \returns a dynamic-size expression of the first coefficients of *this. -/// -/// \only_for_vectors -/// -/// \param n the number of coefficients in the segment -/// -/// Example: \include MatrixBase_start_int.cpp -/// Output: \verbinclude MatrixBase_start_int.out -/// -/// \note Even though the returned expression has dynamic size, in the case -/// when it is applied to a fixed-size vector, it inherits a fixed maximal size, -/// which means that evaluating it does not cause a dynamic memory allocation. -/// -/// \sa class Block, block(Index,Index) -/// -EIGEN_DEVICE_FUNC -inline SegmentReturnType head(Index n) -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return SegmentReturnType(derived(), 0, n); -} - -/// This is the const version of head(Index). -EIGEN_DEVICE_FUNC -inline ConstSegmentReturnType head(Index n) const -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return ConstSegmentReturnType(derived(), 0, n); -} - -/// \returns a dynamic-size expression of the last coefficients of *this. -/// -/// \only_for_vectors -/// -/// \param n the number of coefficients in the segment -/// -/// Example: \include MatrixBase_end_int.cpp -/// Output: \verbinclude MatrixBase_end_int.out -/// -/// \note Even though the returned expression has dynamic size, in the case -/// when it is applied to a fixed-size vector, it inherits a fixed maximal size, -/// which means that evaluating it does not cause a dynamic memory allocation. -/// -/// \sa class Block, block(Index,Index) -/// -EIGEN_DEVICE_FUNC -inline SegmentReturnType tail(Index n) -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return SegmentReturnType(derived(), this->size() - n, n); -} - -/// This is the const version of tail(Index). -EIGEN_DEVICE_FUNC -inline ConstSegmentReturnType tail(Index n) const -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return ConstSegmentReturnType(derived(), this->size() - n, n); -} - -/// \returns a fixed-size expression of a segment (i.e. a vector block) in \c *this -/// -/// \only_for_vectors -/// -/// \tparam N the number of coefficients in the segment as specified at compile-time -/// \param start the index of the first element in the segment -/// \param n the number of coefficients in the segment as specified at compile-time -/// -/// The compile-time and run-time information should not contradict. In other words, -/// \a n should equal \a N unless \a N is \a Dynamic. -/// -/// Example: \include MatrixBase_template_int_segment.cpp -/// Output: \verbinclude MatrixBase_template_int_segment.out -/// -/// \sa class Block -/// -template -EIGEN_DEVICE_FUNC -inline typename FixedSegmentReturnType::Type segment(Index start, Index n = N) -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return typename FixedSegmentReturnType::Type(derived(), start, n); -} - -/// This is the const version of segment(Index). -template -EIGEN_DEVICE_FUNC -inline typename ConstFixedSegmentReturnType::Type segment(Index start, Index n = N) const -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return typename ConstFixedSegmentReturnType::Type(derived(), start, n); -} - -/// \returns a fixed-size expression of the first coefficients of *this. -/// -/// \only_for_vectors -/// -/// \tparam N the number of coefficients in the segment as specified at compile-time -/// \param n the number of coefficients in the segment as specified at run-time -/// -/// The compile-time and run-time information should not contradict. In other words, -/// \a n should equal \a N unless \a N is \a Dynamic. -/// -/// Example: \include MatrixBase_template_int_start.cpp -/// Output: \verbinclude MatrixBase_template_int_start.out -/// -/// \sa class Block -/// -template -EIGEN_DEVICE_FUNC -inline typename FixedSegmentReturnType::Type head(Index n = N) -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return typename FixedSegmentReturnType::Type(derived(), 0, n); -} - -/// This is the const version of head(). -template -EIGEN_DEVICE_FUNC -inline typename ConstFixedSegmentReturnType::Type head(Index n = N) const -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return typename ConstFixedSegmentReturnType::Type(derived(), 0, n); -} - -/// \returns a fixed-size expression of the last coefficients of *this. -/// -/// \only_for_vectors -/// -/// \tparam N the number of coefficients in the segment as specified at compile-time -/// \param n the number of coefficients in the segment as specified at run-time -/// -/// The compile-time and run-time information should not contradict. In other words, -/// \a n should equal \a N unless \a N is \a Dynamic. -/// -/// Example: \include MatrixBase_template_int_end.cpp -/// Output: \verbinclude MatrixBase_template_int_end.out -/// -/// \sa class Block -/// -template -EIGEN_DEVICE_FUNC -inline typename FixedSegmentReturnType::Type tail(Index n = N) -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return typename FixedSegmentReturnType::Type(derived(), size() - n); -} - -/// This is the const version of tail. -template -EIGEN_DEVICE_FUNC -inline typename ConstFixedSegmentReturnType::Type tail(Index n = N) const -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return typename ConstFixedSegmentReturnType::Type(derived(), size() - n); -} diff --git a/lib/eigen_3.3.9/bench/btl/cmake/FindPackageHandleStandardArgs.cmake b/lib/eigen_3.3.9/bench/btl/cmake/FindPackageHandleStandardArgs.cmake deleted file mode 100644 index 7f122edcddd..00000000000 --- a/lib/eigen_3.3.9/bench/btl/cmake/FindPackageHandleStandardArgs.cmake +++ /dev/null @@ -1,60 +0,0 @@ -# FIND_PACKAGE_HANDLE_STANDARD_ARGS(NAME (DEFAULT_MSG|"Custom failure message") VAR1 ... ) -# -# This macro is intended to be used in FindXXX.cmake modules files. -# It handles the REQUIRED and QUIET argument to FIND_PACKAGE() and -# it also sets the _FOUND variable. -# The package is found if all variables listed are TRUE. -# Example: -# -# FIND_PACKAGE_HANDLE_STANDARD_ARGS(LibXml2 DEFAULT_MSG LIBXML2_LIBRARIES LIBXML2_INCLUDE_DIR) -# -# LibXml2 is considered to be found, if both LIBXML2_LIBRARIES and -# LIBXML2_INCLUDE_DIR are valid. Then also LIBXML2_FOUND is set to TRUE. -# If it is not found and REQUIRED was used, it fails with FATAL_ERROR, -# independent whether QUIET was used or not. -# -# If it is found, the location is reported using the VAR1 argument, so -# here a message "Found LibXml2: /usr/lib/libxml2.so" will be printed out. -# If the second argument is DEFAULT_MSG, the message in the failure case will -# be "Could NOT find LibXml2", if you don't like this message you can specify -# your own custom failure message there. - -MACRO(FIND_PACKAGE_HANDLE_STANDARD_ARGS _NAME _FAIL_MSG _VAR1 ) - - IF("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG") - IF (${_NAME}_FIND_REQUIRED) - SET(_FAIL_MESSAGE "Could not find REQUIRED package ${_NAME}") - ELSE (${_NAME}_FIND_REQUIRED) - SET(_FAIL_MESSAGE "Could not find OPTIONAL package ${_NAME}") - ENDIF (${_NAME}_FIND_REQUIRED) - ELSE("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG") - SET(_FAIL_MESSAGE "${_FAIL_MSG}") - ENDIF("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG") - - STRING(TOUPPER ${_NAME} _NAME_UPPER) - - SET(${_NAME_UPPER}_FOUND TRUE) - IF(NOT ${_VAR1}) - SET(${_NAME_UPPER}_FOUND FALSE) - ENDIF(NOT ${_VAR1}) - - FOREACH(_CURRENT_VAR ${ARGN}) - IF(NOT ${_CURRENT_VAR}) - SET(${_NAME_UPPER}_FOUND FALSE) - ENDIF(NOT ${_CURRENT_VAR}) - ENDFOREACH(_CURRENT_VAR) - - IF (${_NAME_UPPER}_FOUND) - IF (NOT ${_NAME}_FIND_QUIETLY) - MESSAGE(STATUS "Found ${_NAME}: ${${_VAR1}}") - ENDIF (NOT ${_NAME}_FIND_QUIETLY) - ELSE (${_NAME_UPPER}_FOUND) - IF (${_NAME}_FIND_REQUIRED) - MESSAGE(FATAL_ERROR "${_FAIL_MESSAGE}") - ELSE (${_NAME}_FIND_REQUIRED) - IF (NOT ${_NAME}_FIND_QUIETLY) - MESSAGE(STATUS "${_FAIL_MESSAGE}") - ENDIF (NOT ${_NAME}_FIND_QUIETLY) - ENDIF (${_NAME}_FIND_REQUIRED) - ENDIF (${_NAME_UPPER}_FOUND) -ENDMACRO(FIND_PACKAGE_HANDLE_STANDARD_ARGS) diff --git a/lib/eigen_3.3.9/bench/perf_monitoring/gemm/changesets.txt b/lib/eigen_3.3.9/bench/perf_monitoring/gemm/changesets.txt deleted file mode 100644 index af8eb9b8f7b..00000000000 --- a/lib/eigen_3.3.9/bench/perf_monitoring/gemm/changesets.txt +++ /dev/null @@ -1,61 +0,0 @@ -#3.0.1 -#3.1.1 -#3.2.0 -3.2.4 -#5745:37f59e65eb6c -5891:d8652709345d # introduce AVX -#5893:24b4dc92c6d3 # merge -5895:997c2ef9fc8b # introduce FMA -#5904:e1eafd14eaa1 # complex and AVX -5908:f8ee3c721251 # improve packing with ptranspose -#5921:ca808bb456b0 # merge -#5927:8b1001f9e3ac -5937:5a4ca1ad8c53 # New gebp kernel handling up to 3 packets x 4 register-level blocks -#5949:f3488f4e45b2 # merge -#5969:e09031dccfd9 # Disable 3pX4 kernel on Altivec -#5992:4a429f5e0483 # merge -before-evaluators -#6334:f6a45e5b8b7c # Implement evaluator for sparse outer products -#6639:c9121c60b5c7 -#6655:06f163b5221f # Properly detect FMA support on ARM -#6677:700e023044e7 # FMA has been wrongly disabled -#6681:11d31dafb0e3 -#6699:5e6e8e10aad1 # merge default to tensors -#6726:ff2d2388e7b9 # merge default to tensors -#6742:0cbd6195e829 # merge default to tensors -#6747:853d2bafeb8f # Generalized the gebp apis -6765:71584fd55762 # Made the blocking computation aware of the l3 cache; Also optimized the blocking parameters to take into account the number of threads used for a computation -#6781:9cc5a931b2c6 # generalized gemv -#6792:f6e1daab600a # ensured that contractions that can be reduced to a matrix vector product -#6844:039efd86b75c # merge tensor -6845:7333ed40c6ef # change prefetching in gebp -#6856:b5be5e10eb7f # merge index conversion -#6893:c3a64aba7c70 # clean blocking size computation -#6898:6fb31ebe6492 # rotating kernel for ARM -6899:877facace746 # rotating kernel for ARM only -#6904:c250623ae9fa # result_of -6921:915f1b1fc158 # fix prefetching change for ARM -6923:9ff25f6dacc6 # prefetching -6933:52572e60b5d3 # blocking size strategy -6937:c8c042f286b2 # avoid redundant pack_rhs -6981:7e5d6f78da59 # dynamic loop swapping -6984:45f26866c091 # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache -6986:a675d05b6f8f # blocking heuristic: block on the rhs in L1 if the lhs fit in L1. -7013:f875e75f07e5 # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5) -7015:8aad8f35c955 # Refactor computeProductBlockingSizes to make room for the possibility of using lookup tables -7016:a58d253e8c91 # Polish lookup tables generation -7018:9b27294a8186 # actual_panel_rows computation should always be resilient to parameters not consistent with the known L1 cache size, see comment -7019:c758b1e2c073 # Provide a empirical lookup table for blocking sizes measured on a Nexus 5. Only for float, only for Android on ARM 32bit for now. -7085:627e039fba68 # Bug 986: add support for coefficient-based product with 0 depth. -7098:b6f1db9cf9ec # Bug 992: don't select a 3p GEMM path with non-vectorizable scalar types, this hits unsupported paths in symm/triangular products code -7591:09a8e2186610 # 3.3-alpha1 -7650:b0f3c8f43025 # help clang inlining -#8744:74b789ada92a # Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs) -8789:efcb912e4356 # Made the index type a template parameter to evaluateProductBlockingSizes. Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes -8972:81d53c711775 # Don't optimize the processing of the last rows of a matrix matrix product in cases that violate the assumptions made by the optimized code path -8985:d935df21a082 # Remove the rotating kernel. -8988:6c2dc56e73b3 # Bug 256: enable vectorization with unaligned loads/stores. -9148:b8b8c421e36c # Relax mixing-type constraints for binary coefficient-wise operators -9174:d228bc282ac9 # merge -9212:c90098affa7b # Fix performance regression introduced in changeset 8aad8f35c955 -9213:9f1c14e4694b # Fix performance regression in dgemm introduced by changeset 81d53c711775 diff --git a/lib/eigen_3.3.9/bench/perf_monitoring/gemm/make_plot.sh b/lib/eigen_3.3.9/bench/perf_monitoring/gemm/make_plot.sh deleted file mode 100755 index cd3214ac915..00000000000 --- a/lib/eigen_3.3.9/bench/perf_monitoring/gemm/make_plot.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -# base name of the bench -# it reads $1.out -# and generates $1.pdf -WHAT=$1 -bench=$2 - -header="rev " -while read line -do - if [ ! -z '$line' ]; then - header="$header \"$line\"" - fi -done < $bench"_settings.txt" - -echo $header > $WHAT.out.header -cat $WHAT.out >> $WHAT.out.header - - -echo "set title '$WHAT'" > $WHAT.gnuplot -echo "set key autotitle columnhead outside " >> $WHAT.gnuplot -echo "set xtics rotate 1" >> $WHAT.gnuplot - -echo "set term pdf color rounded enhanced fontscale 0.35 size 7in,5in" >> $WHAT.gnuplot -echo set output "'"$WHAT.pdf"'" >> $WHAT.gnuplot - -col=`cat $bench"_settings.txt" | wc -l` -echo "plot for [col=2:$col+1] '$WHAT.out.header' using 0:col:xticlabels(1) with lines" >> $WHAT.gnuplot -echo " " >> $WHAT.gnuplot - -gnuplot -persist < $WHAT.gnuplot - -# generate a png file -# convert -background white -density 120 -rotate 90 -resize 800 +dither -colors 256 -quality 0 $WHAT.ps -background white -flatten .$WHAT.png - -# clean -rm $WHAT.out.header $WHAT.gnuplot \ No newline at end of file diff --git a/lib/eigen_3.3.9/bench/tensors/tensor_benchmarks_sycl.cc b/lib/eigen_3.3.9/bench/tensors/tensor_benchmarks_sycl.cc deleted file mode 100644 index 7eca4d96607..00000000000 --- a/lib/eigen_3.3.9/bench/tensors/tensor_benchmarks_sycl.cc +++ /dev/null @@ -1,37 +0,0 @@ -#define EIGEN_USE_SYCL - -#include -#include - -#include "tensor_benchmarks.h" - -using Eigen::array; -using Eigen::SyclDevice; -using Eigen::Tensor; -using Eigen::TensorMap; -// Simple functions -template -cl::sycl::queue sycl_queue() { - return cl::sycl::queue(device_selector(), [=](cl::sycl::exception_list l) { - for (const auto& e : l) { - try { - std::rethrow_exception(e); - } catch (cl::sycl::exception e) { - std::cout << e.what() << std::endl; - } - } - }); -} - -#define BM_FuncGPU(FUNC) \ - static void BM_##FUNC(int iters, int N) { \ - StopBenchmarkTiming(); \ - cl::sycl::queue q = sycl_queue(); \ - Eigen::SyclDevice device(q); \ - BenchmarkSuite suite(device, N); \ - suite.FUNC(iters); \ - } \ - BENCHMARK_RANGE(BM_##FUNC, 10, 5000); - -BM_FuncGPU(broadcasting); -BM_FuncGPU(coeffWiseOp); diff --git a/lib/eigen_3.3.9/cmake/FindAdolc.cmake b/lib/eigen_3.3.9/cmake/FindAdolc.cmake deleted file mode 100644 index 937e549904e..00000000000 --- a/lib/eigen_3.3.9/cmake/FindAdolc.cmake +++ /dev/null @@ -1,20 +0,0 @@ - -if (ADOLC_INCLUDES AND ADOLC_LIBRARIES) - set(ADOLC_FIND_QUIETLY TRUE) -endif (ADOLC_INCLUDES AND ADOLC_LIBRARIES) - -find_path(ADOLC_INCLUDES - NAMES - adolc/adtl.h - PATHS - $ENV{ADOLCDIR} - ${INCLUDE_INSTALL_DIR} -) - -find_library(ADOLC_LIBRARIES adolc PATHS $ENV{ADOLCDIR} ${LIB_INSTALL_DIR}) - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(ADOLC DEFAULT_MSG - ADOLC_INCLUDES ADOLC_LIBRARIES) - -mark_as_advanced(ADOLC_INCLUDES ADOLC_LIBRARIES) diff --git a/lib/eigen_3.3.9/cmake/FindComputeCpp.cmake b/lib/eigen_3.3.9/cmake/FindComputeCpp.cmake deleted file mode 100644 index 07ebed61b9b..00000000000 --- a/lib/eigen_3.3.9/cmake/FindComputeCpp.cmake +++ /dev/null @@ -1,245 +0,0 @@ -#.rst: -# FindComputeCpp -#--------------- -# -# Copyright 2016 Codeplay Software Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use these files except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -######################### -# FindComputeCpp.cmake -######################### -# -# Tools for finding and building with ComputeCpp. -# -# User must define COMPUTECPP_PACKAGE_ROOT_DIR pointing to the ComputeCpp -# installation. -# -# Latest version of this file can be found at: -# https://github.com/codeplaysoftware/computecpp-sdk - -# Require CMake version 3.2.2 or higher -cmake_minimum_required(VERSION 3.2.2) - -# Check that a supported host compiler can be found -if(CMAKE_COMPILER_IS_GNUCXX) - # Require at least gcc 4.8 - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8) - message(FATAL_ERROR - "host compiler - Not found! (gcc version must be at least 4.8)") - # Require the GCC dual ABI to be disabled for 5.1 or higher - elseif (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.1) - set(COMPUTECPP_DISABLE_GCC_DUAL_ABI "True") - message(STATUS - "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION} (note pre 5.1 gcc ABI enabled)") - else() - message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}") - endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - # Require at least clang 3.6 - if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6) - message(FATAL_ERROR - "host compiler - Not found! (clang version must be at least 3.6)") - else() - message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}") - endif() -else() - message(WARNING - "host compiler - Not found! (ComputeCpp supports GCC and Clang, see readme)") -endif() - -set(COMPUTECPP_64_BIT_DEFAULT ON) -option(COMPUTECPP_64_BIT_CODE "Compile device code in 64 bit mode" - ${COMPUTECPP_64_BIT_DEFAULT}) -mark_as_advanced(COMPUTECPP_64_BIT_CODE) - -# Find OpenCL package -find_package(OpenCL REQUIRED) - -# Find ComputeCpp packagee -if(NOT COMPUTECPP_PACKAGE_ROOT_DIR) - message(FATAL_ERROR - "ComputeCpp package - Not found! (please set COMPUTECPP_PACKAGE_ROOT_DIR") -else() - message(STATUS "ComputeCpp package - Found") -endif() -option(COMPUTECPP_PACKAGE_ROOT_DIR "Path to the ComputeCpp Package") - -# Obtain the path to compute++ -find_program(COMPUTECPP_DEVICE_COMPILER compute++ PATHS - ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin) -if (EXISTS ${COMPUTECPP_DEVICE_COMPILER}) - mark_as_advanced(COMPUTECPP_DEVICE_COMPILER) - message(STATUS "compute++ - Found") -else() - message(FATAL_ERROR "compute++ - Not found! (${COMPUTECPP_DEVICE_COMPILER})") -endif() - -# Obtain the path to computecpp_info -find_program(COMPUTECPP_INFO_TOOL computecpp_info PATHS - ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin) -if (EXISTS ${COMPUTECPP_INFO_TOOL}) - mark_as_advanced(${COMPUTECPP_INFO_TOOL}) - message(STATUS "computecpp_info - Found") -else() - message(FATAL_ERROR "computecpp_info - Not found! (${COMPUTECPP_INFO_TOOL})") -endif() - -# Obtain the path to the ComputeCpp runtime library -find_library(COMPUTECPP_RUNTIME_LIBRARY ComputeCpp PATHS ${COMPUTECPP_PACKAGE_ROOT_DIR} - HINTS ${COMPUTECPP_PACKAGE_ROOT_DIR}/lib PATH_SUFFIXES lib - DOC "ComputeCpp Runtime Library" NO_DEFAULT_PATH) - -if (EXISTS ${COMPUTECPP_RUNTIME_LIBRARY}) - mark_as_advanced(COMPUTECPP_RUNTIME_LIBRARY) - message(STATUS "libComputeCpp.so - Found") -else() - message(FATAL_ERROR "libComputeCpp.so - Not found!") -endif() - -# Obtain the ComputeCpp include directory -set(COMPUTECPP_INCLUDE_DIRECTORY ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/) -if (NOT EXISTS ${COMPUTECPP_INCLUDE_DIRECTORY}) - message(FATAL_ERROR "ComputeCpp includes - Not found!") -else() - message(STATUS "ComputeCpp includes - Found") -endif() - -# Obtain the package version -execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-version" - OUTPUT_VARIABLE COMPUTECPP_PACKAGE_VERSION - RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE) -if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0") - message(FATAL_ERROR "Package version - Error obtaining version!") -else() - mark_as_advanced(COMPUTECPP_PACKAGE_VERSION) - message(STATUS "Package version - ${COMPUTECPP_PACKAGE_VERSION}") -endif() - -# Obtain the device compiler flags -execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-device-compiler-flags" - OUTPUT_VARIABLE COMPUTECPP_DEVICE_COMPILER_FLAGS - RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE) -if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0") - message(FATAL_ERROR "compute++ flags - Error obtaining compute++ flags!") -else() - mark_as_advanced(COMPUTECPP_COMPILER_FLAGS) - message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}") -endif() - -set(COMPUTECPP_DEVICE_COMPILER_FLAGS ${COMPUTECPP_DEVICE_COMPILER_FLAGS} -sycl-compress-name -no-serial-memop -DEIGEN_NO_ASSERTION_CHECKING=1) - -# Check if the platform is supported -execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported" - OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED - RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE) -if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0") - message(FATAL_ERROR "platform - Error checking platform support!") -else() - mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED) - if (COMPUTECPP_PLATFORM_IS_SUPPORTED) - message(STATUS "platform - your system can support ComputeCpp") - else() - message(STATUS "platform - your system CANNOT support ComputeCpp") - endif() -endif() - -#################### -# __build_sycl -#################### -# -# Adds a custom target for running compute++ and adding a dependency for the -# resulting integration header. -# -# targetName : Name of the target. -# sourceFile : Source file to be compiled. -# binaryDir : Intermediate directory to output the integration header. -# -function(__build_spir targetName sourceFile binaryDir) - - # Retrieve source file name. - get_filename_component(sourceFileName ${sourceFile} NAME) - - # Set the path to the Sycl file. - set(outputSyclFile ${binaryDir}/${sourceFileName}.sycl) - - # Add any user-defined include to the device compiler - get_property(includeDirectories DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY - INCLUDE_DIRECTORIES) - set(device_compiler_includes "") - foreach(directory ${includeDirectories}) - set(device_compiler_includes "-I${directory}" ${device_compiler_includes}) - endforeach() - if (CMAKE_INCLUDE_PATH) - foreach(directory ${CMAKE_INCLUDE_PATH}) - set(device_compiler_includes "-I${directory}" - ${device_compiler_includes}) - endforeach() - endif() - - # Convert argument list format - separate_arguments(COMPUTECPP_DEVICE_COMPILER_FLAGS) - - # Add custom command for running compute++ - add_custom_command( - OUTPUT ${outputSyclFile} - COMMAND ${COMPUTECPP_DEVICE_COMPILER} - ${COMPUTECPP_DEVICE_COMPILER_FLAGS} - -isystem ${COMPUTECPP_INCLUDE_DIRECTORY} - ${COMPUTECPP_PLATFORM_SPECIFIC_ARGS} - ${device_compiler_includes} - -o ${outputSyclFile} - -c ${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile} - DEPENDS ${sourceFile} - WORKING_DIRECTORY ${binaryDir} - COMMENT "Building ComputeCpp integration header file ${outputSyclFile}") - - # Add a custom target for the generated integration header - add_custom_target(${targetName}_integration_header DEPENDS ${outputSyclFile}) - - # Add a dependency on the integration header - add_dependencies(${targetName} ${targetName}_integration_header) - - # Set the host compiler C++ standard to C++11 - set_property(TARGET ${targetName} PROPERTY CXX_STANDARD 11) - - # Disable GCC dual ABI on GCC 5.1 and higher - if(COMPUTECPP_DISABLE_GCC_DUAL_ABI) - set_property(TARGET ${targetName} APPEND PROPERTY COMPILE_DEFINITIONS - "_GLIBCXX_USE_CXX11_ABI=0") - endif() - -endfunction() - -####################### -# add_sycl_to_target -####################### -# -# Adds a SYCL compilation custom command associated with an existing -# target and sets a dependancy on that new command. -# -# targetName : Name of the target to add a SYCL to. -# sourceFile : Source file to be compiled for SYCL. -# binaryDir : Intermediate directory to output the integration header. -# -function(add_sycl_to_target targetName sourceFile binaryDir) - - # Add custom target to run compute++ and generate the integration header - __build_spir(${targetName} ${sourceFile} ${binaryDir}) - - # Link with the ComputeCpp runtime library - target_link_libraries(${targetName} PUBLIC ${COMPUTECPP_RUNTIME_LIBRARY} - PUBLIC ${OpenCL_LIBRARIES}) - -endfunction(add_sycl_to_target) diff --git a/lib/eigen_3.3.9/cmake/RegexUtils.cmake b/lib/eigen_3.3.9/cmake/RegexUtils.cmake deleted file mode 100644 index b59dfc340f5..00000000000 --- a/lib/eigen_3.3.9/cmake/RegexUtils.cmake +++ /dev/null @@ -1,19 +0,0 @@ -function(escape_string_as_regex _str_out _str_in) - STRING(REGEX REPLACE "\\\\" "\\\\\\\\" FILETEST2 "${_str_in}") - STRING(REGEX REPLACE "([.$+*?|-])" "\\\\\\1" FILETEST2 "${FILETEST2}") - STRING(REGEX REPLACE "\\^" "\\\\^" FILETEST2 "${FILETEST2}") - STRING(REGEX REPLACE "\\(" "\\\\(" FILETEST2 "${FILETEST2}") - STRING(REGEX REPLACE "\\)" "\\\\)" FILETEST2 "${FILETEST2}") - STRING(REGEX REPLACE "\\[" "\\\\[" FILETEST2 "${FILETEST2}") - STRING(REGEX REPLACE "\\]" "\\\\]" FILETEST2 "${FILETEST2}") - SET(${_str_out} "${FILETEST2}" PARENT_SCOPE) -endfunction() - -function(test_escape_string_as_regex) - SET(test1 "\\.^$-+*()[]?|") - escape_string_as_regex(test2 "${test1}") - SET(testRef "\\\\\\.\\^\\$\\-\\+\\*\\(\\)\\[\\]\\?\\|") - if(NOT test2 STREQUAL testRef) - message("Error in the escape_string_for_regex function : \n ${test1} was escaped as ${test2}, should be ${testRef}") - endif(NOT test2 STREQUAL testRef) -endfunction() \ No newline at end of file diff --git a/lib/eigen_3.3.9/cmake/language_support.cmake b/lib/eigen_3.3.9/cmake/language_support.cmake deleted file mode 100644 index ddba509459f..00000000000 --- a/lib/eigen_3.3.9/cmake/language_support.cmake +++ /dev/null @@ -1,67 +0,0 @@ -# cmake/modules/language_support.cmake -# -# Temporary additional general language support is contained within this -# file. - -# This additional function definition is needed to provide a workaround for -# CMake bug 9220. - -# On debian testing (cmake 2.6.2), I get return code zero when calling -# cmake the first time, but cmake crashes when running a second time -# as follows: -# -# -- The Fortran compiler identification is unknown -# CMake Error at /usr/share/cmake-2.6/Modules/CMakeFortranInformation.cmake:7 (GET_FILENAME_COMPONENT): -# get_filename_component called with incorrect number of arguments -# Call Stack (most recent call first): -# CMakeLists.txt:3 (enable_language) -# -# My workaround is to invoke cmake twice. If both return codes are zero, -# it is safe to invoke ENABLE_LANGUAGE(Fortran OPTIONAL) - -function(workaround_9220 language language_works) - #message("DEBUG: language = ${language}") - set(text - "project(test NONE) - cmake_minimum_required(VERSION 2.8.0) - set (CMAKE_Fortran_FLAGS \"${CMAKE_Fortran_FLAGS}\") - set (CMAKE_EXE_LINKER_FLAGS \"${CMAKE_EXE_LINKER_FLAGS}\") - enable_language(${language}) - ") - file(REMOVE_RECURSE ${CMAKE_BINARY_DIR}/language_tests/${language}) - file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/language_tests/${language}) - file(WRITE ${CMAKE_BINARY_DIR}/language_tests/${language}/CMakeLists.txt - ${text}) - execute_process( - COMMAND ${CMAKE_COMMAND} . -G "${CMAKE_GENERATOR}" - WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/language_tests/${language} - RESULT_VARIABLE return_code - OUTPUT_QUIET - ERROR_QUIET - ) - - if(return_code EQUAL 0) - # Second run - execute_process ( - COMMAND ${CMAKE_COMMAND} . -G "${CMAKE_GENERATOR}" - WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/language_tests/${language} - RESULT_VARIABLE return_code - OUTPUT_QUIET - ERROR_QUIET - ) - if(return_code EQUAL 0) - set(${language_works} ON PARENT_SCOPE) - else(return_code EQUAL 0) - set(${language_works} OFF PARENT_SCOPE) - endif(return_code EQUAL 0) - else(return_code EQUAL 0) - set(${language_works} OFF PARENT_SCOPE) - endif(return_code EQUAL 0) -endfunction(workaround_9220) - -# Temporary tests of the above function. -#workaround_9220(CXX CXX_language_works) -#message("CXX_language_works = ${CXX_language_works}") -#workaround_9220(CXXp CXXp_language_works) -#message("CXXp_language_works = ${CXXp_language_works}") - diff --git a/lib/eigen_3.3.9/doc/A05_PortingFrom2To3.dox b/lib/eigen_3.3.9/doc/A05_PortingFrom2To3.dox deleted file mode 100644 index 51555f9967e..00000000000 --- a/lib/eigen_3.3.9/doc/A05_PortingFrom2To3.dox +++ /dev/null @@ -1,299 +0,0 @@ -namespace Eigen { - -/** \page Eigen2ToEigen3 Porting from Eigen2 to Eigen3 - -This page lists the most important API changes between Eigen2 and Eigen3, -and gives tips to help porting your application from Eigen2 to Eigen3. - -\eigenAutoToc - -\section CompatibilitySupport Eigen2 compatibility support - -Up to version 3.2 %Eigen provides Eigen2 support modes. These are removed now, because they were barely used anymore and became hard to maintain after internal re-designs. -You can still use them by first porting your code to Eigen 3.2. - -\section Using The USING_PART_OF_NAMESPACE_EIGEN macro - -The USING_PART_OF_NAMESPACE_EIGEN macro has been removed. In Eigen 3, just do: -\code -using namespace Eigen; -\endcode - -\section ComplexDot Dot products over complex numbers - -This is the single trickiest change between Eigen 2 and Eigen 3. It only affects code using \c std::complex numbers as scalar type. - -Eigen 2's dot product was linear in the first variable. Eigen 3's dot product is linear in the second variable. In other words, the Eigen 2 code \code x.dot(y) \endcode is equivalent to the Eigen 3 code \code y.dot(x) \endcode In yet other words, dot products are complex-conjugated in Eigen 3 compared to Eigen 2. The switch to the new convention was commanded by common usage, especially with the notation \f$ x^Ty \f$ for dot products of column-vectors. - -\section VectorBlocks Vector blocks - - - - -
Eigen 2Eigen 3
\code -vector.start(length) -vector.start() -vector.end(length) -vector.end() -\endcode\code -vector.head(length) -vector.head() -vector.tail(length) -vector.tail() -\endcode
- - -\section Corners Matrix Corners - - - - - -
Eigen 2Eigen 3
\code -matrix.corner(TopLeft,r,c) -matrix.corner(TopRight,r,c) -matrix.corner(BottomLeft,r,c) -matrix.corner(BottomRight,r,c) -matrix.corner(TopLeft) -matrix.corner(TopRight) -matrix.corner(BottomLeft) -matrix.corner(BottomRight) -\endcode\code -matrix.topLeftCorner(r,c) -matrix.topRightCorner(r,c) -matrix.bottomLeftCorner(r,c) -matrix.bottomRightCorner(r,c) -matrix.topLeftCorner() -matrix.topRightCorner() -matrix.bottomLeftCorner() -matrix.bottomRightCorner() -\endcode
- -Notice that Eigen3 also provides these new convenience methods: topRows(), bottomRows(), leftCols(), rightCols(). See in class DenseBase. - -\section CoefficientWiseOperations Coefficient wise operations - -In Eigen2, coefficient wise operations which have no proper mathematical definition (as a coefficient wise product) -were achieved using the .cwise() prefix, e.g.: -\code a.cwise() * b \endcode -In Eigen3 this .cwise() prefix has been superseded by a new kind of matrix type called -Array for which all operations are performed coefficient wise. You can easily view a matrix as an array and vice versa using -the MatrixBase::array() and ArrayBase::matrix() functions respectively. Here is an example: -\code -Vector4f a, b, c; -c = a.array() * b.array(); -\endcode -Note that the .array() function is not at all a synonym of the deprecated .cwise() prefix. -While the .cwise() prefix changed the behavior of the following operator, the array() function performs -a permanent conversion to the array world. Therefore, for binary operations such as the coefficient wise product, -both sides must be converted to an \em array as in the above example. On the other hand, when you -concatenate multiple coefficient wise operations you only have to do the conversion once, e.g.: -\code -Vector4f a, b, c; -c = a.array().abs().pow(3) * b.array().abs().sin(); -\endcode -With Eigen2 you would have written: -\code -c = (a.cwise().abs().cwise().pow(3)).cwise() * (b.cwise().abs().cwise().sin()); -\endcode - -\section PartAndExtract Triangular and self-adjoint matrices - -In Eigen 2 you had to play with the part, extract, and marked functions to deal with triangular and selfadjoint matrices. In Eigen 3, all these functions have been removed in favor of the concept of \em views: - - - - - - - - - - - - - - - -
Eigen 2Eigen 3
\code -A.part(); -A.part(); \endcode\code -A.triangularView() -A.triangularView()\endcode
\code -A.extract(); -A.extract();\endcode\code -A.triangularView() -A.triangularView()\endcode
\code -A.marked(); -A.marked();\endcode\code -A.triangularView() -A.triangularView()\endcode
\code -A.part(); -A.extract();\endcode\code -A.selfadjointView() -A.selfadjointView()\endcode
\code -UpperTriangular -LowerTriangular -UnitUpperTriangular -UnitLowerTriangular -StrictlyUpperTriangular -StrictlyLowerTriangular -\endcode\code -Upper -Lower -UnitUpper -UnitLower -StrictlyUpper -StrictlyLower -\endcode
- -\sa class TriangularView, class SelfAdjointView - -\section TriangularSolveInPlace Triangular in-place solving - - - - -
Eigen 2Eigen 3
\code A.triangularSolveInPlace(Y);\endcode\code A.triangularView().solveInPlace(Y);\endcode
- - -\section Decompositions Matrix decompositions - -Some of Eigen 2's matrix decompositions have been renamed in Eigen 3, while some others have been removed and are replaced by other decompositions in Eigen 3. - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Eigen 2Eigen 3Notes
LUFullPivLUSee also the new PartialPivLU, it's much faster
QRHouseholderQRSee also the new ColPivHouseholderQR, it's more reliable
SVDJacobiSVDWe currently don't have a bidiagonalizing SVD; of course this is planned.
EigenSolver and friends\code #include \endcode Moved to separate module
- -\section LinearSolvers Linear solvers - - - - - - - - - - - - - - - - - - -
Eigen 2Eigen 3Notes
\code A.lu();\endcode\code A.fullPivLu();\endcodeNow A.lu() returns a PartialPivLU
\code A.lu().solve(B,&X);\endcode\code X = A.lu().solve(B); - X = A.fullPivLu().solve(B);\endcodeThe returned by value is fully optimized
\code A.llt().solve(B,&X);\endcode\code X = A.llt().solve(B); - X = A.selfadjointView.llt().solve(B); - X = A.selfadjointView.llt().solve(B);\endcodeThe returned by value is fully optimized and \n -the selfadjointView API allows you to select the \n -triangular part to work on (default is lower part)
\code A.llt().solveInPlace(B);\endcode\code B = A.llt().solve(B); - B = A.selfadjointView.llt().solve(B); - B = A.selfadjointView.llt().solve(B);\endcodeIn place solving
\code A.ldlt().solve(B,&X);\endcode\code X = A.ldlt().solve(B); - X = A.selfadjointView.ldlt().solve(B); - X = A.selfadjointView.ldlt().solve(B);\endcodeThe returned by value is fully optimized and \n -the selfadjointView API allows you to select the \n -triangular part to work on
- -\section GeometryModule Changes in the Geometry module - -The Geometry module is the one that changed the most. If you rely heavily on it, it's probably a good idea to use the "Eigen 2 support modes" to perform your migration. - -\section Transform The Transform class - -In Eigen 2, the Transform class didn't really know whether it was a projective or affine transformation. In Eigen 3, it takes a new \a Mode template parameter, which indicates whether it's \a Projective or \a Affine transform. There is no default value. - -The Transform3f (etc) typedefs are no more. In Eigen 3, the Transform typedefs explicitly refer to the \a Projective and \a Affine modes: - - - - - - - - -
Eigen 2Eigen 3Notes
Transform3f Affine3f or Projective3f Of course 3f is just an example here
- - -\section LazyVsNoalias Lazy evaluation and noalias - -In Eigen all operations are performed in a lazy fashion except the matrix products which are always evaluated into a temporary by default. -In Eigen2, lazy evaluation could be enforced by tagging a product using the .lazy() function. However, in complex expressions it was not -easy to determine where to put the lazy() function. In Eigen3, the lazy() feature has been superseded by the MatrixBase::noalias() function -which can be used on the left hand side of an assignment when no aliasing can occur. Here is an example: -\code -MatrixXf a, b, c; -... -c.noalias() += 2 * a.transpose() * b; -\endcode -However, the noalias mechanism does not cover all the features of the old .lazy(). Indeed, in some extremely rare cases, -it might be useful to explicit request for a lay product, i.e., for a product which will be evaluated one coefficient at once, on request, -just like any other expressions. To this end you can use the MatrixBase::lazyProduct() function, however we strongly discourage you to -use it unless you are sure of what you are doing, i.e., you have rigourosly measured a speed improvement. - -\section AlignMacros Alignment-related macros - -The EIGEN_ALIGN_128 macro has been renamed to EIGEN_ALIGN16. Don't be surprised, it's just that we switched to counting in bytes ;-) - -The \link TopicPreprocessorDirectivesPerformance EIGEN_DONT_ALIGN \endlink option still exists in Eigen 3, but it has a new cousin: \link TopicPreprocessorDirectivesPerformance EIGEN_DONT_ALIGN_STATICALLY.\endlink It allows to get rid of all static alignment issues while keeping alignment of dynamic-size heap-allocated arrays. Vectorization of statically allocated arrays is still preserved (unless you define \link TopicPreprocessorDirectivesPerformance EIGEN_UNALIGNED_VECTORIZE \endlink =0), at the cost of unaligned memory stores. - -\section AlignedMap Aligned Map objects - -A common issue with Eigen 2 was that when mapping an array with Map, there was no way to tell Eigen that your array was aligned. There was a ForceAligned option but it didn't mean that; it was just confusing and has been removed. - -New in Eigen3 is the #Aligned option. See the documentation of class Map. Use it like this: -\code -Map myMappedVector(some_aligned_array); -\endcode -There also are related convenience static methods, which actually are the preferred way as they take care of such things as constness: -\code -result = Vector4f::MapAligned(some_aligned_array); -\endcode - -\section StdContainers STL Containers - -In Eigen2, \#include\ tweaked std::vector to automatically align elements. The problem was that that was quite invasive. In Eigen3, we only override standard behavior if you use Eigen::aligned_allocator as your allocator type. So for example, if you use std::vector, you need to do the following change (note that aligned_allocator is under namespace Eigen): - - - - - - - -
Eigen 2Eigen 3
\code std::vector \endcode \code std::vector > \endcode
- -\section eiPrefix Internal ei_ prefix - -In Eigen2, global internal functions and structures were prefixed by \c ei_. In Eigen3, they all have been moved into the more explicit \c internal namespace. So, e.g., \c ei_sqrt(x) now becomes \c internal::sqrt(x). Of course it is not recommended to rely on Eigen's internal features. - - - -*/ - -} diff --git a/lib/eigen_3.3.9/doc/FixedSizeVectorizable.dox b/lib/eigen_3.3.9/doc/FixedSizeVectorizable.dox deleted file mode 100644 index 49e38af7683..00000000000 --- a/lib/eigen_3.3.9/doc/FixedSizeVectorizable.dox +++ /dev/null @@ -1,38 +0,0 @@ -namespace Eigen { - -/** \eigenManualPage TopicFixedSizeVectorizable Fixed-size vectorizable Eigen objects - -The goal of this page is to explain what we mean by "fixed-size vectorizable". - -\section FixedSizeVectorizable_summary Executive Summary - -An Eigen object is called "fixed-size vectorizable" if it has fixed size and that size is a multiple of 16 bytes. - -Examples include: -\li Eigen::Vector2d -\li Eigen::Vector4d -\li Eigen::Vector4f -\li Eigen::Matrix2d -\li Eigen::Matrix2f -\li Eigen::Matrix4d -\li Eigen::Matrix4f -\li Eigen::Affine3d -\li Eigen::Affine3f -\li Eigen::Quaterniond -\li Eigen::Quaternionf - -\section FixedSizeVectorizable_explanation Explanation - -First, "fixed-size" should be clear: an Eigen object has fixed size if its number of rows and its number of columns are fixed at compile-time. So for example Matrix3f has fixed size, but MatrixXf doesn't (the opposite of fixed-size is dynamic-size). - -The array of coefficients of a fixed-size Eigen object is a plain "static array", it is not dynamically allocated. For example, the data behind a Matrix4f is just a "float array[16]". - -Fixed-size objects are typically very small, which means that we want to handle them with zero runtime overhead -- both in terms of memory usage and of speed. - -Now, vectorization (both SSE and AltiVec) works with 128-bit packets. Moreover, for performance reasons, these packets need to be have 128-bit alignment. - -So it turns out that the only way that fixed-size Eigen objects can be vectorized, is if their size is a multiple of 128 bits, or 16 bytes. Eigen will then request 16-byte alignment for these objects, and henceforth rely on these objects being aligned so no runtime check for alignment is performed. - -*/ - -} diff --git a/lib/eigen_3.3.9/doc/StlContainers.dox b/lib/eigen_3.3.9/doc/StlContainers.dox deleted file mode 100644 index e0f8714a95a..00000000000 --- a/lib/eigen_3.3.9/doc/StlContainers.dox +++ /dev/null @@ -1,62 +0,0 @@ -namespace Eigen { - -/** \eigenManualPage TopicStlContainers Using STL Containers with Eigen - -\eigenAutoToc - -\section StlContainers_summary Executive summary - -Using STL containers on \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", or classes having members of such types, requires taking the following two steps: - -\li A 16-byte-aligned allocator must be used. Eigen does provide one ready for use: aligned_allocator. -\li If you want to use the std::vector container, you need to \#include . - -These issues arise only with \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types" and \ref TopicStructHavingEigenMembers "structures having such Eigen objects as member". For other Eigen types, such as Vector3f or MatrixXd, no special care is needed when using STL containers. - -\section allocator Using an aligned allocator - -STL containers take an optional template parameter, the allocator type. When using STL containers on \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", you need tell the container to use an allocator that will always allocate memory at 16-byte-aligned locations. Fortunately, Eigen does provide such an allocator: Eigen::aligned_allocator. - -For example, instead of -\code -std::map -\endcode -you need to use -\code -std::map, - Eigen::aligned_allocator > > -\endcode -Note that the third parameter "std::less" is just the default value, but we have to include it because we want to specify the fourth parameter, which is the allocator type. - -\section StlContainers_vector The case of std::vector - -The situation with std::vector was even worse (explanation below) so we had to specialize it for the Eigen::aligned_allocator type. In practice you \b must use the Eigen::aligned_allocator (not another aligned allocator), \b and \#include . - -Here is an example: -\code -#include -/* ... */ -std::vector > -\endcode - -\subsection vector_spec An alternative - specializing std::vector for Eigen types - -As an alternative to the recommended approach described above, you have the option to specialize std::vector for Eigen types requiring alignment. -The advantage is that you won't need to declare std::vector all over with Eigen::allocator. One drawback on the other hand side is that -the specialization needs to be defined before all code pieces in which e.g. std::vector is used. Otherwise, without knowing the specialization -the compiler will compile that particular instance with the default std::allocator and you program is most likely to crash. - -Here is an example: -\code -#include -/* ... */ -EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(Matrix2d) -std::vector -\endcode - -\b Explanation: The resize() method of std::vector takes a value_type argument (defaulting to value_type()). So with std::vector, some Eigen::Vector4f objects will be passed by value, which discards any alignment modifiers, so a Eigen::Vector4f can be created at an unaligned location. In order to avoid that, the only solution we saw was to specialize std::vector to make it work on a slight modification of, here, Eigen::Vector4f, that is able to deal properly with this situation. - - -*/ - -} diff --git a/lib/eigen_3.3.9/doc/StructHavingEigenMembers.dox b/lib/eigen_3.3.9/doc/StructHavingEigenMembers.dox deleted file mode 100644 index 7fbed0eb01c..00000000000 --- a/lib/eigen_3.3.9/doc/StructHavingEigenMembers.dox +++ /dev/null @@ -1,190 +0,0 @@ -namespace Eigen { - -/** \eigenManualPage TopicStructHavingEigenMembers Structures Having Eigen Members - -\eigenAutoToc - -\section StructHavingEigenMembers_summary Executive Summary - -If you define a structure having members of \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", you must overload its "operator new" so that it generates 16-bytes-aligned pointers. Fortunately, %Eigen provides you with a macro EIGEN_MAKE_ALIGNED_OPERATOR_NEW that does that for you. - -\section StructHavingEigenMembers_what What kind of code needs to be changed? - -The kind of code that needs to be changed is this: - -\code -class Foo -{ - ... - Eigen::Vector2d v; - ... -}; - -... - -Foo *foo = new Foo; -\endcode - -In other words: you have a class that has as a member a \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen object", and then you dynamically create an object of that class. - -\section StructHavingEigenMembers_how How should such code be modified? - -Very easy, you just need to put a EIGEN_MAKE_ALIGNED_OPERATOR_NEW macro in a public part of your class, like this: - -\code -class Foo -{ - ... - Eigen::Vector2d v; - ... -public: - EIGEN_MAKE_ALIGNED_OPERATOR_NEW -}; - -... - -Foo *foo = new Foo; -\endcode - -This macro makes "new Foo" always return an aligned pointer. - -If this approach is too intrusive, see also the \ref StructHavingEigenMembers_othersolutions "other solutions". - -\section StructHavingEigenMembers_why Why is this needed? - -OK let's say that your code looks like this: - -\code -class Foo -{ - ... - Eigen::Vector2d v; - ... -}; - -... - -Foo *foo = new Foo; -\endcode - -A Eigen::Vector2d consists of 2 doubles, which is 128 bits. Which is exactly the size of a SSE packet, which makes it possible to use SSE for all sorts of operations on this vector. But SSE instructions (at least the ones that %Eigen uses, which are the fast ones) require 128-bit alignment. Otherwise you get a segmentation fault. - -For this reason, Eigen takes care by itself to require 128-bit alignment for Eigen::Vector2d, by doing two things: -\li Eigen requires 128-bit alignment for the Eigen::Vector2d's array (of 2 doubles). With GCC, this is done with a __attribute__ ((aligned(16))). -\li Eigen overloads the "operator new" of Eigen::Vector2d so it will always return 128-bit aligned pointers. - -Thus, normally, you don't have to worry about anything, Eigen handles alignment for you... - -... except in one case. When you have a class Foo like above, and you dynamically allocate a new Foo as above, then, since Foo doesn't have aligned "operator new", the returned pointer foo is not necessarily 128-bit aligned. - -The alignment attribute of the member v is then relative to the start of the class, foo. If the foo pointer wasn't aligned, then foo->v won't be aligned either! - -The solution is to let class Foo have an aligned "operator new", as we showed in the previous section. - -\section StructHavingEigenMembers_movetotop Should I then put all the members of Eigen types at the beginning of my class? - -That's not required. Since Eigen takes care of declaring 128-bit alignment, all members that need it are automatically 128-bit aligned relatively to the class. So code like this works fine: - -\code -class Foo -{ - double x; - Eigen::Vector2d v; -public: - EIGEN_MAKE_ALIGNED_OPERATOR_NEW -}; -\endcode - -\section StructHavingEigenMembers_dynamicsize What about dynamic-size matrices and vectors? - -Dynamic-size matrices and vectors, such as Eigen::VectorXd, allocate dynamically their own array of coefficients, so they take care of requiring absolute alignment automatically. So they don't cause this issue. The issue discussed here is only with \ref TopicFixedSizeVectorizable "fixed-size vectorizable matrices and vectors". - -\section StructHavingEigenMembers_bugineigen So is this a bug in Eigen? - -No, it's not our bug. It's more like an inherent problem of the C++98 language specification, and seems to be taken care of in the upcoming language revision: see this document. - -\section StructHavingEigenMembers_conditional What if I want to do this conditionnally (depending on template parameters) ? - -For this situation, we offer the macro EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign). It will generate aligned operators like EIGEN_MAKE_ALIGNED_OPERATOR_NEW if NeedsToAlign is true. It will generate operators with the default alignment if NeedsToAlign is false. - -Example: - -\code -template class Foo -{ - typedef Eigen::Matrix Vector; - enum { NeedsToAlign = (sizeof(Vector)%16)==0 }; - ... - Vector v; - ... -public: - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) -}; - -... - -Foo<4> *foo4 = new Foo<4>; // foo4 is guaranteed to be 128bit-aligned -Foo<3> *foo3 = new Foo<3>; // foo3 has only the system default alignment guarantee -\endcode - - -\section StructHavingEigenMembers_othersolutions Other solutions - -In case putting the EIGEN_MAKE_ALIGNED_OPERATOR_NEW macro everywhere is too intrusive, there exists at least two other solutions. - -\subsection othersolutions1 Disabling alignment - -The first is to disable alignment requirement for the fixed size members: -\code -class Foo -{ - ... - Eigen::Matrix v; - ... -}; -\endcode -This has for effect to disable vectorization when using \c v. -If a function of Foo uses it several times, then it still possible to re-enable vectorization by copying it into an aligned temporary vector: -\code -void Foo::bar() -{ - Eigen::Vector2d av(v); - // use av instead of v - ... - // if av changed, then do: - v = av; -} -\endcode - -\subsection othersolutions2 Private structure - -The second consist in storing the fixed-size objects into a private struct which will be dynamically allocated at the construction time of the main object: - -\code -struct Foo_d -{ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW - Vector2d v; - ... -}; - - -struct Foo { - Foo() { init_d(); } - ~Foo() { delete d; } - void bar() - { - // use d->v instead of v - ... - } -private: - void init_d() { d = new Foo_d; } - Foo_d* d; -}; -\endcode - -The clear advantage here is that the class Foo remains unchanged regarding alignment issues. The drawback is that a heap allocation will be required whatsoever. - -*/ - -} diff --git a/lib/eigen_3.3.9/doc/TopicMultithreading.dox b/lib/eigen_3.3.9/doc/TopicMultithreading.dox deleted file mode 100644 index a2855745b60..00000000000 --- a/lib/eigen_3.3.9/doc/TopicMultithreading.dox +++ /dev/null @@ -1,55 +0,0 @@ -namespace Eigen { - -/** \page TopicMultiThreading Eigen and multi-threading - -\section TopicMultiThreading_MakingEigenMT Make Eigen run in parallel - -Some Eigen's algorithms can exploit the multiple cores present in your hardware. To this end, it is enough to enable OpenMP on your compiler, for instance: - * GCC: \c -fopenmp - * ICC: \c -openmp - * MSVC: check the respective option in the build properties. -You can control the number of thread that will be used using either the OpenMP API or Eigen's API using the following priority: -\code - OMP_NUM_THREADS=n ./my_program - omp_set_num_threads(n); - Eigen::setNbThreads(n); -\endcode -Unless setNbThreads has been called, Eigen uses the number of threads specified by OpenMP. You can restore this behavior by calling \code setNbThreads(0); \endcode -You can query the number of threads that will be used with: -\code -n = Eigen::nbThreads( ); -\endcode -You can disable Eigen's multi threading at compile time by defining the EIGEN_DONT_PARALLELIZE preprocessor token. - -Currently, the following algorithms can make use of multi-threading: - - general dense matrix - matrix products - - PartialPivLU - - row-major-sparse * dense vector/matrix products - - ConjugateGradient with \c Lower|Upper as the \c UpLo template parameter. - - BiCGSTAB with a row-major sparse matrix format. - - LeastSquaresConjugateGradient - -\section TopicMultiThreading_UsingEigenWithMT Using Eigen in a multi-threaded application - -In the case your own application is multithreaded, and multiple threads make calls to Eigen, then you have to initialize Eigen by calling the following routine \b before creating the threads: -\code -#include - -int main(int argc, char** argv) -{ - Eigen::initParallel(); - - ... -} -\endcode - -\note With Eigen 3.3, and a fully C++11 compliant compiler (i.e., thread-safe static local variable initialization), then calling \c initParallel() is optional. - -\warning note that all functions generating random matrices are \b not re-entrant nor thread-safe. Those include DenseBase::Random(), and DenseBase::setRandom() despite a call to Eigen::initParallel(). This is because these functions are based on std::rand which is not re-entrant. For thread-safe random generator, we recommend the use of boost::random or c++11 random feature. - -In the case your application is parallelized with OpenMP, you might want to disable Eigen's own parallization as detailed in the previous section. - -\warning Using OpenMP with custom scalar types that might throw exceptions can lead to unexpected behaviour in the event of throwing. -*/ - -} diff --git a/lib/eigen_3.3.9/doc/TutorialReshapeSlicing.dox b/lib/eigen_3.3.9/doc/TutorialReshapeSlicing.dox deleted file mode 100644 index 3730a5de6ec..00000000000 --- a/lib/eigen_3.3.9/doc/TutorialReshapeSlicing.dox +++ /dev/null @@ -1,65 +0,0 @@ -namespace Eigen { - -/** \eigenManualPage TutorialReshapeSlicing Reshape and Slicing - -%Eigen does not expose convenient methods to take slices or to reshape a matrix yet. -Nonetheless, such features can easily be emulated using the Map class. - -\eigenAutoToc - -\section TutorialReshape Reshape - -A reshape operation consists in modifying the sizes of a matrix while keeping the same coefficients. -Instead of modifying the input matrix itself, which is not possible for compile-time sizes, the approach consist in creating a different \em view on the storage using class Map. -Here is a typical example creating a 1D linear view of a matrix: - - - - -
Example:Output:
-\include Tutorial_ReshapeMat2Vec.cpp - -\verbinclude Tutorial_ReshapeMat2Vec.out -
- -Remark how the storage order of the input matrix modifies the order of the coefficients in the linear view. -Here is another example reshaping a 2x6 matrix to a 6x2 one: - - - -
Example:Output:
-\include Tutorial_ReshapeMat2Mat.cpp - -\verbinclude Tutorial_ReshapeMat2Mat.out -
- - - -\section TutorialSlicing Slicing - -Slicing consists in taking a set of rows, columns, or elements, uniformly spaced within a matrix. -Again, the class Map allows to easily mimic this feature. - -For instance, one can skip every P elements in a vector: - - - -
Example:Output:
-\include Tutorial_SlicingVec.cpp - -\verbinclude Tutorial_SlicingVec.out -
- -One can olso take one column over three using an adequate outer-stride or inner-stride depending on the actual storage order: - - - -
Example:Output:
-\include Tutorial_SlicingCol.cpp - -\verbinclude Tutorial_SlicingCol.out -
- -*/ - -} diff --git a/lib/eigen_3.3.9/test/cholmod_support.cpp b/lib/eigen_3.3.9/test/cholmod_support.cpp deleted file mode 100644 index a7eda28f79e..00000000000 --- a/lib/eigen_3.3.9/test/cholmod_support.cpp +++ /dev/null @@ -1,57 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2011 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS -#include "sparse_solver.h" - -#include - -template void test_cholmod_T() -{ - CholmodDecomposition, Lower> g_chol_colmajor_lower; g_chol_colmajor_lower.setMode(CholmodSupernodalLLt); - CholmodDecomposition, Upper> g_chol_colmajor_upper; g_chol_colmajor_upper.setMode(CholmodSupernodalLLt); - CholmodDecomposition, Lower> g_llt_colmajor_lower; g_llt_colmajor_lower.setMode(CholmodSimplicialLLt); - CholmodDecomposition, Upper> g_llt_colmajor_upper; g_llt_colmajor_upper.setMode(CholmodSimplicialLLt); - CholmodDecomposition, Lower> g_ldlt_colmajor_lower; g_ldlt_colmajor_lower.setMode(CholmodLDLt); - CholmodDecomposition, Upper> g_ldlt_colmajor_upper; g_ldlt_colmajor_upper.setMode(CholmodLDLt); - - CholmodSupernodalLLT, Lower> chol_colmajor_lower; - CholmodSupernodalLLT, Upper> chol_colmajor_upper; - CholmodSimplicialLLT, Lower> llt_colmajor_lower; - CholmodSimplicialLLT, Upper> llt_colmajor_upper; - CholmodSimplicialLDLT, Lower> ldlt_colmajor_lower; - CholmodSimplicialLDLT, Upper> ldlt_colmajor_upper; - - check_sparse_spd_solving(g_chol_colmajor_lower); - check_sparse_spd_solving(g_chol_colmajor_upper); - check_sparse_spd_solving(g_llt_colmajor_lower); - check_sparse_spd_solving(g_llt_colmajor_upper); - check_sparse_spd_solving(g_ldlt_colmajor_lower); - check_sparse_spd_solving(g_ldlt_colmajor_upper); - - check_sparse_spd_solving(chol_colmajor_lower); - check_sparse_spd_solving(chol_colmajor_upper); - check_sparse_spd_solving(llt_colmajor_lower); - check_sparse_spd_solving(llt_colmajor_upper); - check_sparse_spd_solving(ldlt_colmajor_lower); - check_sparse_spd_solving(ldlt_colmajor_upper); - - check_sparse_spd_determinant(chol_colmajor_lower); - check_sparse_spd_determinant(chol_colmajor_upper); - check_sparse_spd_determinant(llt_colmajor_lower); - check_sparse_spd_determinant(llt_colmajor_upper); - check_sparse_spd_determinant(ldlt_colmajor_lower); - check_sparse_spd_determinant(ldlt_colmajor_upper); -} - -void test_cholmod_support() -{ - CALL_SUBTEST_1(test_cholmod_T()); - CALL_SUBTEST_2(test_cholmod_T >()); -} diff --git a/lib/eigen_3.3.9/test/cuda_basic.cu b/lib/eigen_3.3.9/test/cuda_basic.cu deleted file mode 100644 index ce66c2c7868..00000000000 --- a/lib/eigen_3.3.9/test/cuda_basic.cu +++ /dev/null @@ -1,170 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015-2016 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -// workaround issue between gcc >= 4.7 and cuda 5.5 -#if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7) - #undef _GLIBCXX_ATOMIC_BUILTINS - #undef _GLIBCXX_USE_INT128 -#endif - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cuda_basic -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int - -#include -#include -#include "main.h" -#include "cuda_common.h" - -// Check that dense modules can be properly parsed by nvcc -#include - -// struct Foo{ -// EIGEN_DEVICE_FUNC -// void operator()(int i, const float* mats, float* vecs) const { -// using namespace Eigen; -// // Matrix3f M(data); -// // Vector3f x(data+9); -// // Map(data+9) = M.inverse() * x; -// Matrix3f M(mats+i/16); -// Vector3f x(vecs+i*3); -// // using std::min; -// // using std::sqrt; -// Map(vecs+i*3) << x.minCoeff(), 1, 2;// / x.dot(x);//(M.inverse() * x) / x.x(); -// //x = x*2 + x.y() * x + x * x.maxCoeff() - x / x.sum(); -// } -// }; - -template -struct coeff_wise { - EIGEN_DEVICE_FUNC - void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const - { - using namespace Eigen; - T x1(in+i); - T x2(in+i+1); - T x3(in+i+2); - Map res(out+i*T::MaxSizeAtCompileTime); - - res.array() += (in[0] * x1 + x2).array() * x3.array(); - } -}; - -template -struct replicate { - EIGEN_DEVICE_FUNC - void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const - { - using namespace Eigen; - T x1(in+i); - int step = x1.size() * 4; - int stride = 3 * step; - - typedef Map > MapType; - MapType(out+i*stride+0*step, x1.rows()*2, x1.cols()*2) = x1.replicate(2,2); - MapType(out+i*stride+1*step, x1.rows()*3, x1.cols()) = in[i] * x1.colwise().replicate(3); - MapType(out+i*stride+2*step, x1.rows(), x1.cols()*3) = in[i] * x1.rowwise().replicate(3); - } -}; - -template -struct redux { - EIGEN_DEVICE_FUNC - void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const - { - using namespace Eigen; - int N = 10; - T x1(in+i); - out[i*N+0] = x1.minCoeff(); - out[i*N+1] = x1.maxCoeff(); - out[i*N+2] = x1.sum(); - out[i*N+3] = x1.prod(); - out[i*N+4] = x1.matrix().squaredNorm(); - out[i*N+5] = x1.matrix().norm(); - out[i*N+6] = x1.colwise().sum().maxCoeff(); - out[i*N+7] = x1.rowwise().maxCoeff().sum(); - out[i*N+8] = x1.matrix().colwise().squaredNorm().sum(); - } -}; - -template -struct prod_test { - EIGEN_DEVICE_FUNC - void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const - { - using namespace Eigen; - typedef Matrix T3; - T1 x1(in+i); - T2 x2(in+i+1); - Map res(out+i*T3::MaxSizeAtCompileTime); - res += in[i] * x1 * x2; - } -}; - -template -struct diagonal { - EIGEN_DEVICE_FUNC - void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const - { - using namespace Eigen; - T1 x1(in+i); - Map res(out+i*T2::MaxSizeAtCompileTime); - res += x1.diagonal(); - } -}; - -template -struct eigenvalues { - EIGEN_DEVICE_FUNC - void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const - { - using namespace Eigen; - typedef Matrix Vec; - T M(in+i); - Map res(out+i*Vec::MaxSizeAtCompileTime); - T A = M*M.adjoint(); - SelfAdjointEigenSolver eig; - eig.computeDirect(M); - res = eig.eigenvalues(); - } -}; - -void test_cuda_basic() -{ - ei_test_init_cuda(); - - int nthreads = 100; - Eigen::VectorXf in, out; - - #ifndef __CUDA_ARCH__ - int data_size = nthreads * 512; - in.setRandom(data_size); - out.setRandom(data_size); - #endif - - CALL_SUBTEST( run_and_compare_to_cuda(coeff_wise(), nthreads, in, out) ); - CALL_SUBTEST( run_and_compare_to_cuda(coeff_wise(), nthreads, in, out) ); - - CALL_SUBTEST( run_and_compare_to_cuda(replicate(), nthreads, in, out) ); - CALL_SUBTEST( run_and_compare_to_cuda(replicate(), nthreads, in, out) ); - - CALL_SUBTEST( run_and_compare_to_cuda(redux(), nthreads, in, out) ); - CALL_SUBTEST( run_and_compare_to_cuda(redux(), nthreads, in, out) ); - - CALL_SUBTEST( run_and_compare_to_cuda(prod_test(), nthreads, in, out) ); - CALL_SUBTEST( run_and_compare_to_cuda(prod_test(), nthreads, in, out) ); - - CALL_SUBTEST( run_and_compare_to_cuda(diagonal(), nthreads, in, out) ); - CALL_SUBTEST( run_and_compare_to_cuda(diagonal(), nthreads, in, out) ); - - CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues(), nthreads, in, out) ); - CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues(), nthreads, in, out) ); - -} diff --git a/lib/eigen_3.3.9/test/cuda_common.h b/lib/eigen_3.3.9/test/cuda_common.h deleted file mode 100644 index 9737693acfa..00000000000 --- a/lib/eigen_3.3.9/test/cuda_common.h +++ /dev/null @@ -1,101 +0,0 @@ - -#ifndef EIGEN_TEST_CUDA_COMMON_H -#define EIGEN_TEST_CUDA_COMMON_H - -#include -#include -#include -#include - -#ifndef __CUDACC__ -dim3 threadIdx, blockDim, blockIdx; -#endif - -template -void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out) -{ - for(int i=0; i -__global__ -void run_on_cuda_meta_kernel(const Kernel ker, int n, const Input* in, Output* out) -{ - int i = threadIdx.x + blockIdx.x*blockDim.x; - if(i -void run_on_cuda(const Kernel& ker, int n, const Input& in, Output& out) -{ - typename Input::Scalar* d_in; - typename Output::Scalar* d_out; - std::ptrdiff_t in_bytes = in.size() * sizeof(typename Input::Scalar); - std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar); - - cudaMalloc((void**)(&d_in), in_bytes); - cudaMalloc((void**)(&d_out), out_bytes); - - cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_out, out.data(), out_bytes, cudaMemcpyHostToDevice); - - // Simple and non-optimal 1D mapping assuming n is not too large - // That's only for unit testing! - dim3 Blocks(128); - dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) ); - - cudaThreadSynchronize(); - run_on_cuda_meta_kernel<<>>(ker, n, d_in, d_out); - cudaThreadSynchronize(); - - // check inputs have not been modified - cudaMemcpy(const_cast(in.data()), d_in, in_bytes, cudaMemcpyDeviceToHost); - cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost); - - cudaFree(d_in); - cudaFree(d_out); -} - - -template -void run_and_compare_to_cuda(const Kernel& ker, int n, const Input& in, Output& out) -{ - Input in_ref, in_cuda; - Output out_ref, out_cuda; - #ifndef __CUDA_ARCH__ - in_ref = in_cuda = in; - out_ref = out_cuda = out; - #endif - run_on_cpu (ker, n, in_ref, out_ref); - run_on_cuda(ker, n, in_cuda, out_cuda); - #ifndef __CUDA_ARCH__ - VERIFY_IS_APPROX(in_ref, in_cuda); - VERIFY_IS_APPROX(out_ref, out_cuda); - #endif -} - - -void ei_test_init_cuda() -{ - int device = 0; - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, device); - std::cout << "CUDA device info:\n"; - std::cout << " name: " << deviceProp.name << "\n"; - std::cout << " capability: " << deviceProp.major << "." << deviceProp.minor << "\n"; - std::cout << " multiProcessorCount: " << deviceProp.multiProcessorCount << "\n"; - std::cout << " maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << "\n"; - std::cout << " warpSize: " << deviceProp.warpSize << "\n"; - std::cout << " regsPerBlock: " << deviceProp.regsPerBlock << "\n"; - std::cout << " concurrentKernels: " << deviceProp.concurrentKernels << "\n"; - std::cout << " clockRate: " << deviceProp.clockRate << "\n"; - std::cout << " canMapHostMemory: " << deviceProp.canMapHostMemory << "\n"; - std::cout << " computeMode: " << deviceProp.computeMode << "\n"; -} - -#endif // EIGEN_TEST_CUDA_COMMON_H diff --git a/lib/eigen_3.3.9/test/dense_storage.cpp b/lib/eigen_3.3.9/test/dense_storage.cpp deleted file mode 100644 index e63712b1a49..00000000000 --- a/lib/eigen_3.3.9/test/dense_storage.cpp +++ /dev/null @@ -1,76 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2013 Hauke Heibel -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#include "main.h" - -#include - -template -void dense_storage_copy() -{ - static const int Size = ((Rows==Dynamic || Cols==Dynamic) ? Dynamic : Rows*Cols); - typedef DenseStorage DenseStorageType; - - const int rows = (Rows==Dynamic) ? 4 : Rows; - const int cols = (Cols==Dynamic) ? 3 : Cols; - const int size = rows*cols; - DenseStorageType reference(size, rows, cols); - T* raw_reference = reference.data(); - for (int i=0; i(i); - - DenseStorageType copied_reference(reference); - const T* raw_copied_reference = copied_reference.data(); - for (int i=0; i -void dense_storage_assignment() -{ - static const int Size = ((Rows==Dynamic || Cols==Dynamic) ? Dynamic : Rows*Cols); - typedef DenseStorage DenseStorageType; - - const int rows = (Rows==Dynamic) ? 4 : Rows; - const int cols = (Cols==Dynamic) ? 3 : Cols; - const int size = rows*cols; - DenseStorageType reference(size, rows, cols); - T* raw_reference = reference.data(); - for (int i=0; i(i); - - DenseStorageType copied_reference; - copied_reference = reference; - const T* raw_copied_reference = copied_reference.data(); - for (int i=0; i(); - dense_storage_copy(); - dense_storage_copy(); - dense_storage_copy(); - - dense_storage_copy(); - dense_storage_copy(); - dense_storage_copy(); - dense_storage_copy(); - - dense_storage_assignment(); - dense_storage_assignment(); - dense_storage_assignment(); - dense_storage_assignment(); - - dense_storage_assignment(); - dense_storage_assignment(); - dense_storage_assignment(); - dense_storage_assignment(); -} diff --git a/lib/eigen_3.3.9/test/exceptions.cpp b/lib/eigen_3.3.9/test/exceptions.cpp deleted file mode 100644 index 015b9fd33f3..00000000000 --- a/lib/eigen_3.3.9/test/exceptions.cpp +++ /dev/null @@ -1,115 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2011 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - - -// Various sanity tests with exceptions: -// - no memory leak when a custom scalar type trow an exceptions -// - todo: complete the list of tests! - -#define EIGEN_STACK_ALLOCATION_LIMIT 100000000 - -#include "main.h" - -struct my_exception -{ - my_exception() {} - ~my_exception() {} -}; - -class ScalarWithExceptions -{ - public: - ScalarWithExceptions() { init(); } - ScalarWithExceptions(const float& _v) { init(); *v = _v; } - ScalarWithExceptions(const ScalarWithExceptions& other) { init(); *v = *(other.v); } - ~ScalarWithExceptions() { - delete v; - instances--; - } - - void init() { - v = new float; - instances++; - } - - ScalarWithExceptions operator+(const ScalarWithExceptions& other) const - { - countdown--; - if(countdown<=0) - throw my_exception(); - return ScalarWithExceptions(*v+*other.v); - } - - ScalarWithExceptions operator-(const ScalarWithExceptions& other) const - { return ScalarWithExceptions(*v-*other.v); } - - ScalarWithExceptions operator*(const ScalarWithExceptions& other) const - { return ScalarWithExceptions((*v)*(*other.v)); } - - ScalarWithExceptions& operator+=(const ScalarWithExceptions& other) - { *v+=*other.v; return *this; } - ScalarWithExceptions& operator-=(const ScalarWithExceptions& other) - { *v-=*other.v; return *this; } - ScalarWithExceptions& operator=(const ScalarWithExceptions& other) - { *v = *(other.v); return *this; } - - bool operator==(const ScalarWithExceptions& other) const - { return *v==*other.v; } - bool operator!=(const ScalarWithExceptions& other) const - { return *v!=*other.v; } - - float* v; - static int instances; - static int countdown; -}; - -ScalarWithExceptions real(const ScalarWithExceptions &x) { return x; } -ScalarWithExceptions imag(const ScalarWithExceptions & ) { return 0; } -ScalarWithExceptions conj(const ScalarWithExceptions &x) { return x; } - -int ScalarWithExceptions::instances = 0; -int ScalarWithExceptions::countdown = 0; - - -#define CHECK_MEMLEAK(OP) { \ - ScalarWithExceptions::countdown = 100; \ - int before = ScalarWithExceptions::instances; \ - bool exception_thrown = false; \ - try { OP; } \ - catch (my_exception) { \ - exception_thrown = true; \ - VERIFY(ScalarWithExceptions::instances==before && "memory leak detected in " && EIGEN_MAKESTRING(OP)); \ - } \ - VERIFY(exception_thrown && " no exception thrown in " && EIGEN_MAKESTRING(OP)); \ - } - -void memoryleak() -{ - typedef Eigen::Matrix VectorType; - typedef Eigen::Matrix MatrixType; - - { - int n = 50; - VectorType v0(n), v1(n); - MatrixType m0(n,n), m1(n,n), m2(n,n); - v0.setOnes(); v1.setOnes(); - m0.setOnes(); m1.setOnes(); m2.setOnes(); - CHECK_MEMLEAK(v0 = m0 * m1 * v1); - CHECK_MEMLEAK(m2 = m0 * m1 * m2); - CHECK_MEMLEAK((v0+v1).dot(v0+v1)); - } - VERIFY(ScalarWithExceptions::instances==0 && "global memory leak detected in " && EIGEN_MAKESTRING(OP)); \ -} - -void test_exceptions() -{ - EIGEN_TRY { - CALL_SUBTEST( memoryleak() ); - } EIGEN_CATCH(...) {} -} diff --git a/lib/eigen_3.3.9/test/geo_alignedbox.cpp b/lib/eigen_3.3.9/test/geo_alignedbox.cpp deleted file mode 100644 index 4cf51aafb58..00000000000 --- a/lib/eigen_3.3.9/test/geo_alignedbox.cpp +++ /dev/null @@ -1,188 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008-2009 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#include "main.h" -#include -#include -#include - -#include -using namespace std; - -// TODO not sure if this is actually still necessary anywhere ... -template EIGEN_DONT_INLINE -void kill_extra_precision(T& ) { } - - -template void alignedbox(const BoxType& _box) -{ - /* this test covers the following files: - AlignedBox.h - */ - typedef typename BoxType::Scalar Scalar; - typedef typename NumTraits::Real RealScalar; - typedef Matrix VectorType; - - const Index dim = _box.dim(); - - VectorType p0 = VectorType::Random(dim); - VectorType p1 = VectorType::Random(dim); - while( p1 == p0 ){ - p1 = VectorType::Random(dim); } - RealScalar s1 = internal::random(0,1); - - BoxType b0(dim); - BoxType b1(VectorType::Random(dim),VectorType::Random(dim)); - BoxType b2; - - kill_extra_precision(b1); - kill_extra_precision(p0); - kill_extra_precision(p1); - - b0.extend(p0); - b0.extend(p1); - VERIFY(b0.contains(p0*s1+(Scalar(1)-s1)*p1)); - VERIFY(b0.contains(b0.center())); - VERIFY_IS_APPROX(b0.center(),(p0+p1)/Scalar(2)); - - (b2 = b0).extend(b1); - VERIFY(b2.contains(b0)); - VERIFY(b2.contains(b1)); - VERIFY_IS_APPROX(b2.clamp(b0), b0); - - // intersection - BoxType box1(VectorType::Random(dim)); - box1.extend(VectorType::Random(dim)); - BoxType box2(VectorType::Random(dim)); - box2.extend(VectorType::Random(dim)); - - VERIFY(box1.intersects(box2) == !box1.intersection(box2).isEmpty()); - - // alignment -- make sure there is no memory alignment assertion - BoxType *bp0 = new BoxType(dim); - BoxType *bp1 = new BoxType(dim); - bp0->extend(*bp1); - delete bp0; - delete bp1; - - // sampling - for( int i=0; i<10; ++i ) - { - VectorType r = b0.sample(); - VERIFY(b0.contains(r)); - } - -} - - - -template -void alignedboxCastTests(const BoxType& _box) -{ - // casting - typedef typename BoxType::Scalar Scalar; - typedef Matrix VectorType; - - const Index dim = _box.dim(); - - VectorType p0 = VectorType::Random(dim); - VectorType p1 = VectorType::Random(dim); - - BoxType b0(dim); - - b0.extend(p0); - b0.extend(p1); - - const int Dim = BoxType::AmbientDimAtCompileTime; - typedef typename GetDifferentType::type OtherScalar; - AlignedBox hp1f = b0.template cast(); - VERIFY_IS_APPROX(hp1f.template cast(),b0); - AlignedBox hp1d = b0.template cast(); - VERIFY_IS_APPROX(hp1d.template cast(),b0); -} - - -void specificTest1() -{ - Vector2f m; m << -1.0f, -2.0f; - Vector2f M; M << 1.0f, 5.0f; - - typedef AlignedBox2f BoxType; - BoxType box( m, M ); - - Vector2f sides = M-m; - VERIFY_IS_APPROX(sides, box.sizes() ); - VERIFY_IS_APPROX(sides[1], box.sizes()[1] ); - VERIFY_IS_APPROX(sides[1], box.sizes().maxCoeff() ); - VERIFY_IS_APPROX(sides[0], box.sizes().minCoeff() ); - - VERIFY_IS_APPROX( 14.0f, box.volume() ); - VERIFY_IS_APPROX( 53.0f, box.diagonal().squaredNorm() ); - VERIFY_IS_APPROX( std::sqrt( 53.0f ), box.diagonal().norm() ); - - VERIFY_IS_APPROX( m, box.corner( BoxType::BottomLeft ) ); - VERIFY_IS_APPROX( M, box.corner( BoxType::TopRight ) ); - Vector2f bottomRight; bottomRight << M[0], m[1]; - Vector2f topLeft; topLeft << m[0], M[1]; - VERIFY_IS_APPROX( bottomRight, box.corner( BoxType::BottomRight ) ); - VERIFY_IS_APPROX( topLeft, box.corner( BoxType::TopLeft ) ); -} - - -void specificTest2() -{ - Vector3i m; m << -1, -2, 0; - Vector3i M; M << 1, 5, 3; - - typedef AlignedBox3i BoxType; - BoxType box( m, M ); - - Vector3i sides = M-m; - VERIFY_IS_APPROX(sides, box.sizes() ); - VERIFY_IS_APPROX(sides[1], box.sizes()[1] ); - VERIFY_IS_APPROX(sides[1], box.sizes().maxCoeff() ); - VERIFY_IS_APPROX(sides[0], box.sizes().minCoeff() ); - - VERIFY_IS_APPROX( 42, box.volume() ); - VERIFY_IS_APPROX( 62, box.diagonal().squaredNorm() ); - - VERIFY_IS_APPROX( m, box.corner( BoxType::BottomLeftFloor ) ); - VERIFY_IS_APPROX( M, box.corner( BoxType::TopRightCeil ) ); - Vector3i bottomRightFloor; bottomRightFloor << M[0], m[1], m[2]; - Vector3i topLeftFloor; topLeftFloor << m[0], M[1], m[2]; - VERIFY_IS_APPROX( bottomRightFloor, box.corner( BoxType::BottomRightFloor ) ); - VERIFY_IS_APPROX( topLeftFloor, box.corner( BoxType::TopLeftFloor ) ); -} - - -void test_geo_alignedbox() -{ - for(int i = 0; i < g_repeat; i++) - { - CALL_SUBTEST_1( alignedbox(AlignedBox2f()) ); - CALL_SUBTEST_2( alignedboxCastTests(AlignedBox2f()) ); - - CALL_SUBTEST_3( alignedbox(AlignedBox3f()) ); - CALL_SUBTEST_4( alignedboxCastTests(AlignedBox3f()) ); - - CALL_SUBTEST_5( alignedbox(AlignedBox4d()) ); - CALL_SUBTEST_6( alignedboxCastTests(AlignedBox4d()) ); - - CALL_SUBTEST_7( alignedbox(AlignedBox1d()) ); - CALL_SUBTEST_8( alignedboxCastTests(AlignedBox1d()) ); - - CALL_SUBTEST_9( alignedbox(AlignedBox1i()) ); - CALL_SUBTEST_10( alignedbox(AlignedBox2i()) ); - CALL_SUBTEST_11( alignedbox(AlignedBox3i()) ); - - CALL_SUBTEST_14( alignedbox(AlignedBox(4)) ); - } - CALL_SUBTEST_12( specificTest1() ); - CALL_SUBTEST_13( specificTest2() ); -} diff --git a/lib/eigen_3.3.9/test/numext.cpp b/lib/eigen_3.3.9/test/numext.cpp deleted file mode 100644 index beba9e911b3..00000000000 --- a/lib/eigen_3.3.9/test/numext.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2017 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#include "main.h" - -template -void check_abs() { - typedef typename NumTraits::Real Real; - Real zero(0); - - if(NumTraits::IsSigned) - VERIFY_IS_EQUAL(numext::abs(-T(1)), T(1)); - VERIFY_IS_EQUAL(numext::abs(T(0)), T(0)); - VERIFY_IS_EQUAL(numext::abs(T(1)), T(1)); - - for(int k=0; k(); - if(!internal::is_same::value) - x = x/Real(2); - if(NumTraits::IsSigned) - { - VERIFY_IS_EQUAL(numext::abs(x), numext::abs(-x)); - VERIFY( numext::abs(-x) >= zero ); - } - VERIFY( numext::abs(x) >= zero ); - VERIFY_IS_APPROX( numext::abs2(x), numext::abs2(numext::abs(x)) ); - } -} - -void test_numext() { - CALL_SUBTEST( check_abs() ); - CALL_SUBTEST( check_abs() ); - CALL_SUBTEST( check_abs() ); - CALL_SUBTEST( check_abs() ); - CALL_SUBTEST( check_abs() ); - CALL_SUBTEST( check_abs() ); - CALL_SUBTEST( check_abs() ); - CALL_SUBTEST( check_abs() ); - CALL_SUBTEST( check_abs() ); - CALL_SUBTEST( check_abs() ); - CALL_SUBTEST( check_abs() ); - CALL_SUBTEST( check_abs() ); - CALL_SUBTEST( check_abs() ); - - CALL_SUBTEST( check_abs >() ); - CALL_SUBTEST( check_abs >() ); -} diff --git a/lib/eigen_3.3.9/test/packetmath.cpp b/lib/eigen_3.3.9/test/packetmath.cpp deleted file mode 100644 index 74ac435cf47..00000000000 --- a/lib/eigen_3.3.9/test/packetmath.cpp +++ /dev/null @@ -1,636 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008-2009 Gael Guennebaud -// Copyright (C) 2006-2008 Benoit Jacob -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#include "main.h" -#include "unsupported/Eigen/SpecialFunctions" - -#if defined __GNUC__ && __GNUC__>=6 - #pragma GCC diagnostic ignored "-Wignored-attributes" -#endif -// using namespace Eigen; - -namespace Eigen { -namespace internal { -template T negate(const T& x) { return -x; } -} -} - -// NOTE: we disbale inlining for this function to workaround a GCC issue when using -O3 and the i387 FPU. -template EIGEN_DONT_INLINE -bool isApproxAbs(const Scalar& a, const Scalar& b, const typename NumTraits::Real& refvalue) -{ - return internal::isMuchSmallerThan(a-b, refvalue); -} - -template bool areApproxAbs(const Scalar* a, const Scalar* b, int size, const typename NumTraits::Real& refvalue) -{ - for (int i=0; i >(a,size) << "]" << " != vec: [" << Map >(b,size) << "]\n"; - return false; - } - } - return true; -} - -template bool areApprox(const Scalar* a, const Scalar* b, int size) -{ - for (int i=0; i >(a,size) << "]" << " != vec: [" << Map >(b,size) << "]\n"; - return false; - } - } - return true; -} - -#define CHECK_CWISE1(REFOP, POP) { \ - for (int i=0; i(data1))); \ - VERIFY(areApprox(ref, data2, PacketSize) && #POP); \ -} - -template -struct packet_helper -{ - template - inline Packet load(const T* from) const { return internal::pload(from); } - - template - inline void store(T* to, const Packet& x) const { internal::pstore(to,x); } -}; - -template -struct packet_helper -{ - template - inline T load(const T* from) const { return *from; } - - template - inline void store(T* to, const T& x) const { *to = x; } -}; - -#define CHECK_CWISE1_IF(COND, REFOP, POP) if(COND) { \ - packet_helper h; \ - for (int i=0; i h; \ - for (int i=0; i void packetmath() -{ - using std::abs; - typedef internal::packet_traits PacketTraits; - typedef typename PacketTraits::type Packet; - const int PacketSize = PacketTraits::size; - typedef typename NumTraits::Real RealScalar; - - const int max_size = PacketSize > 4 ? PacketSize : 4; - const int size = PacketSize*max_size; - EIGEN_ALIGN_MAX Scalar data1[size]; - EIGEN_ALIGN_MAX Scalar data2[size]; - EIGEN_ALIGN_MAX Packet packets[PacketSize*2]; - EIGEN_ALIGN_MAX Scalar ref[size]; - RealScalar refvalue = 0; - for (int i=0; i()/RealScalar(PacketSize); - data2[i] = internal::random()/RealScalar(PacketSize); - refvalue = (std::max)(refvalue,abs(data1[i])); - } - - internal::pstore(data2, internal::pload(data1)); - VERIFY(areApprox(data1, data2, PacketSize) && "aligned load/store"); - - for (int offset=0; offset(data1+offset)); - VERIFY(areApprox(data1+offset, data2, PacketSize) && "internal::ploadu"); - } - - for (int offset=0; offset(data1)); - VERIFY(areApprox(data1, data2+offset, PacketSize) && "internal::pstoreu"); - } - - for (int offset=0; offset(data1); - packets[1] = internal::pload(data1+PacketSize); - if (offset==0) internal::palign<0>(packets[0], packets[1]); - else if (offset==1) internal::palign<1>(packets[0], packets[1]); - else if (offset==2) internal::palign<2>(packets[0], packets[1]); - else if (offset==3) internal::palign<3>(packets[0], packets[1]); - else if (offset==4) internal::palign<4>(packets[0], packets[1]); - else if (offset==5) internal::palign<5>(packets[0], packets[1]); - else if (offset==6) internal::palign<6>(packets[0], packets[1]); - else if (offset==7) internal::palign<7>(packets[0], packets[1]); - else if (offset==8) internal::palign<8>(packets[0], packets[1]); - else if (offset==9) internal::palign<9>(packets[0], packets[1]); - else if (offset==10) internal::palign<10>(packets[0], packets[1]); - else if (offset==11) internal::palign<11>(packets[0], packets[1]); - else if (offset==12) internal::palign<12>(packets[0], packets[1]); - else if (offset==13) internal::palign<13>(packets[0], packets[1]); - else if (offset==14) internal::palign<14>(packets[0], packets[1]); - else if (offset==15) internal::palign<15>(packets[0], packets[1]); - internal::pstore(data2, packets[0]); - - for (int i=0; i::value) || (!PacketTraits::Vectorizable) || PacketTraits::HasDiv); - - CHECK_CWISE2_IF(PacketTraits::HasAdd, REF_ADD, internal::padd); - CHECK_CWISE2_IF(PacketTraits::HasSub, REF_SUB, internal::psub); - CHECK_CWISE2_IF(PacketTraits::HasMul, REF_MUL, internal::pmul); - CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv); - - CHECK_CWISE1(internal::negate, internal::pnegate); - CHECK_CWISE1(numext::conj, internal::pconj); - - for(int offset=0;offset<3;++offset) - { - for (int i=0; i(data1[offset])); - VERIFY(areApprox(ref, data2, PacketSize) && "internal::pset1"); - } - - { - for (int i=0; i(data1, A0, A1, A2, A3); - internal::pstore(data2+0*PacketSize, A0); - internal::pstore(data2+1*PacketSize, A1); - internal::pstore(data2+2*PacketSize, A2); - internal::pstore(data2+3*PacketSize, A3); - VERIFY(areApprox(ref, data2, 4*PacketSize) && "internal::pbroadcast4"); - } - - { - for (int i=0; i(data1, A0, A1); - internal::pstore(data2+0*PacketSize, A0); - internal::pstore(data2+1*PacketSize, A1); - VERIFY(areApprox(ref, data2, 2*PacketSize) && "internal::pbroadcast2"); - } - - VERIFY(internal::isApprox(data1[0], internal::pfirst(internal::pload(data1))) && "internal::pfirst"); - - if(PacketSize>1) - { - for(int offset=0;offset<4;++offset) - { - for(int i=0;i(data1+offset)); - VERIFY(areApprox(ref, data2, PacketSize) && "ploaddup"); - } - } - - if(PacketSize>2) - { - for(int offset=0;offset<4;++offset) - { - for(int i=0;i(data1+offset)); - VERIFY(areApprox(ref, data2, PacketSize) && "ploadquad"); - } - } - - ref[0] = 0; - for (int i=0; i(data1)), refvalue) && "internal::predux"); - - { - int newsize = PacketSize>4?PacketSize/2:PacketSize; - for (int i=0; i(data1))); - VERIFY(areApprox(ref, data2, newsize) && "internal::predux_downto4"); - } - - ref[0] = 1; - for (int i=0; i(data1))) && "internal::predux_mul"); - - for (int j=0; j(data1+j*PacketSize); - } - internal::pstore(data2, internal::preduxp(packets)); - VERIFY(areApproxAbs(ref, data2, PacketSize, refvalue) && "internal::preduxp"); - - for (int i=0; i(data1))); - VERIFY(areApprox(ref, data2, PacketSize) && "internal::preverse"); - - internal::PacketBlock kernel; - for (int i=0; i(data1+i*PacketSize); - } - ptranspose(kernel); - for (int i=0; i(data1); - Packet elsePacket = internal::pload(data2); - EIGEN_ALIGN_MAX internal::Selector selector; - for (int i = 0; i < PacketSize; ++i) { - selector.select[i] = i; - } - - Packet blend = internal::pblend(selector, thenPacket, elsePacket); - EIGEN_ALIGN_MAX Scalar result[size]; - internal::pstore(result, blend); - for (int i = 0; i < PacketSize; ++i) { - VERIFY(isApproxAbs(result[i], (selector.select[i] ? data1[i] : data2[i]), refvalue)); - } - } - - if (PacketTraits::HasBlend) { - // pinsertfirst - for (int i=0; i(); - ref[0] = s; - internal::pstore(data2, internal::pinsertfirst(internal::pload(data1),s)); - VERIFY(areApprox(ref, data2, PacketSize) && "internal::pinsertfirst"); - } - - if (PacketTraits::HasBlend) { - // pinsertlast - for (int i=0; i(); - ref[PacketSize-1] = s; - internal::pstore(data2, internal::pinsertlast(internal::pload(data1),s)); - VERIFY(areApprox(ref, data2, PacketSize) && "internal::pinsertlast"); - } -} - -template void packetmath_real() -{ - using std::abs; - typedef internal::packet_traits PacketTraits; - typedef typename PacketTraits::type Packet; - const int PacketSize = PacketTraits::size; - - const int size = PacketSize*4; - EIGEN_ALIGN_MAX Scalar data1[PacketTraits::size*4]; - EIGEN_ALIGN_MAX Scalar data2[PacketTraits::size*4]; - EIGEN_ALIGN_MAX Scalar ref[PacketTraits::size*4]; - - for (int i=0; i(-1,1) * std::pow(Scalar(10), internal::random(-3,3)); - data2[i] = internal::random(-1,1) * std::pow(Scalar(10), internal::random(-3,3)); - } - CHECK_CWISE1_IF(PacketTraits::HasSin, std::sin, internal::psin); - CHECK_CWISE1_IF(PacketTraits::HasCos, std::cos, internal::pcos); - CHECK_CWISE1_IF(PacketTraits::HasTan, std::tan, internal::ptan); - - CHECK_CWISE1_IF(PacketTraits::HasRound, numext::round, internal::pround); - CHECK_CWISE1_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil); - CHECK_CWISE1_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor); - - for (int i=0; i(-1,1); - data2[i] = internal::random(-1,1); - } - CHECK_CWISE1_IF(PacketTraits::HasASin, std::asin, internal::pasin); - CHECK_CWISE1_IF(PacketTraits::HasACos, std::acos, internal::pacos); - - for (int i=0; i(-87,88); - data2[i] = internal::random(-87,88); - } - CHECK_CWISE1_IF(PacketTraits::HasExp, std::exp, internal::pexp); - for (int i=0; i(-1,1) * std::pow(Scalar(10), internal::random(-6,6)); - data2[i] = internal::random(-1,1) * std::pow(Scalar(10), internal::random(-6,6)); - } - CHECK_CWISE1_IF(PacketTraits::HasTanh, std::tanh, internal::ptanh); - if(PacketTraits::HasExp && PacketTraits::size>=2) - { - data1[0] = std::numeric_limits::quiet_NaN(); - data1[1] = std::numeric_limits::epsilon(); - packet_helper h; - h.store(data2, internal::pexp(h.load(data1))); - VERIFY((numext::isnan)(data2[0])); - VERIFY_IS_EQUAL(std::exp(std::numeric_limits::epsilon()), data2[1]); - - data1[0] = -std::numeric_limits::epsilon(); - data1[1] = 0; - h.store(data2, internal::pexp(h.load(data1))); - VERIFY_IS_EQUAL(std::exp(-std::numeric_limits::epsilon()), data2[0]); - VERIFY_IS_EQUAL(std::exp(Scalar(0)), data2[1]); - - data1[0] = (std::numeric_limits::min)(); - data1[1] = -(std::numeric_limits::min)(); - h.store(data2, internal::pexp(h.load(data1))); - VERIFY_IS_EQUAL(std::exp((std::numeric_limits::min)()), data2[0]); - VERIFY_IS_EQUAL(std::exp(-(std::numeric_limits::min)()), data2[1]); - - data1[0] = std::numeric_limits::denorm_min(); - data1[1] = -std::numeric_limits::denorm_min(); - h.store(data2, internal::pexp(h.load(data1))); - VERIFY_IS_EQUAL(std::exp(std::numeric_limits::denorm_min()), data2[0]); - VERIFY_IS_EQUAL(std::exp(-std::numeric_limits::denorm_min()), data2[1]); - } - - if (PacketTraits::HasTanh) { - // NOTE this test migh fail with GCC prior to 6.3, see MathFunctionsImpl.h for details. - data1[0] = std::numeric_limits::quiet_NaN(); - packet_helper::HasTanh,Packet> h; - h.store(data2, internal::ptanh(h.load(data1))); - VERIFY((numext::isnan)(data2[0])); - } - -#if EIGEN_HAS_C99_MATH - { - data1[0] = std::numeric_limits::quiet_NaN(); - packet_helper::HasLGamma,Packet> h; - h.store(data2, internal::plgamma(h.load(data1))); - VERIFY((numext::isnan)(data2[0])); - } - { - data1[0] = std::numeric_limits::quiet_NaN(); - packet_helper::HasErf,Packet> h; - h.store(data2, internal::perf(h.load(data1))); - VERIFY((numext::isnan)(data2[0])); - } - { - data1[0] = std::numeric_limits::quiet_NaN(); - packet_helper::HasErfc,Packet> h; - h.store(data2, internal::perfc(h.load(data1))); - VERIFY((numext::isnan)(data2[0])); - } -#endif // EIGEN_HAS_C99_MATH - - for (int i=0; i(0,1) * std::pow(Scalar(10), internal::random(-6,6)); - data2[i] = internal::random(0,1) * std::pow(Scalar(10), internal::random(-6,6)); - } - - if(internal::random(0,1)<0.1f) - data1[internal::random(0, PacketSize)] = 0; - CHECK_CWISE1_IF(PacketTraits::HasSqrt, std::sqrt, internal::psqrt); - CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog); -#if EIGEN_HAS_C99_MATH && (__cplusplus > 199711L) - CHECK_CWISE1_IF(PacketTraits::HasLog1p, std::log1p, internal::plog1p); - CHECK_CWISE1_IF(internal::packet_traits::HasLGamma, std::lgamma, internal::plgamma); - CHECK_CWISE1_IF(internal::packet_traits::HasErf, std::erf, internal::perf); - CHECK_CWISE1_IF(internal::packet_traits::HasErfc, std::erfc, internal::perfc); -#endif - - if(PacketTraits::HasLog && PacketTraits::size>=2) - { - data1[0] = std::numeric_limits::quiet_NaN(); - data1[1] = std::numeric_limits::epsilon(); - packet_helper h; - h.store(data2, internal::plog(h.load(data1))); - VERIFY((numext::isnan)(data2[0])); - VERIFY_IS_EQUAL(std::log(std::numeric_limits::epsilon()), data2[1]); - - data1[0] = -std::numeric_limits::epsilon(); - data1[1] = 0; - h.store(data2, internal::plog(h.load(data1))); - VERIFY((numext::isnan)(data2[0])); - VERIFY_IS_EQUAL(std::log(Scalar(0)), data2[1]); - - data1[0] = (std::numeric_limits::min)(); - data1[1] = -(std::numeric_limits::min)(); - h.store(data2, internal::plog(h.load(data1))); - VERIFY_IS_EQUAL(std::log((std::numeric_limits::min)()), data2[0]); - VERIFY((numext::isnan)(data2[1])); - - data1[0] = std::numeric_limits::denorm_min(); - data1[1] = -std::numeric_limits::denorm_min(); - h.store(data2, internal::plog(h.load(data1))); - // VERIFY_IS_EQUAL(std::log(std::numeric_limits::denorm_min()), data2[0]); - VERIFY((numext::isnan)(data2[1])); - - data1[0] = Scalar(-1.0f); - h.store(data2, internal::plog(h.load(data1))); - VERIFY((numext::isnan)(data2[0])); - h.store(data2, internal::psqrt(h.load(data1))); - VERIFY((numext::isnan)(data2[0])); - VERIFY((numext::isnan)(data2[1])); - } -} - -template void packetmath_notcomplex() -{ - using std::abs; - typedef internal::packet_traits PacketTraits; - typedef typename PacketTraits::type Packet; - const int PacketSize = PacketTraits::size; - - EIGEN_ALIGN_MAX Scalar data1[PacketTraits::size*4]; - EIGEN_ALIGN_MAX Scalar data2[PacketTraits::size*4]; - EIGEN_ALIGN_MAX Scalar ref[PacketTraits::size*4]; - - Array::Map(data1, PacketTraits::size*4).setRandom(); - - ref[0] = data1[0]; - for (int i=0; i(data1))) && "internal::predux_min"); - - VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMin); - VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMax); - - CHECK_CWISE2_IF(PacketTraits::HasMin, (std::min), internal::pmin); - CHECK_CWISE2_IF(PacketTraits::HasMax, (std::max), internal::pmax); - CHECK_CWISE1(abs, internal::pabs); - - ref[0] = data1[0]; - for (int i=0; i(data1))) && "internal::predux_max"); - - for (int i=0; i(data1[0])); - VERIFY(areApprox(ref, data2, PacketSize) && "internal::plset"); -} - -template void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval) -{ - typedef internal::packet_traits PacketTraits; - typedef typename PacketTraits::type Packet; - const int PacketSize = PacketTraits::size; - - internal::conj_if cj0; - internal::conj_if cj1; - internal::conj_helper cj; - internal::conj_helper pcj; - - for(int i=0;i(data1),internal::pload(data2))); - VERIFY(areApprox(ref, pval, PacketSize) && "conj_helper pmul"); - - for(int i=0;i(data1),internal::pload(data2),internal::pload(pval))); - VERIFY(areApprox(ref, pval, PacketSize) && "conj_helper pmadd"); -} - -template void packetmath_complex() -{ - typedef internal::packet_traits PacketTraits; - typedef typename PacketTraits::type Packet; - const int PacketSize = PacketTraits::size; - - const int size = PacketSize*4; - EIGEN_ALIGN_MAX Scalar data1[PacketSize*4]; - EIGEN_ALIGN_MAX Scalar data2[PacketSize*4]; - EIGEN_ALIGN_MAX Scalar ref[PacketSize*4]; - EIGEN_ALIGN_MAX Scalar pval[PacketSize*4]; - - for (int i=0; i() * Scalar(1e2); - data2[i] = internal::random() * Scalar(1e2); - } - - test_conj_helper (data1,data2,ref,pval); - test_conj_helper (data1,data2,ref,pval); - test_conj_helper (data1,data2,ref,pval); - test_conj_helper (data1,data2,ref,pval); - - { - for(int i=0;i(data1))); - VERIFY(areApprox(ref, pval, PacketSize) && "pcplxflip"); - } -} - -template void packetmath_scatter_gather() -{ - typedef internal::packet_traits PacketTraits; - typedef typename PacketTraits::type Packet; - typedef typename NumTraits::Real RealScalar; - const int PacketSize = PacketTraits::size; - EIGEN_ALIGN_MAX Scalar data1[PacketSize]; - RealScalar refvalue = 0; - for (int i=0; i()/RealScalar(PacketSize); - } - - int stride = internal::random(1,20); - - EIGEN_ALIGN_MAX Scalar buffer[PacketSize*20]; - memset(buffer, 0, 20*PacketSize*sizeof(Scalar)); - Packet packet = internal::pload(data1); - internal::pscatter(buffer, packet, stride); - - for (int i = 0; i < PacketSize*20; ++i) { - if ((i%stride) == 0 && i()/RealScalar(PacketSize); - } - packet = internal::pgather(buffer, 7); - internal::pstore(data1, packet); - for (int i = 0; i < PacketSize; ++i) { - VERIFY(isApproxAbs(data1[i], buffer[i*7], refvalue) && "pgather"); - } -} - -void test_packetmath() -{ - for(int i = 0; i < g_repeat; i++) { - CALL_SUBTEST_1( packetmath() ); - CALL_SUBTEST_2( packetmath() ); - CALL_SUBTEST_3( packetmath() ); - CALL_SUBTEST_4( packetmath >() ); - CALL_SUBTEST_5( packetmath >() ); - - CALL_SUBTEST_1( packetmath_notcomplex() ); - CALL_SUBTEST_2( packetmath_notcomplex() ); - CALL_SUBTEST_3( packetmath_notcomplex() ); - - CALL_SUBTEST_1( packetmath_real() ); - CALL_SUBTEST_2( packetmath_real() ); - - CALL_SUBTEST_4( packetmath_complex >() ); - CALL_SUBTEST_5( packetmath_complex >() ); - - CALL_SUBTEST_1( packetmath_scatter_gather() ); - CALL_SUBTEST_2( packetmath_scatter_gather() ); - CALL_SUBTEST_3( packetmath_scatter_gather() ); - CALL_SUBTEST_4( packetmath_scatter_gather >() ); - CALL_SUBTEST_5( packetmath_scatter_gather >() ); - } -} diff --git a/lib/eigen_3.3.9/test/unalignedassert.cpp b/lib/eigen_3.3.9/test/unalignedassert.cpp deleted file mode 100644 index 731a08977c6..00000000000 --- a/lib/eigen_3.3.9/test/unalignedassert.cpp +++ /dev/null @@ -1,180 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008 Benoit Jacob -// Copyright (C) 2015 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_TEST_PART_1) - // default -#elif defined(EIGEN_TEST_PART_2) - #define EIGEN_MAX_STATIC_ALIGN_BYTES 16 - #define EIGEN_MAX_ALIGN_BYTES 16 -#elif defined(EIGEN_TEST_PART_3) - #define EIGEN_MAX_STATIC_ALIGN_BYTES 32 - #define EIGEN_MAX_ALIGN_BYTES 32 -#elif defined(EIGEN_TEST_PART_4) - #define EIGEN_MAX_STATIC_ALIGN_BYTES 64 - #define EIGEN_MAX_ALIGN_BYTES 64 -#endif - -#include "main.h" - -typedef Matrix Vector6f; -typedef Matrix Vector8f; -typedef Matrix Vector12f; - -typedef Matrix Vector5d; -typedef Matrix Vector6d; -typedef Matrix Vector7d; -typedef Matrix Vector8d; -typedef Matrix Vector9d; -typedef Matrix Vector10d; -typedef Matrix Vector12d; - -struct TestNew1 -{ - MatrixXd m; // good: m will allocate its own array, taking care of alignment. - TestNew1() : m(20,20) {} -}; - -struct TestNew2 -{ - Matrix3d m; // good: m's size isn't a multiple of 16 bytes, so m doesn't have to be 16-byte aligned, - // 8-byte alignment is good enough here, which we'll get automatically -}; - -struct TestNew3 -{ - Vector2f m; // good: m's size isn't a multiple of 16 bytes, so m doesn't have to be 16-byte aligned -}; - -struct TestNew4 -{ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW - Vector2d m; - float f; // make the struct have sizeof%16!=0 to make it a little more tricky when we allow an array of 2 such objects -}; - -struct TestNew5 -{ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW - float f; // try the f at first -- the EIGEN_ALIGN_MAX attribute of m should make that still work - Matrix4f m; -}; - -struct TestNew6 -{ - Matrix m; // good: no alignment requested - float f; -}; - -template struct Depends -{ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(Align) - Vector2d m; - float f; -}; - -template -void check_unalignedassert_good() -{ - T *x, *y; - x = new T; - delete x; - y = new T[2]; - delete[] y; -} - -#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 -template -void construct_at_boundary(int boundary) -{ - char buf[sizeof(T)+256]; - size_t _buf = reinterpret_cast(buf); - _buf += (EIGEN_MAX_ALIGN_BYTES - (_buf % EIGEN_MAX_ALIGN_BYTES)); // make 16/32/...-byte aligned - _buf += boundary; // make exact boundary-aligned - T *x = ::new(reinterpret_cast(_buf)) T; - x[0].setZero(); // just in order to silence warnings - x->~T(); -} -#endif - -void unalignedassert() -{ -#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 - construct_at_boundary(4); - construct_at_boundary(4); - construct_at_boundary(16); - construct_at_boundary(4); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - construct_at_boundary(16); - construct_at_boundary(16); - construct_at_boundary(4); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - - construct_at_boundary(16); - construct_at_boundary(4); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - construct_at_boundary(4); - construct_at_boundary(16); - construct_at_boundary(4); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - construct_at_boundary(4); - construct_at_boundary(16); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - construct_at_boundary(4); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - - construct_at_boundary(16); - construct_at_boundary(4); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - construct_at_boundary(16); -#endif - - check_unalignedassert_good(); - check_unalignedassert_good(); - check_unalignedassert_good(); - - check_unalignedassert_good(); - check_unalignedassert_good(); - check_unalignedassert_good(); - check_unalignedassert_good >(); - -#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 - if(EIGEN_MAX_ALIGN_BYTES>=16) - { - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - // Complexes are disabled because the compiler might aggressively vectorize - // the initialization of complex coeffs to 0 before we can check for alignedness - //VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - } - for(int b=8; b(b)); - if(b<64) VERIFY_RAISES_ASSERT(construct_at_boundary(b)); - if(b<32) VERIFY_RAISES_ASSERT(construct_at_boundary(b)); - if(b<32) VERIFY_RAISES_ASSERT(construct_at_boundary(b)); - if(b<128) VERIFY_RAISES_ASSERT(construct_at_boundary(b)); - //if(b<32) VERIFY_RAISES_ASSERT(construct_at_boundary(b)); - } -#endif -} - -void test_unalignedassert() -{ - CALL_SUBTEST(unalignedassert()); -} diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h deleted file mode 100644 index 4cfe300eb42..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ /dev/null @@ -1,392 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H -#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H - -namespace Eigen { - -/** \class TensorBroadcasting - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor broadcasting class. - * - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorBroadcastingOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorBroadcastingOp type; -}; - -template -struct is_input_scalar { - static const bool value = false; -}; -template <> -struct is_input_scalar > { - static const bool value = true; -}; -#ifndef EIGEN_EMULATE_CXX11_META_H -template -struct is_input_scalar > { - static const bool value = (Sizes::total_size == 1); -}; -#endif - -} // end namespace internal - - - -template -class TensorBroadcastingOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast) - : m_xpr(expr), m_broadcast(broadcast) {} - - EIGEN_DEVICE_FUNC - const Broadcast& broadcast() const { return m_broadcast; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const Broadcast m_broadcast; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorBroadcastingOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::Dimensions>::value; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename TensorEvaluator::Dimensions InputDimensions; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = true, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_broadcast(op.broadcast()),m_impl(op.expression(), device) - { - // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar - // and store the result in a scalar. Instead one should reshape the scalar into a a N-D - // tensor with N >= 1 of 1 element first and then broadcast. - EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - const InputDimensions& input_dims = m_impl.dimensions(); - const Broadcast& broadcast = op.broadcast(); - for (int i = 0; i < NumDims; ++i) { - eigen_assert(input_dims[i] > 0); - m_dimensions[i] = input_dims[i] * broadcast[i]; - } - - if (static_cast(Layout) == static_cast(ColMajor)) { - m_inputStrides[0] = 1; - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - } - } else { - m_inputStrides[NumDims-1] = 1; - m_outputStrides[NumDims-1] = 1; - for (int i = NumDims-2; i >= 0; --i) { - m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; - m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const - { - if (internal::is_input_scalar::type>::value) { - return m_impl.coeff(0); - } - - if (static_cast(Layout) == static_cast(ColMajor)) { - return coeffColMajor(index); - } else { - return coeffRowMajor(index); - } - } - - // TODO: attempt to speed this up. The integer divisions and modulo are slow - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const - { - Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx < m_impl.dimensions()[i]); - inputIndex += idx * m_inputStrides[i]; - } else { - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); - } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; - } - } - index -= idx * m_outputStrides[i]; - } - if (internal::index_statically_eq(0, 1)) { - eigen_assert(index < m_impl.dimensions()[0]); - inputIndex += index; - } else { - if (internal::index_statically_eq(0, 1)) { - eigen_assert(index % m_impl.dimensions()[0] == 0); - } else { - inputIndex += (index % m_impl.dimensions()[0]); - } - } - return m_impl.coeff(inputIndex); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const - { - Index inputIndex = 0; - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx < m_impl.dimensions()[i]); - inputIndex += idx * m_inputStrides[i]; - } else { - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); - } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; - } - } - index -= idx * m_outputStrides[i]; - } - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index < m_impl.dimensions()[NumDims-1]); - inputIndex += index; - } else { - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); - } else { - inputIndex += (index % m_impl.dimensions()[NumDims-1]); - } - } - return m_impl.coeff(inputIndex); - } - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const - { - if (internal::is_input_scalar::type>::value) { - return internal::pset1(m_impl.coeff(0)); - } - - if (static_cast(Layout) == static_cast(ColMajor)) { - return packetColMajor(index); - } else { - return packetRowMajor(index); - } - } - - // Ignore the LoadMode and always use unaligned loads since we can't guarantee - // the alignment at compile time. - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - const Index originalIndex = index; - - Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx < m_impl.dimensions()[i]); - inputIndex += idx * m_inputStrides[i]; - } else { - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); - } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; - } - } - index -= idx * m_outputStrides[i]; - } - Index innermostLoc; - if (internal::index_statically_eq(0, 1)) { - eigen_assert(index < m_impl.dimensions()[0]); - innermostLoc = index; - } else { - if (internal::index_statically_eq(0, 1)) { - eigen_assert(index % m_impl.dimensions()[0] == 0); - innermostLoc = 0; - } else { - innermostLoc = index % m_impl.dimensions()[0]; - } - } - inputIndex += innermostLoc; - - // Todo: this could be extended to the second dimension if we're not - // broadcasting alongside the first dimension, and so on. - if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) { - return m_impl.template packet(inputIndex); - } else { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - values[0] = m_impl.coeff(inputIndex); - for (int i = 1; i < PacketSize; ++i) { - values[i] = coeffColMajor(originalIndex+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - const Index originalIndex = index; - - Index inputIndex = 0; - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx < m_impl.dimensions()[i]); - inputIndex += idx * m_inputStrides[i]; - } else { - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); - } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; - } - } - index -= idx * m_outputStrides[i]; - } - Index innermostLoc; - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index < m_impl.dimensions()[NumDims-1]); - innermostLoc = index; - } else { - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); - innermostLoc = 0; - } else { - innermostLoc = index % m_impl.dimensions()[NumDims-1]; - } - } - inputIndex += innermostLoc; - - // Todo: this could be extended to the second dimension if we're not - // broadcasting alongside the first dimension, and so on. - if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) { - return m_impl.template packet(inputIndex); - } else { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - values[0] = m_impl.coeff(inputIndex); - for (int i = 1; i < PacketSize; ++i) { - values[i] = coeffRowMajor(originalIndex+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - double compute_cost = TensorOpCost::AddCost(); - if (NumDims > 0) { - for (int i = NumDims - 1; i > 0; --i) { - compute_cost += TensorOpCost::DivCost(); - if (internal::index_statically_eq(i, 1)) { - compute_cost += - TensorOpCost::MulCost() + TensorOpCost::AddCost(); - } else { - if (!internal::index_statically_eq(i, 1)) { - compute_cost += TensorOpCost::MulCost() + - TensorOpCost::ModCost() + - TensorOpCost::AddCost(); - } - } - compute_cost += - TensorOpCost::MulCost() + TensorOpCost::AddCost(); - } - } - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - const TensorEvaluator& impl() const { return m_impl; } - - Broadcast functor() const { return m_broadcast; } - - protected: - const Broadcast m_broadcast; - Dimensions m_dimensions; - array m_outputStrides; - array m_inputStrides; - TensorEvaluator m_impl; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h deleted file mode 100644 index 20b29e5fde9..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ /dev/null @@ -1,628 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H - -namespace Eigen { - -/** \class TensorContraction - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor contraction class. - * - * - */ -namespace internal { - -template -struct traits > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename gebp_traits::type, - typename remove_const::type>::ResScalar Scalar; - - typedef typename promote_storage_type::StorageKind, - typename traits::StorageKind>::ret StorageKind; - typedef typename promote_index_type::Index, - typename traits::Index>::type Index; - typedef typename LhsXprType::Nested LhsNested; - typedef typename RhsXprType::Nested RhsNested; - typedef typename remove_reference::type _LhsNested; - typedef typename remove_reference::type _RhsNested; - - // From NumDims below. - static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; - static const int Layout = traits::Layout; - - enum { - Flags = 0 - }; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorContractionOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorContractionOp type; -}; - -template -struct traits, Device_> > { - typedef Indices_ Indices; - typedef LeftArgType_ LeftArgType; - typedef RightArgType_ RightArgType; - typedef Device_ Device; - - // From NumDims below. - static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; -}; - -} // end namespace internal - -template -class TensorContractionOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename internal::gebp_traits::ResScalar CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp( - const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) - : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {} - - EIGEN_DEVICE_FUNC - const Indices& indices() const { return m_indices; } - - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - lhsExpression() const { return m_lhs_xpr; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - rhsExpression() const { return m_rhs_xpr; } - - protected: - typename LhsXprType::Nested m_lhs_xpr; - typename RhsXprType::Nested m_rhs_xpr; - const Indices m_indices; -}; - - -template -struct TensorContractionEvaluatorBase -{ - typedef typename internal::traits::Indices Indices; - typedef typename internal::traits::LeftArgType LeftArgType; - typedef typename internal::traits::RightArgType RightArgType; - typedef typename internal::traits::Device Device; - - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - enum { - IsAligned = true, - PacketAccess = (internal::unpacket_traits::size > 1), - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = true - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size::Dimensions>::value; - static const int RDims = - internal::array_size::Dimensions>::value; - static const int ContractDims = internal::array_size::value; - static const int NumDims = LDims + RDims - 2 * ContractDims; - - typedef array contract_t; - typedef array left_nocontract_t; - typedef array right_nocontract_t; - - typedef DSizes Dimensions; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorContractionEvaluatorBase(const XprType& op, const Device& device) - : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), - op.lhsExpression(), op.rhsExpression()), device), - m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), - op.rhsExpression(), op.lhsExpression()), device), - m_device(device), - m_result(NULL) { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == - static_cast(TensorEvaluator::Layout)), - YOU_MADE_A_PROGRAMMING_MISTAKE); - - - DSizes eval_left_dims; - DSizes eval_right_dims; - array, ContractDims> eval_op_indices; - if (static_cast(Layout) == static_cast(ColMajor)) { - // For ColMajor, we keep using the existing dimensions - for (int i = 0; i < LDims; i++) { - eval_left_dims[i] = m_leftImpl.dimensions()[i]; - } - for (int i = 0; i < RDims; i++) { - eval_right_dims[i] = m_rightImpl.dimensions()[i]; - } - // We keep the pairs of contracting indices. - for (int i = 0; i < ContractDims; i++) { - eval_op_indices[i].first = op.indices()[i].first; - eval_op_indices[i].second = op.indices()[i].second; - } - } else { - // For RowMajor, we need to reverse the existing dimensions - for (int i = 0; i < LDims; i++) { - eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1]; - } - for (int i = 0; i < RDims; i++) { - eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1]; - } - // We need to flip all the pairs of contracting indices as well as - // reversing the dimensions. - for (int i = 0; i < ContractDims; i++) { - eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second; - eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first; - } - } - - // Check for duplicate axes and make sure the first index in eval_op_indices - // is increasing. Using O(n^2) sorting is OK since ContractDims is small - for (int i = 0; i < ContractDims; i++) { - for (int j = i + 1; j < ContractDims; j++) { - eigen_assert(eval_op_indices[j].first != eval_op_indices[i].first && - eval_op_indices[j].second != eval_op_indices[i].second && - "contraction axes should be unique"); - if (eval_op_indices[j].first < eval_op_indices[i].first) { - numext::swap(eval_op_indices[j], eval_op_indices[i]); - } - } - } - - array lhs_strides; - lhs_strides[0] = 1; - for (int i = 0; i < LDims-1; ++i) { - lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i]; - } - - array rhs_strides; - rhs_strides[0] = 1; - for (int i = 0; i < RDims-1; ++i) { - rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i]; - } - - if (m_i_strides.size() > 0) m_i_strides[0] = 1; - if (m_j_strides.size() > 0) m_j_strides[0] = 1; - if (m_k_strides.size() > 0) m_k_strides[0] = 1; - - m_i_size = 1; - m_j_size = 1; - m_k_size = 1; - - // To compute the dimension, we simply concatenate the non-contracting - // dimensions of the left and then the right tensor. Additionally, we also - // compute the strides corresponding to the left non-contracting - // dimensions and right non-contracting dimensions. - m_lhs_inner_dim_contiguous = true; - int dim_idx = 0; - unsigned int nocontract_idx = 0; - - for (int i = 0; i < LDims; i++) { - // find if we are contracting on index i of left tensor - bool contracting = false; - for (int j = 0; j < ContractDims; j++) { - if (eval_op_indices[j].first == i) { - contracting = true; - break; - } - } - if (!contracting) { - // add dimension size to output dimensions - m_dimensions[dim_idx] = eval_left_dims[i]; - m_left_nocontract_strides[nocontract_idx] = lhs_strides[i]; - if (dim_idx != i) { - m_lhs_inner_dim_contiguous = false; - } - if (nocontract_idx+1 < internal::array_size::value) { - m_i_strides[nocontract_idx+1] = - m_i_strides[nocontract_idx] * eval_left_dims[i]; - } else { - m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i]; - } - dim_idx++; - nocontract_idx++; - } - } - - nocontract_idx = 0; - for (int i = 0; i < RDims; i++) { - bool contracting = false; - // find if we are contracting on index i of right tensor - for (int j = 0; j < ContractDims; j++) { - if (eval_op_indices[j].second == i) { - contracting = true; - break; - } - } - if (!contracting) { - m_dimensions[dim_idx] = eval_right_dims[i]; - if (nocontract_idx+1 < internal::array_size::value) { - m_j_strides[nocontract_idx+1] = - m_j_strides[nocontract_idx] * eval_right_dims[i]; - } else { - m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i]; - } - m_right_nocontract_strides[nocontract_idx] = rhs_strides[i]; - dim_idx++; - nocontract_idx++; - } - } - - // Now compute the strides corresponding to the contracting dimensions. We - // assumed above that non-contracting axes are represented in the same order - // in the matrix as they are in the tensor. This is not the case for - // contracting axes. As the contracting axes must be of the same size in - // each tensor, we'll only look at the first tensor here. - m_rhs_inner_dim_contiguous = true; - m_rhs_inner_dim_reordered = false; - for (int i = 0; i < ContractDims; i++) { - Index left = eval_op_indices[i].first; - Index right = eval_op_indices[i].second; - - Index size = eval_left_dims[left]; - eigen_assert(size == eval_right_dims[right] && - "Contraction axes must be same size"); - - if (i+1 < static_cast(internal::array_size::value)) { - m_k_strides[i+1] = m_k_strides[i] * size; - } else { - m_k_size = m_k_strides[i] * size; - } - m_left_contracting_strides[i] = lhs_strides[left]; - m_right_contracting_strides[i] = rhs_strides[right]; - - if (i > 0 && right < eval_op_indices[i-1].second) { - m_rhs_inner_dim_reordered = true; - } - if (right != i) { - m_rhs_inner_dim_contiguous = false; - } - } - - // If the layout is RowMajor, we need to reverse the m_dimensions - if (static_cast(Layout) == static_cast(RowMajor)) { - for (int i = 0, j = NumDims - 1; i < j; i++, j--) { - numext::swap(m_dimensions[i], m_dimensions[j]); - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - m_leftImpl.evalSubExprsIfNeeded(NULL); - m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { - evalTo(data); - return false; - } else { - m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); - evalTo(m_result); - return true; - } - } - - EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { - if (this->m_lhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - } - else { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - } - } - - template - EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const { - const Index rows = m_i_size; - const Index cols = m_k_size; - - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - const Index lhs_packet_size = internal::unpacket_traits::size; - const Index rhs_packet_size = internal::unpacket_traits::size; - const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned; - const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned; - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, - m_left_contracting_strides, m_k_strides); - RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides, - m_right_contracting_strides, m_k_strides); - - const Scalar alpha(1); - const Index resIncr(1); - - // zero out the result buffer (which must be of size at least rows * sizeof(Scalar) - m_device.memset(buffer, 0, rows * sizeof(Scalar)); - - internal::general_matrix_vector_product::run( - rows, cols, lhs, rhs, - buffer, resIncr, alpha); - } - - template - EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - // define mr, nr, and all of my data mapper types - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - typedef typename internal::gebp_traits Traits; - - const Index nr = Traits::nr; - const Index mr = Traits::mr; - - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - - const Index lhs_packet_size = internal::unpacket_traits::size; - const Index rhs_packet_size = internal::unpacket_traits::size; - - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - typedef internal::blas_data_mapper OutputMapper; - - // Declare GEBP packing and kernel structs - internal::gemm_pack_lhs pack_lhs; - internal::gemm_pack_rhs pack_rhs; - - internal::gebp_kernel gebp; - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - - // Sizes of the blocks to load in cache. See the Goto paper for details. - internal::TensorContractionBlocking blocking(k, m, n, 1); - const Index kc = blocking.kc(); - const Index mc = numext::mini(m, blocking.mc()); - const Index nc = numext::mini(n, blocking.nc()); - const Index sizeA = mc * kc; - const Index sizeB = kc * nc; - - LhsScalar* blockA = static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar))); - RhsScalar* blockB = static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar))); - - for(Index i2=0; i2m_device.deallocate(blockA); - this->m_device.deallocate(blockB); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_leftImpl.cleanup(); - m_rightImpl.cleanup(); - - if (m_result != NULL) { - m_device.deallocate(m_result); - m_result = NULL; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_result[index]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return internal::ploadt(m_result + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; } - - protected: - // Prevent assignment - TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&); - Dimensions m_dimensions; - - contract_t m_k_strides; - contract_t m_left_contracting_strides; - contract_t m_right_contracting_strides; - - bool m_lhs_inner_dim_contiguous; - bool m_rhs_inner_dim_contiguous; - bool m_rhs_inner_dim_reordered; - - left_nocontract_t m_i_strides; - right_nocontract_t m_j_strides; - left_nocontract_t m_left_nocontract_strides; - right_nocontract_t m_right_nocontract_strides; - - Index m_i_size; - Index m_j_size; - Index m_k_size; - - TensorEvaluator m_leftImpl; - TensorEvaluator m_rightImpl; - const Device& m_device; - Scalar* m_result; -}; - - -// evaluator for default device -template -struct TensorEvaluator, Device> : - public TensorContractionEvaluatorBase< - TensorEvaluator, Device> > { - typedef TensorEvaluator, Device> Self; - typedef TensorContractionEvaluatorBase Base; - - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - enum { - Layout = TensorEvaluator::Layout - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size::Dimensions>::value; - static const int RDims = - internal::array_size::Dimensions>::value; - static const int ContractDims = internal::array_size::value; - - typedef array contract_t; - typedef array left_nocontract_t; - typedef array right_nocontract_t; - - static const int NumDims = LDims + RDims - 2 * ContractDims; - - // Could we use NumDimensions here? - typedef DSizes Dimensions; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) { } - - template - EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const { - if (this->m_j_size == 1) { - this->template evalGemv(buffer); - return; - } - - this->template evalGemm(buffer); - } -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h deleted file mode 100644 index 5cf7b4f7189..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ /dev/null @@ -1,56 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H - - -namespace Eigen { -namespace internal { - -enum { - ShardByRow = 0, - ShardByCol = 1 -}; - - -// Default Blocking Strategy -template -class TensorContractionBlocking { - public: - - typedef typename LhsMapper::Scalar LhsScalar; - typedef typename RhsMapper::Scalar RhsScalar; - - EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : - kc_(k), mc_(m), nc_(n) - { - if (ShardingType == ShardByCol) { - computeProductBlockingSizes(kc_, mc_, nc_, num_threads); - } - else { - computeProductBlockingSizes(kc_, nc_, mc_, num_threads); - } - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } - - private: - Index kc_; - Index mc_; - Index nc_; -}; - - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h deleted file mode 100644 index c70dea05303..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ /dev/null @@ -1,1043 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H - -// evaluator for thread pool device -#ifdef EIGEN_USE_THREADS - -namespace Eigen { - -#ifdef EIGEN_USE_SIMPLE_THREAD_POOL -namespace internal { - -template -struct packLhsArg { - LhsScalar* blockA; - const LhsMapper& lhs; - const Index m_start; - const Index k_start; - const Index mc; - const Index kc; -}; - -template -struct packRhsAndKernelArg { - const MaxSizeVector* blockAs; - RhsScalar* blockB; - const RhsMapper& rhs; - OutputMapper& output; - const Index m; - const Index k; - const Index n; - const Index mc; - const Index kc; - const Index nc; - const Index num_threads; - const Index num_blockAs; - const Index max_m; - const Index k_block_idx; - const Index m_block_idx; - const Index n_block_idx; - const Index m_blocks; - const Index n_blocks; - MaxSizeVector* kernel_notifications; - const MaxSizeVector* lhs_notifications; - const bool need_to_pack; -}; - -} // end namespace internal -#endif // EIGEN_USE_SIMPLE_THREAD_POOL - -template -struct TensorEvaluator, ThreadPoolDevice> : - public TensorContractionEvaluatorBase, ThreadPoolDevice> > { - - typedef ThreadPoolDevice Device; - - typedef TensorEvaluator, Device> Self; - typedef TensorContractionEvaluatorBase Base; - - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - enum { - Layout = TensorEvaluator::Layout, - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size::Dimensions>::value; - static const int RDims = - internal::array_size::Dimensions>::value; - static const int ContractDims = internal::array_size::value; - - typedef array left_dim_mapper_t; - typedef array right_dim_mapper_t; - - typedef array contract_t; - typedef array left_nocontract_t; - typedef array right_nocontract_t; - - static const int NumDims = LDims + RDims - 2 * ContractDims; - - typedef DSizes Dimensions; - - // typedefs needed in evalTo - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - typedef typename internal::gebp_traits Traits; - - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - - TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) {} - -#ifndef EIGEN_USE_SIMPLE_THREAD_POOL - template - void evalProduct(Scalar* buffer) const { - typedef internal::TensorContractionInputMapper< - LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, - contract_t, internal::packet_traits::size, - lhs_inner_dim_contiguous, false, Unaligned> - LhsMapper; - typedef internal::TensorContractionInputMapper< - RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, - contract_t, internal::packet_traits::size, - rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> - RhsMapper; - typedef internal::blas_data_mapper OutputMapper; - typedef internal::gemm_pack_lhs - LhsPacker; - typedef internal::gemm_pack_rhs< - RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> - RhsPacker; - typedef internal::gebp_kernel - GebpKernel; - - const Index m = this->m_i_size; - const Index n = this->m_j_size; - const Index k = this->m_k_size; - if (m == 0 || n == 0 || k == 0) return; - - // Compute a set of algorithm parameters: - // - kernel block sizes (bm, bn, bk) - // - task grain sizes (number of kernels executed per task: gm, gn) - // - number of threads - // - sharding by row/column - // - parallel packing or first lhs then rhs - // and some derived parameters: - // - number of tasks (nm, nn, nk) - // - number of kernels (nm0, nn0) - // Unfortunately, all these parameters are tightly interdependent. - // So in some cases we first compute approximate values, then compute other - // values based on these approximations and then refine the approximations. - - // There are lots of heuristics here. There is some reasoning behind them, - // but ultimately they are just tuned on contraction benchmarks for - // different input configurations, thread counts and instruction sets. - // So feel free to question any of them. - - // Compute whether we want to shard by row or by column. - // This is a first approximation, it will be refined later. Since we don't - // know number of threads yet we use 2, because what's we are most - // interested in at this point is whether it makes sense to use - // parallelization at all or not. - bool shard_by_col = shardByCol(m, n, 2); - - // First approximation of kernel blocking sizes. - // Again, we don't know number of threads yet, so we use 2. - Index bm, bn, bk; - if (shard_by_col) { - internal::TensorContractionBlocking - blocking(k, m, n, 2); - bm = blocking.mc(); - bn = blocking.nc(); - bk = blocking.kc(); - } else { - internal::TensorContractionBlocking - blocking(k, m, n, 2); - bm = blocking.mc(); - bn = blocking.nc(); - bk = blocking.kc(); - } - - // Compute optimal number of threads. - // Note: we use bk instead of k here because we are interested in amount of - // _parallelizable_ computations, and computations are not parallelizable - // across k dimension. - const TensorOpCost cost = - contractionCost(m, n, bm, bn, bk, shard_by_col, false); - int num_threads = TensorCostModel::numThreads( - static_cast(n) * m, cost, this->m_device.numThreads()); - - // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost - // model is not tuned. Remove this when the cost model is tuned. - if (n == 1) num_threads = 1; - - if (num_threads == 1) { - // The single-threaded algorithm should be faster in this case. - if (n == 1) - this->template evalGemv(buffer); - else - this->template evalGemm(buffer); - return; - } - - // Now that we know number of threads, recalculate sharding and blocking. - shard_by_col = shardByCol(m, n, num_threads); - if (shard_by_col) { - internal::TensorContractionBlocking - blocking(k, m, n, num_threads); - bm = blocking.mc(); - bn = blocking.nc(); - bk = blocking.kc(); - } else { - internal::TensorContractionBlocking - blocking(k, m, n, num_threads); - bm = blocking.mc(); - bn = blocking.nc(); - bk = blocking.kc(); - } - - // Number of kernels for each dimension. - Index nm0 = divup(m, bm); - Index nn0 = divup(n, bn); - Index nk = divup(k, bk); - - // Calculate task grain size (number of kernels executed per task). - // This task size coarsening serves two purposes: - // 1. It reduces per-task overheads including synchronization overheads. - // 2. It allows to use caches better (reuse the same packed rhs in several - // consecutive kernels). - Index gm = 1; - Index gn = 1; - // If we are sharding by column, then we prefer to reduce rows first. - if (shard_by_col) { - gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); - gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); - } else { - gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); - gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); - } - // Number of tasks in each dimension. - Index nm = divup(nm0, gm); - Index nn = divup(nn0, gn); - - // Last by not least, decide whether we want to issue both lhs and rhs - // packing in parallel; or issue lhs packing first, and then issue rhs - // packing when lhs packing completes (for !shard_by_col lhs and rhs are - // swapped). Parallel packing allows more parallelism (for both packing and - // kernels), while sequential packing provides better locality (once - // a thread finishes rhs packing it proceed to kernels with that rhs). - // First, we are interested in parallel packing if there are few tasks. - bool parallel_pack = num_threads >= nm * nn; - // Also do parallel packing if all data fits into L2$. - if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <= - l2CacheSize() * num_threads) - parallel_pack = true; - // But don't do it if we will use each rhs only once. Locality seems to be - // more important in this case. - if ((shard_by_col ? nm : nn) == 1) parallel_pack = false; - - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, - this->m_i_strides, this->m_left_contracting_strides, - this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, - this->m_j_strides, this->m_right_contracting_strides, - this->m_k_strides); - - Context(this->m_device, num_threads, lhs, rhs, buffer, m, n, - k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, - shard_by_col, parallel_pack) - .run(); - } - - // Context coordinates a single parallel gemm operation. - template - class Context { - public: - Context(const Device& device, int num_threads, LhsMapper& lhs, - RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, - Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, - Index gn, Index nm0, Index nn0, bool shard_by_col, - bool parallel_pack) - : device_(device), - lhs_(lhs), - rhs_(rhs), - buffer_(buffer), - output_(buffer, tm), - num_threads_(num_threads), - shard_by_col_(shard_by_col), - parallel_pack_(parallel_pack), - m_(tm), - n_(tn), - k_(tk), - bm_(bm), - bn_(bn), - bk_(bk), - nm_(nm), - nn_(nn), - nk_(nk), - gm_(gm), - gn_(gn), - nm0_(nm0), - nn0_(nn0) - { - for (Index x = 0; x < P; x++) { - // Normal number of notifications for k slice switch is - // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only - // nm_ + nn_ notifications, because they will not receive notifications - // from preceeding kernels. - state_switch_[x] = - x == 0 - ? 1 - : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) + - (x == P - 1 ? nm_ * nn_ : 0); - state_packing_ready_[x] = - parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_); - state_kernel_[x] = new std::atomic*[nm_]; - for (Index m = 0; m < nm_; m++) { - state_kernel_[x][m] = new std::atomic[nn_]; - // Kernels generally receive 3 notifications (previous kernel + 2 - // packing), but the first slice won't get notifications from previous - // kernels. - for (Index n = 0; n < nn_; n++) - state_kernel_[x][m][n].store( - (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1), - std::memory_order_relaxed); - } - } - - // Allocate memory for packed rhs/lhs matrices. - size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); - size_t lhs_size = - divup(bm_ * bk_ * sizeof(LhsScalar), align) * align; - size_t rhs_size = - divup(bn_ * bk_ * sizeof(RhsScalar), align) * align; - packed_mem_ = static_cast(internal::aligned_malloc( - (nm0_ * lhs_size + nn0_ * rhs_size) * std::min(nk_, P - 1))); - char* mem = static_cast(packed_mem_); - for (Index x = 0; x < numext::mini(nk_, P - 1); x++) { - packed_lhs_[x].resize(nm0_); - for (Index m = 0; m < nm0_; m++) { - packed_lhs_[x][m] = reinterpret_cast(mem); - mem += lhs_size; - } - packed_rhs_[x].resize(nn0_); - for (Index n = 0; n < nn0_; n++) { - packed_rhs_[x][n] = reinterpret_cast(mem); - mem += rhs_size; - } - } - } - - ~Context() { - for (Index x = 0; x < P; x++) { - for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m]; - delete[] state_kernel_[x]; - } - internal::aligned_free(packed_mem_); - } - - void run() { - // Kick off packing of the first slice. - signal_switch(0, 1); - // Wait for overall completion. - // TODO(dvyukov): this wait can lead to deadlock. - // If nthreads contractions are concurrently submitted from worker - // threads, this wait will block all worker threads and the system will - // deadlock. - done_.Wait(); - } - - private: - Notification done_; - const Device& device_; - LhsMapper& lhs_; - RhsMapper& rhs_; - Scalar* const buffer_; - OutputMapper output_; - const int num_threads_; - const bool shard_by_col_; - const bool parallel_pack_; - // Matrix sizes. - const Index m_; - const Index n_; - const Index k_; - // Block sizes. - const Index bm_; - const Index bn_; - const Index bk_; - // Number of tasks. - const Index nm_; - const Index nn_; - const Index nk_; - // Task grain sizes (number of kernels executed per task). - const Index gm_; - const Index gn_; - // Number of blocks (this is different from ni_/nn_ because of task size - // coarsening). - const Index nm0_; - const Index nn0_; - - // Parallelization strategy. - // - // Blocks related to the same k block can run in parallel because they write - // to different output blocks. So we parallelize within k slices, this - // gives us parallelism level of m x n. Before we can start any kernels - // related to k-th slice, we need to issue m lhs packing tasks and n rhs - // packing tasks. - // - // However, there is a bottleneck when we are finishing kernels for k-th - // slice (at the very end there is only 1 runnable kernel). To mitigate this - // bottleneck we allow kernels from k-th and k+1-th slices to run in - // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same - // output block, so they must not run in parallel. - // - // This gives us the following dependency graph. - // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs - // packing tasks. - // Kernel (m, n, k) can start when: - // - kernel (m, n, k-1) has finished - // - lhs packing (m, k) has finished - // - rhs packing (n, k) has finished - // Lhs/rhs packing can start when: - // - all k-1 packing has finished (artificially imposed to limit amount of - // parallel packing) - // - // On top of that we limit runnable tasks to two consecutive k slices. - // This is done to limit amount of memory we need for packed lhs/rhs - // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_). - // - // state_switch_ tracks when we are ready to switch to the next k slice. - // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n). - // These variable are rolling over 3 consecutive k slices: first two we are - // actively executing + one to track completion of kernels in the second - // slice. - static const Index P = 3; - void* packed_mem_; - std::vector packed_lhs_[P - 1]; - std::vector packed_rhs_[P - 1]; - std::atomic** state_kernel_[P]; - // state_switch_ is frequently modified by worker threads, while other - // fields are read-only after constructor. Let's move it to a separate cache - // line to reduce cache-coherency traffic. - char pad_[128]; - std::atomic state_packing_ready_[P]; - std::atomic state_switch_[P]; - - void pack_lhs(Index m, Index k) { - const Index mend = m * gm_ + gm(m); - for (Index m1 = m * gm_; m1 < mend; m1++) - LhsPacker()(packed_lhs_[k % (P - 1)][m1], - lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); - - if (!parallel_pack_ && shard_by_col_) { - signal_packing(k); - } else { - signal_switch(k + 1); - for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0); - } - } - - void pack_rhs(Index n, Index k) { - const Index nend = n * gn_ + gn(n); - for (Index n1 = n * gn_; n1 < nend; n1++) { - if (k == 0) { - // Zero the output memory in parallel. - // On 10000x2x10000 mm zeroing can easily take half of time. - // Zero (bn x m) row. Safe to do here because all kernels that will - // write to this memory depend on completion of this task. - // Note: don't call device_.memset() here. device_.memset() blocks on - // thread pool worker thread, which can lead to underutilization and - // deadlocks. - memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); - } - RhsPacker()(packed_rhs_[k % (P - 1)][n1], - rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); - } - - if (parallel_pack_ || shard_by_col_) { - signal_switch(k + 1); - for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0); - } else { - signal_packing(k); - } - } - - void kernel(Index m, Index n, Index k) { - // Note: order of iteration matters here. Iteration over m is innermost - // because we want to reuse the same packed rhs in consequetive tasks - // (rhs fits into L2$ while lhs only into L3$). - const Index nend = n * gn_ + gn(n); - const Index mend = m * gm_ + gm(m); - if (shard_by_col_) { - for (Index n1 = n * gn_; n1 < nend; n1++) { - for (Index m1 = m * gm_; m1 < mend; m1++) - GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), - packed_lhs_[k % (P - 1)][m1], - packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), - Scalar(1), -1, -1, 0, 0); - } - } else { - for (Index m1 = m * gm_; m1 < mend; m1++) - for (Index n1 = n * gn_; n1 < nend; n1++) { - GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), - packed_lhs_[k % (P - 1)][m1], - packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), - Scalar(1), -1, -1, 0, 0); - } - } - signal_kernel(m, n, k + 1, false); - signal_switch(k + 2); - } - - void signal_packing(Index k) { - eigen_assert(!parallel_pack_); - Index s = state_packing_ready_[k % P].fetch_sub(1); - eigen_assert(s > 0); - if (s != 1) return; - state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_; - enqueue_packing(k, shard_by_col_); - } - - void signal_kernel(Index m, Index n, Index k, bool sync) { - std::atomic* state = &state_kernel_[k % P][m][n]; - Index s = state->load(); - eigen_assert(s > 0); - if (s != 1 && state->fetch_sub(1) != 1) return; - state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed); - if (sync) - kernel(m, n, k); - else - device_.enqueueNoNotification([=]() { kernel(m, n, k); }); - } - - void signal_switch(Index k, Index v = 1) { - Index s = state_switch_[k % P].fetch_sub(v); - eigen_assert(s >= v); - if (s != v) return; - - // Ready to switch to the next k slice. - // Reset counter for the next iteration. - state_switch_[k % P] = - (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) + - nm_ * nn_; - if (k < nk_) { - // Issue lhs/rhs packing. Their completion will in turn kick off - // kernels. - if (parallel_pack_) { - enqueue_packing(k, !shard_by_col_); - enqueue_packing(k, shard_by_col_); - } else if (shard_by_col_) { - enqueue_packing(k, false); - } else { - enqueue_packing(k, true); - } - - // Termination handling. - // Because kernel completion signals k + 2 switch, we need to finish nk - // + 2 slices without issuing any tasks on nk + 1 slice. So here we - // pretend that all nk + 1 packing tasks just finish instantly; so that - // nk + 2 switch only waits for completion of nk kernels. - } else if (k == nk_) { - signal_switch(k + 1, - parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)); - } else { - done_.Notify(); - } - } - - // Enqueue all rhs/lhs packing for k-th slice. - void enqueue_packing(Index k, bool rhs) { - enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs); - } - - void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) { - if (end - start == 1) { - if (rhs) - pack_rhs(start, k); - else - pack_lhs(start, k); - } else { - Index mid = (start + end) / 2; - device_.enqueueNoNotification( - [=]() { enqueue_packing_helper(mid, end, k, rhs); }); - device_.enqueueNoNotification( - [=]() { enqueue_packing_helper(start, mid, k, rhs); }); - } - } - - // Block sizes with accounting for potentially incomplete last block. - Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; } - Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; } - Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; } - // Task grain sizes accounting for potentially incomplete last task. - Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; } - Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; } - - Context(const Context&) = delete; - void operator=(const Context&) = delete; - }; - - // Decide whether we want to shard m x n contraction by columns or by rows. - static bool shardByCol(Index m, Index n, Index num_threads) { - // Note: we are comparing both n and m against Traits::nr, it is not - // a mistake. We are trying to figure out how both n and m will fit into - // the main sharding dimension. - - // Sharding by column is the default - // ... unless there is enough data for vectorization over rows - if (m / num_threads >= Traits::nr && - // and not enough data for vectorization over columns - (n / num_threads < Traits::nr || - // ... or barely enough data for vectorization over columns, - // but it is not evenly dividable across threads - (n / num_threads < 4 * Traits::nr && - (n % (num_threads * Traits::nr)) != 0 && - // ... and it is evenly dividable across threads for rows - ((m % (num_threads * Traits::nr)) == 0 || - // .. or it is not evenly dividable for both dimensions but - // there is much more data over rows so that corner effects are - // mitigated. - (m / n >= 6))))) - return false; - // Wait, or if matrices are just substantially prolonged over the other - // dimension. - if (n / num_threads < 16 * Traits::nr && m > n * 32) return false; - return true; - } - - Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn, - int num_threads, bool shard_by_col) const { - Index gm = 1; - Index gm1 = 1; - Index nm0 = divup(m, bm); - Index nm1 = nm0; - for (;;) { - // Find the next candidate for m grain size. It needs to result in - // different number of blocks. E.g. if we have 10 kernels, we want to try - // 5 and 10, but not 6, 7, 8 and 9. - while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++; - if (gm1 > nm0) break; - // Check the candidate. - int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads, - shard_by_col); - if (res < 0) break; - nm1 = divup(nm0, gm1); - if (res == 0) continue; - // Commit new grain size. - gm = gm1; - } - return gm; - } - - Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm, - int num_threads, bool shard_by_col) const { - Index gn = 1; - Index gn1 = 1; - Index nn0 = divup(n, bn); - Index nn1 = nn0; - for (;;) { - while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++; - if (gn1 > nn0) break; - int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads, - shard_by_col); - if (res < 0) break; - nn1 = divup(nn0, gn1); - if (res == 0) continue; - gn = gn1; - } - return gn; - } - - // checkGrain checks whether grain (gm, gn) is suitable and is better than - // (oldgm, oldgn). - int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm, - Index gn, Index oldgm, Index oldgn, int num_threads, - bool shard_by_col) const { - const TensorOpCost cost = - contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true); - double taskSize = TensorCostModel::taskSize( - static_cast(bm) * gm * bn * gn, cost); - // If the task is too small, then we agree on it regardless of anything - // else. Otherwise synchronization overheads will dominate. - if (taskSize < 1) return 1; - // If it is too large, then we reject it and all larger tasks. - if (taskSize > 2) return -1; - // Now we are in presumably good task size range. - // The main deciding factor here is parallelism. Consider that we have 12 - // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes. - // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4 - // of cores will be busy). While grain size 3 gives us 4 tasks, which gives - // us parallelism of 1 (we can load all cores). - Index nm0 = divup(m, bm); - Index nn0 = divup(n, bn); - Index new_tasks = divup(nm0, gm) * divup(nn0, gn); - double new_parallelism = static_cast(new_tasks) / - (divup(new_tasks, num_threads) * num_threads); - Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn); - double old_parallelism = static_cast(old_tasks) / - (divup(old_tasks, num_threads) * num_threads); - if (new_parallelism > old_parallelism || new_parallelism == 1) return 1; - return 0; - } - -#else // EIGEN_USE_SIMPLE_THREAD_POOL - - template - void evalProduct(Scalar* buffer) const { - if (this->m_j_size == 1) { - this->template evalGemv(buffer); - return; - } - - evalGemm(buffer); - } - - template - void evalGemm(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - - const int lhs_packet_size = internal::unpacket_traits::size; - const int rhs_packet_size = internal::unpacket_traits::size; - - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - typedef internal::blas_data_mapper OutputMapper; - - // TODO: packing could be faster sometimes if we supported row major tensor mappers - typedef internal::gemm_pack_lhs LhsPacker; - typedef internal::gemm_pack_rhs RhsPacker; - - // TODO: replace false, false with conjugate values? - typedef internal::gebp_kernel GebpKernel; - - typedef internal::packLhsArg packLArg; - typedef internal::packRhsAndKernelArg packRKArg; - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - - // compute block sizes (which depend on number of threads) - const Index num_threads = this->m_device.numThreads(); - internal::TensorContractionBlocking blocking(k, m, n, num_threads); - Index mc = blocking.mc(); - Index nc = blocking.nc(); - Index kc = blocking.kc(); - eigen_assert(mc <= m); - eigen_assert(nc <= n); - eigen_assert(kc <= k); - -#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) - const Index k_blocks = CEIL_DIV(k, kc); - const Index n_blocks = CEIL_DIV(n, nc); - const Index m_blocks = CEIL_DIV(m, mc); - const Index sizeA = mc * kc; - const Index sizeB = kc * nc; - - /* cout << "m: " << m << " n: " << n << " k: " << k << endl; - cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl; - cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl; - cout << "num threads: " << num_threads << endl; - */ - - // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB - // aren't 16 byte aligned segfaults will happen due to SIMD instructions - // note: You can get away with allocating just a single blockA and offsets and meet the - // the alignment requirements with the assumption that - // (Traits::mr * sizeof(ResScalar)) % 16 == 0 - const Index numBlockAs = numext::mini(num_threads, m_blocks); - MaxSizeVector blockAs(num_threads); - for (int i = 0; i < num_threads; i++) { - blockAs.push_back(static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); - } - - // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread - // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful. - // Other options: (1) reuse memory when a thread finishes. con: tricky - // (2) allocate block B memory in each thread. con: overhead - MaxSizeVector blockBs(n_blocks); - for (int i = 0; i < n_blocks; i++) { - blockBs.push_back(static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); - } - - // lhs_notifications starts with all null Notifications - MaxSizeVector lhs_notifications(num_threads, nullptr); - - // this should really be numBlockAs * n_blocks; - const Index num_kernel_notifications = num_threads * n_blocks; - MaxSizeVector kernel_notifications(num_kernel_notifications, - nullptr); - - for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { - const Index k_start = k_block_idx * kc; - // make sure we don't overshoot right edge of left matrix - const Index actual_kc = numext::mini(k_start + kc, k) - k_start; - - for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) { - const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs); - - for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) { - const Index m_start = mt_block_idx * mc; - const Index actual_mc = numext::mini(m_start + mc, m) - m_start; - eigen_assert(actual_mc > 0); - - Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; - - for (int i = 0; i < n_blocks; ++i) { - Index notification_id = (blockAId * n_blocks + i); - // Wait for any current kernels using this slot to complete - // before using it. - if (kernel_notifications[notification_id]) { - wait_until_ready(kernel_notifications[notification_id]); - delete kernel_notifications[notification_id]; - } - kernel_notifications[notification_id] = new Notification(); - } - const packLArg arg = { - blockAs[blockAId], // blockA - lhs, // lhs - m_start, // m - k_start, // k - actual_mc, // mc - actual_kc, // kc - }; - - // Delete any existing notification since we may be - // replacing it. The algorithm should ensure that there are - // no existing waiters on this notification. - delete lhs_notifications[blockAId]; - lhs_notifications[blockAId] = - this->m_device.enqueue(&Self::packLhs, arg); - } - - // now start kernels. - const Index m_base_start = m_block_idx * mc; - const bool need_to_pack = m_block_idx == 0; - - for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) { - const Index n_start = n_block_idx * nc; - const Index actual_nc = numext::mini(n_start + nc, n) - n_start; - - // first make sure the previous kernels are all done before overwriting rhs. Also wait if - // we're going to start new k. In both cases need_to_pack is true. - if (need_to_pack) { - for (Index i = num_blocks; i < num_threads; ++i) { - Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads; - Index future_id = (blockAId * n_blocks + n_block_idx); - wait_until_ready(kernel_notifications[future_id]); - } - } - - packRKArg arg = { - &blockAs, // blockA - blockBs[n_block_idx], // blockB - rhs, // rhs - output, // output - m_base_start, // m - k_start, // k - n_start, // n - mc, // mc - actual_kc, // kc - actual_nc, // nc - num_threads, - numBlockAs, - m, - k_block_idx, - m_block_idx, - n_block_idx, // n_block_idx - m_blocks, // m_blocks - n_blocks, // n_blocks - &kernel_notifications, // kernel notifications - &lhs_notifications, // lhs notifications - need_to_pack, // need_to_pack - }; - - // We asynchronously kick off this function, which ends up - // notifying the appropriate kernel_notifications objects, - // which this thread waits on before exiting. - this->m_device.enqueueNoNotification(&Self::packRhsAndKernel, arg); - } - } - } - - // Make sure all the kernels are done. - for (size_t i = 0; i < kernel_notifications.size(); ++i) { - wait_until_ready(kernel_notifications[i]); - delete kernel_notifications[i]; - } - - // No need to wait for lhs notifications since they should have - // already been waited on. Just clean them up. - for (size_t i = 0; i < lhs_notifications.size(); ++i) { - delete lhs_notifications[i]; - } - - // deallocate all of the memory for both A and B's - for (size_t i = 0; i < blockAs.size(); i++) { - this->m_device.deallocate(blockAs[i]); - } - for (size_t i = 0; i < blockBs.size(); i++) { - this->m_device.deallocate(blockBs[i]); - } - -#undef CEIL_DIV - } - - /* - * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing - * the LHS block, check that all of the kernels that worked on the same - * mt_block_idx in the previous m_block are done. - */ - template - static void packLhs(const packLArg arg) { - // perform actual packing - LhsPacker pack_lhs; - pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc); - } - - /* - * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that - * all kernels in the previous block are done. - * Then for each LHS future, we wait on the future and then call GEBP - * on the area packed by the future (which starts at - * blockA + future_idx * mt * kc) on the LHS and with the full packed - * RHS block. - * The output of this GEBP is written to output(m + i * mt, n). - */ - template - static void packRhsAndKernel(packRKArg arg) { - if (arg.need_to_pack) { - RhsPacker pack_rhs; - pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc); - } - - GebpKernel gebp; - for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) { - const Index m_base_start = arg.m + arg.mc*mt_block_idx; - if (m_base_start < arg.max_m) { - Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads; - wait_until_ready((*arg.lhs_notifications)[blockAId]); - const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start; - gebp(arg.output.getSubMapper(m_base_start, arg.n), - (*arg.blockAs)[blockAId], arg.blockB, - actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0); - - // Notify that the kernel is done. - const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx; - (*arg.kernel_notifications)[set_idx]->Notify(); - } - } - } -#endif // EIGEN_USE_SIMPLE_THREAD_POOL - - TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk, - bool shard_by_col, bool prepacked) const { - const int packed_size = std::min(PacketType::size, - PacketType::size); - const int output_packet_size = internal::unpacket_traits::size; - const double kd = static_cast(bk); - // Peak VFMA bandwidth is 0.5. However if we have not enough data for - // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined - // experimentally. - double computeBandwidth = bk == 1 ? 4.0 : - (shard_by_col ? bn : bm) < Traits::nr || - (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5; -#ifndef EIGEN_VECTORIZE_FMA - // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors. - // However for MULPS/ADDPS we have dependent sequence of 2 such instructions, - // so overall bandwidth is 1.0. - if (computeBandwidth == 0.5) computeBandwidth = 1.0; -#endif - // Computations. - TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size); - // Output stores. - cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); - if (prepacked) { - // Packing and kernels are executed in different tasks. When we calculate - // task grain size we look only at kernel cost assuming that kernel - // is more expensive than packing. - return cost; - } - // Lhs/rhs loads + computations. - TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n); - TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m); - // Lhs packing memory cost does not contribute considerably to overall - // execution time because lhs is prefetched early and accessed sequentially. - if (shard_by_col) - lhsCost.dropMemoryCost(); - else - rhsCost.dropMemoryCost(); - return cost + lhsCost + rhsCost; - } -}; - -} // end namespace Eigen - -#endif // EIGEN_USE_THREADS -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h deleted file mode 100644 index 860a6949a9b..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ /dev/null @@ -1,279 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H - -namespace Eigen { - -/** \class TensorConversionOp - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor conversion class. This class makes it possible to vectorize - * type casting operations when the number of scalars per packet in the source - * and the destination type differ - */ -namespace internal { -template -struct traits > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef TargetType Scalar; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = traits::NumDimensions; - static const int Layout = traits::Layout; - enum { Flags = 0 }; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorConversionOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorConversionOp type; -}; - -} // end namespace internal - - -template -struct PacketConverter { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketConverter(const TensorEvaluator& impl) - : m_impl(impl) {} - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { - return internal::pcast(m_impl.template packet(index)); - } - - private: - const TensorEvaluator& m_impl; -}; - - -template -struct PacketConverter { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketConverter(const TensorEvaluator& impl) - : m_impl(impl) {} - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { - const int SrcPacketSize = internal::unpacket_traits::size; - - SrcPacket src1 = m_impl.template packet(index); - SrcPacket src2 = m_impl.template packet(index + SrcPacketSize); - TgtPacket result = internal::pcast(src1, src2); - return result; - } - - private: - const TensorEvaluator& m_impl; -}; - -template -struct PacketConverter { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketConverter(const TensorEvaluator& impl) - : m_impl(impl) {} - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { - const int SrcPacketSize = internal::unpacket_traits::size; - - SrcPacket src1 = m_impl.template packet(index); - SrcPacket src2 = m_impl.template packet(index + SrcPacketSize); - SrcPacket src3 = m_impl.template packet(index + 2 * SrcPacketSize); - SrcPacket src4 = m_impl.template packet(index + 3 * SrcPacketSize); - TgtPacket result = internal::pcast(src1, src2, src3, src4); - return result; - } - - private: - const TensorEvaluator& m_impl; -}; - -template -struct PacketConverter { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketConverter(const TensorEvaluator& impl) - : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {} - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { - const int SrcPacketSize = internal::unpacket_traits::size; - // Only call m_impl.packet() when we have direct access to the underlying data. This - // ensures that we don't compute the subexpression twice. We may however load some - // coefficients twice, but in practice this doesn't negatively impact performance. - if (m_impl.data() && (index + SrcPacketSize < m_maxIndex)) { - // Force unaligned memory loads since we can't ensure alignment anymore - return internal::pcast(m_impl.template packet(index)); - } else { - const int TgtPacketSize = internal::unpacket_traits::size; - typedef typename internal::unpacket_traits::type SrcType; - typedef typename internal::unpacket_traits::type TgtType; - internal::scalar_cast_op converter; - EIGEN_ALIGN_MAX typename internal::unpacket_traits::type values[TgtPacketSize]; - for (int i = 0; i < TgtPacketSize; ++i) { - values[i] = converter(m_impl.coeff(index+i)); - } - TgtPacket rslt = internal::pload(values); - return rslt; - } - } - - private: - const TensorEvaluator& m_impl; - const typename TensorEvaluator::Index m_maxIndex; -}; - -template -class TensorConversionOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename internal::traits::Scalar Scalar; - typedef typename internal::traits::StorageKind StorageKind; - typedef typename internal::traits::Index Index; - typedef typename internal::nested::type Nested; - typedef Scalar CoeffReturnType; - typedef typename NumTraits::Real RealScalar; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr) - : m_xpr(xpr) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; -}; - -template struct ConversionSubExprEval { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) { - impl.evalSubExprsIfNeeded(NULL); - return true; - } -}; - -template struct ConversionSubExprEval { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) { - return impl.evalSubExprsIfNeeded(data); - } -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorConversionOp XprType; - typedef typename XprType::Index Index; - typedef typename TensorEvaluator::Dimensions Dimensions; - typedef TargetType Scalar; - typedef TargetType CoeffReturnType; - typedef typename internal::remove_all::Scalar>::type SrcType; - typedef typename PacketType::type PacketReturnType; - typedef typename PacketType::type PacketSourceType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = false, - PacketAccess = true, - Layout = TensorEvaluator::Layout, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) - { - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) - { - return ConversionSubExprEval::value, TensorEvaluator, Scalar>::run(m_impl, data); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() - { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - internal::scalar_cast_op converter; - return converter(m_impl.coeff(index)); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - const bool Vectorizable = TensorEvaluator::PacketAccess & - internal::type_casting_traits::VectorizedCast; - return PacketConv::run(m_impl, index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double cast_cost = TensorOpCost::CastCost(); - if (vectorized) { - const double SrcCoeffRatio = - internal::type_casting_traits::SrcCoeffRatio; - const double TgtCoeffRatio = - internal::type_casting_traits::TgtCoeffRatio; - return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) + - TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize)); - } else { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost); - } - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - protected: - template - struct PacketConv { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator& impl, Index index) { - internal::scalar_cast_op converter; - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = converter(impl.coeff(index+i)); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - }; - - template - struct PacketConv { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator& impl, Index index) { - const int SrcCoeffRatio = internal::type_casting_traits::SrcCoeffRatio; - const int TgtCoeffRatio = internal::type_casting_traits::TgtCoeffRatio; - PacketConverter, PacketSourceType, PacketReturnType, - SrcCoeffRatio, TgtCoeffRatio> converter(impl); - return converter.template packet(index); - } - }; - - TensorEvaluator m_impl; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h deleted file mode 100644 index 4f5767bc7f7..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ /dev/null @@ -1,337 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H) -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H - -namespace Eigen { - -static const int kCudaScratchSize = 1024; - -// This defines an interface that GPUDevice can take to use -// CUDA streams underneath. -class StreamInterface { - public: - virtual ~StreamInterface() {} - - virtual const cudaStream_t& stream() const = 0; - virtual const cudaDeviceProp& deviceProperties() const = 0; - - // Allocate memory on the actual device where the computation will run - virtual void* allocate(size_t num_bytes) const = 0; - virtual void deallocate(void* buffer) const = 0; - - // Return a scratchpad buffer of size 1k - virtual void* scratchpad() const = 0; - - // Return a semaphore. The semaphore is initially initialized to 0, and - // each kernel using it is responsible for resetting to 0 upon completion - // to maintain the invariant that the semaphore is always equal to 0 upon - // each kernel start. - virtual unsigned int* semaphore() const = 0; -}; - -static cudaDeviceProp* m_deviceProperties; -static bool m_devicePropInitialized = false; - -static void initializeDeviceProp() { - if (!m_devicePropInitialized) { - // Attempts to ensure proper behavior in the case of multiple threads - // calling this function simultaneously. This would be trivial to - // implement if we could use std::mutex, but unfortunately mutex don't - // compile with nvcc, so we resort to atomics and thread fences instead. - // Note that if the caller uses a compiler that doesn't support c++11 we - // can't ensure that the initialization is thread safe. -#if __cplusplus >= 201103L - static std::atomic first(true); - if (first.exchange(false)) { -#else - static bool first = true; - if (first) { - first = false; -#endif - // We're the first thread to reach this point. - int num_devices; - cudaError_t status = cudaGetDeviceCount(&num_devices); - if (status != cudaSuccess) { - std::cerr << "Failed to get the number of CUDA devices: " - << cudaGetErrorString(status) - << std::endl; - assert(status == cudaSuccess); - } - m_deviceProperties = new cudaDeviceProp[num_devices]; - for (int i = 0; i < num_devices; ++i) { - status = cudaGetDeviceProperties(&m_deviceProperties[i], i); - if (status != cudaSuccess) { - std::cerr << "Failed to initialize CUDA device #" - << i - << ": " - << cudaGetErrorString(status) - << std::endl; - assert(status == cudaSuccess); - } - } - -#if __cplusplus >= 201103L - std::atomic_thread_fence(std::memory_order_release); -#endif - m_devicePropInitialized = true; - } else { - // Wait for the other thread to inititialize the properties. - while (!m_devicePropInitialized) { -#if __cplusplus >= 201103L - std::atomic_thread_fence(std::memory_order_acquire); -#endif - sleep(1); - } - } - } -} - -static const cudaStream_t default_stream = cudaStreamDefault; - -class CudaStreamDevice : public StreamInterface { - public: - // Use the default stream on the current device - CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { - cudaGetDevice(&device_); - initializeDeviceProp(); - } - // Use the default stream on the specified device - CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { - initializeDeviceProp(); - } - // Use the specified stream. Note that it's the - // caller responsibility to ensure that the stream can run on - // the specified device. If no device is specified the code - // assumes that the stream is associated to the current gpu device. - CudaStreamDevice(const cudaStream_t* stream, int device = -1) - : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { - if (device < 0) { - cudaGetDevice(&device_); - } else { - int num_devices; - cudaError_t err = cudaGetDeviceCount(&num_devices); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - assert(device < num_devices); - device_ = device; - } - initializeDeviceProp(); - } - - virtual ~CudaStreamDevice() { - if (scratch_) { - deallocate(scratch_); - } - } - - const cudaStream_t& stream() const { return *stream_; } - const cudaDeviceProp& deviceProperties() const { - return m_deviceProperties[device_]; - } - virtual void* allocate(size_t num_bytes) const { - cudaError_t err = cudaSetDevice(device_); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - void* result; - err = cudaMalloc(&result, num_bytes); - assert(err == cudaSuccess); - assert(result != NULL); - return result; - } - virtual void deallocate(void* buffer) const { - cudaError_t err = cudaSetDevice(device_); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - assert(buffer != NULL); - err = cudaFree(buffer); - assert(err == cudaSuccess); - } - - virtual void* scratchpad() const { - if (scratch_ == NULL) { - scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int)); - } - return scratch_; - } - - virtual unsigned int* semaphore() const { - if (semaphore_ == NULL) { - char* scratch = static_cast(scratchpad()) + kCudaScratchSize; - semaphore_ = reinterpret_cast(scratch); - cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - } - return semaphore_; - } - - private: - const cudaStream_t* stream_; - int device_; - mutable void* scratch_; - mutable unsigned int* semaphore_; -}; - -struct GpuDevice { - // The StreamInterface is not owned: the caller is - // responsible for its initialization and eventual destruction. - explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { - eigen_assert(stream); - } - explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { - eigen_assert(stream); - } - // TODO(bsteiner): This is an internal API, we should not expose it. - EIGEN_STRONG_INLINE const cudaStream_t& stream() const { - return stream_->stream(); - } - - EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - return stream_->allocate(num_bytes); - } - - EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - stream_->deallocate(buffer); - } - - EIGEN_STRONG_INLINE void* scratchpad() const { - return stream_->scratchpad(); - } - - EIGEN_STRONG_INLINE unsigned int* semaphore() const { - return stream_->semaphore(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { -#ifndef __CUDA_ARCH__ - cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, - stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { - cudaError_t err = - cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - } - - EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { - cudaError_t err = - cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { -#ifndef __CUDA_ARCH__ - cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE size_t numThreads() const { - // FIXME - return 32; - } - - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { - // FIXME - return 48*1024; - } - - EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { - // We won't try to take advantage of the l2 cache for the time being, and - // there is no l3 cache on cuda devices. - return firstLevelCacheSize(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { -#if defined(__CUDACC__) && !defined(__CUDA_ARCH__) - cudaError_t err = cudaStreamSynchronize(stream_->stream()); - if (err != cudaSuccess) { - std::cerr << "Error detected in CUDA stream: " - << cudaGetErrorString(err) - << std::endl; - assert(err == cudaSuccess); - } -#else - assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { - return stream_->deviceProperties().multiProcessorCount; - } - EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { - return stream_->deviceProperties().maxThreadsPerBlock; - } - EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { - return stream_->deviceProperties().maxThreadsPerMultiProcessor; - } - EIGEN_STRONG_INLINE int sharedMemPerBlock() const { - return stream_->deviceProperties().sharedMemPerBlock; - } - EIGEN_STRONG_INLINE int majorDeviceVersion() const { - return stream_->deviceProperties().major; - } - EIGEN_STRONG_INLINE int minorDeviceVersion() const { - return stream_->deviceProperties().minor; - } - - EIGEN_STRONG_INLINE int maxBlocks() const { - return max_blocks_; - } - - // This function checks if the CUDA runtime recorded an error for the - // underlying stream device. - inline bool ok() const { -#ifdef __CUDACC__ - cudaError_t error = cudaStreamQuery(stream_->stream()); - return (error == cudaSuccess) || (error == cudaErrorNotReady); -#else - return false; -#endif - } - - private: - const StreamInterface* stream_; - int max_blocks_; -}; - -#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ - (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ - assert(cudaGetLastError() == cudaSuccess); - - -// FIXME: Should be device and kernel specific. -#ifdef __CUDACC__ -static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { -#ifndef __CUDA_ARCH__ - cudaError_t status = cudaDeviceSetSharedMemConfig(config); - EIGEN_UNUSED_VARIABLE(status) - assert(status == cudaSuccess); -#else - EIGEN_UNUSED_VARIABLE(config) -#endif -} -#endif - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h deleted file mode 100644 index 7c039890e2e..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ /dev/null @@ -1,122 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: -// Copyright (C) 2016 Benoit Steiner - -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H - -namespace Eigen { -struct SyclDevice { - /// class members - /// sycl queue - mutable cl::sycl::queue m_queue; - /// std::map is the container used to make sure that we create only one buffer - /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. - /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - mutable std::map> buffer_map; - /// creating device by using selector - template SyclDevice(dev_Selector s) - : -#ifdef EIGEN_EXCEPTIONS - m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) { - for (const auto& e : l) { - try { - std::rethrow_exception(e); - } catch (cl::sycl::exception e) { - std::cout << e.what() << std::endl; - } - } - })) -#else - m_queue(cl::sycl::queue(s)) -#endif - {} - // destructor - ~SyclDevice() { deallocate_all(); } - - template void deallocate(T *p) const { - auto it = buffer_map.find(p); - if (it != buffer_map.end()) { - buffer_map.erase(it); - internal::aligned_free(p); - } - } - void deallocate_all() const { - std::map>::iterator it=buffer_map.begin(); - while (it!=buffer_map.end()) { - auto p=it->first; - buffer_map.erase(it); - internal::aligned_free(const_cast(p)); - it=buffer_map.begin(); - } - buffer_map.clear(); - } - - /// creation of sycl accessor for a buffer. This function first tries to find - /// the buffer in the buffer_map. If found it gets the accessor from it, if not, - ///the function then adds an entry by creating a sycl buffer for that particular pointer. - template inline cl::sycl::accessor - get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const { - return (get_sycl_buffer(num_bytes, ptr)->template get_access(cgh)); - } - - template inline std::pair>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const { - using Type = cl::sycl::buffer; - std::pair>::iterator,bool> ret = buffer_map.insert(std::pair>(ptr, std::shared_ptr(new Type(cl::sycl::range<1>(num_bytes)), - [](void *dataMem) { delete static_cast(dataMem); }))); - (static_cast(buffer_map.at(ptr).get()))->set_final_data(nullptr); - return ret; - } - - template inline cl::sycl::buffer* get_sycl_buffer(size_t num_bytes,const T * ptr) const { - return static_cast*>(add_sycl_buffer(ptr, num_bytes).first->second.get()); - } - - /// allocating memory on the cpu - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const { - return internal::aligned_malloc(8); - } - - // some runtime conditions that can be applied here - bool isDeviceSuitable() const { return true; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { - ::memcpy(dst, src, n); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const { - auto host_acc= (static_cast*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access(); - memcpy(host_acc.get_pointer(), src, n); - } - /// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon. - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const { - auto it = buffer_map.find(src); - if (it != buffer_map.end()) { - auto host_acc= (static_cast*>(it->second.get()))-> template get_access(); - memcpy(dst,host_acc.get_pointer(), n); - } else{ - eigen_assert("no device memory found. The memory might be destroyed before creation"); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const { - ::memset(buffer, c, n); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { - return 1; - } -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h deleted file mode 100644 index a5e084a2409..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ /dev/null @@ -1,282 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H) -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H - -namespace Eigen { - -// Use the SimpleThreadPool by default. We'll switch to the new non blocking -// thread pool later. -#ifndef EIGEN_USE_SIMPLE_THREAD_POOL -template using ThreadPoolTempl = NonBlockingThreadPoolTempl; -typedef NonBlockingThreadPool ThreadPool; -#else -template using ThreadPoolTempl = SimpleThreadPoolTempl; -typedef SimpleThreadPool ThreadPool; -#endif - - -// Barrier is an object that allows one or more threads to wait until -// Notify has been called a specified number of times. -class Barrier { - public: - Barrier(unsigned int count) : state_(count << 1), notified_(false) { - eigen_assert(((count << 1) >> 1) == count); - } - ~Barrier() { - eigen_plain_assert((state_>>1) == 0); - } - - void Notify() { - unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2; - if (v != 1) { - eigen_assert(((v + 2) & ~1) != 0); - return; // either count has not dropped to 0, or waiter is not waiting - } - std::unique_lock l(mu_); - eigen_assert(!notified_); - notified_ = true; - cv_.notify_all(); - } - - void Wait() { - unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel); - if ((v >> 1) == 0) return; - std::unique_lock l(mu_); - while (!notified_) { - cv_.wait(l); - } - } - - private: - std::mutex mu_; - std::condition_variable cv_; - std::atomic state_; // low bit is waiter flag - bool notified_; -}; - - -// Notification is an object that allows a user to to wait for another -// thread to signal a notification that an event has occurred. -// -// Multiple threads can wait on the same Notification object, -// but only one caller must call Notify() on the object. -struct Notification : Barrier { - Notification() : Barrier(1) {}; -}; - - -// Runs an arbitrary function and then calls Notify() on the passed in -// Notification. -template struct FunctionWrapperWithNotification -{ - static void run(Notification* n, Function f, Args... args) { - f(args...); - if (n) { - n->Notify(); - } - } -}; - -template struct FunctionWrapperWithBarrier -{ - static void run(Barrier* b, Function f, Args... args) { - f(args...); - if (b) { - b->Notify(); - } - } -}; - -template -static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { - if (n) { - n->Wait(); - } -} - - -// Build a thread pool device on top the an existing pool of threads. -struct ThreadPoolDevice { - // The ownership of the thread pool remains with the caller. - ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { } - - EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - return internal::aligned_malloc(num_bytes); - } - - EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - internal::aligned_free(buffer); - } - - EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { - ::memcpy(dst, src, n); - } - EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { - memcpy(dst, src, n); - } - EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { - memcpy(dst, src, n); - } - - EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { - ::memset(buffer, c, n); - } - - EIGEN_STRONG_INLINE int numThreads() const { - return num_threads_; - } - - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { - return l1CacheSize(); - } - - EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { - // The l3 cache size is shared between all the cores. - return l3CacheSize() / num_threads_; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { - // Should return an enum that encodes the ISA supported by the CPU - return 1; - } - - template - EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const { - Notification* n = new Notification(); - pool_->Schedule(std::bind(&FunctionWrapperWithNotification::run, n, f, args...)); - return n; - } - - template - EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, - Function&& f, - Args&&... args) const { - pool_->Schedule(std::bind( - &FunctionWrapperWithBarrier::run, b, f, args...)); - } - - template - EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const { - pool_->Schedule(std::bind(f, args...)); - } - - // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if - // called from one of the threads in pool_. Returns -1 otherwise. - EIGEN_STRONG_INLINE int currentThreadId() const { - return pool_->CurrentThreadId(); - } - - // parallelFor executes f with [0, n) arguments in parallel and waits for - // completion. F accepts a half-open interval [first, last). - // Block size is choosen based on the iteration cost and resulting parallel - // efficiency. If block_align is not nullptr, it is called to round up the - // block size. - void parallelFor(Index n, const TensorOpCost& cost, - std::function block_align, - std::function f) const { - typedef TensorCostModel CostModel; - if (n <= 1 || numThreads() == 1 || - CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { - f(0, n); - return; - } - - // Calculate block size based on (1) the iteration cost and (2) parallel - // efficiency. We want blocks to be not too small to mitigate - // parallelization overheads; not too large to mitigate tail - // effect and potential load imbalance and we also want number - // of blocks to be evenly dividable across threads. - - double block_size_f = 1.0 / CostModel::taskSize(1, cost); - const Index max_oversharding_factor = 4; - Index block_size = numext::mini( - n, numext::maxi(divup(n, max_oversharding_factor * numThreads()), - block_size_f)); - const Index max_block_size = numext::mini(n, 2 * block_size); - if (block_align) { - Index new_block_size = block_align(block_size); - eigen_assert(new_block_size >= block_size); - block_size = numext::mini(n, new_block_size); - } - Index block_count = divup(n, block_size); - // Calculate parallel efficiency as fraction of total CPU time used for - // computations: - double max_efficiency = - static_cast(block_count) / - (divup(block_count, numThreads()) * numThreads()); - // Now try to increase block size up to max_block_size as long as it - // doesn't decrease parallel efficiency. - for (Index prev_block_count = block_count; - max_efficiency < 1.0 && prev_block_count > 1;) { - // This is the next block size that divides size into a smaller number - // of blocks than the current block_size. - Index coarser_block_size = divup(n, prev_block_count - 1); - if (block_align) { - Index new_block_size = block_align(coarser_block_size); - eigen_assert(new_block_size >= coarser_block_size); - coarser_block_size = numext::mini(n, new_block_size); - } - if (coarser_block_size > max_block_size) { - break; // Reached max block size. Stop. - } - // Recalculate parallel efficiency. - const Index coarser_block_count = divup(n, coarser_block_size); - eigen_assert(coarser_block_count < prev_block_count); - prev_block_count = coarser_block_count; - const double coarser_efficiency = - static_cast(coarser_block_count) / - (divup(coarser_block_count, numThreads()) * numThreads()); - if (coarser_efficiency + 0.01 >= max_efficiency) { - // Taking it. - block_size = coarser_block_size; - block_count = coarser_block_count; - if (max_efficiency < coarser_efficiency) { - max_efficiency = coarser_efficiency; - } - } - } - - // Recursively divide size into halves until we reach block_size. - // Division code rounds mid to block_size, so we are guaranteed to get - // block_count leaves that do actual computations. - Barrier barrier(static_cast(block_count)); - std::function handleRange; - handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) { - if (last - first <= block_size) { - // Single block or less, execute directly. - f(first, last); - barrier.Notify(); - return; - } - // Split into halves and submit to the pool. - Index mid = first + divup((last - first) / 2, block_size) * block_size; - pool_->Schedule([=, &handleRange]() { handleRange(mid, last); }); - pool_->Schedule([=, &handleRange]() { handleRange(first, mid); }); - }; - handleRange(0, n); - barrier.Wait(); - } - - // Convenience wrapper for parallelFor that does not align blocks. - void parallelFor(Index n, const TensorOpCost& cost, - std::function f) const { - parallelFor(n, cost, nullptr, std::move(f)); - } - - private: - ThreadPoolInterface* pool_; - int num_threads_; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h deleted file mode 100644 index 834ce07df55..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ /dev/null @@ -1,633 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H -#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H - -namespace Eigen { - -/** \class TensorEvaluator - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor evaluator classes. - * - * These classes are responsible for the evaluation of the tensor expression. - * - * TODO: add support for more types of expressions, in particular expressions - * leading to lvalues (slicing, reshaping, etc...) - */ - -// Generic evaluator -template -struct TensorEvaluator -{ - typedef typename Derived::Index Index; - typedef typename Derived::Scalar Scalar; - typedef typename Derived::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef typename Derived::Dimensions Dimensions; - - // NumDimensions is -1 for variable dim tensors - static const int NumCoords = internal::traits::NumDimensions > 0 ? - internal::traits::NumDimensions : 0; - - enum { - IsAligned = Derived::IsAligned, - PacketAccess = (internal::unpacket_traits::size > 1), - Layout = Derived::Layout, - CoordAccess = NumCoords > 0, - RawAccess = true - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(const_cast::template MakePointer::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m) - { } - - // Used for accessor extraction in SYCL Managed TensorMap: - const Derived& derived() const { return m_impl; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) { - if (dest) { - m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize()); - return false; - } - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data); - return m_data[index]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - eigen_assert(m_data); - return m_data[index]; - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketReturnType packet(Index index) const - { - return internal::ploadt(m_data + index); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - return internal::pstoret(m_data + index, x); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { - eigen_assert(m_data); - if (static_cast(Layout) == static_cast(ColMajor)) { - return m_data[m_dims.IndexOfColMajor(coords)]; - } else { - return m_data[m_dims.IndexOfRowMajor(coords)]; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& coords) { - eigen_assert(m_data); - if (static_cast(Layout) == static_cast(ColMajor)) { - return m_data[m_dims.IndexOfColMajor(coords)]; - } else { - return m_data[m_dims.IndexOfRowMajor(coords)]; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits::size); - } - - EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } - - /// required by sycl in order to construct sycl buffer from raw pointer - const Device& device() const{return m_device;} - - protected: - typename internal::traits::template MakePointer::Type m_data; - Dimensions m_dims; - const Device& m_device; - const Derived& m_impl; -}; - -namespace { -template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T loadConstant(const T* address) { - return *address; -} -// Use the texture cache on CUDA devices whenever possible -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 -template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -float loadConstant(const float* address) { - return __ldg(address); -} -template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -double loadConstant(const double* address) { - return __ldg(address); -} -template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -Eigen::half loadConstant(const Eigen::half* address) { - return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x))); -} -#endif -} - - -// Default evaluator for rvalues -template -struct TensorEvaluator -{ - typedef typename Derived::Index Index; - typedef typename Derived::Scalar Scalar; - typedef typename Derived::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef typename Derived::Dimensions Dimensions; - - // NumDimensions is -1 for variable dim tensors - static const int NumCoords = internal::traits::NumDimensions > 0 ? - internal::traits::NumDimensions : 0; - - enum { - IsAligned = Derived::IsAligned, - PacketAccess = (internal::unpacket_traits::size > 1), - Layout = Derived::Layout, - CoordAccess = NumCoords > 0, - RawAccess = true - }; - - // Used for accessor extraction in SYCL Managed TensorMap: - const Derived& derived() const { return m_impl; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(m.data()), m_dims(m.dimensions()), m_device(device), m_impl(m) - { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { - if (!NumTraits::type>::RequireInitialization && data) { - m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar)); - return false; - } - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data); - return loadConstant(m_data+index); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketReturnType packet(Index index) const - { - return internal::ploadt_ro(m_data + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { - eigen_assert(m_data); - const Index index = (static_cast(Layout) == static_cast(ColMajor)) ? m_dims.IndexOfColMajor(coords) - : m_dims.IndexOfRowMajor(coords); - return loadConstant(m_data+index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits::size); - } - - EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } - - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} - - protected: - typename internal::traits::template MakePointer::Type m_data; - Dimensions m_dims; - const Device& m_device; - const Derived& m_impl; -}; - - - - -// -------------------- CwiseNullaryOp -------------------- - -template -struct TensorEvaluator, Device> -{ - typedef TensorCwiseNullaryOp XprType; - - enum { - IsAligned = true, - PacketAccess = internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC - TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() - { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::traits::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - typedef typename TensorEvaluator::Dimensions Dimensions; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_wrapper(m_functor, index); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_wrapper.template packetOp(m_functor, index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits::size); - } - - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } - - /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return m_argImpl; } - /// required by sycl in order to extract the accessor - NullaryOp functor() const { return m_functor; } - - - private: - const NullaryOp m_functor; - TensorEvaluator m_argImpl; - const internal::nullary_wrapper m_wrapper; -}; - - - -// -------------------- CwiseUnaryOp -------------------- - -template -struct TensorEvaluator, Device> -{ - typedef TensorCwiseUnaryOp XprType; - - enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), - m_argImpl(op.nestedExpression(), device) - { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::traits::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - typedef typename TensorEvaluator::Dimensions Dimensions; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { - m_argImpl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_argImpl.cleanup(); - } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_functor(m_argImpl.coeff(index)); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_functor.packetOp(m_argImpl.template packet(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - const double functor_cost = internal::functor_traits::Cost; - return m_argImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } - - /// required by sycl in order to extract the accessor - const TensorEvaluator & impl() const { return m_argImpl; } - /// added for sycl in order to construct the buffer from sycl device - UnaryOp functor() const { return m_functor; } - - - private: - const UnaryOp m_functor; - TensorEvaluator m_argImpl; -}; - - -// -------------------- CwiseBinaryOp -------------------- - -template -struct TensorEvaluator, Device> -{ - typedef TensorCwiseBinaryOp XprType; - - enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), - m_leftImpl(op.lhsExpression(), device), - m_rightImpl(op.rhsExpression(), device) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || internal::traits::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); - eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); - } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::traits::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - typedef typename TensorEvaluator::Dimensions Dimensions; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const - { - // TODO: use right impl instead if right impl dimensions are known at compile time. - return m_leftImpl.dimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - m_leftImpl.evalSubExprsIfNeeded(NULL); - m_rightImpl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_leftImpl.cleanup(); - m_rightImpl.cleanup(); - } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index)); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_functor.packetOp(m_leftImpl.template packet(index), m_rightImpl.template packet(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double functor_cost = internal::functor_traits::Cost; - return m_leftImpl.costPerCoeff(vectorized) + - m_rightImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& left_impl() const { return m_leftImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& right_impl() const { return m_rightImpl; } - /// required by sycl in order to extract the accessor - BinaryOp functor() const { return m_functor; } - - private: - const BinaryOp m_functor; - TensorEvaluator m_leftImpl; - TensorEvaluator m_rightImpl; -}; - -// -------------------- CwiseTernaryOp -------------------- - -template -struct TensorEvaluator, Device> -{ - typedef TensorCwiseTernaryOp XprType; - - enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), - m_arg1Impl(op.arg1Expression(), device), - m_arg2Impl(op.arg2Expression(), device), - m_arg3Impl(op.arg3Expression(), device) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || internal::traits::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); - - EIGEN_STATIC_ASSERT((internal::is_same::StorageKind, - typename internal::traits::StorageKind>::value), - STORAGE_KIND_MUST_MATCH) - EIGEN_STATIC_ASSERT((internal::is_same::StorageKind, - typename internal::traits::StorageKind>::value), - STORAGE_KIND_MUST_MATCH) - EIGEN_STATIC_ASSERT((internal::is_same::Index, - typename internal::traits::Index>::value), - STORAGE_INDEX_MUST_MATCH) - EIGEN_STATIC_ASSERT((internal::is_same::Index, - typename internal::traits::Index>::value), - STORAGE_INDEX_MUST_MATCH) - - eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions())); - } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::traits::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - typedef typename TensorEvaluator::Dimensions Dimensions; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const - { - // TODO: use arg2 or arg3 dimensions if they are known at compile time. - return m_arg1Impl.dimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - m_arg1Impl.evalSubExprsIfNeeded(NULL); - m_arg2Impl.evalSubExprsIfNeeded(NULL); - m_arg3Impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_arg1Impl.cleanup(); - m_arg2Impl.cleanup(); - m_arg3Impl.cleanup(); - } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index)); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_functor.packetOp(m_arg1Impl.template packet(index), - m_arg2Impl.template packet(index), - m_arg3Impl.template packet(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double functor_cost = internal::functor_traits::Cost; - return m_arg1Impl.costPerCoeff(vectorized) + - m_arg2Impl.costPerCoeff(vectorized) + - m_arg3Impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } - - /// required by sycl in order to extract the accessor - const TensorEvaluator & arg1Impl() const { return m_arg1Impl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& arg2Impl() const { return m_arg2Impl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& arg3Impl() const { return m_arg3Impl; } - - private: - const TernaryOp m_functor; - TensorEvaluator m_arg1Impl; - TensorEvaluator m_arg2Impl; - TensorEvaluator m_arg3Impl; -}; - - -// -------------------- SelectOp -------------------- - -template -struct TensorEvaluator, Device> -{ - typedef TensorSelectOp XprType; - typedef typename XprType::Scalar Scalar; - - enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & - internal::packet_traits::HasBlend, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_condImpl(op.ifExpression(), device), - m_thenImpl(op.thenExpression(), device), - m_elseImpl(op.elseExpression(), device) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions())); - eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions())); - } - - typedef typename XprType::Index Index; - typedef typename internal::traits::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - typedef typename TensorEvaluator::Dimensions Dimensions; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const - { - // TODO: use then or else impl instead if they happen to be known at compile time. - return m_condImpl.dimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - m_condImpl.evalSubExprsIfNeeded(NULL); - m_thenImpl.evalSubExprsIfNeeded(NULL); - m_elseImpl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_condImpl.cleanup(); - m_thenImpl.cleanup(); - m_elseImpl.cleanup(); - } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); - } - template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const - { - internal::Selector select; - for (Index i = 0; i < PacketSize; ++i) { - select.select[i] = m_condImpl.coeff(index+i); - } - return internal::pblend(select, - m_thenImpl.template packet(index), - m_elseImpl.template packet(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - return m_condImpl.costPerCoeff(vectorized) + - m_thenImpl.costPerCoeff(vectorized) - .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator & cond_impl() const { return m_condImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& then_impl() const { return m_thenImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& else_impl() const { return m_elseImpl; } - - private: - TensorEvaluator m_condImpl; - TensorEvaluator m_thenImpl; - TensorEvaluator m_elseImpl; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h deleted file mode 100644 index f01d77c0a06..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ /dev/null @@ -1,288 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H -#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H - -namespace Eigen { - -/** \class TensorExecutor - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor executor class. - * - * This class is responsible for launch the evaluation of the expression on - * the specified computing device. - */ -namespace internal { - -// Default strategy: the expression is evaluated with a single cpu thread. -template -class TensorExecutor -{ - public: - typedef typename Expression::Index Index; - EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, const Device& device = Device()) - { - TensorEvaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); - for (Index i = 0; i < size; ++i) { - evaluator.evalScalar(i); - } - } - evaluator.cleanup(); - } -}; - - -template -class TensorExecutor -{ - public: - typedef typename Expression::Index Index; - EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) - { - TensorEvaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); - const int PacketSize = unpacket_traits::PacketReturnType>::size; - // Give the compiler a strong hint to unroll the loop. But don't insist - // on unrolling, because if the function is expensive the compiler should not - // unroll the loop at the expense of inlining. - const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; - for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { - for (Index j = 0; j < 4; j++) { - evaluator.evalPacket(i + j * PacketSize); - } - } - const Index VectorizedSize = (size / PacketSize) * PacketSize; - for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { - evaluator.evalPacket(i); - } - for (Index i = VectorizedSize; i < size; ++i) { - evaluator.evalScalar(i); - } - } - evaluator.cleanup(); - } -}; - - - -// Multicore strategy: the index space is partitioned and each partition is executed on a single core -#ifdef EIGEN_USE_THREADS -template -struct EvalRange { - static void run(Evaluator* evaluator_in, const Index first, const Index last) { - Evaluator evaluator = *evaluator_in; - eigen_assert(last >= first); - for (Index i = first; i < last; ++i) { - evaluator.evalScalar(i); - } - } - - static Index alignBlockSize(Index size) { - return size; - } -}; - -template -struct EvalRange { - static const int PacketSize = unpacket_traits::size; - - static void run(Evaluator* evaluator_in, const Index first, const Index last) { - Evaluator evaluator = *evaluator_in; - eigen_assert(last >= first); - Index i = first; - if (last - first >= PacketSize) { - eigen_assert(first % PacketSize == 0); - Index last_chunk_offset = last - 4 * PacketSize; - // Give the compiler a strong hint to unroll the loop. But don't insist - // on unrolling, because if the function is expensive the compiler should not - // unroll the loop at the expense of inlining. - for (; i <= last_chunk_offset; i += 4*PacketSize) { - for (Index j = 0; j < 4; j++) { - evaluator.evalPacket(i + j * PacketSize); - } - } - last_chunk_offset = last - PacketSize; - for (; i <= last_chunk_offset; i += PacketSize) { - evaluator.evalPacket(i); - } - } - for (; i < last; ++i) { - evaluator.evalScalar(i); - } - } - - static Index alignBlockSize(Index size) { - // Align block size to packet size and account for unrolling in run above. - if (size >= 16 * PacketSize) { - return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); - } - // Aligning to 4 * PacketSize would increase block size by more than 25%. - return (size + PacketSize - 1) & ~(PacketSize - 1); - } -}; - -template -class TensorExecutor { - public: - typedef typename Expression::Index Index; - static inline void run(const Expression& expr, const ThreadPoolDevice& device) - { - typedef TensorEvaluator Evaluator; - Evaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); -#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) - device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), - EvalRange::alignBlockSize, - [&evaluator](Index first, Index last) { - EvalRange::run(&evaluator, first, last); - }); -#else - size_t num_threads = device.numThreads(); - if (num_threads > 1) { - num_threads = TensorCostModel::numThreads( - size, evaluator.costPerCoeff(Vectorizable), num_threads); - } - if (num_threads == 1) { - EvalRange::run(&evaluator, 0, size); - } else { - const Index PacketSize = Vectorizable ? unpacket_traits::size : 1; - Index blocksz = std::ceil(static_cast(size)/num_threads) + PacketSize - 1; - const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); - const Index numblocks = size / blocksize; - - Barrier barrier(numblocks); - for (int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier( - &barrier, &EvalRange::run, - &evaluator, i * blocksize, (i + 1) * blocksize); - } - if (numblocks * blocksize < size) { - EvalRange::run( - &evaluator, numblocks * blocksize, size); - } - barrier.Wait(); - } -#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL) - } - evaluator.cleanup(); - } -}; -#endif // EIGEN_USE_THREADS - - -// GPU: the evaluation of the expression is offloaded to a GPU. -#if defined(EIGEN_USE_GPU) - -template -class TensorExecutor { - public: - typedef typename Expression::Index Index; - static void run(const Expression& expr, const GpuDevice& device); -}; - - -#if defined(__CUDACC__) -template -struct EigenMetaKernelEval { - static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, Index first, Index last, Index step_size) { - for (Index i = first; i < last; i += step_size) { - eval.evalScalar(i); - } - } -}; - -template -struct EigenMetaKernelEval { - static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, Index first, Index last, Index step_size) { - const Index PacketSize = unpacket_traits::size; - const Index vectorized_size = (last / PacketSize) * PacketSize; - const Index vectorized_step_size = step_size * PacketSize; - - // Use the vector path - for (Index i = first * PacketSize; i < vectorized_size; - i += vectorized_step_size) { - eval.evalPacket(i); - } - for (Index i = vectorized_size + first; i < last; i += step_size) { - eval.evalScalar(i); - } - } -}; - -template -__global__ void -__launch_bounds__(1024) -EigenMetaKernel(Evaluator eval, Index size) { - - const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; - const Index step_size = blockDim.x * gridDim.x; - - const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; - EigenMetaKernelEval::run(eval, first_index, size, step_size); -} - -/*static*/ -template -inline void TensorExecutor::run( - const Expression& expr, const GpuDevice& device) { - TensorEvaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) { - const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = device.getNumCudaMultiProcessors() * - device.maxCudaThreadsPerMultiProcessor() / block_size; - const Index size = array_prod(evaluator.dimensions()); - // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. - const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); - - LAUNCH_CUDA_KERNEL( - (EigenMetaKernel, Index>), - num_blocks, block_size, 0, device, evaluator, size); - } - evaluator.cleanup(); -} - -#endif // __CUDACC__ -#endif // EIGEN_USE_GPU - -// SYCL Executor policy -#ifdef EIGEN_USE_SYCL - -template -class TensorExecutor { -public: - static inline void run(const Expression &expr, const SyclDevice &device) { - // call TensorSYCL module - TensorSycl::run(expr, device); - } -}; - -#endif - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h deleted file mode 100644 index 8bece4e6515..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ /dev/null @@ -1,169 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H -#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H - -namespace Eigen { - -namespace internal { -template class MakePointer_> -struct traits > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - - enum { - Flags = 0 - }; - template struct MakePointer { - // Intermediate typedef to workaround MSVC issue. - typedef MakePointer_ MakePointerT; - typedef typename MakePointerT::Type Type; - }; -}; - -template class MakePointer_> -struct eval, Eigen::Dense> -{ - typedef const TensorForcedEvalOp& type; -}; - -template class MakePointer_> -struct nested, 1, typename eval >::type> -{ - typedef TensorForcedEvalOp type; -}; - -} // end namespace internal - - - -// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_) - -/** \class TensorForcedEvalOp - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor reshaping class. - * - * - */ -/// `template class MakePointer_` is added to convert the host pointer to the device pointer. -/// It is added due to the fact that for our device compiler `T*` is not allowed. -/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`. -/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_` is `T*` . -/// Therefore, by adding the default value, we managed to convert the type and it does not break any -/// existing code as its default value is `T*`. -template class MakePointer_> -class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr) - : m_xpr(expr) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; -}; - - -template class MakePointer_> -struct TensorEvaluator, Device> -{ - typedef TensorForcedEvalOp XprType; - typedef typename ArgType::Scalar Scalar; - typedef typename TensorEvaluator::Dimensions Dimensions; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = true, - PacketAccess = (PacketSize > 1), - Layout = TensorEvaluator::Layout, - RawAccess = true - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - /// op_ is used for sycl - : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) - { } - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - const Index numValues = internal::array_prod(m_impl.dimensions()); - m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); - // Should initialize the memory in case we're dealing with non POD types. - if (NumTraits::RequireInitialization) { - for (Index i = 0; i < numValues; ++i) { - new(m_buffer+i) CoeffReturnType(); - } - } - typedef TensorEvalToOp< const typename internal::remove_const::type > EvalTo; - EvalTo evalToTmp(m_buffer, m_op); - const bool PacketAccess = internal::IsVectorizable::value; - internal::TensorExecutor::type, PacketAccess>::run(evalToTmp, m_device); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_device.deallocate(m_buffer); - m_buffer = NULL; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_buffer[index]; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return internal::ploadt(m_buffer + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC typename MakePointer::Type data() const { return m_buffer; } - - /// required by sycl in order to extract the sycl accessor - const TensorEvaluator& impl() { return m_impl; } - /// used by sycl in order to build the sycl buffer - const Device& device() const{return m_device;} - private: - TensorEvaluator m_impl; - const ArgType m_op; - const Device& m_device; - typename MakePointer::Type m_buffer; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h deleted file mode 100644 index ee0078bbcc4..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +++ /dev/null @@ -1,54 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H -#define EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H - - -/** use this macro in sfinae selection in templated functions - * - * template::value , int >::type = 0 - * > - * void foo(){} - * - * becomes => - * - * template::value ) - * > - * void foo(){} - */ - -// SFINAE requires variadic templates -#ifndef __CUDACC__ -#if EIGEN_HAS_VARIADIC_TEMPLATES - // SFINAE doesn't work for gcc <= 4.7 - #ifdef EIGEN_COMP_GNUC - #if EIGEN_GNUC_AT_LEAST(4,8) - #define EIGEN_HAS_SFINAE - #endif - #else - #define EIGEN_HAS_SFINAE - #endif -#endif -#endif - -#define EIGEN_SFINAE_ENABLE_IF( __condition__ ) \ - typename internal::enable_if< ( __condition__ ) , int >::type = 0 - - -#if EIGEN_HAS_CONSTEXPR -#define EIGEN_CONSTEXPR constexpr -#else -#define EIGEN_CONSTEXPR -#endif - - -#endif diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h deleted file mode 100644 index 647bcf10887..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ /dev/null @@ -1,397 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H -#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H - -namespace Eigen { - -/** \class TensorPadding - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor padding class. - * At the moment only padding with a constant value is supported. - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorPaddingOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorPaddingOp type; -}; - -} // end namespace internal - - - -template -class TensorPaddingOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, const Scalar padding_value) - : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {} - - EIGEN_DEVICE_FUNC - const PaddingDimensions& padding() const { return m_padding_dims; } - EIGEN_DEVICE_FUNC - Scalar padding_value() const { return m_padding_value; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const PaddingDimensions m_padding_dims; - const Scalar m_padding_value; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorPaddingOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::value; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = true, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = true, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()) - { - // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead - // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector - // of 1 element first and then pad. - EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - - // Compute dimensions - m_dimensions = m_impl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - m_dimensions[i] += m_padding[i].first + m_padding[i].second; - } - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - if (static_cast(Layout) == static_cast(ColMajor)) { - m_inputStrides[0] = 1; - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - } - m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1]; - } else { - m_inputStrides[NumDims - 1] = 1; - m_outputStrides[NumDims] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; - m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1]; - } - m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0]; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - eigen_assert(index < dimensions().TotalSize()); - Index inputIndex = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - if (isPaddingAtIndexForDim(idx, i)) { - return m_paddingValue; - } - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - if (isPaddingAtIndexForDim(index, 0)) { - return m_paddingValue; - } - inputIndex += (index - m_padding[0].first); - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i+1]; - if (isPaddingAtIndexForDim(idx, i)) { - return m_paddingValue; - } - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - index -= idx * m_outputStrides[i+1]; - } - if (isPaddingAtIndexForDim(index, NumDims-1)) { - return m_paddingValue; - } - inputIndex += (index - m_padding[NumDims-1].first); - } - return m_impl.coeff(inputIndex); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - if (static_cast(Layout) == static_cast(ColMajor)) { - return packetColMajor(index); - } - return packetRowMajor(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - TensorOpCost cost = m_impl.costPerCoeff(vectorized); - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = 0; i < NumDims; ++i) - updateCostPerDimension(cost, i, i == 0); - } else { - for (int i = NumDims - 1; i >= 0; --i) - updateCostPerDimension(cost, i, i == NumDims - 1); - } - return cost; - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - private: - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( - Index index, int dim_index) const { -#if defined(EIGEN_HAS_INDEX_LIST) - return (!internal::index_pair_first_statically_eq(dim_index, 0) && - index < m_padding[dim_index].first) || - (!internal::index_pair_second_statically_eq(dim_index, 0) && - index >= m_dimensions[dim_index] - m_padding[dim_index].second); -#else - return (index < m_padding[dim_index].first) || - (index >= m_dimensions[dim_index] - m_padding[dim_index].second); -#endif - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero( - int dim_index) const { -#if defined(EIGEN_HAS_INDEX_LIST) - return internal::index_pair_first_statically_eq(dim_index, 0); -#else - EIGEN_UNUSED_VARIABLE(dim_index); - return false; -#endif - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero( - int dim_index) const { -#if defined(EIGEN_HAS_INDEX_LIST) - return internal::index_pair_second_statically_eq(dim_index, 0); -#else - EIGEN_UNUSED_VARIABLE(dim_index); - return false; -#endif - } - - - void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const { - const double in = static_cast(m_impl.dimensions()[i]); - const double out = in + m_padding[i].first + m_padding[i].second; - if (out == 0) - return; - const double reduction = in / out; - cost *= reduction; - if (first) { - cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost() + - reduction * (1 * TensorOpCost::AddCost())); - } else { - cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost() + - 2 * TensorOpCost::MulCost() + - reduction * (2 * TensorOpCost::MulCost() + - 1 * TensorOpCost::DivCost())); - } - } - - protected: - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - const Index initialIndex = index; - Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index first = index; - const Index last = index + PacketSize - 1; - const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; - const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; - const Index lastPaddedRight = m_outputStrides[i+1]; - - if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { - // all the coefficient are between the 2 padding zones. - const Index idx = index / m_outputStrides[i]; - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - else { - // Every other case - return packetWithPossibleZero(initialIndex); - } - } - - const Index last = index + PacketSize - 1; - const Index first = index; - const Index lastPaddedLeft = m_padding[0].first; - const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); - const Index lastPaddedRight = m_outputStrides[1]; - - if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { - // all the coefficient are between the 2 padding zones. - inputIndex += (index - m_padding[0].first); - return m_impl.template packet(inputIndex); - } - // Every other case - return packetWithPossibleZero(initialIndex); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - const Index initialIndex = index; - Index inputIndex = 0; - - for (int i = 0; i < NumDims - 1; ++i) { - const Index first = index; - const Index last = index + PacketSize - 1; - const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; - const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; - const Index lastPaddedRight = m_outputStrides[i]; - - if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { - // all the coefficient are between the 2 padding zones. - const Index idx = index / m_outputStrides[i+1]; - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - index -= idx * m_outputStrides[i+1]; - } - else { - // Every other case - return packetWithPossibleZero(initialIndex); - } - } - - const Index last = index + PacketSize - 1; - const Index first = index; - const Index lastPaddedLeft = m_padding[NumDims-1].first; - const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); - const Index lastPaddedRight = m_outputStrides[NumDims-1]; - - if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { - // all the coefficient are between the 2 padding zones. - inputIndex += (index - m_padding[NumDims-1].first); - return m_impl.template packet(inputIndex); - } - // Every other case - return packetWithPossibleZero(initialIndex); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const - { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - - Dimensions m_dimensions; - array m_outputStrides; - array m_inputStrides; - TensorEvaluator m_impl; - PaddingDimensions m_padding; - - Scalar m_paddingValue; -}; - - - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h deleted file mode 100644 index 1655a813e4f..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ /dev/null @@ -1,276 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H -#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H - -namespace Eigen { -namespace internal { - -namespace { - -EIGEN_DEVICE_FUNC uint64_t get_random_seed() { -#ifdef __CUDA_ARCH__ - // We don't support 3d kernels since we currently only use 1 and - // 2d kernels. - assert(threadIdx.z == 0); - return clock64() + - blockIdx.x * blockDim.x + threadIdx.x + - gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); - -#elif defined _WIN32 - // Use the current time as a baseline. - SYSTEMTIME st; - GetSystemTime(&st); - int time = st.wSecond + 1000 * st.wMilliseconds; - // Mix in a random number to make sure that we get different seeds if - // we try to generate seeds faster than the clock resolution. - // We need 2 random values since the generator only generate 16 bits at - // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx) - int rnd1 = ::rand(); - int rnd2 = ::rand(); - uint64_t rnd = (rnd1 | rnd2 << 16) ^ time; - return rnd; - -#elif defined __APPLE__ - // Same approach as for win32, except that the random number generator - // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random). - uint64_t rnd = ::random() ^ mach_absolute_time(); - return rnd; - -#else - // Augment the current time with pseudo random number generation - // to ensure that we get different seeds if we try to generate seeds - // faster than the clock resolution. - timespec ts; - clock_gettime(CLOCK_REALTIME, &ts); - uint64_t rnd = ::random() ^ ts.tv_nsec; - return rnd; -#endif -} - -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) { - // TODO: Unify with the implementation in the non blocking thread pool. - uint64_t current = *state; - // Update the internal state - *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; - // Generate the random output (using the PCG-XSH-RS scheme) - return static_cast((current ^ (current >> 22)) >> (22 + (current >> 61))); -} - -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) { - seed = seed ? seed : get_random_seed(); - return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; -} - -} // namespace - - -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeUniform(uint64_t* state) { - unsigned rnd = PCG_XSH_RS_generator(state); - return static_cast(rnd); -} - - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -Eigen::half RandomToTypeUniform(uint64_t* state) { - Eigen::half result; - // Generate 10 random bits for the mantissa - unsigned rnd = PCG_XSH_RS_generator(state); - result.x = static_cast(rnd & 0x3ffu); - // Set the exponent - result.x |= (static_cast(15) << 10); - // Return the final result - return result - Eigen::half(1.0f); -} - - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -float RandomToTypeUniform(uint64_t* state) { - typedef union { - uint32_t raw; - float fp; - } internal; - internal result; - // Generate 23 random bits for the mantissa mantissa - const unsigned rnd = PCG_XSH_RS_generator(state); - result.raw = rnd & 0x7fffffu; - // Set the exponent - result.raw |= (static_cast(127) << 23); - // Return the final result - return result.fp - 1.0f; -} - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -double RandomToTypeUniform(uint64_t* state) { - typedef union { - uint64_t raw; - double dp; - } internal; - internal result; - result.raw = 0; - // Generate 52 random bits for the mantissa - // First generate the upper 20 bits - unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu; - // The generate the lower 32 bits - unsigned rnd2 = PCG_XSH_RS_generator(state); - result.raw = (static_cast(rnd1) << 32) | rnd2; - // Set the exponent - result.raw |= (static_cast(1023) << 52); - // Return the final result - return result.dp - 1.0; -} - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeUniform >(uint64_t* state) { - return std::complex(RandomToTypeUniform(state), - RandomToTypeUniform(state)); -} -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeUniform >(uint64_t* state) { - return std::complex(RandomToTypeUniform(state), - RandomToTypeUniform(state)); -} - -template class UniformRandomGenerator { - public: - static const bool PacketAccess = true; - - // Uses the given "seed" if non-zero, otherwise uses a random seed. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( - uint64_t seed = 0) { - m_state = PCG_XSH_RS_state(seed); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( - const UniformRandomGenerator& other) { - m_state = other.m_state; - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - T operator()(Index i) const { - uint64_t local_state = m_state + i; - T result = RandomToTypeUniform(&local_state); - m_state = local_state; - return result; - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Packet packetOp(Index i) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_ALIGN_MAX T values[packetSize]; - uint64_t local_state = m_state + i; - for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeUniform(&local_state); - } - m_state = local_state; - return internal::pload(values); - } - - private: - mutable uint64_t m_state; -}; - -template -struct functor_traits > { - enum { - // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)). - Cost = 12 * NumTraits::AddCost * - ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)), - PacketAccess = UniformRandomGenerator::PacketAccess - }; -}; - - - -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeNormal(uint64_t* state) { - // Use the ratio of uniform method to generate numbers following a normal - // distribution. See for example Numerical Recipes chapter 7.3.9 for the - // details. - T u, v, q; - do { - u = RandomToTypeUniform(state); - v = T(1.7156) * (RandomToTypeUniform(state) - T(0.5)); - const T x = u - T(0.449871); - const T y = numext::abs(v) + T(0.386595); - q = x*x + y * (T(0.196)*y - T(0.25472)*x); - } while (q > T(0.27597) && - (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u)); - - return v/u; -} - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeNormal >(uint64_t* state) { - return std::complex(RandomToTypeNormal(state), - RandomToTypeNormal(state)); -} -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeNormal >(uint64_t* state) { - return std::complex(RandomToTypeNormal(state), - RandomToTypeNormal(state)); -} - - -template class NormalRandomGenerator { - public: - static const bool PacketAccess = true; - - // Uses the given "seed" if non-zero, otherwise uses a random seed. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) { - m_state = PCG_XSH_RS_state(seed); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator( - const NormalRandomGenerator& other) { - m_state = other.m_state; - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - T operator()(Index i) const { - uint64_t local_state = m_state + i; - T result = RandomToTypeNormal(&local_state); - m_state = local_state; - return result; - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Packet packetOp(Index i) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_ALIGN_MAX T values[packetSize]; - uint64_t local_state = m_state + i; - for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeNormal(&local_state); - } - m_state = local_state; - return internal::pload(values); - } - - private: - mutable uint64_t m_state; -}; - - -template -struct functor_traits > { - enum { - // On average, we need to generate about 3 random numbers - // 15 mul, 8 add, 1.5 logs - Cost = 3 * functor_traits >::Cost + - 15 * NumTraits::AddCost + 8 * NumTraits::AddCost + - 3 * functor_traits >::Cost / 2, - PacketAccess = NormalRandomGenerator::PacketAccess - }; -}; - - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h deleted file mode 100644 index 3daecb04540..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ /dev/null @@ -1,242 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorSyclPlaceHolderExpr.h - * - * \brief: - * This is the specialisation of the placeholder expression based on the - * operation type - * -*****************************************************************/ - -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP - -namespace Eigen { -namespace internal { - -template struct syclGenericBufferReducer{ -template -static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ - do { - auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable { - cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)}, - cl::sycl::range<1>{std::min(length, local)}}; - /* Two accessors are used: one to the buffer that is being reduced, - * and a second to local memory, used to store intermediate data. */ - auto aI = - bufI.template get_access(h); - auto aOut = - bufOut->template get_access(h); - cl::sycl::accessor - scratch(cl::sycl::range<1>(local), h); - - /* The parallel_for invocation chosen is the variant with an nd_item - * parameter, since the code requires barriers for correctness. */ - h.parallel_for( - r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) { - size_t globalid = id.get_global(0); - size_t localid = id.get_local(0); - /* All threads collectively read from global memory into local. - * The barrier ensures all threads' IO is resolved before - * execution continues (strictly speaking, all threads within - * a single work-group - there is no co-ordination between - * work-groups, only work-items). */ - if (globalid < length) { - scratch[localid] = aI[globalid]; - } - id.barrier(cl::sycl::access::fence_space::local_space); - - /* Apply the reduction operation between the current local - * id and the one on the other half of the vector. */ - if (globalid < length) { - int min = (length < local) ? length : local; - for (size_t offset = min / 2; offset > 0; offset /= 2) { - if (localid < offset) { - scratch[localid] += scratch[localid + offset]; - } - id.barrier(cl::sycl::access::fence_space::local_space); - } - /* The final result will be stored in local id 0. */ - if (localid == 0) { - aI[id.get_group(0)] = scratch[localid]; - if((length<=local) && globalid ==0){ - aOut[globalid]=scratch[localid]; - } - } - } - }); - }; - dev.m_queue.submit(f); - dev.m_queue.throw_asynchronous(); - - /* At this point, you could queue::wait_and_throw() to ensure that - * errors are caught quickly. However, this would likely impact - * performance negatively. */ - length = length / local; - - } while (length > 1); - - - -} - -}; - -/// For now let's start with a full reducer -/// Self is useless here because in expression construction we are going to treat reduction as a leafnode. -/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the -/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as -// a leafNode. -template -struct FullReducer { - - typedef typename Self::CoeffReturnType CoeffReturnType; - static const bool HasOptimizedImplementation = false; - - static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) { - typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; - auto functors = TensorSycl::internal::extractFunctors(self.impl()); - int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread. - size_t inputSize =self.impl().dimensions().TotalSize(); - size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input - size_t remaining = inputSize% red_factor; - if(rng ==0) { - red_factor=1; - }; - size_t tileSize =dev.m_queue.get_device(). template get_info()/2; - size_t GRange=std::max((size_t )1, rng); - - // convert global range to power of 2 for redecution - GRange--; - GRange |= GRange >> 1; - GRange |= GRange >> 2; - GRange |= GRange >> 4; - GRange |= GRange >> 8; - GRange |= GRange >> 16; -#if __x86_64__ || __ppc64__ || _WIN64 - GRange |= GRange >> 32; -#endif - GRange++; - size_t outTileSize = tileSize; - /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one. - if (GRange < outTileSize) outTileSize=GRange; - // getting final out buffer at the moment the created buffer is true because there is no need for assign - auto out_buffer =dev.template get_sycl_buffer::type>(self.dimensions().TotalSize(), output); - /// creating the shared memory for calculating reduction. - /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can - /// recursively apply reduction on it in order to reduce the whole. - auto temp_global_buffer =cl::sycl::buffer(cl::sycl::range<1>(GRange)); - typedef typename Eigen::internal::remove_all::type Dims; - Dims dims= self.xprDims(); - Op functor = reducer; - dev.m_queue.submit([&](cl::sycl::handler &cgh) { - // create a tuple of accessors from Evaluator - auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - auto tmp_global_accessor = temp_global_buffer. template get_access(cgh); - - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) { - typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; - auto device_expr = TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); - /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour - /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the - /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. - const auto device_self_expr= TensorReductionOp(device_expr.expr, dims, functor); - /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is - /// the device_evaluator is detectable and recognisable on the device. - auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); - /// const cast added as a naive solution to solve the qualifier drop error - auto globalid=itemID.get_global_linear_id(); - - if(globalid::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast(functor)); - else - tmp_global_accessor.get_pointer()[globalid]=static_cast(0); - - if(remaining!=0 && globalid==0 ) - // this will add the rest of input buffer when the input size is not devidable to red_factor. - tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast(functor)); - }); - }); - dev.m_queue.throw_asynchronous(); - -/// This is used to recursively reduce the tmp value to an element of 1; - syclGenericBufferReducer::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize); - } - -}; - -template -struct InnerReducer { - - typedef typename Self::CoeffReturnType CoeffReturnType; - static const bool HasOptimizedImplementation = false; - - static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) { - typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; - auto functors = TensorSycl::internal::extractFunctors(self.impl()); - - size_t tileSize =dev.m_queue.get_device(). template get_info()/2; - - size_t GRange=num_coeffs_to_preserve; - if (tileSize>GRange) tileSize=GRange; - else if(GRange>tileSize){ - size_t xMode = GRange % tileSize; - if (xMode != 0) GRange += (tileSize - xMode); - } - // getting final out buffer at the moment the created buffer is true because there is no need for assign - /// creating the shared memory for calculating reduction. - /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can - /// recursively apply reduction on it in order to reduce the whole. - typedef typename Eigen::internal::remove_all::type Dims; - Dims dims= self.xprDims(); - Op functor = reducer; - - dev.m_queue.submit([&](cl::sycl::handler &cgh) { - // create a tuple of accessors from Evaluator - auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - auto output_accessor = dev.template get_sycl_accessor(num_coeffs_to_preserve,cgh, output); - - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { - typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; - auto device_expr = TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); - /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour - /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the - /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. - const auto device_self_expr= TensorReductionOp(device_expr.expr, dims, functor); - /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is - /// the device_evaluator is detectable and recognisable on the device. - typedef Eigen::TensorEvaluator DeiceSelf; - auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); - /// const cast added as a naive solution to solve the qualifier drop error - auto globalid=itemID.get_global_linear_id(); - if (globalid< static_cast(num_coeffs_to_preserve)) { - typename DeiceSelf::CoeffReturnType accum = functor.initialize(); - GenericDimReducer::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast(functor), &accum); - functor.finalize(accum); - output_accessor.get_pointer()[globalid]= accum; - } - }); - }); - dev.m_queue.throw_asynchronous(); - return false; - } -}; - -} // end namespace internal -} // namespace Eigen - -#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h deleted file mode 100644 index 14e392e365e..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ /dev/null @@ -1,288 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Navdeep Jaitly -// Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H -#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H -namespace Eigen { - -/** \class TensorReverse - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor reverse elements class. - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorReverseOp& type; -}; - -template -struct nested, 1, - typename eval >::type> -{ - typedef TensorReverseOp type; -}; - -} // end namespace internal - -template -class TensorReverseOp : public TensorBase, WriteAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind - StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp( - const XprType& expr, const ReverseDimensions& reverse_dims) - : m_xpr(expr), m_reverse_dims(reverse_dims) { } - - EIGEN_DEVICE_FUNC - const ReverseDimensions& reverse() const { return m_reverse_dims; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorReverseOp& operator = (const TensorReverseOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorReverseOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - protected: - typename XprType::Nested m_xpr; - const ReverseDimensions m_reverse_dims; -}; - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorReverseOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::value; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, - const Device& device) - : m_impl(op.expression(), device), m_reverse(op.reverse()) - { - // Reversing a scalar isn't supported yet. It would be a no-op anyway. - EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - - // Compute strides - m_dimensions = m_impl.dimensions(); - if (static_cast(Layout) == static_cast(ColMajor)) { - m_strides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_strides[i] = m_strides[i-1] * m_dimensions[i-1]; - } - } else { - m_strides[NumDims-1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_strides[i] = m_strides[i+1] * m_dimensions[i+1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index reverseIndex( - Index index) const { - eigen_assert(index < dimensions().TotalSize()); - Index inputIndex = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - Index idx = index / m_strides[i]; - index -= idx * m_strides[i]; - if (m_reverse[i]) { - idx = m_dimensions[i] - idx - 1; - } - inputIndex += idx * m_strides[i] ; - } - if (m_reverse[0]) { - inputIndex += (m_dimensions[0] - index - 1); - } else { - inputIndex += index; - } - } else { - for (int i = 0; i < NumDims - 1; ++i) { - Index idx = index / m_strides[i]; - index -= idx * m_strides[i]; - if (m_reverse[i]) { - idx = m_dimensions[i] - idx - 1; - } - inputIndex += idx * m_strides[i] ; - } - if (m_reverse[NumDims-1]) { - inputIndex += (m_dimensions[NumDims-1] - index - 1); - } else { - inputIndex += index; - } - } - return inputIndex; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff( - Index index) const { - return m_impl.coeff(reverseIndex(index)); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - // TODO(ndjaitly): write a better packing routine that uses - // local structure. - EIGEN_ALIGN_MAX typename internal::remove_const::type - values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + - 2 * TensorOpCost::MulCost() + - TensorOpCost::DivCost()); - for (int i = 0; i < NumDims; ++i) { - if (m_reverse[i]) { - compute_cost += 2 * TensorOpCost::AddCost(); - } - } - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - protected: - Dimensions m_dimensions; - array m_strides; - TensorEvaluator m_impl; - ReverseDimensions m_reverse; -}; - -// Eval as lvalue - -template -struct TensorEvaluator, Device> - : public TensorEvaluator, - Device> { - typedef TensorEvaluator, - Device> Base; - typedef TensorReverseOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::value; - typedef DSizes Dimensions; - - enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, - const Device& device) - : Base(op, device) {} - - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Dimensions& dimensions() const { return this->m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - return this->m_impl.coeffRef(this->reverseIndex(index)); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - // This code is pilfered from TensorMorphing.h - EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; - internal::pstore(values, x); - for (int i = 0; i < PacketSize; ++i) { - this->coeffRef(index+i) = values[i]; - } - } - -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h deleted file mode 100644 index 8501466ce60..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ /dev/null @@ -1,287 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Igor Babuschkin -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H -#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H - -namespace Eigen { - -namespace internal { - -template -struct traits > - : public traits { - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorScanOp& type; -}; - -template -struct nested, 1, - typename eval >::type> -{ - typedef TensorScanOp type; -}; -} // end namespace internal - -/** \class TensorScan - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor scan class. - */ -template -class TensorScanOp - : public TensorBase, ReadOnlyAccessors> { -public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp( - const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op()) - : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Index axis() const { return m_axis; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const XprType& expression() const { return m_expr; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Op accumulator() const { return m_accumulator; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - bool exclusive() const { return m_exclusive; } - -protected: - typename XprType::Nested m_expr; - const Index m_axis; - const Op m_accumulator; - const bool m_exclusive; -}; - -template -struct ScanLauncher; - -// Eval as rvalue -template -struct TensorEvaluator, Device> { - - typedef TensorScanOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::Dimensions>::value; - typedef DSizes Dimensions; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef TensorEvaluator, Device> Self; - - enum { - IsAligned = false, - PacketAccess = (internal::unpacket_traits::size > 1), - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = true - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, - const Device& device) - : m_impl(op.expression(), device), - m_device(device), - m_exclusive(op.exclusive()), - m_accumulator(op.accumulator()), - m_size(m_impl.dimensions()[op.axis()]), - m_stride(1), - m_output(NULL) { - - // Accumulating a scalar isn't supported. - EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - eigen_assert(op.axis() >= 0 && op.axis() < NumDims); - - // Compute stride of scan axis - const Dimensions& dims = m_impl.dimensions(); - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = 0; i < op.axis(); ++i) { - m_stride = m_stride * dims[i]; - } - } else { - for (int i = NumDims - 1; i > op.axis(); --i) { - m_stride = m_stride * dims[i]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { - return m_impl.dimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const { - return m_stride; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const { - return m_size; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const { - return m_accumulator; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const { - return m_exclusive; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& inner() const { - return m_impl; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { - return m_device; - } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - m_impl.evalSubExprsIfNeeded(NULL); - ScanLauncher launcher; - if (data) { - launcher(*this, data); - return false; - } - - const Index total_size = internal::array_prod(dimensions()); - m_output = static_cast(m_device.allocate(total_size * sizeof(Scalar))); - launcher(*this, m_output); - return true; - } - - template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - return internal::ploadt(m_output + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const - { - return m_output; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_output[index]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - if (m_output != NULL) { - m_device.deallocate(m_output); - m_output = NULL; - } - m_impl.cleanup(); - } - -protected: - TensorEvaluator m_impl; - const Device& m_device; - const bool m_exclusive; - Op m_accumulator; - const Index m_size; - Index m_stride; - CoeffReturnType* m_output; -}; - -// CPU implementation of scan -// TODO(ibab) This single-threaded implementation should be parallelized, -// at least by running multiple scans at the same time. -template -struct ScanLauncher { - void operator()(Self& self, typename Self::CoeffReturnType *data) { - Index total_size = internal::array_prod(self.dimensions()); - - // We fix the index along the scan axis to 0 and perform a - // scan per remaining entry. The iteration is split into two nested - // loops to avoid an integer division by keeping track of each idx1 and idx2. - for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) { - for (Index idx2 = 0; idx2 < self.stride(); idx2++) { - // Calculate the starting offset for the scan - Index offset = idx1 + idx2; - - // Compute the scan along the axis, starting at the calculated offset - typename Self::CoeffReturnType accum = self.accumulator().initialize(); - for (Index idx3 = 0; idx3 < self.size(); idx3++) { - Index curr = offset + idx3 * self.stride(); - - if (self.exclusive()) { - data[curr] = self.accumulator().finalize(accum); - self.accumulator().reduce(self.inner().coeff(curr), &accum); - } else { - self.accumulator().reduce(self.inner().coeff(curr), &accum); - data[curr] = self.accumulator().finalize(accum); - } - } - } - } - } -}; - -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - -// GPU implementation of scan -// TODO(ibab) This placeholder implementation performs multiple scans in -// parallel, but it would be better to use a parallel scan algorithm and -// optimize memory access. -template -__global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) { - // Compute offset as in the CPU version - Index val = threadIdx.x + blockIdx.x * blockDim.x; - Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride(); - - if (offset + (self.size() - 1) * self.stride() < total_size) { - // Compute the scan along the axis, starting at the calculated offset - typename Self::CoeffReturnType accum = self.accumulator().initialize(); - for (Index idx = 0; idx < self.size(); idx++) { - Index curr = offset + idx * self.stride(); - if (self.exclusive()) { - data[curr] = self.accumulator().finalize(accum); - self.accumulator().reduce(self.inner().coeff(curr), &accum); - } else { - self.accumulator().reduce(self.inner().coeff(curr), &accum); - data[curr] = self.accumulator().finalize(accum); - } - } - } - __syncthreads(); - -} - -template -struct ScanLauncher { - void operator()(const Self& self, typename Self::CoeffReturnType* data) { - Index total_size = internal::array_prod(self.dimensions()); - Index num_blocks = (total_size / self.size() + 63) / 64; - Index block_size = 64; - LAUNCH_CUDA_KERNEL((ScanKernel), num_blocks, block_size, 0, self.device(), self, total_size, data); - } -}; -#endif // EIGEN_USE_GPU && __CUDACC__ - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h deleted file mode 100644 index 113c060e3f1..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ /dev/null @@ -1,264 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H -#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H - -namespace Eigen { - -/** \class TensorShuffling - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor shuffling class. - * - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorShufflingOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorShufflingOp type; -}; - -} // end namespace internal - - - -template -class TensorShufflingOp : public TensorBase > -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shuffle) - : m_xpr(expr), m_shuffle(shuffle) {} - - EIGEN_DEVICE_FUNC - const Shuffle& shufflePermutation() const { return m_shuffle; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const TensorShufflingOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - protected: - typename XprType::Nested m_xpr; - const Shuffle m_shuffle; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorShufflingOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::Dimensions>::value; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = false, - PacketAccess = (internal::packet_traits::size > 1), - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) - { - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - const Shuffle& shuffle = op.shufflePermutation(); - for (int i = 0; i < NumDims; ++i) { - m_dimensions[i] = input_dims[shuffle[i]]; - } - - array inputStrides; - - if (static_cast(Layout) == static_cast(ColMajor)) { - inputStrides[0] = 1; - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1]; - m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; - } - } else { - inputStrides[NumDims - 1] = 1; - m_outputStrides[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; - m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; - } - } - - for (int i = 0; i < NumDims; ++i) { - m_inputStrides[i] = inputStrides[shuffle[i]]; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_impl.coeff(srcCoeff(index)); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - const double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + - 2 * TensorOpCost::MulCost() + - TensorOpCost::DivCost()); - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { - Index inputIndex = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - inputIndex += idx * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - return inputIndex + index * m_inputStrides[0]; - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i]; - inputIndex += idx * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - return inputIndex + index * m_inputStrides[NumDims - 1]; - } - } - - Dimensions m_dimensions; - array m_outputStrides; - array m_inputStrides; - TensorEvaluator m_impl; -}; - - -// Eval as lvalue -template -struct TensorEvaluator, Device> - : public TensorEvaluator, Device> -{ - typedef TensorEvaluator, Device> Base; - - typedef TensorShufflingOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::Dimensions>::value; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = false, - PacketAccess = (internal::packet_traits::size > 1), - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - return this->m_impl.coeffRef(this->srcCoeff(index)); - } - - template EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - internal::pstore(values, x); - for (int i = 0; i < PacketSize; ++i) { - this->coeffRef(index+i) = values[i]; - } - } -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h deleted file mode 100644 index bb8800d458c..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h +++ /dev/null @@ -1,82 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: eigen@codeplay.com -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -// General include header of SYCL target for Tensor Module -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H - -#ifdef EIGEN_USE_SYCL - -// global pointer to set different attribute state for a class -template -struct MakeGlobalPointer { - typedef typename cl::sycl::global_ptr::pointer_t Type; -}; - -// global pointer to set different attribute state for a class -template -struct MakeLocalPointer { - typedef typename cl::sycl::local_ptr::pointer_t Type; -}; - - -namespace Eigen { -namespace TensorSycl { -namespace internal { - -/// This struct is used for special expression nodes with no operations (for example assign and selectOP). - struct NoOP; - -template struct GetType{ - typedef const T Type; -}; -template struct GetType{ - typedef T Type; -}; - -} -} -} - -// tuple construction -#include "TensorSyclTuple.h" - -// counting number of leaf at compile time -#include "TensorSyclLeafCount.h" - -// The index PlaceHolder takes the actual expression and replaces the actual -// data on it with the place holder. It uses the same pre-order expression tree -// traverse as the leaf count in order to give the right access number to each -// node in the expression -#include "TensorSyclPlaceHolderExpr.h" - -// creation of an accessor tuple from a tuple of SYCL buffers -#include "TensorSyclExtractAccessor.h" - -// this is used to change the address space type in tensor map for GPU -#include "TensorSyclConvertToDeviceExpression.h" - -// this is used to extract the functors -#include "TensorSyclExtractFunctors.h" - -// this is used to create tensormap on the device -// this is used to construct the expression on the device -#include "TensorSyclExprConstructor.h" - -/// this is used for extracting tensor reduction -#include "TensorReductionSycl.h" - -// kernel execution using fusion -#include "TensorSyclRun.h" - -#endif // end of EIGEN_USE_SYCL -#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h deleted file mode 100644 index 8729c86ee8a..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ /dev/null @@ -1,121 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorSyclConvertToDeviceExpression.h - * - * \brief: - * Conversion from host pointer to device pointer - * inside leaf nodes of the expression. - * -*****************************************************************/ - -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP - -namespace Eigen { -namespace TensorSycl { -namespace internal { - -/// \struct ConvertToDeviceExpression -/// \brief This struct is used to convert the MakePointer in the host expression -/// to the MakeGlobalPointer for the device expression. For the leafNodes -/// containing the pointer. This is due to the fact that the address space of -/// the pointer T* is different on the host and the device. -template -struct ConvertToDeviceExpression; - -template class NonOpCategory, bool IsConst, typename... Args> -struct NonOpConversion{ - typedef typename GetType::Type...> >::Type Type; -}; - - -template class > class NonOpCategory, bool IsConst, typename Args> -struct DeviceConvertor{ - typedef typename GetType::Type, MakeGlobalPointer> >::Type Type; -}; - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node -/// type is TensorMap -#define TENSORMAPCONVERT(CVQual)\ -template class MakePointer_>\ -struct ConvertToDeviceExpression, Options2_, MakePointer_> > {\ - typedef CVQual TensorMap, Options2_, MakeGlobalPointer> Type;\ -}; - -TENSORMAPCONVERT(const) -TENSORMAPCONVERT() -#undef TENSORMAPCONVERT - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node -/// type is TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, TensorBroadcastingOp -#define CATEGORYCONVERT(CVQual)\ -template class Category, typename OP, typename... subExprs>\ -struct ConvertToDeviceExpression > {\ - typedef CVQual Category::Type... > Type;\ -}; -CATEGORYCONVERT(const) -CATEGORYCONVERT() -#undef CATEGORYCONVERT - - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node -/// type is TensorCwiseSelectOp -#define SELECTOPCONVERT(CVQual, Res)\ -template \ -struct ConvertToDeviceExpression >\ -: NonOpConversion {}; -SELECTOPCONVERT(const, true) -SELECTOPCONVERT(, false) -#undef SELECTOPCONVERT - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node -/// type is const AssingOP -#define ASSIGNCONVERT(CVQual, Res)\ -template \ -struct ConvertToDeviceExpression >\ -: NonOpConversion{}; - -ASSIGNCONVERT(const, true) -ASSIGNCONVERT(, false) -#undef ASSIGNCONVERT - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node -/// type is either TensorForcedEvalOp or TensorEvalToOp -#define KERNELBROKERCONVERT(CVQual, Res, ExprNode)\ -template \ -struct ConvertToDeviceExpression > \ -: DeviceConvertor{}; - -KERNELBROKERCONVERT(const, true, TensorForcedEvalOp) -KERNELBROKERCONVERT(, false, TensorForcedEvalOp) -KERNELBROKERCONVERT(const, true, TensorEvalToOp) -KERNELBROKERCONVERT(, false, TensorEvalToOp) -#undef KERNELBROKERCONVERT - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp -#define KERNELBROKERCONVERTREDUCTION(CVQual)\ -template class MakePointer_>\ -struct ConvertToDeviceExpression > {\ - typedef CVQual TensorReductionOp::Type, MakeGlobalPointer> Type;\ -}; - -KERNELBROKERCONVERTREDUCTION(const) -KERNELBROKERCONVERTREDUCTION() -#undef KERNELBROKERCONVERTREDUCTION - -} // namespace internal -} // namespace TensorSycl -} // namespace Eigen - -#endif // UNSUPPORTED_EIGEN_CXX1 diff --git a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h deleted file mode 100644 index 983f6318085..00000000000 --- a/lib/eigen_3.3.9/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ /dev/null @@ -1,239 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorSyclExprConstructor.h - * - * \brief: - * This file re-create an expression on the SYCL device in order - * to use the original tensor evaluator. - * -*****************************************************************/ - -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP - -namespace Eigen { -namespace TensorSycl { -namespace internal { -/// this class is used by EvalToOp in order to create an lhs expression which is -/// a pointer from an accessor on device-only buffer -template -struct EvalToLHSConstructor { - PtrType expr; - EvalToLHSConstructor(const utility::tuple::Tuple &t): expr((&(*(utility::tuple::get(t).get_pointer())))) {} -}; - -/// struct ExprConstructor is used to reconstruct the expression on the device and -/// recreate the expression with MakeGlobalPointer containing the device address -/// space for the TensorMap pointers used in eval function. -/// It receives the original expression type, the functor of the node, the tuple -/// of accessors, and the device expression type to re-instantiate the -/// expression tree for the device -template -struct ExprConstructor; - -/// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorMap -#define TENSORMAP(CVQual)\ -template class MakePointer_, size_t N, typename... Params>\ -struct ExprConstructor< CVQual TensorMap, Options2_, MakeGlobalPointer>,\ -CVQual PlaceHolder, Options3_, MakePointer_>, N>, Params...>{\ - typedef CVQual TensorMap, Options2_, MakeGlobalPointer> Type;\ - Type expr;\ - template \ - ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple &t)\ - : expr(Type((&(*(utility::tuple::get(t).get_pointer()))), fd.dimensions())) {}\ -}; - -TENSORMAP(const) -TENSORMAP() -#undef TENSORMAP - -#define UNARYCATEGORY(CVQual)\ -template class UnaryCategory, typename OP, typename OrigRHSExpr, typename RHSExpr, typename... Params>\ -struct ExprConstructor, CVQual UnaryCategory, Params...> {\ - typedef ExprConstructor my_type;\ - my_type rhsExpr;\ - typedef CVQual UnaryCategory Type;\ - Type expr;\ - template \ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ - : rhsExpr(funcD.rhsExpr, t), expr(rhsExpr.expr, funcD.func) {}\ -}; - -UNARYCATEGORY(const) -UNARYCATEGORY() -#undef UNARYCATEGORY - -/// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorBinaryOp -#define BINARYCATEGORY(CVQual)\ -template class BinaryCategory, typename OP, typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr,\ -typename RHSExpr, typename... Params>\ -struct ExprConstructor, CVQual BinaryCategory, Params...> {\ - typedef ExprConstructor my_left_type;\ - typedef ExprConstructor my_right_type;\ - typedef CVQual BinaryCategory Type;\ - my_left_type lhsExpr;\ - my_right_type rhsExpr;\ - Type expr;\ - template \ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ - : lhsExpr(funcD.lhsExpr, t),rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr, funcD.func) {}\ -}; - -BINARYCATEGORY(const) -BINARYCATEGORY() -#undef BINARYCATEGORY - -/// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorCwiseTernaryOp -#define TERNARYCATEGORY(CVQual)\ -template