microsoft
diff --git a/‎Inc/DirectXMath.h‎
Lines changed: 57 additions & 0 deletions b/‎Inc/DirectXMath.h‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎Inc/DirectXMathMisc.inl‎
Lines changed: 45 additions & 0 deletions b/‎Inc/DirectXMathMisc.inl‎
Lines changed: 45 additions & 0 deletions
@@ -38,7 +38,30 @@
 #define XM_CTOR_DEFAULT =default;
 #endif
 
+#if !defined(_XM_F16C_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_)
+#define _XM_F16C_INTRINSICS_
+#endif
+
+#ifdef _XM_F16C_INTRINSICS_
+#if defined(_MSC_VER) && (_MSC_VER < 1700)
+#error DirectX Math use of F16C intrinsics requires Visual C++ 2012 or later.
+#endif
+#ifndef _XM_AVX_INTRINSICS_
+#define _XM_AVX_INTRINSICS_
+#endif
+#endif // _XM_F16C_INTRINSICS_
+
+#if !defined(_XM_AVX_INTRINSICS_) && defined(__AVX__) && !defined(_XM_NO_INTRINSICS_)
+#define _XM_AVX_INTRINSICS_
+#endif
+
+#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_SSE4_INTRINSICS_)
+#define _XM_SSE4_INTRINSICS_
+#endif
 
+#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_)
+#define _XM_SSE_INTRINSICS_
+#endif
 
 #if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
 #if defined(_M_IX86) || defined(_M_X64)
@@ -77,7 +100,17 @@
 #endif
 #endif
 
+#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+#pragma warning(push)
+#pragma warning(disable : 4987)
+#include <intrin.h>
+#pragma warning(pop)
+#include <smmintrin.h>
+#endif
 
+#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+#include <immintrin.h>
+#endif
 
 #include <sal.h>
 #include <assert.h>
@@ -129,7 +162,11 @@
 #define XM_SFENCE() _mm_sfence()
 #endif
 
+#if defined(_XM_AVX_INTRINSICS_)
+#define XM_PERMUTE_PS( v, c ) _mm_permute_ps( v, c )
+#else
 #define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps( v, v, c )
+#endif
 
 #endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_
 
@@ -1506,6 +1543,22 @@ template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t Permu
 template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; }
 template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; }
 
+#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); }
+#endif
 
 #if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
 
@@ -1570,6 +1623,10 @@ template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t Swizz
 // Specialized swizzles
 template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; }
 
+#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); }
+template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); }
+#endif
 
 #if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
 
 
@@ -1092,6 +1092,10 @@ inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst
     XMVECTOR Result = XMVector3ReciprocalLengthEst(P);
     return XMVectorMultiply(P, Result);
 
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps( P, P, 0x7f );
+    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
+    return _mm_mul_ps(vResult, P);
 #elif defined(_XM_SSE_INTRINSICS_)
     // Perform the dot product
     XMVECTOR vDot = _mm_mul_ps(P,P);
@@ -1138,6 +1142,18 @@ inline XMVECTOR XM_CALLCONV XMPlaneNormalize
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
     XMVECTOR vLength = XMVector3ReciprocalLength(P);
     return XMVectorMultiply( P, vLength );
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_dp_ps( P, P, 0x7f );
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(P,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vLengthSq);
+    return vResult;
 #elif defined(_XM_SSE_INTRINSICS_)
     // Perform the dot product on x,y and z only
     XMVECTOR vLengthSq = _mm_mul_ps(P,P);
@@ -1967,6 +1983,35 @@ inline XMVECTOR XM_CALLCONV XMColorSRGBToRGB( FXMVECTOR srgb )
 inline bool XMVerifyCPUSupport()
 {
 #if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+#if defined(_XM_F16C_INTRINSICS_) || defined(_XM_AVX_INTRINSICS_)
+   int avxCPUInfo[4] = {-1};
+   __cpuid( avxCPUInfo, 0 );
+
+   if ( avxCPUInfo[0] < 1  )
+       return false;
+
+    __cpuid(avxCPUInfo, 1 );
+
+#ifdef _XM_F16C_INTRINSICS_
+    if ( (avxCPUInfo[2] & 0x38000000 ) != 0x38000000 )
+        return false; // No F16C/AVX/OSXSAVE support
+#else
+    if ( (avxCPUInfo[2] & 0x18000000 ) != 0x18000000 )
+        return false; // No AVX/OSXSAVE support
+#endif
+#endif
+#ifdef _XM_SSE4_INTRINSICS_
+   int CPUInfo[4] = {-1};
+   __cpuid( CPUInfo, 0 );
+
+   if ( CPUInfo[0] < 1  )
+       return false;
+
+    __cpuid(CPUInfo, 1 );
+
+    if ( (CPUInfo[2] & 0x80001) != 0x80001 )
+        return false; // Missing SSE3 or SSE 4.1 support
+#endif
 #if defined(_M_X64)
     // The X64 processor model requires SSE2 support
     return true;