Skip to content

Commit 3af6a34

Browse files
committed
AVX / AVX2 otimizations
1 parent ef45bb7 commit 3af6a34

File tree

4 files changed

+592
-0
lines changed

4 files changed

+592
-0
lines changed

Inc/DirectXMath.h

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,30 @@
3838
#define XM_CTOR_DEFAULT =default;
3939
#endif
4040

41+
#if !defined(_XM_F16C_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_)
42+
#define _XM_F16C_INTRINSICS_
43+
#endif
44+
45+
#ifdef _XM_F16C_INTRINSICS_
46+
#if defined(_MSC_VER) && (_MSC_VER < 1700)
47+
#error DirectX Math use of F16C intrinsics requires Visual C++ 2012 or later.
48+
#endif
49+
#ifndef _XM_AVX_INTRINSICS_
50+
#define _XM_AVX_INTRINSICS_
51+
#endif
52+
#endif // _XM_F16C_INTRINSICS_
53+
54+
#if !defined(_XM_AVX_INTRINSICS_) && defined(__AVX__) && !defined(_XM_NO_INTRINSICS_)
55+
#define _XM_AVX_INTRINSICS_
56+
#endif
57+
58+
#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_SSE4_INTRINSICS_)
59+
#define _XM_SSE4_INTRINSICS_
60+
#endif
4161

62+
#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_)
63+
#define _XM_SSE_INTRINSICS_
64+
#endif
4265

4366
#if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
4467
#if defined(_M_IX86) || defined(_M_X64)
@@ -77,7 +100,17 @@
77100
#endif
78101
#endif
79102

103+
#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
104+
#pragma warning(push)
105+
#pragma warning(disable : 4987)
106+
#include <intrin.h>
107+
#pragma warning(pop)
108+
#include <smmintrin.h>
109+
#endif
80110

111+
#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
112+
#include <immintrin.h>
113+
#endif
81114

82115
#include <sal.h>
83116
#include <assert.h>
@@ -129,7 +162,11 @@
129162
#define XM_SFENCE() _mm_sfence()
130163
#endif
131164

165+
#if defined(_XM_AVX_INTRINSICS_)
166+
#define XM_PERMUTE_PS( v, c ) _mm_permute_ps( v, c )
167+
#else
132168
#define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps( v, v, c )
169+
#endif
133170

134171
#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_
135172

@@ -1506,6 +1543,22 @@ template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t Permu
15061543
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; }
15071544
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; }
15081545

1546+
#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
1547+
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); }
1548+
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); }
1549+
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); }
1550+
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); }
1551+
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); }
1552+
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); }
1553+
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); }
1554+
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); }
1555+
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); }
1556+
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); }
1557+
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); }
1558+
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); }
1559+
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); }
1560+
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); }
1561+
#endif
15091562

15101563
#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
15111564

@@ -1570,6 +1623,10 @@ template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t Swizz
15701623
// Specialized swizzles
15711624
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; }
15721625

1626+
#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
1627+
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); }
1628+
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); }
1629+
#endif
15731630

15741631
#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
15751632

Inc/DirectXMathMisc.inl

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,6 +1092,10 @@ inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst
10921092
XMVECTOR Result = XMVector3ReciprocalLengthEst(P);
10931093
return XMVectorMultiply(P, Result);
10941094

1095+
#elif defined(_XM_SSE4_INTRINSICS_)
1096+
XMVECTOR vTemp = _mm_dp_ps( P, P, 0x7f );
1097+
XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
1098+
return _mm_mul_ps(vResult, P);
10951099
#elif defined(_XM_SSE_INTRINSICS_)
10961100
// Perform the dot product
10971101
XMVECTOR vDot = _mm_mul_ps(P,P);
@@ -1138,6 +1142,18 @@ inline XMVECTOR XM_CALLCONV XMPlaneNormalize
11381142
#elif defined(_XM_ARM_NEON_INTRINSICS_)
11391143
XMVECTOR vLength = XMVector3ReciprocalLength(P);
11401144
return XMVectorMultiply( P, vLength );
1145+
#elif defined(_XM_SSE4_INTRINSICS_)
1146+
XMVECTOR vLengthSq = _mm_dp_ps( P, P, 0x7f );
1147+
// Prepare for the division
1148+
XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
1149+
// Failsafe on zero (Or epsilon) length planes
1150+
// If the length is infinity, set the elements to zero
1151+
vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
1152+
// Reciprocal mul to perform the normalization
1153+
vResult = _mm_div_ps(P,vResult);
1154+
// Any that are infinity, set to zero
1155+
vResult = _mm_and_ps(vResult,vLengthSq);
1156+
return vResult;
11411157
#elif defined(_XM_SSE_INTRINSICS_)
11421158
// Perform the dot product on x,y and z only
11431159
XMVECTOR vLengthSq = _mm_mul_ps(P,P);
@@ -1967,6 +1983,35 @@ inline XMVECTOR XM_CALLCONV XMColorSRGBToRGB( FXMVECTOR srgb )
19671983
inline bool XMVerifyCPUSupport()
19681984
{
19691985
#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
1986+
#if defined(_XM_F16C_INTRINSICS_) || defined(_XM_AVX_INTRINSICS_)
1987+
int avxCPUInfo[4] = {-1};
1988+
__cpuid( avxCPUInfo, 0 );
1989+
1990+
if ( avxCPUInfo[0] < 1 )
1991+
return false;
1992+
1993+
__cpuid(avxCPUInfo, 1 );
1994+
1995+
#ifdef _XM_F16C_INTRINSICS_
1996+
if ( (avxCPUInfo[2] & 0x38000000 ) != 0x38000000 )
1997+
return false; // No F16C/AVX/OSXSAVE support
1998+
#else
1999+
if ( (avxCPUInfo[2] & 0x18000000 ) != 0x18000000 )
2000+
return false; // No AVX/OSXSAVE support
2001+
#endif
2002+
#endif
2003+
#ifdef _XM_SSE4_INTRINSICS_
2004+
int CPUInfo[4] = {-1};
2005+
__cpuid( CPUInfo, 0 );
2006+
2007+
if ( CPUInfo[0] < 1 )
2008+
return false;
2009+
2010+
__cpuid(CPUInfo, 1 );
2011+
2012+
if ( (CPUInfo[2] & 0x80001) != 0x80001 )
2013+
return false; // Missing SSE3 or SSE 4.1 support
2014+
#endif
19702015
#if defined(_M_X64)
19712016
// The X64 processor model requires SSE2 support
19722017
return true;

0 commit comments

Comments
 (0)